180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* NEON optimized code (C) COPYRIGHT 2009 Motorola
280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *
380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * Use of this source code is governed by a BSD-style license that can be
480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * found in the LICENSE file.
580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */
680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState.h"
880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkPerspIter.h"
980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkShader.h"
1080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkUtilsArm.h"
1180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
1280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruextern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
1380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruextern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
1480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
1580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
1680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
1780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
1880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic unsigned SK_USHIFT16(unsigned x) {
1980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    return x >> 16;
2080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru}
2180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
2280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define MAKENAME(suffix)        ClampX_ClampY ## suffix ## _neon
2380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
2480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
2580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
2680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
2780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define CHECK_FOR_DECAL
2880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState_matrix_clamp_neon.h"
2980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
3080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define MAKENAME(suffix)        RepeatX_RepeatY ## suffix ## _neon
3180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
3280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
3380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
3480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
3580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState_matrix_repeat_neon.h"
3680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
3780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
3880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruvoid decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
3980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru{
4080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    int i;
4180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
4280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    if (count >= 8) {
4380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        /* SkFixed is 16.16 fixed point */
4480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkFixed dx2 = dx+dx;
4580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkFixed dx4 = dx2+dx2;
4680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkFixed dx8 = dx4+dx4;
4780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
4880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        /* now build fx/fx+dx/fx+2dx/fx+3dx */
4980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkFixed fx1, fx2, fx3;
5080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        int32x2_t lower, upper;
5180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        int32x4_t lbase, hbase;
5280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        uint16_t *dst16 = (uint16_t *)dst;
5380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
5480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx1 = fx+dx;
5580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx2 = fx1+dx;
5680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx3 = fx2+dx;
5780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
5880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        /* avoid an 'lbase unitialized' warning */
5980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        lbase = vdupq_n_s32(fx);
6080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        lbase = vsetq_lane_s32(fx1, lbase, 1);
6180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        lbase = vsetq_lane_s32(fx2, lbase, 2);
6280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        lbase = vsetq_lane_s32(fx3, lbase, 3);
6380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
6480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
6580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        /* take upper 16 of each, store, and bump everything */
6680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        do {
6780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            int32x4_t lout, hout;
6880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            uint16x8_t hi16;
6980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
7080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            lout = lbase;
7180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            hout = hbase;
7280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            /* gets hi's of all louts then hi's of all houts */
7380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
7480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            hi16 = vreinterpretq_u16_s32(hout);
7580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            vst1q_u16(dst16, hi16);
7680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
7780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            /* on to the next */
7880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
7980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
8080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            dst16 += 8;
8180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            count -= 8;
8280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            fx += dx8;
8380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        } while (count >= 8);
8480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        dst = (uint32_t *) dst16;
8580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    }
8680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
8780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    uint16_t* xx = (uint16_t*)dst;
8880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    for (i = count; i > 0; --i) {
8980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        *xx++ = SkToU16(fx >> 16); fx += dx;
9080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    }
9180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru}
9280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
9380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruvoid decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
9480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru{
9580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    if (count >= 8) {
9680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        int32x4_t wide_fx;
9780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        int32x4_t wide_fx2;
9880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
9980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
10080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        wide_fx = vdupq_n_s32(fx);
10180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
10280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
10380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
10480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
10580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
10680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
10780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        while (count >= 8) {
10880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            int32x4_t wide_out;
10980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            int32x4_t wide_out2;
11080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
11180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
11280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_out = vorrq_s32(wide_out,
11380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
11480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
11580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
11680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_out2 = vorrq_s32(wide_out2,
11780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
11880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
11980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
12080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
12180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
12280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            dst += 8;
12380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            fx += dx*8;
12480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_fx = vaddq_s32(wide_fx, wide_dx8);
12580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
12680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru            count -= 8;
12780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        }
12880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    }
12980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
13080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    if (count & 1)
13180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    {
13280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkASSERT((fx >> (16 + 14)) == 0);
13380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
13480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx += dx;
13580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    }
13680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    while ((count -= 2) >= 0)
13780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    {
13880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        SkASSERT((fx >> (16 + 14)) == 0);
13980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
14080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx += dx;
14180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru
14280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
14380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru        fx += dx;
14480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru    }
14580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru}
146