19f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o/* NEON optimized code (C) COPYRIGHT 2009 Motorola 29f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o * 39f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o * Use of this source code is governed by a BSD-style license that can be 49f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o * found in the LICENSE file. 59f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o */ 69f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 79f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkBitmapProcState.h" 89f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkPerspIter.h" 99f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkShader.h" 109f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkUtilsArm.h" 119f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkBitmapProcState_utils.h" 129f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 139f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include <arm_neon.h> 149f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 159f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'oextern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; 169f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'oextern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; 179f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 189f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'ostatic void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 19797f5ef14e92294b329e52971d467d7af5b2993eTheodore Ts'ostatic void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 209f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 219f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 229f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'ostatic inline int16x8_t sbpsm_clamp_tile8(int32x4_t low, int32x4_t high, unsigned max) { 239f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o int16x8_t res; 249f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 259f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // get the hi 16s of all those 32s 269f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o res = vuzpq_s16(vreinterpretq_s16_s32(low), vreinterpretq_s16_s32(high)).val[1]; 279f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 289f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // clamp 299f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o res = vmaxq_s16(res, vdupq_n_s16(0)); 309f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o res = vminq_s16(res, vdupq_n_s16(max)); 319f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 329f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o return res; 339f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o} 349f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 359f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o// TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 369f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'ostatic inline int32x4_t sbpsm_clamp_tile4(int32x4_t f, unsigned max) { 379f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o int32x4_t res; 389f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 399f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // get the hi 16s of all those 32s 409f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o res = vshrq_n_s32(f, 16); 419f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 429f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // clamp 430fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger res = vmaxq_s32(res, vdupq_n_s32(0)); 449f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o res = vminq_s32(res, vdupq_n_s32(max)); 450fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger 469f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o return res; 47e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall} 48e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 499f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o// TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 509f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'ostatic inline int32x4_t sbpsm_clamp_tile4_low_bits(int32x4_t fx) { 519f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o int32x4_t ret; 529f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 536d0ed6780285e79360353d4b06b339c00712d754Theodore Ts'o ret = vshrq_n_s32(fx, 12); 54e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 559f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o /* We don't need the mask below because the caller will 569f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o * overwrite the non-masked bits 579f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o */ 589f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o //ret = vandq_s32(ret, vdupq_n_s32(0xF)); 599f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 609f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o return ret; 619f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o} 629f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 639f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) 640fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilgerstatic inline int16x8_t sbpsm_repeat_tile8(int32x4_t low, int32x4_t high, unsigned max) { 650fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger uint16x8_t res; 660fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger uint32x4_t tmpl, tmph; 670fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger 680fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger // get the lower 16 bits 690fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger res = vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).val[0]; 700fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger 710fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger // bare multiplication, not SkFixedMul 720fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger tmpl = vmull_u16(vget_low_u16(res), vdup_n_u16(max+1)); 739f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o tmph = vmull_u16(vget_high_u16(res), vdup_n_u16(max+1)); 749f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 759f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // extraction of the 16 upper bits 76bec6f49a46ec78a4c5928a22214848a7ea654704Theodore Ts'o res = vuzpq_u16(vreinterpretq_u16_u32(tmpl), vreinterpretq_u16_u32(tmph)).val[1]; 779f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 789f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o return vreinterpretq_s16_u16(res); 799f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o} 809f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 81932a489cdf6bc83d69e59d3f8e0a57b733799ce1Andreas Dilger// TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) 8239dc1c45cb41ce37a56d364103bb852d0b62c835Theodore Ts'ostatic inline int32x4_t sbpsm_repeat_tile4(int32x4_t f, unsigned max) { 837da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger uint16x4_t res; 847da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger uint32x4_t tmp; 859f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 867da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger // get the lower 16 bits 877da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger res = vmovn_u32(vreinterpretq_u32_s32(f)); 880fbd1a8a9a2ef697927f886559344ffa1bb2eefaAndreas Dilger 897da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger // bare multiplication, not SkFixedMul 907da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger tmp = vmull_u16(res, vdup_n_u16(max+1)); 919f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 927da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger // extraction of the 16 upper bits 937da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger tmp = vshrq_n_u32(tmp, 16); 949f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 957da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger return vreinterpretq_s32_u32(tmp); 969f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o} 977da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilger 989f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o// TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 997da78ff17df7d2c1eeebbc2b0727106b63c08a68Andreas Dilgerstatic inline int32x4_t sbpsm_repeat_tile4_low_bits(int32x4_t fx, unsigned max) { 1009f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o uint16x4_t res; 1019f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o uint32x4_t tmp; 102e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall int32x4_t ret; 103e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 104e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall // get the lower 16 bits 105e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall res = vmovn_u32(vreinterpretq_u32_s32(fx)); 106e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 107e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall // bare multiplication, not SkFixedMul 108e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall tmp = vmull_u16(res, vdup_n_u16(max + 1)); 109e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 110e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall // shift and mask 111e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall ret = vshrq_n_s32(vreinterpretq_s32_u32(tmp), 12); 112e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 1133fbfad558e01ffbacc14e5a536c4dfec0a0af4f5Theodore Ts'o /* We don't need the mask below because the caller will 114e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall * overwrite the non-masked bits 115e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall */ 116e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall //ret = vandq_s32(ret, vdupq_n_s32(0xF)); 1173fbfad558e01ffbacc14e5a536c4dfec0a0af4f5Theodore Ts'o 118e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall return ret; 119e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall} 120e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 121e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon 122e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 1233fbfad558e01ffbacc14e5a536c4dfec0a0af4f5Theodore Ts'o#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 1243fbfad558e01ffbacc14e5a536c4dfec0a0af4f5Theodore Ts'o#define TILEX_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) 1259f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_PROCF_NEON8(l, h, max) sbpsm_clamp_tile8(l, h, max) 1269f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_PROCF_NEON4(fx, max) sbpsm_clamp_tile4(fx, max) 1279f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_PROCF_NEON4(fy, max) sbpsm_clamp_tile4(fy, max) 1289f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 1299f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 1309f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_clamp_tile4_low_bits(fx) 1319f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_clamp_tile4_low_bits(fy) 1329f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define CHECK_FOR_DECAL 1339f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkBitmapProcState_matrix_neon.h" 1349f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1359f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon 1369f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 1379f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 1389f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) 1399f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_PROCF_NEON8(l, h, max) sbpsm_repeat_tile8(l, h, max) 1409f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_PROCF_NEON4(fx, max) sbpsm_repeat_tile4(fx, max) 1419f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_PROCF_NEON4(fy, max) sbpsm_repeat_tile4(fy, max) 1429f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 1439f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 1449f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEX_LOW_BITS_NEON4(fx, max) sbpsm_repeat_tile4_low_bits(fx, max) 1459f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#define TILEY_LOW_BITS_NEON4(fy, max) sbpsm_repeat_tile4_low_bits(fy, max) 1469f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o#include "SkBitmapProcState_matrix_neon.h" 1479f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1489f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1499f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1509f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'ovoid decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 1518061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o if (count >= 8) { 1528061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o // SkFixed is 16.16 fixed point 1539f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o SkFixed dx8 = dx * 8; 1549f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o int32x4_t vdx8 = vdupq_n_s32(dx8); 1559f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1569f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o // setup lbase and hbase 157f5fa20078bfc05b554294fe9c5505375d7913e8cTheodore Ts'o int32x4_t lbase, hbase; 158e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall lbase = vdupq_n_s32(fx); 159e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall lbase = vsetq_lane_s32(fx + dx, lbase, 1); 160e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2); 1618815fb8a00f5a441eb62f035353db9e0cca90b38Theodore Ts'o lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3); 162e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall hbase = lbase + vdupq_n_s32(4 * dx); 1639f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o 1649f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o do { 165e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall // store the upper 16 bits 166e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall vst1q_u32(dst, vreinterpretq_u32_s16( 167e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1] 1688061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o )); 1698061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o 1708061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o // on to the next group of 8 1718061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o lbase += vdx8; 1728061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o hbase += vdx8; 1738061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o dst += 4; // we did 8 elements but the result is twice smaller 1748061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o count -= 8; 1758061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o fx += dx8; 176e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall } while (count >= 8); 177e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall } 178e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 179e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall uint16_t* xx = (uint16_t*)dst; 1808061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o for (int i = count; i > 0; --i) { 181e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall *xx++ = SkToU16(fx >> 16); fx += dx; 1828061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o } 1838061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o} 1848061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o 1858061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'ovoid decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 1868061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o if (count >= 8) { 1878061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o SkFixed dx8 = dx * 8; 188e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall int32x4_t vdx8 = vdupq_n_s32(dx8); 189e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall 190e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall int32x4_t wide_fx, wide_fx2; 191e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall wide_fx = vdupq_n_s32(fx); 192e0ed7404719a9ddd2ba427a80db5365c8bad18c0JP Abgrall wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1); 1938061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2); 1948061d2c144bf22ce3e170e40a194932da4baf8fbTheodore Ts'o wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3); 195f5fa20078bfc05b554294fe9c5505375d7913e8cTheodore Ts'o 196f5fa20078bfc05b554294fe9c5505375d7913e8cTheodore Ts'o wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx)); 197ca2634a46ab9da85a3a015a7772770d9dbe5848eJose R. Santos 198f5fa20078bfc05b554294fe9c5505375d7913e8cTheodore Ts'o while (count >= 8) { 1999f8046fc6dfc13eee2f5c363214e60b533872cacTheodore Ts'o int32x4_t wide_out; 200df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o int32x4_t wide_out2; 201df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 202df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 203df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1)); 204df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 205df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 206df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1)); 207df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 208df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 209df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 210df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 211df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o dst += 8; 212df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o fx += dx8; 213df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_fx += vdx8; 214df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o wide_fx2 += vdx8; 215df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o count -= 8; 216df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o } 217df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o } 218df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 219f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o if (count & 1) 220f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o { 221f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o SkASSERT((fx >> (16 + 14)) == 0); 222f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 223f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o fx += dx; 224f77704e416fca7dbe4cc91abba674d2ae3c14f6fTheodore Ts'o } 225503f9e7f6eb331c5b75d7f1ad126f71bcdcfb4e3Theodore Ts'o while ((count -= 2) >= 0) 226503f9e7f6eb331c5b75d7f1ad126f71bcdcfb4e3Theodore Ts'o { 227503f9e7f6eb331c5b75d7f1ad126f71bcdcfb4e3Theodore Ts'o SkASSERT((fx >> (16 + 14)) == 0); 228df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 229df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o fx += dx; 230df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o 231df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 232df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o fx += dx; 233df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o } 234df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o} 235df614db6ef79c767745b8154c26d69398b571605Theodore Ts'o