180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* NEON optimized code (C) COPYRIGHT 2009 Motorola 280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * Use of this source code is governed by a BSD-style license that can be 480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * found in the LICENSE file. 580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState.h" 880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkPerspIter.h" 980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkShader.h" 1080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkUtilsArm.h" 1180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 1280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruextern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; 1380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruextern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; 1480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 1580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 1680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 1780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 1880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic unsigned SK_USHIFT16(unsigned x) { 1980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru return x >> 16; 2080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 2180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 2280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon 2380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 2480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 2580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 2680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 2780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define CHECK_FOR_DECAL 2880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState_matrix_clamp_neon.h" 2980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 3080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon 3180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 3280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 3380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 3480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 3580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include "SkBitmapProcState_matrix_repeat_neon.h" 3680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 3780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 3880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruvoid decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 3980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru{ 4080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int i; 4180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 4280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 8) { 4380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkFixed is 16.16 fixed point */ 4480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx2 = dx+dx; 4580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx4 = dx2+dx2; 4680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx8 = dx4+dx4; 4780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 4880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* now build fx/fx+dx/fx+2dx/fx+3dx */ 4980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx1, fx2, fx3; 5080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x2_t lower, upper; 5180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t lbase, hbase; 5280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint16_t *dst16 = (uint16_t *)dst; 5380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 5480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx1 = fx+dx; 5580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx2 = fx1+dx; 5680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx3 = fx2+dx; 5780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 5880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* avoid an 'lbase unitialized' warning */ 5980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vdupq_n_s32(fx); 6080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx1, lbase, 1); 6180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx2, lbase, 2); 6280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx3, lbase, 3); 6380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 6480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 6580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* take upper 16 of each, store, and bump everything */ 6680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru do { 6780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t lout, hout; 6880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint16x8_t hi16; 6980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 7080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lout = lbase; 7180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hout = hbase; 7280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* gets hi's of all louts then hi's of all houts */ 7380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 7480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vreinterpretq_u16_s32(hout); 7580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_u16(dst16, hi16); 7680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 7780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* on to the next */ 7880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 7980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 8080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru dst16 += 8; 8180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 8; 8280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx8; 8380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } while (count >= 8); 8480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru dst = (uint32_t *) dst16; 8580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 8680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 8780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint16_t* xx = (uint16_t*)dst; 8880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (i = count; i > 0; --i) { 8980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xx++ = SkToU16(fx >> 16); fx += dx; 9080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 9180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 9280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 9380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruvoid decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 9480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru{ 9580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 8) { 9680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fx; 9780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fx2; 9880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 9980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 10080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vdupq_n_s32(fx); 10180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 10280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 10380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 10480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 10580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 10680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 10780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (count >= 8) { 10880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_out; 10980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_out2; 11080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 11180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 11280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_out = vorrq_s32(wide_out, 11380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 11480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 11580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 11680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_out2 = vorrq_s32(wide_out2, 11780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 11880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 11980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 12080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 12180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 12280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru dst += 8; 12380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx*8; 12480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vaddq_s32(wide_fx, wide_dx8); 12580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 12680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 8; 12780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 12880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 12980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 13080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count & 1) 13180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 13280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((fx >> (16 + 14)) == 0); 13380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 13480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; 13580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 13680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while ((count -= 2) >= 0) 13780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 13880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((fx >> (16 + 14)) == 0); 13980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 14080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; 14180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 14280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 14380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; 14480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 14580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 146