180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* NEON optimized code (C) COPYRIGHT 2009 Motorola 280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * Use of this source code is governed by a BSD-style license that can be 480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * found in the LICENSE file. 580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* 880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * Modifications done in-house at Motorola 980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 1080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * this is a clone of SkBitmapProcState_matrix.h 1180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * and has been tuned to work with the NEON unit. 1280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 1380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * Still going back and forth between whether this approach 1480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * (clone the entire SkBitmapProcState_matrix.h file or 1580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * if I should put just the modified routines in here and 1680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * then use a construct like #define DONT_DO_THIS_FUNCTION or 1780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * something like that... 1880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 1980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * This is for the ClampX_ClampY instance 2080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 2180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 2280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 2380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 2480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#include <arm_neon.h> 2580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 2680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* 2780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * This has been modified on the knowledge that (at the time) 2880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * we had the following macro definitions in the parent file 2980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * 3080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define MAKENAME(suffix) ClampX_ClampY ## suffix 3180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 3280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 3380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 3480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 3580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * #define CHECK_FOR_DECAL 3680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 3780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 3880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru/* SkClampMax(val,max) -- bound to 0..max */ 3980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 4058190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) 4158190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define SCALE_FILTER_NAME MAKENAME(_filter_scale) 4258190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) 4358190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) 4458190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) 4558190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger#define PERSP_FILTER_NAME MAKENAME(_filter_persp) 4680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 4780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) 4880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) 4980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 5080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#ifndef PREAMBLE 5180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #define PREAMBLE(state) 5280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #define PREAMBLE_PARAM_X 5380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #define PREAMBLE_PARAM_Y 5480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #define PREAMBLE_ARG_X 5580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #define PREAMBLE_ARG_Y 5680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 5780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 5880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, 5980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t xy[], int count, int x, int y) { 6080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 6180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kScale_Mask)) == 0); 6280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 6380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 6480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // we store y, x, x, x, x, x 6580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 6680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const unsigned maxX = s.fBitmap->width() - 1; 6780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx; 6880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 6980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkPoint pt; 7058190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 7158190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &pt); 7280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx = SkScalarToFixed(pt.fY); 7380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const unsigned maxY = s.fBitmap->height() - 1; 7480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = TILEY_PROCF(fx, maxY); 7580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx = SkScalarToFixed(pt.fX); 7680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 7780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 7880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (0 == maxX) { 7980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // all of the following X values must be 0 8080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru memset(xy, 0, count * sizeof(uint16_t)); 8180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru return; 8280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 8380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 8480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed dx = s.fInvSx; 8580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 8680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#ifdef CHECK_FOR_DECAL 8780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // test if we don't need to apply the tile proc 8880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if ((unsigned)(fx >> 16) <= maxX && 8980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { 9080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru decal_nofilter_scale_neon(xy, fx, dx, count); 9180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru return; 9280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 9380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 9480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 9580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int i; 9680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 9780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* very much like done in decal_nofilter, but with 9880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * an extra clamping function applied. 9980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * TILEX_PROCF(fx,max) SkClampMax((fx)>>16, max) 10080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 10180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 8) { 10280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkFixed is 16.16 fixed point */ 10380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx2 = dx+dx; 10480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx4 = dx2+dx2; 10580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx8 = dx4+dx4; 10680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 10780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* now build fx/fx+dx/fx+2dx/fx+3dx */ 10880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx1, fx2, fx3; 10980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t lbase, hbase; 11080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16_t *dst16 = (int16_t *)xy; 11180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 11280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx1 = fx+dx; 11380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx2 = fx1+dx; 11480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx3 = fx2+dx; 11580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 11680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* build my template(s) */ 11780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* avoid the 'lbase unitialized' warning */ 11880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vdupq_n_s32(fx); 11980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx1, lbase, 1); 12080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx2, lbase, 2); 12180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vsetq_lane_s32(fx3, lbase, 3); 12280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 12380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 12480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 12580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* store & bump */ 12680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru do { 12780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t lout; 12880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t hout; 12980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x8_t hi16; 13080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 13180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* get the hi 16s of all those 32s */ 13280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lout = lbase; 13380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hout = hbase; 13480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* this sets up all lout's then all hout's in hout */ 13580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 13680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vreinterpretq_s16_s32(hout); 13780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 13880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp & output */ 13980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vmaxq_s16(hi16, vdupq_n_s16(0)); 14080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vminq_s16(hi16, vdupq_n_s16(maxX)); 14180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_s16(dst16, hi16); 14280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 14380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* but preserving base & on to the next */ 14480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 14580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 14680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru dst16 += 8; 14780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 8; 14880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx8; 14980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } while (count >= 8); 15080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy = (uint32_t *) dst16; 15180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 15280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 15380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint16_t* xx = (uint16_t*)xy; 15480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (i = count; i > 0; --i) { 15580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xx++ = TILEX_PROCF(fx, maxX); fx += dx; 15680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 15780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 15880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 15980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru// note: we could special-case on a matrix which is skewed in X but not Y. 16080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru// this would require a more general setup thatn SCALE does, but could use 16180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru// SCALE's inner loop that only looks at dx 16280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 16380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, 16480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t xy[], int count, int x, int y) { 16580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 16680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 16780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kScale_Mask | 16880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kAffine_Mask)) == 0); 16980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 17080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 17180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkPoint srcPt; 17258190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger s.fInvProc(s.fInvMatrix, 17380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(x) + SK_ScalarHalf, 17480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 17580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 17680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx = SkScalarToFixed(srcPt.fX); 17780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fy = SkScalarToFixed(srcPt.fY); 17880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx = s.fInvSx; 17980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dy = s.fInvKy; 18080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int maxX = s.fBitmap->width() - 1; 18180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int maxY = s.fBitmap->height() - 1; 18280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 18380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* NEON lets us do an 8x unrolling */ 18480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 8) { 18580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkFixed is 16.16 fixed point */ 18680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx4 = dx * 4; 18780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dy4 = dy * 4; 18880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx8 = dx * 8; 18980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dy8 = dy * 8; 19080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 19180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t xbase, ybase; 19280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t x2base, y2base; 19380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16_t *dst16 = (int16_t *) xy; 19480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 19580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* my sets of maxx/maxy for clamping */ 19680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32_t maxpair = (maxX&0xffff) | ((maxY&0xffff)<<16); 19780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x8_t maxXY = vreinterpretq_s16_s32(vdupq_n_s32(maxpair)); 19880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 19980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* now build fx/fx+dx/fx+2dx/fx+3dx */ 20080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* avoid the 'xbase unitialized' warning...*/ 20180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xbase = vdupq_n_s32(fx); 20280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xbase = vsetq_lane_s32(fx+dx, xbase, 1); 20380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2); 20480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3); 20580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 20680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* same for fy */ 20780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* avoid the 'ybase unitialized' warning...*/ 20880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ybase = vdupq_n_s32(fy); 20980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ybase = vsetq_lane_s32(fy+dy, ybase, 1); 21080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2); 21180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); 21280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 21380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2base = vaddq_s32(xbase, vdupq_n_s32(dx4)); 21480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2base = vaddq_s32(ybase, vdupq_n_s32(dy4)); 21580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 21680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* store & bump */ 21780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru do { 21880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t xout, yout; 21980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t x2out, y2out; 22080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x8_t hi16, hi16_2; 22180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 22280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xout = xbase; 22380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru yout = ybase; 22480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 22580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* overlay y's low16 with hi16 from x */ 22680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* so we properly shifted xyxyxyxy */ 22780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru yout = vsriq_n_s32(yout, xout, 16); 22880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vreinterpretq_s16_s32 (yout); 22980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 23080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* do the clamping; both guys get 0's */ 23180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vmaxq_s16 (hi16, vdupq_n_s16(0)); 23280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16 = vminq_s16 (hi16, maxXY); 23380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 23480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_s16 (dst16, hi16); 23580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 23680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* and for the other 4 pieces of this iteration */ 23780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2out = x2base; 23880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2out = y2base; 23980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 24080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* overlay y's low16 with hi16 from x */ 24180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* so we properly shifted xyxyxyxy */ 24280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2out = vsriq_n_s32(y2out, x2out, 16); 24380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16_2 = vreinterpretq_s16_s32 (y2out); 24480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 24580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* do the clamping; both guys get 0's */ 24680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16_2 = vmaxq_s16 (hi16_2, vdupq_n_s16(0)); 24780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru hi16_2 = vminq_s16 (hi16_2, maxXY); 24880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 24980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* RBE: gcc regenerates dst16+8 all the time instead 25080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * of folding it into an addressing mode. *sigh* */ 25180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_s16 (dst16+8, hi16_2); 25280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 25380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* moving base and on to the next */ 25480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xbase = vaddq_s32 (xbase, vdupq_n_s32 (dx8)); 25580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ybase = vaddq_s32 (ybase, vdupq_n_s32 (dy8)); 25680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2base = vaddq_s32 (x2base, vdupq_n_s32 (dx8)); 25780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2base = vaddq_s32 (y2base, vdupq_n_s32 (dy8)); 25880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 25980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru dst16 += 16; /* 8x32 aka 16x16 */ 26080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 8; 26180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx8; 26280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fy += dy8; 26380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } while (count >= 8); 26480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy = (uint32_t *) dst16; 26580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 26680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 26780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (int i = count; i > 0; --i) { 26880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX); 26980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; fy += dy; 27080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 27180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 27280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 27380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef DEBUG_PERSP_NOFILTER 27480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 27580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, 27680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t* SK_RESTRICT xy, 27780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int count, int x, int y) { 27880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 27980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 28080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 28180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* max{X,Y} are int here, but later shown/assumed to fit in 16 bits */ 28280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int maxX = s.fBitmap->width() - 1; 28380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int maxY = s.fBitmap->height() - 1; 28480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 28558190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger SkPerspIter iter(s.fInvMatrix, 28680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(x) + SK_ScalarHalf, 28780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(y) + SK_ScalarHalf, count); 28880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 28980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while ((count = iter.next()) != 0) { 29080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 29180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 29280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#if defined(DEBUG_PERSP_NOFILTER) 29380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* debugging stuff */ 29480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed *end_srcXY = srcXY + (count*2); 29580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t *end_xy = xy + (count); 29680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed *base_srcXY = srcXY; 29780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t *base_xy = xy; 29880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int base_count = count; 29980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 30080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 30180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#if 1 30280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition 30380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn 30480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 30580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... 30680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * but we immediately discard the low 16 bits... 30780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * so what we're going to do is vld4, which will give us 30880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo' 30980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * parts.... 31080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 31180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 8) { 31280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16_t *mysrc = (int16_t *) srcXY; 31380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16_t *mydst = (int16_t *) xy; 31480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x4_t maxX4 = vdup_n_s16((int16_t)maxX); 31580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x4_t maxY4 = vdup_n_s16((int16_t)maxY); 31680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int16x4_t zero4 = vdup_n_s16(0); 31780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 31880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* The constructs with local blocks for register assignments 31980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * and asm() instructions is to make keep any hard register 32080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * assignments to as small a scope as possible. and to avoid 32180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * burning call-preserved hard registers on the vld/vst 32280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * instructions. 32380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 32480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 32580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru do { 326096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int16x4_t xhi, yhi; 327096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int16x4_t x2hi, y2hi; 32880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 32980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* vld4 does the de-interleaving for us */ 33080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 33180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_xlo asm("d0"); 33280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_xhi asm("d1"); 33380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_ylo asm("d2"); 33480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_yhi asm("d3"); 33580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 33680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vld4.16 {d0-d3},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" 33780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) 33880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "r" (mysrc) 33980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ); 34080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xhi = t_xhi; 34180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru yhi = t_yhi; 34280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 34380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 34480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp X>>16 (aka xhi) to 0..maxX */ 34580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xhi = vmax_s16(xhi, zero4); /* now 0.. */ 34680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xhi = vmin_s16(xhi, maxX4); /* now 0..maxX */ 34780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 34880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp Y>>16 (aka yhi) to 0..maxY */ 34980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru yhi = vmax_s16(yhi, zero4); /* now 0.. */ 35080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru yhi = vmin_s16(yhi, maxY4); /* now 0..maxY */ 35180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 35280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* deal with the second set of numbers */ 35380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 35480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_xlo asm("d4"); 35580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_xhi asm("d5"); 35680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_ylo asm("d6"); 35780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t t_yhi asm("d7"); 35880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 35980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* offset == 256 bits == 32 bytes == 8 longs == 16 shorts */ 36080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vld4.16 {d4-d7},[%4] /* xlo=%P0 xhi=%P1 ylo=%P2 yhi=%P3 */" 36180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "=w" (t_xlo), "=w" (t_xhi), "=w" (t_ylo), "=w" (t_yhi) 36280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "r" (mysrc+16) 36380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ); 36480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2hi = t_xhi; 36580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2hi = t_yhi; 36680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 36780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 36880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp the second 4 here */ 36980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 37080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (0) { extern void rbe(void); rbe(); } 37180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 37280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp X>>16 (aka xhi) to 0..maxX */ 37380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2hi = vmax_s16(x2hi, zero4); /* now 0.. */ 37480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru x2hi = vmin_s16(x2hi, maxX4); /* now 0..maxX */ 37580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 37680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* clamp Y>>16 (aka yhi) to 0..maxY */ 37780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2hi = vmax_s16(y2hi, zero4); /* now 0.. */ 37880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru y2hi = vmin_s16(y2hi, maxY4); /* now 0..maxY */ 37980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 38080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* we're storing as {x,y}s: x is [0], y is [1] */ 38180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* we'll use vst2 to make this happen */ 38280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 38380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 38480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t out_x asm("d16") = xhi; 38580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t out_y asm("d17") = yhi; 38680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 38780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */" 38880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : 38980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "w" (out_x), "w" (out_y), "r" (mydst) 39080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ); 39180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 39280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 39380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t out_x asm("d18") = x2hi; 39480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int16x4_t out_y asm("d19") = y2hi; 39580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 39680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */" 39780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : 39880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "w" (out_x), "w" (out_y), "r" (mydst+8) 39980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ); 40080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 40180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 40280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* XXX: gcc isn't interleaving these with the NEON ops 40380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * but i think that all the scoreboarding works out */ 40480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 8; /* 8 iterations */ 40580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru mysrc += 32; /* 16 longs, aka 32 shorts */ 40680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru mydst += 16; /* 16 shorts, aka 8 longs */ 40780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } while (count >= 8); 40880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* get xy and srcXY fixed up */ 40980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru srcXY = (const SkFixed *) mysrc; 41080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy = (uint32_t *) mydst; 41180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 41280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 41380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 41480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (--count >= 0) { 41580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | 41680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru TILEX_PROCF(srcXY[0], maxX); 41780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru srcXY += 2; 41880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 41980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 42080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#if defined(DEBUG_PERSP_NOFILTER) 42180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* for checking our NEON-produced results against vanilla code */ 42280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 42380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int bad = (-1); 42480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (int i = 0; i < base_count; i++) { 42580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t val; 42680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | 42780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); 42880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 42980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (val != base_xy[i]) { 43080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru bad = i; 43180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru break; 43280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 43380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 43480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (bad >= 0) { 43580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("clamp-nofilter-persp failed piece %d\n", bad); 43680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY); 43780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru bad -= (bad & 0x7); /* align */ 43880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (int i = bad; i < bad + 8; i++) { 43980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t val; 44080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | 44180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); 44280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 44380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", 44480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru i, base_xy[i], val, base_srcXY[i * 2 + 0], 44580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru base_srcXY[i * 2 + 1]); 44680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 44780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf ("---\n"); 44880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 44980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 45080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (end_xy != xy) { 45180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy); 45280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 45380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (end_srcXY != srcXY) { 45480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY, 45580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru end_srcXY); 45680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 45780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 45880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 45980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 46080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 46180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 46280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef DEBUG_PERSP_NOFILTER 46380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 46480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru////////////////////////////////////////////////////////////////////////////// 46580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 46680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, 46780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed one PREAMBLE_PARAM_Y) { 46880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned i = TILEY_PROCF(f, max); 46980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru i = (i << 4) | TILEY_LOW_BITS(f, max); 47080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru return (i << 14) | (TILEY_PROCF((f + one), max)); 47180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 47280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 47380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, 47480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed one PREAMBLE_PARAM_X) { 47580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned i = TILEX_PROCF(f, max); 47680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru i = (i << 4) | TILEX_LOW_BITS(f, max); 47780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru return (i << 14) | (TILEX_PROCF((f + one), max)); 47880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 47980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 48080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void SCALE_FILTER_NAME(const SkBitmapProcState& s, 48180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t xy[], int count, int x, int y) { 48280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 48380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kScale_Mask)) == 0); 48480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT(s.fInvKy == 0); 48580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 48680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 48780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 48880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const unsigned maxX = s.fBitmap->width() - 1; 48980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed one = s.fFilterOneX; 49080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed dx = s.fInvSx; 49180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx; 49280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 49380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 49480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkPoint pt; 49558190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 49658190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger SkIntToScalar(y) + SK_ScalarHalf, &pt); 49780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 49880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const unsigned maxY = s.fBitmap->height() - 1; 49980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // compute our two Y values up front 50080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); 50180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // now initialize fx 50280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx = SkScalarToFixed(pt.fX) - (one >> 1); 50380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 50480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 50580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#ifdef CHECK_FOR_DECAL 50680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru // test if we don't need to apply the tile proc 50780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (dx > 0 && 50880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru (unsigned)(fx >> 16) <= maxX && 50980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { 51080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru decal_filter_scale_neon(xy, fx, dx, count); 51180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } else 51280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 51380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 51480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 4) { 515096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int32x4_t wide_one, wide_fx, wide_fx1, wide_i, wide_lo; 51680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #if 0 51780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* verification hooks -- see below */ 51880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed debug_fx = fx; 51980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int count_done = 0; 52080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #endif 52180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 52280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vdupq_n_s32(fx); 52380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 52480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 52580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 52680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 52780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_one = vdupq_n_s32(one); 52880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 52980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (count >= 4) { 53080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* original expands to: 53180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * unsigned i = SkClampMax((f) >> 16, max); 53280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * i = (i << 4) | (((f) >> 12) & 0xF); 53380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 53480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 53580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 53680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i = SkClampMax(f>>16, maxX) */ 53780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); 53880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); 53980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 54080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<4 | TILEX_LOW_BITS(fx) */ 54180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_lo = vshrq_n_s32(wide_fx, 12); 54280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 54380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 54480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<14 */ 54580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vshlq_n_s32(wide_i, 14); 54680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 54780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkClampMax(((f + one)) >> 16, max) */ 54880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vaddq_s32(wide_fx, wide_one); 54980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); 55080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); 55180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 55280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* final combination */ 55380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vorrq_s32(wide_i, wide_fx1); 55480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 55580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vst1q_u32(xy, vreinterpretq_u32_s32(wide_i)); 55680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 55780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #if 0 55880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* having a verification hook is a good idea */ 55980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* use debug_fx, debug_fx+dx, etc. */ 56080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 56180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (int i=0;i<4;i++) { 56280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t want = PACK_FILTER_X_NAME(debug_fx, maxX, one PREAMBLE_ARG_X); 56380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (xy[i] != want) 56480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 56580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* print a nastygram */ 56680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("clamp-filter-scale fails\n"); 56780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("got %08x want %08x\n", xy[i], want); 56880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("fx %08x debug_fx %08x dx %08x done %d\n", 56980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx, debug_fx, dx, count_done); 57080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf(" maxX %08x one %08x\n", maxX, one); 57180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 57280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 57380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru debug_fx += dx; 57480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count_done++; 57580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 57680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #endif 57780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx += vdupq_n_s32(dx+dx+dx+dx); 57880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx+dx+dx+dx; 57980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy += 4; 58080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 4; 58180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 58280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 58380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 58480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (--count >= 0) { 58580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_X_NAME(fx, maxX, one PREAMBLE_ARG_X); 58680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; 58780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 58880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 58980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 59080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void AFFINE_FILTER_NAME(const SkBitmapProcState& s, 59180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t xy[], int count, int x, int y) { 59280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 59380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 59480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kScale_Mask | 59580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkMatrix::kAffine_Mask)) == 0); 59680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 59780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 59880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkPoint srcPt; 59958190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger s.fInvProc(s.fInvMatrix, 60080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(x) + SK_ScalarHalf, 60180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 60280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 60380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed oneX = s.fFilterOneX; 60480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed oneY = s.fFilterOneY; 60580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 60680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 60780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dx = s.fInvSx; 60880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed dy = s.fInvKy; 60980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned maxX = s.fBitmap->width() - 1; 61080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned maxY = s.fBitmap->height() - 1; 61180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 61280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 4) { 613096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int32x4_t wide_i, wide_lo; 614096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int32x4_t wide_fx, wide_onex, wide_fx1; 615096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int32x4_t wide_fy, wide_oney, wide_fy1; 61680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 61780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #undef AFFINE_DEBUG 61880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #if defined(AFFINE_DEBUG) 61980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fyp = fy; 62080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed fxp = fx; 62180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t *xyp = xy; 62280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int count_done = 0; 62380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #endif 62480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 62580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vdupq_n_s32(fx); 62680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 62780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 62880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 62980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 63080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy = vdupq_n_s32(fy); 63180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); 63280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); 63380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); 63480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 63580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_onex = vdupq_n_s32(oneX); 63680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_oney = vdupq_n_s32(oneY); 63780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 63880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (count >= 4) { 63980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_x; 64080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_y; 64180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 64280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* do the X side, then the Y side, then interleave them */ 64380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 64480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* original expands to: 64580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * unsigned i = SkClampMax((f) >> 16, max); 64680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * i = (i << 4) | (((f) >> 12) & 0xF); 64780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 64880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 64980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 65080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i = SkClampMax(f>>16, maxX) */ 65180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vmaxq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(0)); 65280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vminq_s32(wide_i, vdupq_n_s32(maxX)); 65380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 65480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<4 | TILEX_LOW_BITS(fx) */ 65580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_lo = vshrq_n_s32(wide_fx, 12); 65680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 65780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 65880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<14 */ 65980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vshlq_n_s32(wide_i, 14); 66080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 66180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkClampMax(((f + one)) >> 16, max) */ 66280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vaddq_s32(wide_fx, wide_onex); 66380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vmaxq_s32(vshrq_n_s32(wide_fx1,16), vdupq_n_s32(0)); 66480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vminq_s32(wide_fx1, vdupq_n_s32(maxX)); 66580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 66680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* final combination */ 66780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_x = vorrq_s32(wide_i, wide_fx1); 66880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 66980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* And now the Y side */ 67080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 67180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i = SkClampMax(f>>16, maxX) */ 67280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vmaxq_s32(vshrq_n_s32(wide_fy,16), vdupq_n_s32(0)); 67380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vminq_s32(wide_i, vdupq_n_s32(maxY)); 67480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 67580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<4 | TILEX_LOW_BITS(fx) */ 67680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_lo = vshrq_n_s32(wide_fy, 12); 67780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vsliq_n_s32(wide_lo, wide_i, 4); 67880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 67980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<14 */ 68080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vshlq_n_s32(wide_i, 14); 68180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 68280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkClampMax(((f + one)) >> 16, max) */ 68380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy1 = vaddq_s32(wide_fy, wide_oney); 68480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy1 = vmaxq_s32(vshrq_n_s32(wide_fy1,16), vdupq_n_s32(0)); 68580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy1 = vminq_s32(wide_fy1, vdupq_n_s32(maxY)); 68680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 68780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* final combination */ 68880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_y = vorrq_s32(wide_i, wide_fy1); 68980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 69080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* interleave as YXYXYXYX as part of the storing */ 69180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 69280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* vst2.32 needs side-by-side registers */ 69380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t t_x asm("q1"); 69480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t t_y asm("q0"); 69580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 69680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru t_x = wide_x; t_y = wide_y; 69780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" 69880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : 69980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "w" (t_y), "w" (t_x), "r" (xy) 70080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru ); 70180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 70280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 70380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #if defined(AFFINE_DEBUG) 70480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* make sure we're good here -- check the 4 we just output */ 70580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru for (int i = 0; i<4;i++) { 70680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t val; 70780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru val = PACK_FILTER_Y_NAME(fyp, maxY, oneY PREAMBLE_ARG_Y); 70880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (val != xy[i*2+0]) { 70980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* print a nastygram */ 71080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("clamp-filter-affine fails\n"); 71180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("[bad-y] got %08x want %08x\n", xy[i*2+0], val); 71280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("fy %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", 71380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fy, fxp, fyp, dx, dy, count_done); 71480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf(" maxY %08x oneY %08x\n", maxY, oneY); 71580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 71680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru val = PACK_FILTER_X_NAME(fxp, maxX, oneX PREAMBLE_ARG_X); 71780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (val != xy[i*2+1]) { 71880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* print a nastygram */ 71980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("clamp-filter-affine fails\n"); 72080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("[bad-x] got %08x want %08x\n", xy[i*2+1], val); 72180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf("fx %08x fxp %08x fyp %08x dx %08x dy %08x done %d\n", 72280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx, fxp, fyp, dx, dy, count_done); 72380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkDebugf(" maxX %08x one %08x\n", maxX, oneX); 72480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 72580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fyp += dy; 72680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fxp += dx; 72780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count_done++; 72880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 72980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #endif 73080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 73180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx += vdupq_n_s32(dx+dx+dx+dx); 73280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx+dx+dx+dx; 73380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy += vdupq_n_s32(dy+dy+dy+dy); 73480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fy += dy+dy+dy+dy; 73580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy += 8; /* 4 x's, 4 y's */ 73680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 4; 73780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 73880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 73980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 74080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (--count >= 0) { 74180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* NB: writing Y/X */ 74280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); 74380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fy += dy; 74480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); 74580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru fx += dx; 74680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 74780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 74880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 74980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Querustatic void PERSP_FILTER_NAME(const SkBitmapProcState& s, 75080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru uint32_t* SK_RESTRICT xy, int count, 75180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int x, int y) { 75280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 75380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 75480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PREAMBLE(s); 75580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned maxX = s.fBitmap->width() - 1; 75680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru unsigned maxY = s.fBitmap->height() - 1; 75780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed oneX = s.fFilterOneX; 75880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkFixed oneY = s.fFilterOneY; 75980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 76058190644c30e1c4aa8e527f3503c58f841e0fcf3Derek Sollenberger SkPerspIter iter(s.fInvMatrix, 76180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(x) + SK_ScalarHalf, 76280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SkIntToScalar(y) + SK_ScalarHalf, count); 76380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 76480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while ((count = iter.next()) != 0) { 76580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 76680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 76780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru if (count >= 4) { 768096defe64d408e54474fe19f418c95bf1a554fc7Derek Sollenberger int32x4_t wide_i, wide_lo; 76980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fx1; 77080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fy1; 77180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_x, wide_y; 77280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 77380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (count >= 4) { 77480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* RBE: it's good, but: 77580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * -- we spill a constant that could be easily regnerated 77680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * [perhaps tweak gcc's NEON constant costs?] 77780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 77880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 77980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* load src: x-y-x-y-x-y-x-y */ 78080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 78180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t q0 asm ("q0"); 78280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t q1 asm ("q1"); 78380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" 78480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "=w" (q0), "=w" (q1) 78580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "r" (srcXY)); 78680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_x = q0; wide_y = q1; 78780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 78880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 78980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* do the X side, then the Y side, then interleave them */ 79080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 79180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_x = vsubq_s32(wide_x, vdupq_n_s32 (oneX>>1)); 79280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 79380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* original expands to: 79480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * unsigned i = SkClampMax((f) >> 16, max); 79580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * i = (i << 4) | (((f) >> 12) & 0xF); 79680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * return (i << 14) | (SkClampMax(((f + one)) >> 16, max)); 79780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru */ 79880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 79980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i = SkClampMax(f>>16, maxX) */ 80080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vmaxq_s32 (vshrq_n_s32 (wide_x, 16), vdupq_n_s32 (0)); 80180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxX)); 80280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 80380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<4 | TILEX_LOW_BITS(fx) */ 80480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_lo = vshrq_n_s32 (wide_x, 12); 80580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); 80680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 80780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<14 */ 80880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vshlq_n_s32 (wide_i, 14); 80980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 81080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkClampMax(((f + one)) >> 16, max) */ 81180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vaddq_s32 (wide_x, vdupq_n_s32(oneX)); 81280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vmaxq_s32 (vshrq_n_s32 (wide_fx1, 16), vdupq_n_s32 (0)); 81380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fx1 = vminq_s32 (wide_fx1, vdupq_n_s32 (maxX)); 81480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 81580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* final combination */ 81680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_x = vorrq_s32 (wide_i, wide_fx1); 81780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 81880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 81980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* And now the Y side */ 82080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 82180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_y = vsubq_s32(wide_y, vdupq_n_s32 (oneY>>1)); 82280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 82380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i = SkClampMax(f>>16, maxX) */ 82480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vmaxq_s32 (vshrq_n_s32 (wide_y, 16), vdupq_n_s32 (0)); 82580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vminq_s32 (wide_i, vdupq_n_s32 (maxY)); 82680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 82780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<4 | TILEX_LOW_BITS(fx) */ 82880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_lo = vshrq_n_s32 (wide_y, 12); 82980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vsliq_n_s32 (wide_lo, wide_i, 4); 83080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 83180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* i<<14 */ 83280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_i = vshlq_n_s32 (wide_i, 14); 83380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 83480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* SkClampMax(((f + one)) >> 16, max) */ 83580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 83680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* wide_fy1_1 and wide_fy1_2 are just temporary variables to 83780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * work-around an ICE in debug */ 83880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fy1_1 = vaddq_s32 (wide_y, vdupq_n_s32(oneY)); 83980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru int32x4_t wide_fy1_2 = vmaxq_s32 (vshrq_n_s32 (wide_fy1_1, 16), 84080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru vdupq_n_s32 (0)); 84180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_fy1 = vminq_s32 (wide_fy1_2, vdupq_n_s32 (maxY)); 84280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 84380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* final combination */ 84480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru wide_y = vorrq_s32 (wide_i, wide_fy1); 84580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 84680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* switch them around; have to do it this way to get them 84780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru * in the proper registers to match our instruction */ 84880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 84980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* iteration bookkeeping, ahead of the asm() for scheduling */ 85080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru srcXY += 2*4; 85180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru count -= 4; 85280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 85380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */ 85480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru { 85580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t q0 asm ("q0") = wide_y; 85680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru register int32x4_t q1 asm ("q1") = wide_x; 85780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 85880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" 85980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : 86080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru : "w" (q0), "w" (q1), "r" (xy)); 86180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 86280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 86380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* on to the next iteration */ 86480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* count, srcXY are handled above */ 86580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru xy += 2*4; 86680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 86780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 86880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 86980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* was do-while; NEON code invalidates original count>0 assumption */ 87080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru while (--count >= 0) { 87180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru /* NB: we read x/y, we write y/x */ 87280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, 87380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru oneY PREAMBLE_ARG_Y); 87480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, 87580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru oneX PREAMBLE_ARG_X); 87680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru srcXY += 2; 87780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 87880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru } 87980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru} 88080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 88180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queruconst SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { 88280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SCALE_NOFILTER_NAME, 88380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru SCALE_FILTER_NAME, 88480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru AFFINE_NOFILTER_NAME, 88580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru AFFINE_FILTER_NAME, 88680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PERSP_NOFILTER_NAME, 88780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru PERSP_FILTER_NAME 88880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru}; 88980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 89080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef MAKENAME 89180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef TILEX_PROCF 89280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef TILEY_PROCF 89380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#ifdef CHECK_FOR_DECAL 89480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru #undef CHECK_FOR_DECAL 89580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#endif 89680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 89780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef SCALE_NOFILTER_NAME 89880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef SCALE_FILTER_NAME 89980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef AFFINE_NOFILTER_NAME 90080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef AFFINE_FILTER_NAME 90180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PERSP_NOFILTER_NAME 90280bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PERSP_FILTER_NAME 90380bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 90480bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PREAMBLE 90580bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PREAMBLE_PARAM_X 90680bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PREAMBLE_PARAM_Y 90780bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PREAMBLE_ARG_X 90880bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef PREAMBLE_ARG_Y 90980bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru 91080bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef TILEX_LOW_BITS 91180bacfeb4bda06541e8695bd502229727bccfeaJean-Baptiste Queru#undef TILEY_LOW_BITS 912