1 2#include <arm_neon.h> 3 4 5#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) 6#define SCALE_FILTER_NAME MAKENAME(_filter_scale) 7#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) 8#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) 9#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) 10#define PERSP_FILTER_NAME MAKENAME(_filter_persp) 11 12#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) 13#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) 14#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) 15#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) 16 17#ifndef PREAMBLE 18 #define PREAMBLE(state) 19 #define PREAMBLE_PARAM_X 20 #define PREAMBLE_PARAM_Y 21 #define PREAMBLE_ARG_X 22 #define PREAMBLE_ARG_Y 23#endif 24 25static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, 26 uint32_t xy[], int count, int x, int y) { 27 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 28 SkMatrix::kScale_Mask)) == 0); 29 30 PREAMBLE(s); 31 32 // we store y, x, x, x, x, x 33 const unsigned maxX = s.fBitmap->width() - 1; 34 SkFractionalInt fx; 35 { 36 SkPoint pt; 37 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 38 SkIntToScalar(y) + SK_ScalarHalf, &pt); 39 fx = SkScalarToFractionalInt(pt.fY); 40 const unsigned maxY = s.fBitmap->height() - 1; 41 *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY); 42 fx = SkScalarToFractionalInt(pt.fX); 43 } 44 45 if (0 == maxX) { 46 // all of the following X values must be 0 47 memset(xy, 0, count * sizeof(uint16_t)); 48 return; 49 } 50 51 const SkFractionalInt dx = s.fInvSxFractionalInt; 52 53#ifdef CHECK_FOR_DECAL 54 // test if we don't need to apply the tile proc 55 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { 56 decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx), 57 SkFractionalIntToFixed(dx), count); 58 return; 59 } 60#endif 61 62 if (count >= 8) { 63 SkFractionalInt dx2 = dx+dx; 64 SkFractionalInt dx4 = dx2+dx2; 65 SkFractionalInt dx8 = dx4+dx4; 66 67 // now build fx/fx+dx/fx+2dx/fx+3dx 68 SkFractionalInt fx1, fx2, fx3; 69 int32x4_t lbase, hbase; 70 int16_t *dst16 = (int16_t *)xy; 71 72 fx1 = fx+dx; 73 fx2 = fx1+dx; 74 fx3 = fx2+dx; 75 76 lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); 77 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); 78 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); 79 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); 80 hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); 81 82 // store & bump 83 while (count >= 8) { 84 85 int16x8_t fx8; 86 87 fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); 88 89 vst1q_s16(dst16, fx8); 90 91 // but preserving base & on to the next 92 lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 93 hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 94 dst16 += 8; 95 count -= 8; 96 fx += dx8; 97 }; 98 xy = (uint32_t *) dst16; 99 } 100 101 uint16_t* xx = (uint16_t*)xy; 102 for (int i = count; i > 0; --i) { 103 *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); 104 fx += dx; 105 } 106} 107 108static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, 109 uint32_t xy[], int count, int x, int y) { 110 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 111 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 112 SkMatrix::kScale_Mask | 113 SkMatrix::kAffine_Mask)) == 0); 114 115 PREAMBLE(s); 116 SkPoint srcPt; 117 s.fInvProc(s.fInvMatrix, 118 SkIntToScalar(x) + SK_ScalarHalf, 119 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 120 121 SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX); 122 SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY); 123 SkFractionalInt dx = s.fInvSxFractionalInt; 124 SkFractionalInt dy = s.fInvKyFractionalInt; 125 int maxX = s.fBitmap->width() - 1; 126 int maxY = s.fBitmap->height() - 1; 127 128 if (count >= 8) { 129 SkFractionalInt dx4 = dx * 4; 130 SkFractionalInt dy4 = dy * 4; 131 SkFractionalInt dx8 = dx * 8; 132 SkFractionalInt dy8 = dy * 8; 133 134 int32x4_t xbase, ybase; 135 int32x4_t x2base, y2base; 136 int16_t *dst16 = (int16_t *) xy; 137 138 // now build fx, fx+dx, fx+2dx, fx+3dx 139 xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); 140 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); 141 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); 142 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); 143 144 // same for fy 145 ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); 146 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); 147 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); 148 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); 149 150 x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); 151 y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); 152 153 // store & bump 154 do { 155 int16x8x2_t hi16; 156 157 hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); 158 hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); 159 160 vst2q_s16(dst16, hi16); 161 162 // moving base and on to the next 163 xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 164 ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); 165 x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 166 y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); 167 168 dst16 += 16; // 8x32 aka 16x16 169 count -= 8; 170 fx += dx8; 171 fy += dy8; 172 } while (count >= 8); 173 xy = (uint32_t *) dst16; 174 } 175 176 for (int i = count; i > 0; --i) { 177 *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | 178 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); 179 fx += dx; fy += dy; 180 } 181} 182 183static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, 184 uint32_t* SK_RESTRICT xy, 185 int count, int x, int y) { 186 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 187 188 PREAMBLE(s); 189 // max{X,Y} are int here, but later shown/assumed to fit in 16 bits 190 int maxX = s.fBitmap->width() - 1; 191 int maxY = s.fBitmap->height() - 1; 192 193 SkPerspIter iter(s.fInvMatrix, 194 SkIntToScalar(x) + SK_ScalarHalf, 195 SkIntToScalar(y) + SK_ScalarHalf, count); 196 197 while ((count = iter.next()) != 0) { 198 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 199 200 if (count >= 8) { 201 int32_t *mysrc = (int32_t *) srcXY; 202 int16_t *mydst = (int16_t *) xy; 203 do { 204 int16x8x2_t hi16; 205 int32x4x2_t xy1, xy2; 206 207 xy1 = vld2q_s32(mysrc); 208 xy2 = vld2q_s32(mysrc+8); 209 210 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); 211 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); 212 213 vst2q_s16(mydst, hi16); 214 215 count -= 8; // 8 iterations 216 mysrc += 16; // 16 longs 217 mydst += 16; // 16 shorts, aka 8 longs 218 } while (count >= 8); 219 // get xy and srcXY fixed up 220 srcXY = (const SkFixed *) mysrc; 221 xy = (uint32_t *) mydst; 222 } 223 224 while (--count >= 0) { 225 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | 226 TILEX_PROCF(srcXY[0], maxX); 227 srcXY += 2; 228 } 229 } 230} 231 232static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, 233 SkFixed one PREAMBLE_PARAM_Y) { 234 unsigned i = TILEY_PROCF(f, max); 235 i = (i << 4) | TILEY_LOW_BITS(f, max); 236 return (i << 14) | (TILEY_PROCF((f + one), max)); 237} 238 239static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, 240 SkFixed one PREAMBLE_PARAM_X) { 241 unsigned i = TILEX_PROCF(f, max); 242 i = (i << 4) | TILEX_LOW_BITS(f, max); 243 return (i << 14) | (TILEX_PROCF((f + one), max)); 244} 245 246static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, 247 SkFixed one PREAMBLE_PARAM_X) { 248 int32x4_t ret, res, wide_one; 249 250 // Prepare constants 251 wide_one = vdupq_n_s32(one); 252 253 // Step 1 254 res = TILEX_PROCF_NEON4(f, max); 255 256 // Step 2 257 ret = TILEX_LOW_BITS_NEON4(f, max); 258 ret = vsliq_n_s32(ret, res, 4); 259 260 // Step 3 261 res = TILEX_PROCF_NEON4(f + wide_one, max); 262 ret = vorrq_s32(vshlq_n_s32(ret, 14), res); 263 264 return ret; 265} 266 267static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, 268 SkFixed one PREAMBLE_PARAM_X) { 269 int32x4_t ret, res, wide_one; 270 271 // Prepare constants 272 wide_one = vdupq_n_s32(one); 273 274 // Step 1 275 res = TILEY_PROCF_NEON4(f, max); 276 277 // Step 2 278 ret = TILEY_LOW_BITS_NEON4(f, max); 279 ret = vsliq_n_s32(ret, res, 4); 280 281 // Step 3 282 res = TILEY_PROCF_NEON4(f + wide_one, max); 283 ret = vorrq_s32(vshlq_n_s32(ret, 14), res); 284 285 return ret; 286} 287 288static void SCALE_FILTER_NAME(const SkBitmapProcState& s, 289 uint32_t xy[], int count, int x, int y) { 290 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 291 SkMatrix::kScale_Mask)) == 0); 292 SkASSERT(s.fInvKy == 0); 293 294 PREAMBLE(s); 295 296 const unsigned maxX = s.fBitmap->width() - 1; 297 const SkFixed one = s.fFilterOneX; 298 const SkFractionalInt dx = s.fInvSxFractionalInt; 299 SkFractionalInt fx; 300 301 { 302 SkPoint pt; 303 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 304 SkIntToScalar(y) + SK_ScalarHalf, &pt); 305 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); 306 const unsigned maxY = s.fBitmap->height() - 1; 307 // compute our two Y values up front 308 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); 309 // now initialize fx 310 fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1); 311 } 312 313#ifdef CHECK_FOR_DECAL 314 // test if we don't need to apply the tile proc 315 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { 316 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), 317 SkFractionalIntToFixed(dx), count); 318 return; 319 } 320#endif 321 { 322 323 if (count >= 4) { 324 int32x4_t wide_fx; 325 326 wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); 327 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); 328 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); 329 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); 330 331 while (count >= 4) { 332 int32x4_t res; 333 334 res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); 335 336 vst1q_u32(xy, vreinterpretq_u32_s32(res)); 337 338 wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); 339 fx += dx+dx+dx+dx; 340 xy += 4; 341 count -= 4; 342 } 343 } 344 345 while (--count >= 0) { 346 *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); 347 fx += dx; 348 } 349 350 } 351} 352 353static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, 354 uint32_t xy[], int count, int x, int y) { 355 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 356 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 357 SkMatrix::kScale_Mask | 358 SkMatrix::kAffine_Mask)) == 0); 359 360 PREAMBLE(s); 361 SkPoint srcPt; 362 s.fInvProc(s.fInvMatrix, 363 SkIntToScalar(x) + SK_ScalarHalf, 364 SkIntToScalar(y) + SK_ScalarHalf, &srcPt); 365 366 SkFixed oneX = s.fFilterOneX; 367 SkFixed oneY = s.fFilterOneY; 368 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); 369 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); 370 SkFixed dx = s.fInvSx; 371 SkFixed dy = s.fInvKy; 372 unsigned maxX = s.fBitmap->width() - 1; 373 unsigned maxY = s.fBitmap->height() - 1; 374 375 if (count >= 4) { 376 int32x4_t wide_fy, wide_fx; 377 378 wide_fx = vdupq_n_s32(fx); 379 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 380 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 381 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 382 383 wide_fy = vdupq_n_s32(fy); 384 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); 385 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); 386 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); 387 388 while (count >= 4) { 389 int32x4x2_t vxy; 390 391 // do the X side, then the Y side, then interleave them 392 vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); 393 vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); 394 395 // interleave as YXYXYXYX as part of the storing 396 vst2q_s32((int32_t*)xy, vxy); 397 398 // prepare next iteration 399 wide_fx += vdupq_n_s32(dx+dx+dx+dx); 400 fx += dx + dx + dx + dx; 401 wide_fy += vdupq_n_s32(dy+dy+dy+dy); 402 fy += dy+dy+dy+dy; 403 xy += 8; // 4 x's, 4 y's 404 count -= 4; 405 } 406 } 407 408 while (--count >= 0) { 409 // NB: writing Y/X 410 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); 411 fy += dy; 412 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); 413 fx += dx; 414 } 415} 416 417static void PERSP_FILTER_NAME(const SkBitmapProcState& s, 418 uint32_t* SK_RESTRICT xy, int count, 419 int x, int y) { 420 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 421 422 PREAMBLE(s); 423 unsigned maxX = s.fBitmap->width() - 1; 424 unsigned maxY = s.fBitmap->height() - 1; 425 SkFixed oneX = s.fFilterOneX; 426 SkFixed oneY = s.fFilterOneY; 427 428 SkPerspIter iter(s.fInvMatrix, 429 SkIntToScalar(x) + SK_ScalarHalf, 430 SkIntToScalar(y) + SK_ScalarHalf, count); 431 432 while ((count = iter.next()) != 0) { 433 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 434 435 while (count >= 4) { 436 int32x4_t wide_x, wide_y; 437 int32x4x2_t vxy, vresyx; 438 439 // load src: x-y-x-y-x-y-x-y 440 vxy = vld2q_s32(srcXY); 441 442 // do the X side, then the Y side, then interleave them 443 wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); 444 wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); 445 446 vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); 447 vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); 448 449 // store interleaved as y-x-y-x-y-x-y-x (NB != read order) 450 vst2q_s32((int32_t*)xy, vresyx); 451 452 // on to the next iteration 453 srcXY += 2*4; 454 count -= 4; 455 xy += 2*4; 456 } 457 458 while (--count >= 0) { 459 // NB: we read x/y, we write y/x 460 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, 461 oneY PREAMBLE_ARG_Y); 462 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, 463 oneX PREAMBLE_ARG_X); 464 srcXY += 2; 465 } 466 } 467} 468 469const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { 470 SCALE_NOFILTER_NAME, 471 SCALE_FILTER_NAME, 472 AFFINE_NOFILTER_NAME, 473 AFFINE_FILTER_NAME, 474 PERSP_NOFILTER_NAME, 475 PERSP_FILTER_NAME 476}; 477 478#undef TILEX_PROCF_NEON8 479#undef TILEY_PROCF_NEON8 480#undef TILEX_PROCF_NEON4 481#undef TILEY_PROCF_NEON4 482#undef TILEX_LOW_BITS_NEON4 483#undef TILEY_LOW_BITS_NEON4 484 485#undef MAKENAME 486#undef TILEX_PROCF 487#undef TILEY_PROCF 488#ifdef CHECK_FOR_DECAL 489 #undef CHECK_FOR_DECAL 490#endif 491 492#undef SCALE_NOFILTER_NAME 493#undef SCALE_FILTER_NAME 494#undef AFFINE_NOFILTER_NAME 495#undef AFFINE_FILTER_NAME 496#undef PERSP_NOFILTER_NAME 497#undef PERSP_FILTER_NAME 498 499#undef PREAMBLE 500#undef PREAMBLE_PARAM_X 501#undef PREAMBLE_PARAM_Y 502#undef PREAMBLE_ARG_X 503#undef PREAMBLE_ARG_Y 504 505#undef TILEX_LOW_BITS 506#undef TILEY_LOW_BITS 507