1/* 2 * Copyright 2014 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include <arm_neon.h> 9 10#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) 11#define SCALE_FILTER_NAME MAKENAME(_filter_scale) 12#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) 13#define AFFINE_FILTER_NAME MAKENAME(_filter_affine) 14#define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) 15#define PERSP_FILTER_NAME MAKENAME(_filter_persp) 16 17#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) 18#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) 19#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) 20#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) 21 22#ifndef PREAMBLE 23 #define PREAMBLE(state) 24 #define PREAMBLE_PARAM_X 25 #define PREAMBLE_PARAM_Y 26 #define PREAMBLE_ARG_X 27 #define PREAMBLE_ARG_Y 28#endif 29 30static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, 31 uint32_t xy[], int count, int x, int y) { 32 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 33 SkMatrix::kScale_Mask)) == 0); 34 35 PREAMBLE(s); 36 37 // we store y, x, x, x, x, x 38 const unsigned maxX = s.fPixmap.width() - 1; 39 SkFractionalInt fx; 40 { 41 const SkBitmapProcStateAutoMapper mapper(s, x, y); 42 const unsigned maxY = s.fPixmap.height() - 1; 43 *xy++ = TILEY_PROCF(mapper.fixedY(), maxY); 44 fx = mapper.fractionalIntX(); 45 } 46 47 if (0 == maxX) { 48 // all of the following X values must be 0 49 memset(xy, 0, count * sizeof(uint16_t)); 50 return; 51 } 52 53 const SkFractionalInt dx = s.fInvSxFractionalInt; 54 55#ifdef CHECK_FOR_DECAL 56 // test if we don't need to apply the tile proc 57 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { 58 decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx), 59 SkFractionalIntToFixed(dx), count); 60 return; 61 } 62#endif 63 64 if (count >= 8) { 65 SkFractionalInt dx2 = dx+dx; 66 SkFractionalInt dx4 = dx2+dx2; 67 SkFractionalInt dx8 = dx4+dx4; 68 69 // now build fx/fx+dx/fx+2dx/fx+3dx 70 SkFractionalInt fx1, fx2, fx3; 71 int32x4_t lbase, hbase; 72 int16_t *dst16 = (int16_t *)xy; 73 74 fx1 = fx+dx; 75 fx2 = fx1+dx; 76 fx3 = fx2+dx; 77 78 lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); 79 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); 80 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); 81 lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); 82 hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); 83 84 // store & bump 85 while (count >= 8) { 86 87 int16x8_t fx8; 88 89 fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); 90 91 vst1q_s16(dst16, fx8); 92 93 // but preserving base & on to the next 94 lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 95 hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 96 dst16 += 8; 97 count -= 8; 98 fx += dx8; 99 }; 100 xy = (uint32_t *) dst16; 101 } 102 103 uint16_t* xx = (uint16_t*)xy; 104 for (int i = count; i > 0; --i) { 105 *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); 106 fx += dx; 107 } 108} 109 110static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, 111 uint32_t xy[], int count, int x, int y) { 112 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 113 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 114 SkMatrix::kScale_Mask | 115 SkMatrix::kAffine_Mask)) == 0); 116 117 PREAMBLE(s); 118 const SkBitmapProcStateAutoMapper mapper(s, x, y); 119 120 SkFractionalInt fx = mapper.fractionalIntX(); 121 SkFractionalInt fy = mapper.fractionalIntY(); 122 SkFractionalInt dx = s.fInvSxFractionalInt; 123 SkFractionalInt dy = s.fInvKyFractionalInt; 124 int maxX = s.fPixmap.width() - 1; 125 int maxY = s.fPixmap.height() - 1; 126 127 if (count >= 8) { 128 SkFractionalInt dx4 = dx * 4; 129 SkFractionalInt dy4 = dy * 4; 130 SkFractionalInt dx8 = dx * 8; 131 SkFractionalInt dy8 = dy * 8; 132 133 int32x4_t xbase, ybase; 134 int32x4_t x2base, y2base; 135 int16_t *dst16 = (int16_t *) xy; 136 137 // now build fx, fx+dx, fx+2dx, fx+3dx 138 xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); 139 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); 140 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); 141 xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); 142 143 // same for fy 144 ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); 145 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); 146 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); 147 ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); 148 149 x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); 150 y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); 151 152 // store & bump 153 do { 154 int16x8x2_t hi16; 155 156 hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); 157 hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); 158 159 vst2q_s16(dst16, hi16); 160 161 // moving base and on to the next 162 xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 163 ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); 164 x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); 165 y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); 166 167 dst16 += 16; // 8x32 aka 16x16 168 count -= 8; 169 fx += dx8; 170 fy += dy8; 171 } while (count >= 8); 172 xy = (uint32_t *) dst16; 173 } 174 175 for (int i = count; i > 0; --i) { 176 *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | 177 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); 178 fx += dx; fy += dy; 179 } 180} 181 182static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, 183 uint32_t* SK_RESTRICT xy, 184 int count, int x, int y) { 185 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 186 187 PREAMBLE(s); 188 // max{X,Y} are int here, but later shown/assumed to fit in 16 bits 189 int maxX = s.fPixmap.width() - 1; 190 int maxY = s.fPixmap.height() - 1; 191 192 SkPerspIter iter(s.fInvMatrix, 193 SkIntToScalar(x) + SK_ScalarHalf, 194 SkIntToScalar(y) + SK_ScalarHalf, count); 195 196 while ((count = iter.next()) != 0) { 197 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 198 199 if (count >= 8) { 200 int32_t *mysrc = (int32_t *) srcXY; 201 int16_t *mydst = (int16_t *) xy; 202 do { 203 int16x8x2_t hi16; 204 int32x4x2_t xy1, xy2; 205 206 xy1 = vld2q_s32(mysrc); 207 xy2 = vld2q_s32(mysrc+8); 208 209 hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); 210 hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); 211 212 vst2q_s16(mydst, hi16); 213 214 count -= 8; // 8 iterations 215 mysrc += 16; // 16 longs 216 mydst += 16; // 16 shorts, aka 8 longs 217 } while (count >= 8); 218 // get xy and srcXY fixed up 219 srcXY = (const SkFixed *) mysrc; 220 xy = (uint32_t *) mydst; 221 } 222 223 while (--count >= 0) { 224 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | 225 TILEX_PROCF(srcXY[0], maxX); 226 srcXY += 2; 227 } 228 } 229} 230 231static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, 232 SkFixed one PREAMBLE_PARAM_Y) { 233 unsigned i = TILEY_PROCF(f, max); 234 i = (i << 4) | TILEY_LOW_BITS(f, max); 235 return (i << 14) | (TILEY_PROCF((f + one), max)); 236} 237 238static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, 239 SkFixed one PREAMBLE_PARAM_X) { 240 unsigned i = TILEX_PROCF(f, max); 241 i = (i << 4) | TILEX_LOW_BITS(f, max); 242 return (i << 14) | (TILEX_PROCF((f + one), max)); 243} 244 245static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, 246 SkFixed one PREAMBLE_PARAM_X) { 247 int32x4_t ret, res, wide_one; 248 249 // Prepare constants 250 wide_one = vdupq_n_s32(one); 251 252 // Step 1 253 res = TILEX_PROCF_NEON4(f, max); 254 255 // Step 2 256 ret = TILEX_LOW_BITS_NEON4(f, max); 257 ret = vsliq_n_s32(ret, res, 4); 258 259 // Step 3 260 res = TILEX_PROCF_NEON4(f + wide_one, max); 261 ret = vorrq_s32(vshlq_n_s32(ret, 14), res); 262 263 return ret; 264} 265 266static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, 267 SkFixed one PREAMBLE_PARAM_X) { 268 int32x4_t ret, res, wide_one; 269 270 // Prepare constants 271 wide_one = vdupq_n_s32(one); 272 273 // Step 1 274 res = TILEY_PROCF_NEON4(f, max); 275 276 // Step 2 277 ret = TILEY_LOW_BITS_NEON4(f, max); 278 ret = vsliq_n_s32(ret, res, 4); 279 280 // Step 3 281 res = TILEY_PROCF_NEON4(f + wide_one, max); 282 ret = vorrq_s32(vshlq_n_s32(ret, 14), res); 283 284 return ret; 285} 286 287static void SCALE_FILTER_NAME(const SkBitmapProcState& s, 288 uint32_t xy[], int count, int x, int y) { 289 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 290 SkMatrix::kScale_Mask)) == 0); 291 SkASSERT(s.fInvKy == 0); 292 293 PREAMBLE(s); 294 295 const unsigned maxX = s.fPixmap.width() - 1; 296 const SkFixed one = s.fFilterOneX; 297 const SkFractionalInt dx = s.fInvSxFractionalInt; 298 SkFractionalInt fx; 299 300 { 301 const SkBitmapProcStateAutoMapper mapper(s, x, y); 302 const SkFixed fy = mapper.fixedY(); 303 const unsigned maxY = s.fPixmap.height() - 1; 304 // compute our two Y values up front 305 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); 306 // now initialize fx 307 fx = mapper.fractionalIntX(); 308 } 309 310#ifdef CHECK_FOR_DECAL 311 // test if we don't need to apply the tile proc 312 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { 313 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), 314 SkFractionalIntToFixed(dx), count); 315 return; 316 } 317#endif 318 { 319 320 if (count >= 4) { 321 int32x4_t wide_fx; 322 323 wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); 324 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); 325 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); 326 wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); 327 328 while (count >= 4) { 329 int32x4_t res; 330 331 res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); 332 333 vst1q_u32(xy, vreinterpretq_u32_s32(res)); 334 335 wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); 336 fx += dx+dx+dx+dx; 337 xy += 4; 338 count -= 4; 339 } 340 } 341 342 while (--count >= 0) { 343 *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); 344 fx += dx; 345 } 346 347 } 348} 349 350static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, 351 uint32_t xy[], int count, int x, int y) { 352 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); 353 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | 354 SkMatrix::kScale_Mask | 355 SkMatrix::kAffine_Mask)) == 0); 356 357 PREAMBLE(s); 358 const SkBitmapProcStateAutoMapper mapper(s, x, y); 359 360 SkFixed oneX = s.fFilterOneX; 361 SkFixed oneY = s.fFilterOneY; 362 SkFixed fx = mapper.fixedX(); 363 SkFixed fy = mapper.fixedY(); 364 SkFixed dx = s.fInvSx; 365 SkFixed dy = s.fInvKy; 366 unsigned maxX = s.fPixmap.width() - 1; 367 unsigned maxY = s.fPixmap.height() - 1; 368 369 if (count >= 4) { 370 int32x4_t wide_fy, wide_fx; 371 372 wide_fx = vdupq_n_s32(fx); 373 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 374 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 375 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 376 377 wide_fy = vdupq_n_s32(fy); 378 wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); 379 wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); 380 wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); 381 382 while (count >= 4) { 383 int32x4x2_t vxy; 384 385 // do the X side, then the Y side, then interleave them 386 vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); 387 vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); 388 389 // interleave as YXYXYXYX as part of the storing 390 vst2q_s32((int32_t*)xy, vxy); 391 392 // prepare next iteration 393 wide_fx += vdupq_n_s32(dx+dx+dx+dx); 394 fx += dx + dx + dx + dx; 395 wide_fy += vdupq_n_s32(dy+dy+dy+dy); 396 fy += dy+dy+dy+dy; 397 xy += 8; // 4 x's, 4 y's 398 count -= 4; 399 } 400 } 401 402 while (--count >= 0) { 403 // NB: writing Y/X 404 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); 405 fy += dy; 406 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); 407 fx += dx; 408 } 409} 410 411static void PERSP_FILTER_NAME(const SkBitmapProcState& s, 412 uint32_t* SK_RESTRICT xy, int count, 413 int x, int y) { 414 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); 415 416 PREAMBLE(s); 417 unsigned maxX = s.fPixmap.width() - 1; 418 unsigned maxY = s.fPixmap.height() - 1; 419 SkFixed oneX = s.fFilterOneX; 420 SkFixed oneY = s.fFilterOneY; 421 422 SkPerspIter iter(s.fInvMatrix, 423 SkIntToScalar(x) + SK_ScalarHalf, 424 SkIntToScalar(y) + SK_ScalarHalf, count); 425 426 while ((count = iter.next()) != 0) { 427 const SkFixed* SK_RESTRICT srcXY = iter.getXY(); 428 429 while (count >= 4) { 430 int32x4_t wide_x, wide_y; 431 int32x4x2_t vxy, vresyx; 432 433 // load src: x-y-x-y-x-y-x-y 434 vxy = vld2q_s32(srcXY); 435 436 // do the X side, then the Y side, then interleave them 437 wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); 438 wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); 439 440 vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); 441 vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); 442 443 // store interleaved as y-x-y-x-y-x-y-x (NB != read order) 444 vst2q_s32((int32_t*)xy, vresyx); 445 446 // on to the next iteration 447 srcXY += 2*4; 448 count -= 4; 449 xy += 2*4; 450 } 451 452 while (--count >= 0) { 453 // NB: we read x/y, we write y/x 454 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, 455 oneY PREAMBLE_ARG_Y); 456 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, 457 oneX PREAMBLE_ARG_X); 458 srcXY += 2; 459 } 460 } 461} 462 463const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { 464 SCALE_NOFILTER_NAME, 465 SCALE_FILTER_NAME, 466 AFFINE_NOFILTER_NAME, 467 AFFINE_FILTER_NAME, 468 PERSP_NOFILTER_NAME, 469 PERSP_FILTER_NAME 470}; 471 472#undef TILEX_PROCF_NEON8 473#undef TILEY_PROCF_NEON8 474#undef TILEX_PROCF_NEON4 475#undef TILEY_PROCF_NEON4 476#undef TILEX_LOW_BITS_NEON4 477#undef TILEY_LOW_BITS_NEON4 478 479#undef MAKENAME 480#undef TILEX_PROCF 481#undef TILEY_PROCF 482#ifdef CHECK_FOR_DECAL 483 #undef CHECK_FOR_DECAL 484#endif 485 486#undef SCALE_NOFILTER_NAME 487#undef SCALE_FILTER_NAME 488#undef AFFINE_NOFILTER_NAME 489#undef AFFINE_FILTER_NAME 490#undef PERSP_NOFILTER_NAME 491#undef PERSP_FILTER_NAME 492 493#undef PREAMBLE 494#undef PREAMBLE_PARAM_X 495#undef PREAMBLE_PARAM_Y 496#undef PREAMBLE_ARG_X 497#undef PREAMBLE_ARG_Y 498 499#undef TILEX_LOW_BITS 500#undef TILEY_LOW_BITS 501