SkBitmapProcState_matrixProcs.cpp revision f444e8ccda8905a8ce16bac368e09f205786db31
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7#include "SkBitmapProcState.h" 8#include "SkPerspIter.h" 9#include "SkShader.h" 10#include "SkUtils.h" 11 12/* returns 0...(n-1) given any x (positive or negative). 13 14 As an example, if n (which is always positive) is 5... 15 16 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 17 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 18 */ 19static inline int sk_int_mod(int x, int n) { 20 SkASSERT(n > 0); 21 if ((unsigned)x >= (unsigned)n) { 22 if (x < 0) { 23 x = n + ~(~x % n); 24 } else { 25 x = x % n; 26 } 27 } 28 return x; 29} 30 31void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 32void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 33 34#define MAKENAME(suffix) ClampX_ClampY ## suffix 35#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 36#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 37#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 38#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 39#define CHECK_FOR_DECAL 40#if defined(__ARM_HAVE_NEON) 41 #include "SkBitmapProcState_matrix_clamp.h" 42#else 43 #include "SkBitmapProcState_matrix.h" 44#endif 45 46#define MAKENAME(suffix) RepeatX_RepeatY ## suffix 47#define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) 48#define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16) 49#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 50#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 51#if defined(__ARM_HAVE_NEON) 52 #include "SkBitmapProcState_matrix_repeat.h" 53#else 54 #include "SkBitmapProcState_matrix.h" 55#endif 56 57#define MAKENAME(suffix) GeneralXY ## suffix 58#define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \ 59 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \ 60 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \ 61 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY 62#define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX 63#define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY 64#define PREAMBLE_ARG_X , tileProcX, tileLowBitsProcX 65#define PREAMBLE_ARG_Y , tileProcY, tileLowBitsProcY 66#define TILEX_PROCF(fx, max) (tileProcX(fx) * ((max) + 1) >> 16) 67#define TILEY_PROCF(fy, max) (tileProcY(fy) * ((max) + 1) >> 16) 68#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1) 69#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1) 70#include "SkBitmapProcState_matrix.h" 71 72static inline U16CPU fixed_clamp(SkFixed x) 73{ 74#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 75 if (x >> 16) 76 x = 0xFFFF; 77 if (x < 0) 78 x = 0; 79#else 80 if (x >> 16) 81 { 82 if (x < 0) 83 x = 0; 84 else 85 x = 0xFFFF; 86 } 87#endif 88 return x; 89} 90 91static inline U16CPU fixed_repeat(SkFixed x) 92{ 93 return x & 0xFFFF; 94} 95 96static inline U16CPU fixed_mirror(SkFixed x) 97{ 98 SkFixed s = x << 15 >> 31; 99 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval 100 return (x ^ s) & 0xFFFF; 101} 102 103static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) 104{ 105 if (SkShader::kClamp_TileMode == m) 106 return fixed_clamp; 107 if (SkShader::kRepeat_TileMode == m) 108 return fixed_repeat; 109 SkASSERT(SkShader::kMirror_TileMode == m); 110 return fixed_mirror; 111} 112 113static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) { 114 return (x >> 12) & 0xF; 115} 116 117static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) { 118 return ((x * scale) >> 12) & 0xF; 119} 120 121static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) { 122 if (SkShader::kClamp_TileMode == m) { 123 return fixed_clamp_lowbits; 124 } else { 125 SkASSERT(SkShader::kMirror_TileMode == m || 126 SkShader::kRepeat_TileMode == m); 127 // mirror and repeat have the same behavior for the low bits. 128 return fixed_repeat_or_mirrow_lowbits; 129 } 130} 131 132static inline U16CPU int_clamp(int x, int n) { 133#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 134 if (x >= n) 135 x = n - 1; 136 if (x < 0) 137 x = 0; 138#else 139 if ((unsigned)x >= (unsigned)n) { 140 if (x < 0) { 141 x = 0; 142 } else { 143 x = n - 1; 144 } 145 } 146#endif 147 return x; 148} 149 150static inline U16CPU int_repeat(int x, int n) { 151 return sk_int_mod(x, n); 152} 153 154static inline U16CPU int_mirror(int x, int n) { 155 x = sk_int_mod(x, 2 * n); 156 if (x >= n) { 157 x = n + ~(x - n); 158 } 159 return x; 160} 161 162#if 0 163static void test_int_tileprocs() { 164 for (int i = -8; i <= 8; i++) { 165 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); 166 } 167} 168#endif 169 170static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { 171 if (SkShader::kClamp_TileMode == tm) 172 return int_clamp; 173 if (SkShader::kRepeat_TileMode == tm) 174 return int_repeat; 175 SkASSERT(SkShader::kMirror_TileMode == tm); 176 return int_mirror; 177} 178 179////////////////////////////////////////////////////////////////////////////// 180 181void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 182{ 183 int i; 184 185#if defined(__ARM_HAVE_NEON) 186 if (count >= 8) { 187 /* SkFixed is 16.16 fixed point */ 188 SkFixed dx2 = dx+dx; 189 SkFixed dx4 = dx2+dx2; 190 SkFixed dx8 = dx4+dx4; 191 192 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 193 SkFixed fx1, fx2, fx3; 194 int32x2_t lower, upper; 195 int32x4_t lbase, hbase; 196 uint16_t *dst16 = (uint16_t *)dst; 197 198 fx1 = fx+dx; 199 fx2 = fx1+dx; 200 fx3 = fx2+dx; 201 202 /* avoid an 'lbase unitialized' warning */ 203 lbase = vdupq_n_s32(fx); 204 lbase = vsetq_lane_s32(fx1, lbase, 1); 205 lbase = vsetq_lane_s32(fx2, lbase, 2); 206 lbase = vsetq_lane_s32(fx3, lbase, 3); 207 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 208 209 /* take upper 16 of each, store, and bump everything */ 210 do { 211 int32x4_t lout, hout; 212 uint16x8_t hi16; 213 214 lout = lbase; 215 hout = hbase; 216 /* gets hi's of all louts then hi's of all houts */ 217 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 218 hi16 = vreinterpretq_u16_s32(hout); 219 vst1q_u16(dst16, hi16); 220 221 /* on to the next */ 222 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 223 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 224 dst16 += 8; 225 count -= 8; 226 fx += dx8; 227 } while (count >= 8); 228 dst = (uint32_t *) dst16; 229 } 230#else 231 for (i = (count >> 2); i > 0; --i) 232 { 233 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 234 fx += dx+dx; 235 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 236 fx += dx+dx; 237 } 238 count &= 3; 239#endif 240 241 uint16_t* xx = (uint16_t*)dst; 242 for (i = count; i > 0; --i) { 243 *xx++ = SkToU16(fx >> 16); fx += dx; 244 } 245} 246 247void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 248{ 249 250#if defined(__ARM_HAVE_NEON) 251 if (count >= 8) { 252 int32x4_t wide_fx; 253 int32x4_t wide_fx2; 254 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 255 256 wide_fx = vdupq_n_s32(fx); 257 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 258 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 259 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 260 261 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 262 263 while (count >= 8) { 264 int32x4_t wide_out; 265 int32x4_t wide_out2; 266 267 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 268 wide_out = vorrq_s32(wide_out, 269 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 270 271 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 272 wide_out2 = vorrq_s32(wide_out2, 273 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 274 275 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 276 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 277 278 dst += 8; 279 fx += dx*8; 280 wide_fx = vaddq_s32(wide_fx, wide_dx8); 281 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 282 count -= 8; 283 } 284 } 285#endif 286 287 if (count & 1) 288 { 289 SkASSERT((fx >> (16 + 14)) == 0); 290 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 291 fx += dx; 292 } 293 while ((count -= 2) >= 0) 294 { 295 SkASSERT((fx >> (16 + 14)) == 0); 296 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 297 fx += dx; 298 299 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 300 fx += dx; 301 } 302} 303 304/////////////////////////////////////////////////////////////////////////////// 305// stores the same as SCALE, but is cheaper to compute. Also since there is no 306// scale, we don't need/have a FILTER version 307 308static void fill_sequential(uint16_t xptr[], int start, int count) { 309#if 1 310 if (reinterpret_cast<intptr_t>(xptr) & 0x2) { 311 *xptr++ = start++; 312 count -= 1; 313 } 314 if (count > 3) { 315 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr); 316 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); 317 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); 318 start += count & ~3; 319 int qcount = count >> 2; 320 do { 321 *xxptr++ = pattern0; 322 pattern0 += 0x40004; 323 *xxptr++ = pattern1; 324 pattern1 += 0x40004; 325 } while (--qcount != 0); 326 xptr = reinterpret_cast<uint16_t*>(xxptr); 327 count &= 3; 328 } 329 while (--count >= 0) { 330 *xptr++ = start++; 331 } 332#else 333 for (int i = 0; i < count; i++) { 334 *xptr++ = start++; 335 } 336#endif 337} 338 339static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, 340 int x, int y) { 341 SkPoint pt; 342 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 343 SkIntToScalar(y) + SK_ScalarHalf, &pt); 344 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, 345 s.fBitmap->height()); 346 *xy += 1; // bump the ptr 347 // return our starting X position 348 return SkScalarToFixed(pt.fX) >> 16; 349} 350 351static void clampx_nofilter_trans(const SkBitmapProcState& s, 352 uint32_t xy[], int count, int x, int y) { 353 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 354 355 int xpos = nofilter_trans_preamble(s, &xy, x, y); 356 const int width = s.fBitmap->width(); 357 if (1 == width) { 358 // all of the following X values must be 0 359 memset(xy, 0, count * sizeof(uint16_t)); 360 return; 361 } 362 363 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 364 int n; 365 366 // fill before 0 as needed 367 if (xpos < 0) { 368 n = -xpos; 369 if (n > count) { 370 n = count; 371 } 372 memset(xptr, 0, n * sizeof(uint16_t)); 373 count -= n; 374 if (0 == count) { 375 return; 376 } 377 xptr += n; 378 xpos = 0; 379 } 380 381 // fill in 0..width-1 if needed 382 if (xpos < width) { 383 n = width - xpos; 384 if (n > count) { 385 n = count; 386 } 387 fill_sequential(xptr, xpos, n); 388 count -= n; 389 if (0 == count) { 390 return; 391 } 392 xptr += n; 393 } 394 395 // fill the remaining with the max value 396 sk_memset16(xptr, width - 1, count); 397} 398 399static void repeatx_nofilter_trans(const SkBitmapProcState& s, 400 uint32_t xy[], int count, int x, int y) { 401 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 402 403 int xpos = nofilter_trans_preamble(s, &xy, x, y); 404 const int width = s.fBitmap->width(); 405 if (1 == width) { 406 // all of the following X values must be 0 407 memset(xy, 0, count * sizeof(uint16_t)); 408 return; 409 } 410 411 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 412 int start = sk_int_mod(xpos, width); 413 int n = width - start; 414 if (n > count) { 415 n = count; 416 } 417 fill_sequential(xptr, start, n); 418 xptr += n; 419 count -= n; 420 421 while (count >= width) { 422 fill_sequential(xptr, 0, width); 423 xptr += width; 424 count -= width; 425 } 426 427 if (count > 0) { 428 fill_sequential(xptr, 0, count); 429 } 430} 431 432static void fill_backwards(uint16_t xptr[], int pos, int count) { 433 for (int i = 0; i < count; i++) { 434 SkASSERT(pos >= 0); 435 xptr[i] = pos--; 436 } 437} 438 439static void mirrorx_nofilter_trans(const SkBitmapProcState& s, 440 uint32_t xy[], int count, int x, int y) { 441 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 442 443 int xpos = nofilter_trans_preamble(s, &xy, x, y); 444 const int width = s.fBitmap->width(); 445 if (1 == width) { 446 // all of the following X values must be 0 447 memset(xy, 0, count * sizeof(uint16_t)); 448 return; 449 } 450 451 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 452 // need to know our start, and our initial phase (forward or backward) 453 bool forward; 454 int n; 455 int start = sk_int_mod(xpos, 2 * width); 456 if (start >= width) { 457 start = width + ~(start - width); 458 forward = false; 459 n = start + 1; // [start .. 0] 460 } else { 461 forward = true; 462 n = width - start; // [start .. width) 463 } 464 if (n > count) { 465 n = count; 466 } 467 if (forward) { 468 fill_sequential(xptr, start, n); 469 } else { 470 fill_backwards(xptr, start, n); 471 } 472 forward = !forward; 473 xptr += n; 474 count -= n; 475 476 while (count >= width) { 477 if (forward) { 478 fill_sequential(xptr, 0, width); 479 } else { 480 fill_backwards(xptr, width - 1, width); 481 } 482 forward = !forward; 483 xptr += width; 484 count -= width; 485 } 486 487 if (count > 0) { 488 if (forward) { 489 fill_sequential(xptr, 0, count); 490 } else { 491 fill_backwards(xptr, width - 1, count); 492 } 493 } 494} 495 496/////////////////////////////////////////////////////////////////////////////// 497 498SkBitmapProcState::MatrixProc 499SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { 500// test_int_tileprocs(); 501 // check for our special case when there is no scale/affine/perspective 502 if (trivial_matrix) { 503 SkASSERT(!fDoFilter); 504 fIntTileProcY = choose_int_tile_proc(fTileModeY); 505 switch (fTileModeX) { 506 case SkShader::kClamp_TileMode: 507 return clampx_nofilter_trans; 508 case SkShader::kRepeat_TileMode: 509 return repeatx_nofilter_trans; 510 case SkShader::kMirror_TileMode: 511 return mirrorx_nofilter_trans; 512 } 513 } 514 515 int index = 0; 516 if (fDoFilter) { 517 index = 1; 518 } 519 if (fInvType & SkMatrix::kPerspective_Mask) { 520 index += 4; 521 } else if (fInvType & SkMatrix::kAffine_Mask) { 522 index += 2; 523 } 524 525 if (SkShader::kClamp_TileMode == fTileModeX && 526 SkShader::kClamp_TileMode == fTileModeY) 527 { 528 // clamp gets special version of filterOne 529 fFilterOneX = SK_Fixed1; 530 fFilterOneY = SK_Fixed1; 531 return ClampX_ClampY_Procs[index]; 532 } 533 534 // all remaining procs use this form for filterOne 535 fFilterOneX = SK_Fixed1 / fBitmap->width(); 536 fFilterOneY = SK_Fixed1 / fBitmap->height(); 537 538 if (SkShader::kRepeat_TileMode == fTileModeX && 539 SkShader::kRepeat_TileMode == fTileModeY) 540 { 541 return RepeatX_RepeatY_Procs[index]; 542 } 543 544 fTileProcX = choose_tile_proc(fTileModeX); 545 fTileProcY = choose_tile_proc(fTileModeY); 546 fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); 547 fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY); 548 return GeneralXY_Procs[index]; 549} 550 551