SkBitmapProcState_matrixProcs.cpp revision 4d0078aa5115fab8ccd8ef59519a8937ea4e8854
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7#include "SkBitmapProcState.h" 8#include "SkPerspIter.h" 9#include "SkShader.h" 10#include "SkUtils.h" 11 12// Helper to ensure that when we shift down, we do it w/o sign-extension 13// so the caller doesn't have to manually mask off the top 16 bits 14// 15static unsigned SK_USHIFT16(unsigned x) { 16 return x >> 16; 17} 18 19/* returns 0...(n-1) given any x (positive or negative). 20 21 As an example, if n (which is always positive) is 5... 22 23 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 24 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 25 */ 26static inline int sk_int_mod(int x, int n) { 27 SkASSERT(n > 0); 28 if ((unsigned)x >= (unsigned)n) { 29 if (x < 0) { 30 x = n + ~(~x % n); 31 } else { 32 x = x % n; 33 } 34 } 35 return x; 36} 37 38/* 39 * The decal_ functions require that 40 * 1. dx > 0 41 * 2. [fx, fx+dx, fx+2dx, fx+3dx, ... fx+(count-1)dx] are all <= maxX 42 * 43 * In addition, we use SkFractionalInt to keep more fractional precision than 44 * just SkFixed, so we will abort the decal_ call if dx is very small, since 45 * the decal_ function just operates on SkFixed. If that were changed, we could 46 * skip the very_small test here. 47 */ 48static inline bool can_truncate_to_fixed_for_decal(SkFractionalInt frX, 49 SkFractionalInt frDx, 50 int count, unsigned max) { 51 SkFixed dx = SkFractionalIntToFixed(frDx); 52 53 // if decal_ kept SkFractionalInt precision, this would just be dx <= 0 54 // I just made up the 1/256. Just don't want to perceive accumulated error 55 // if we truncate frDx and lose its low bits. 56 if (dx <= SK_Fixed1 / 256) { 57 return false; 58 } 59 60 // We cast to unsigned so we don't have to check for negative values, which 61 // will now appear as very large positive values, and thus fail our test! 62 SkFixed fx = SkFractionalIntToFixed(frX); 63 return (unsigned)SkFixedFloorToInt(fx) <= max && 64 (unsigned)SkFixedFloorToInt(fx + dx * (count - 1)) < max; 65} 66 67void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 68void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 69 70#define MAKENAME(suffix) ClampX_ClampY ## suffix 71#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 72#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 73#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 74#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 75#define CHECK_FOR_DECAL 76#if defined(__ARM_HAVE_NEON) 77 #include "SkBitmapProcState_matrix_clamp.h" 78#else 79 #include "SkBitmapProcState_matrix.h" 80#endif 81 82#define MAKENAME(suffix) RepeatX_RepeatY ## suffix 83#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 84#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 85#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 86#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 87#if defined(__ARM_HAVE_NEON) 88 #include "SkBitmapProcState_matrix_repeat.h" 89#else 90 #include "SkBitmapProcState_matrix.h" 91#endif 92 93#define MAKENAME(suffix) GeneralXY ## suffix 94#define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \ 95 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \ 96 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \ 97 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY 98#define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX 99#define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY 100#define PREAMBLE_ARG_X , tileProcX, tileLowBitsProcX 101#define PREAMBLE_ARG_Y , tileProcY, tileLowBitsProcY 102#define TILEX_PROCF(fx, max) SK_USHIFT16(tileProcX(fx) * ((max) + 1)) 103#define TILEY_PROCF(fy, max) SK_USHIFT16(tileProcY(fy) * ((max) + 1)) 104#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1) 105#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1) 106#include "SkBitmapProcState_matrix.h" 107 108static inline U16CPU fixed_clamp(SkFixed x) 109{ 110#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 111 if (x >> 16) 112 x = 0xFFFF; 113 if (x < 0) 114 x = 0; 115#else 116 if (x >> 16) 117 { 118 if (x < 0) 119 x = 0; 120 else 121 x = 0xFFFF; 122 } 123#endif 124 return x; 125} 126 127static inline U16CPU fixed_repeat(SkFixed x) 128{ 129 return x & 0xFFFF; 130} 131 132// Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly. 133// See http://code.google.com/p/skia/issues/detail?id=472 134#if defined(_MSC_VER) && (_MSC_VER >= 1600) 135#pragma optimize("", off) 136#endif 137 138static inline U16CPU fixed_mirror(SkFixed x) 139{ 140 SkFixed s = x << 15 >> 31; 141 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval 142 return (x ^ s) & 0xFFFF; 143} 144 145#if defined(_MSC_VER) && (_MSC_VER >= 1600) 146#pragma optimize("", on) 147#endif 148 149static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) 150{ 151 if (SkShader::kClamp_TileMode == m) 152 return fixed_clamp; 153 if (SkShader::kRepeat_TileMode == m) 154 return fixed_repeat; 155 SkASSERT(SkShader::kMirror_TileMode == m); 156 return fixed_mirror; 157} 158 159static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) { 160 return (x >> 12) & 0xF; 161} 162 163static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) { 164 return ((x * scale) >> 12) & 0xF; 165} 166 167static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) { 168 if (SkShader::kClamp_TileMode == m) { 169 return fixed_clamp_lowbits; 170 } else { 171 SkASSERT(SkShader::kMirror_TileMode == m || 172 SkShader::kRepeat_TileMode == m); 173 // mirror and repeat have the same behavior for the low bits. 174 return fixed_repeat_or_mirrow_lowbits; 175 } 176} 177 178static inline U16CPU int_clamp(int x, int n) { 179#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 180 if (x >= n) 181 x = n - 1; 182 if (x < 0) 183 x = 0; 184#else 185 if ((unsigned)x >= (unsigned)n) { 186 if (x < 0) { 187 x = 0; 188 } else { 189 x = n - 1; 190 } 191 } 192#endif 193 return x; 194} 195 196static inline U16CPU int_repeat(int x, int n) { 197 return sk_int_mod(x, n); 198} 199 200static inline U16CPU int_mirror(int x, int n) { 201 x = sk_int_mod(x, 2 * n); 202 if (x >= n) { 203 x = n + ~(x - n); 204 } 205 return x; 206} 207 208#if 0 209static void test_int_tileprocs() { 210 for (int i = -8; i <= 8; i++) { 211 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); 212 } 213} 214#endif 215 216static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { 217 if (SkShader::kClamp_TileMode == tm) 218 return int_clamp; 219 if (SkShader::kRepeat_TileMode == tm) 220 return int_repeat; 221 SkASSERT(SkShader::kMirror_TileMode == tm); 222 return int_mirror; 223} 224 225////////////////////////////////////////////////////////////////////////////// 226 227void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 228{ 229 int i; 230 231#if defined(__ARM_HAVE_NEON) 232 if (count >= 8) { 233 /* SkFixed is 16.16 fixed point */ 234 SkFixed dx2 = dx+dx; 235 SkFixed dx4 = dx2+dx2; 236 SkFixed dx8 = dx4+dx4; 237 238 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 239 SkFixed fx1, fx2, fx3; 240 int32x2_t lower, upper; 241 int32x4_t lbase, hbase; 242 uint16_t *dst16 = (uint16_t *)dst; 243 244 fx1 = fx+dx; 245 fx2 = fx1+dx; 246 fx3 = fx2+dx; 247 248 /* avoid an 'lbase unitialized' warning */ 249 lbase = vdupq_n_s32(fx); 250 lbase = vsetq_lane_s32(fx1, lbase, 1); 251 lbase = vsetq_lane_s32(fx2, lbase, 2); 252 lbase = vsetq_lane_s32(fx3, lbase, 3); 253 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 254 255 /* take upper 16 of each, store, and bump everything */ 256 do { 257 int32x4_t lout, hout; 258 uint16x8_t hi16; 259 260 lout = lbase; 261 hout = hbase; 262 /* gets hi's of all louts then hi's of all houts */ 263 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 264 hi16 = vreinterpretq_u16_s32(hout); 265 vst1q_u16(dst16, hi16); 266 267 /* on to the next */ 268 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 269 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 270 dst16 += 8; 271 count -= 8; 272 fx += dx8; 273 } while (count >= 8); 274 dst = (uint32_t *) dst16; 275 } 276#else 277 for (i = (count >> 2); i > 0; --i) 278 { 279 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 280 fx += dx+dx; 281 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 282 fx += dx+dx; 283 } 284 count &= 3; 285#endif 286 287 uint16_t* xx = (uint16_t*)dst; 288 for (i = count; i > 0; --i) { 289 *xx++ = SkToU16(fx >> 16); fx += dx; 290 } 291} 292 293void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 294{ 295 296#if defined(__ARM_HAVE_NEON) 297 if (count >= 8) { 298 int32x4_t wide_fx; 299 int32x4_t wide_fx2; 300 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 301 302 wide_fx = vdupq_n_s32(fx); 303 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 304 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 305 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 306 307 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 308 309 while (count >= 8) { 310 int32x4_t wide_out; 311 int32x4_t wide_out2; 312 313 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 314 wide_out = vorrq_s32(wide_out, 315 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 316 317 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 318 wide_out2 = vorrq_s32(wide_out2, 319 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 320 321 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 322 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 323 324 dst += 8; 325 fx += dx*8; 326 wide_fx = vaddq_s32(wide_fx, wide_dx8); 327 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 328 count -= 8; 329 } 330 } 331#endif 332 333 if (count & 1) 334 { 335 SkASSERT((fx >> (16 + 14)) == 0); 336 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 337 fx += dx; 338 } 339 while ((count -= 2) >= 0) 340 { 341 SkASSERT((fx >> (16 + 14)) == 0); 342 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 343 fx += dx; 344 345 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 346 fx += dx; 347 } 348} 349 350/////////////////////////////////////////////////////////////////////////////// 351// stores the same as SCALE, but is cheaper to compute. Also since there is no 352// scale, we don't need/have a FILTER version 353 354static void fill_sequential(uint16_t xptr[], int start, int count) { 355#if 1 356 if (reinterpret_cast<intptr_t>(xptr) & 0x2) { 357 *xptr++ = start++; 358 count -= 1; 359 } 360 if (count > 3) { 361 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr); 362 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); 363 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); 364 start += count & ~3; 365 int qcount = count >> 2; 366 do { 367 *xxptr++ = pattern0; 368 pattern0 += 0x40004; 369 *xxptr++ = pattern1; 370 pattern1 += 0x40004; 371 } while (--qcount != 0); 372 xptr = reinterpret_cast<uint16_t*>(xxptr); 373 count &= 3; 374 } 375 while (--count >= 0) { 376 *xptr++ = start++; 377 } 378#else 379 for (int i = 0; i < count; i++) { 380 *xptr++ = start++; 381 } 382#endif 383} 384 385static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, 386 int x, int y) { 387 SkPoint pt; 388 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 389 SkIntToScalar(y) + SK_ScalarHalf, &pt); 390 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, 391 s.fBitmap->height()); 392 *xy += 1; // bump the ptr 393 // return our starting X position 394 return SkScalarToFixed(pt.fX) >> 16; 395} 396 397static void clampx_nofilter_trans(const SkBitmapProcState& s, 398 uint32_t xy[], int count, int x, int y) { 399 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 400 401 int xpos = nofilter_trans_preamble(s, &xy, x, y); 402 const int width = s.fBitmap->width(); 403 if (1 == width) { 404 // all of the following X values must be 0 405 memset(xy, 0, count * sizeof(uint16_t)); 406 return; 407 } 408 409 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 410 int n; 411 412 // fill before 0 as needed 413 if (xpos < 0) { 414 n = -xpos; 415 if (n > count) { 416 n = count; 417 } 418 memset(xptr, 0, n * sizeof(uint16_t)); 419 count -= n; 420 if (0 == count) { 421 return; 422 } 423 xptr += n; 424 xpos = 0; 425 } 426 427 // fill in 0..width-1 if needed 428 if (xpos < width) { 429 n = width - xpos; 430 if (n > count) { 431 n = count; 432 } 433 fill_sequential(xptr, xpos, n); 434 count -= n; 435 if (0 == count) { 436 return; 437 } 438 xptr += n; 439 } 440 441 // fill the remaining with the max value 442 sk_memset16(xptr, width - 1, count); 443} 444 445static void repeatx_nofilter_trans(const SkBitmapProcState& s, 446 uint32_t xy[], int count, int x, int y) { 447 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 448 449 int xpos = nofilter_trans_preamble(s, &xy, x, y); 450 const int width = s.fBitmap->width(); 451 if (1 == width) { 452 // all of the following X values must be 0 453 memset(xy, 0, count * sizeof(uint16_t)); 454 return; 455 } 456 457 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 458 int start = sk_int_mod(xpos, width); 459 int n = width - start; 460 if (n > count) { 461 n = count; 462 } 463 fill_sequential(xptr, start, n); 464 xptr += n; 465 count -= n; 466 467 while (count >= width) { 468 fill_sequential(xptr, 0, width); 469 xptr += width; 470 count -= width; 471 } 472 473 if (count > 0) { 474 fill_sequential(xptr, 0, count); 475 } 476} 477 478static void fill_backwards(uint16_t xptr[], int pos, int count) { 479 for (int i = 0; i < count; i++) { 480 SkASSERT(pos >= 0); 481 xptr[i] = pos--; 482 } 483} 484 485static void mirrorx_nofilter_trans(const SkBitmapProcState& s, 486 uint32_t xy[], int count, int x, int y) { 487 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 488 489 int xpos = nofilter_trans_preamble(s, &xy, x, y); 490 const int width = s.fBitmap->width(); 491 if (1 == width) { 492 // all of the following X values must be 0 493 memset(xy, 0, count * sizeof(uint16_t)); 494 return; 495 } 496 497 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 498 // need to know our start, and our initial phase (forward or backward) 499 bool forward; 500 int n; 501 int start = sk_int_mod(xpos, 2 * width); 502 if (start >= width) { 503 start = width + ~(start - width); 504 forward = false; 505 n = start + 1; // [start .. 0] 506 } else { 507 forward = true; 508 n = width - start; // [start .. width) 509 } 510 if (n > count) { 511 n = count; 512 } 513 if (forward) { 514 fill_sequential(xptr, start, n); 515 } else { 516 fill_backwards(xptr, start, n); 517 } 518 forward = !forward; 519 xptr += n; 520 count -= n; 521 522 while (count >= width) { 523 if (forward) { 524 fill_sequential(xptr, 0, width); 525 } else { 526 fill_backwards(xptr, width - 1, width); 527 } 528 forward = !forward; 529 xptr += width; 530 count -= width; 531 } 532 533 if (count > 0) { 534 if (forward) { 535 fill_sequential(xptr, 0, count); 536 } else { 537 fill_backwards(xptr, width - 1, count); 538 } 539 } 540} 541 542/////////////////////////////////////////////////////////////////////////////// 543 544SkBitmapProcState::MatrixProc 545SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { 546// test_int_tileprocs(); 547 // check for our special case when there is no scale/affine/perspective 548 if (trivial_matrix) { 549 SkASSERT(!fDoFilter); 550 fIntTileProcY = choose_int_tile_proc(fTileModeY); 551 switch (fTileModeX) { 552 case SkShader::kClamp_TileMode: 553 return clampx_nofilter_trans; 554 case SkShader::kRepeat_TileMode: 555 return repeatx_nofilter_trans; 556 case SkShader::kMirror_TileMode: 557 return mirrorx_nofilter_trans; 558 } 559 } 560 561 int index = 0; 562 if (fDoFilter) { 563 index = 1; 564 } 565 if (fInvType & SkMatrix::kPerspective_Mask) { 566 index += 4; 567 } else if (fInvType & SkMatrix::kAffine_Mask) { 568 index += 2; 569 } 570 571 if (SkShader::kClamp_TileMode == fTileModeX && 572 SkShader::kClamp_TileMode == fTileModeY) 573 { 574 // clamp gets special version of filterOne 575 fFilterOneX = SK_Fixed1; 576 fFilterOneY = SK_Fixed1; 577 return ClampX_ClampY_Procs[index]; 578 } 579 580 // all remaining procs use this form for filterOne 581 fFilterOneX = SK_Fixed1 / fBitmap->width(); 582 fFilterOneY = SK_Fixed1 / fBitmap->height(); 583 584 if (SkShader::kRepeat_TileMode == fTileModeX && 585 SkShader::kRepeat_TileMode == fTileModeY) 586 { 587 return RepeatX_RepeatY_Procs[index]; 588 } 589 590 fTileProcX = choose_tile_proc(fTileModeX); 591 fTileProcY = choose_tile_proc(fTileModeY); 592 fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); 593 fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY); 594 return GeneralXY_Procs[index]; 595} 596 597