SkBitmapProcState_matrixProcs.cpp revision 99c114e0ac732ba01705e24d12f5e4dd7e144abd
1/* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7#include "SkBitmapProcState.h" 8#include "SkPerspIter.h" 9#include "SkShader.h" 10#include "SkUtils.h" 11 12// Helper to ensure that when we shift down, we do it w/o sign-extension 13// so the caller doesn't have to manually mask off the top 16 bits 14// 15static unsigned SK_USHIFT16(unsigned x) { 16 return x >> 16; 17} 18 19/* returns 0...(n-1) given any x (positive or negative). 20 21 As an example, if n (which is always positive) is 5... 22 23 x: -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 24 returns: 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 25 */ 26static inline int sk_int_mod(int x, int n) { 27 SkASSERT(n > 0); 28 if ((unsigned)x >= (unsigned)n) { 29 if (x < 0) { 30 x = n + ~(~x % n); 31 } else { 32 x = x % n; 33 } 34 } 35 return x; 36} 37 38void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 39void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 40 41#define MAKENAME(suffix) ClampX_ClampY ## suffix 42#define TILEX_PROCF(fx, max) SkClampMax(SK_USHIFT16(fx), max) 43#define TILEY_PROCF(fy, max) SkClampMax(SK_USHIFT16(fy), max) 44#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 45#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 46#define CHECK_FOR_DECAL 47#if defined(__ARM_HAVE_NEON) 48 #include "SkBitmapProcState_matrix_clamp.h" 49#else 50 #include "SkBitmapProcState_matrix.h" 51#endif 52 53#define MAKENAME(suffix) RepeatX_RepeatY ## suffix 54#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 55#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 56#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 57#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 58#if defined(__ARM_HAVE_NEON) 59 #include "SkBitmapProcState_matrix_repeat.h" 60#else 61 #include "SkBitmapProcState_matrix.h" 62#endif 63 64#define MAKENAME(suffix) GeneralXY ## suffix 65#define PREAMBLE(state) SkBitmapProcState::FixedTileProc tileProcX = (state).fTileProcX; \ 66 SkBitmapProcState::FixedTileProc tileProcY = (state).fTileProcY; \ 67 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX = (state).fTileLowBitsProcX; \ 68 SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY = (state).fTileLowBitsProcY 69#define PREAMBLE_PARAM_X , SkBitmapProcState::FixedTileProc tileProcX, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcX 70#define PREAMBLE_PARAM_Y , SkBitmapProcState::FixedTileProc tileProcY, SkBitmapProcState::FixedTileLowBitsProc tileLowBitsProcY 71#define PREAMBLE_ARG_X , tileProcX, tileLowBitsProcX 72#define PREAMBLE_ARG_Y , tileProcY, tileLowBitsProcY 73#define TILEX_PROCF(fx, max) SK_USHIFT16(tileProcX(fx) * ((max) + 1)) 74#define TILEY_PROCF(fy, max) SK_USHIFT16(tileProcY(fy) * ((max) + 1)) 75#define TILEX_LOW_BITS(fx, max) tileLowBitsProcX(fx, (max) + 1) 76#define TILEY_LOW_BITS(fy, max) tileLowBitsProcY(fy, (max) + 1) 77#include "SkBitmapProcState_matrix.h" 78 79static inline U16CPU fixed_clamp(SkFixed x) 80{ 81#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 82 if (x >> 16) 83 x = 0xFFFF; 84 if (x < 0) 85 x = 0; 86#else 87 if (x >> 16) 88 { 89 if (x < 0) 90 x = 0; 91 else 92 x = 0xFFFF; 93 } 94#endif 95 return x; 96} 97 98static inline U16CPU fixed_repeat(SkFixed x) 99{ 100 return x & 0xFFFF; 101} 102 103// Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly. 104// See http://code.google.com/p/skia/issues/detail?id=472 105#if defined(_MSC_VER) && (_MSC_VER >= 1600) 106#pragma optimize("", off) 107#endif 108 109static inline U16CPU fixed_mirror(SkFixed x) 110{ 111 SkFixed s = x << 15 >> 31; 112 // s is FFFFFFFF if we're on an odd interval, or 0 if an even interval 113 return (x ^ s) & 0xFFFF; 114} 115 116#if defined(_MSC_VER) && (_MSC_VER >= 1600) 117#pragma optimize("", on) 118#endif 119 120static SkBitmapProcState::FixedTileProc choose_tile_proc(unsigned m) 121{ 122 if (SkShader::kClamp_TileMode == m) 123 return fixed_clamp; 124 if (SkShader::kRepeat_TileMode == m) 125 return fixed_repeat; 126 SkASSERT(SkShader::kMirror_TileMode == m); 127 return fixed_mirror; 128} 129 130static inline U16CPU fixed_clamp_lowbits(SkFixed x, int) { 131 return (x >> 12) & 0xF; 132} 133 134static inline U16CPU fixed_repeat_or_mirrow_lowbits(SkFixed x, int scale) { 135 return ((x * scale) >> 12) & 0xF; 136} 137 138static SkBitmapProcState::FixedTileLowBitsProc choose_tile_lowbits_proc(unsigned m) { 139 if (SkShader::kClamp_TileMode == m) { 140 return fixed_clamp_lowbits; 141 } else { 142 SkASSERT(SkShader::kMirror_TileMode == m || 143 SkShader::kRepeat_TileMode == m); 144 // mirror and repeat have the same behavior for the low bits. 145 return fixed_repeat_or_mirrow_lowbits; 146 } 147} 148 149static inline U16CPU int_clamp(int x, int n) { 150#ifdef SK_CPU_HAS_CONDITIONAL_INSTR 151 if (x >= n) 152 x = n - 1; 153 if (x < 0) 154 x = 0; 155#else 156 if ((unsigned)x >= (unsigned)n) { 157 if (x < 0) { 158 x = 0; 159 } else { 160 x = n - 1; 161 } 162 } 163#endif 164 return x; 165} 166 167static inline U16CPU int_repeat(int x, int n) { 168 return sk_int_mod(x, n); 169} 170 171static inline U16CPU int_mirror(int x, int n) { 172 x = sk_int_mod(x, 2 * n); 173 if (x >= n) { 174 x = n + ~(x - n); 175 } 176 return x; 177} 178 179#if 0 180static void test_int_tileprocs() { 181 for (int i = -8; i <= 8; i++) { 182 SkDebugf(" int_mirror(%2d, 3) = %d\n", i, int_mirror(i, 3)); 183 } 184} 185#endif 186 187static SkBitmapProcState::IntTileProc choose_int_tile_proc(unsigned tm) { 188 if (SkShader::kClamp_TileMode == tm) 189 return int_clamp; 190 if (SkShader::kRepeat_TileMode == tm) 191 return int_repeat; 192 SkASSERT(SkShader::kMirror_TileMode == tm); 193 return int_mirror; 194} 195 196////////////////////////////////////////////////////////////////////////////// 197 198void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 199{ 200 int i; 201 202#if defined(__ARM_HAVE_NEON) 203 if (count >= 8) { 204 /* SkFixed is 16.16 fixed point */ 205 SkFixed dx2 = dx+dx; 206 SkFixed dx4 = dx2+dx2; 207 SkFixed dx8 = dx4+dx4; 208 209 /* now build fx/fx+dx/fx+2dx/fx+3dx */ 210 SkFixed fx1, fx2, fx3; 211 int32x2_t lower, upper; 212 int32x4_t lbase, hbase; 213 uint16_t *dst16 = (uint16_t *)dst; 214 215 fx1 = fx+dx; 216 fx2 = fx1+dx; 217 fx3 = fx2+dx; 218 219 /* avoid an 'lbase unitialized' warning */ 220 lbase = vdupq_n_s32(fx); 221 lbase = vsetq_lane_s32(fx1, lbase, 1); 222 lbase = vsetq_lane_s32(fx2, lbase, 2); 223 lbase = vsetq_lane_s32(fx3, lbase, 3); 224 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); 225 226 /* take upper 16 of each, store, and bump everything */ 227 do { 228 int32x4_t lout, hout; 229 uint16x8_t hi16; 230 231 lout = lbase; 232 hout = hbase; 233 /* gets hi's of all louts then hi's of all houts */ 234 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); 235 hi16 = vreinterpretq_u16_s32(hout); 236 vst1q_u16(dst16, hi16); 237 238 /* on to the next */ 239 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); 240 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); 241 dst16 += 8; 242 count -= 8; 243 fx += dx8; 244 } while (count >= 8); 245 dst = (uint32_t *) dst16; 246 } 247#else 248 for (i = (count >> 2); i > 0; --i) 249 { 250 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 251 fx += dx+dx; 252 *dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); 253 fx += dx+dx; 254 } 255 count &= 3; 256#endif 257 258 uint16_t* xx = (uint16_t*)dst; 259 for (i = count; i > 0; --i) { 260 *xx++ = SkToU16(fx >> 16); fx += dx; 261 } 262} 263 264void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) 265{ 266 267#if defined(__ARM_HAVE_NEON) 268 if (count >= 8) { 269 int32x4_t wide_fx; 270 int32x4_t wide_fx2; 271 int32x4_t wide_dx8 = vdupq_n_s32(dx*8); 272 273 wide_fx = vdupq_n_s32(fx); 274 wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); 275 wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); 276 wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); 277 278 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); 279 280 while (count >= 8) { 281 int32x4_t wide_out; 282 int32x4_t wide_out2; 283 284 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 285 wide_out = vorrq_s32(wide_out, 286 vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); 287 288 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 289 wide_out2 = vorrq_s32(wide_out2, 290 vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); 291 292 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 293 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 294 295 dst += 8; 296 fx += dx*8; 297 wide_fx = vaddq_s32(wide_fx, wide_dx8); 298 wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); 299 count -= 8; 300 } 301 } 302#endif 303 304 if (count & 1) 305 { 306 SkASSERT((fx >> (16 + 14)) == 0); 307 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 308 fx += dx; 309 } 310 while ((count -= 2) >= 0) 311 { 312 SkASSERT((fx >> (16 + 14)) == 0); 313 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 314 fx += dx; 315 316 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 317 fx += dx; 318 } 319} 320 321/////////////////////////////////////////////////////////////////////////////// 322// stores the same as SCALE, but is cheaper to compute. Also since there is no 323// scale, we don't need/have a FILTER version 324 325static void fill_sequential(uint16_t xptr[], int start, int count) { 326#if 1 327 if (reinterpret_cast<intptr_t>(xptr) & 0x2) { 328 *xptr++ = start++; 329 count -= 1; 330 } 331 if (count > 3) { 332 uint32_t* xxptr = reinterpret_cast<uint32_t*>(xptr); 333 uint32_t pattern0 = PACK_TWO_SHORTS(start + 0, start + 1); 334 uint32_t pattern1 = PACK_TWO_SHORTS(start + 2, start + 3); 335 start += count & ~3; 336 int qcount = count >> 2; 337 do { 338 *xxptr++ = pattern0; 339 pattern0 += 0x40004; 340 *xxptr++ = pattern1; 341 pattern1 += 0x40004; 342 } while (--qcount != 0); 343 xptr = reinterpret_cast<uint16_t*>(xxptr); 344 count &= 3; 345 } 346 while (--count >= 0) { 347 *xptr++ = start++; 348 } 349#else 350 for (int i = 0; i < count; i++) { 351 *xptr++ = start++; 352 } 353#endif 354} 355 356static int nofilter_trans_preamble(const SkBitmapProcState& s, uint32_t** xy, 357 int x, int y) { 358 SkPoint pt; 359 s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, 360 SkIntToScalar(y) + SK_ScalarHalf, &pt); 361 **xy = s.fIntTileProcY(SkScalarToFixed(pt.fY) >> 16, 362 s.fBitmap->height()); 363 *xy += 1; // bump the ptr 364 // return our starting X position 365 return SkScalarToFixed(pt.fX) >> 16; 366} 367 368static void clampx_nofilter_trans(const SkBitmapProcState& s, 369 uint32_t xy[], int count, int x, int y) { 370 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 371 372 int xpos = nofilter_trans_preamble(s, &xy, x, y); 373 const int width = s.fBitmap->width(); 374 if (1 == width) { 375 // all of the following X values must be 0 376 memset(xy, 0, count * sizeof(uint16_t)); 377 return; 378 } 379 380 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 381 int n; 382 383 // fill before 0 as needed 384 if (xpos < 0) { 385 n = -xpos; 386 if (n > count) { 387 n = count; 388 } 389 memset(xptr, 0, n * sizeof(uint16_t)); 390 count -= n; 391 if (0 == count) { 392 return; 393 } 394 xptr += n; 395 xpos = 0; 396 } 397 398 // fill in 0..width-1 if needed 399 if (xpos < width) { 400 n = width - xpos; 401 if (n > count) { 402 n = count; 403 } 404 fill_sequential(xptr, xpos, n); 405 count -= n; 406 if (0 == count) { 407 return; 408 } 409 xptr += n; 410 } 411 412 // fill the remaining with the max value 413 sk_memset16(xptr, width - 1, count); 414} 415 416static void repeatx_nofilter_trans(const SkBitmapProcState& s, 417 uint32_t xy[], int count, int x, int y) { 418 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 419 420 int xpos = nofilter_trans_preamble(s, &xy, x, y); 421 const int width = s.fBitmap->width(); 422 if (1 == width) { 423 // all of the following X values must be 0 424 memset(xy, 0, count * sizeof(uint16_t)); 425 return; 426 } 427 428 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 429 int start = sk_int_mod(xpos, width); 430 int n = width - start; 431 if (n > count) { 432 n = count; 433 } 434 fill_sequential(xptr, start, n); 435 xptr += n; 436 count -= n; 437 438 while (count >= width) { 439 fill_sequential(xptr, 0, width); 440 xptr += width; 441 count -= width; 442 } 443 444 if (count > 0) { 445 fill_sequential(xptr, 0, count); 446 } 447} 448 449static void fill_backwards(uint16_t xptr[], int pos, int count) { 450 for (int i = 0; i < count; i++) { 451 SkASSERT(pos >= 0); 452 xptr[i] = pos--; 453 } 454} 455 456static void mirrorx_nofilter_trans(const SkBitmapProcState& s, 457 uint32_t xy[], int count, int x, int y) { 458 SkASSERT((s.fInvType & ~SkMatrix::kTranslate_Mask) == 0); 459 460 int xpos = nofilter_trans_preamble(s, &xy, x, y); 461 const int width = s.fBitmap->width(); 462 if (1 == width) { 463 // all of the following X values must be 0 464 memset(xy, 0, count * sizeof(uint16_t)); 465 return; 466 } 467 468 uint16_t* xptr = reinterpret_cast<uint16_t*>(xy); 469 // need to know our start, and our initial phase (forward or backward) 470 bool forward; 471 int n; 472 int start = sk_int_mod(xpos, 2 * width); 473 if (start >= width) { 474 start = width + ~(start - width); 475 forward = false; 476 n = start + 1; // [start .. 0] 477 } else { 478 forward = true; 479 n = width - start; // [start .. width) 480 } 481 if (n > count) { 482 n = count; 483 } 484 if (forward) { 485 fill_sequential(xptr, start, n); 486 } else { 487 fill_backwards(xptr, start, n); 488 } 489 forward = !forward; 490 xptr += n; 491 count -= n; 492 493 while (count >= width) { 494 if (forward) { 495 fill_sequential(xptr, 0, width); 496 } else { 497 fill_backwards(xptr, width - 1, width); 498 } 499 forward = !forward; 500 xptr += width; 501 count -= width; 502 } 503 504 if (count > 0) { 505 if (forward) { 506 fill_sequential(xptr, 0, count); 507 } else { 508 fill_backwards(xptr, width - 1, count); 509 } 510 } 511} 512 513/////////////////////////////////////////////////////////////////////////////// 514 515SkBitmapProcState::MatrixProc 516SkBitmapProcState::chooseMatrixProc(bool trivial_matrix) { 517// test_int_tileprocs(); 518 // check for our special case when there is no scale/affine/perspective 519 if (trivial_matrix) { 520 SkASSERT(!fDoFilter); 521 fIntTileProcY = choose_int_tile_proc(fTileModeY); 522 switch (fTileModeX) { 523 case SkShader::kClamp_TileMode: 524 return clampx_nofilter_trans; 525 case SkShader::kRepeat_TileMode: 526 return repeatx_nofilter_trans; 527 case SkShader::kMirror_TileMode: 528 return mirrorx_nofilter_trans; 529 } 530 } 531 532 int index = 0; 533 if (fDoFilter) { 534 index = 1; 535 } 536 if (fInvType & SkMatrix::kPerspective_Mask) { 537 index += 4; 538 } else if (fInvType & SkMatrix::kAffine_Mask) { 539 index += 2; 540 } 541 542 if (SkShader::kClamp_TileMode == fTileModeX && 543 SkShader::kClamp_TileMode == fTileModeY) 544 { 545 // clamp gets special version of filterOne 546 fFilterOneX = SK_Fixed1; 547 fFilterOneY = SK_Fixed1; 548 return ClampX_ClampY_Procs[index]; 549 } 550 551 // all remaining procs use this form for filterOne 552 fFilterOneX = SK_Fixed1 / fBitmap->width(); 553 fFilterOneY = SK_Fixed1 / fBitmap->height(); 554 555 if (SkShader::kRepeat_TileMode == fTileModeX && 556 SkShader::kRepeat_TileMode == fTileModeY) 557 { 558 return RepeatX_RepeatY_Procs[index]; 559 } 560 561 fTileProcX = choose_tile_proc(fTileModeX); 562 fTileProcY = choose_tile_proc(fTileModeY); 563 fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); 564 fTileLowBitsProcY = choose_tile_lowbits_proc(fTileModeY); 565 return GeneralXY_Procs[index]; 566} 567 568