1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkColor_opts_neon.h" 18#include <arm_neon.h> 19 20#ifdef SK_CPU_ARM64 21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { 22 uint8x8x4_t vsrc; 23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; 24 25 asm ( 26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 27 "mov %[vsrc0].8b, v0.8b \t\n" 28 "mov %[vsrc1].8b, v1.8b \t\n" 29 "mov %[vsrc2].8b, v2.8b \t\n" 30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) 32 : : "v0", "v1", "v2", "v3" 33 ); 34 35 vsrc.val[0] = vsrc_0; 36 vsrc.val[1] = vsrc_1; 37 vsrc.val[2] = vsrc_2; 38 39 return vsrc; 40} 41 42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { 43 uint8x8x4_t vsrc; 44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; 45 46 asm ( 47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 48 "mov %[vsrc0].8b, v0.8b \t\n" 49 "mov %[vsrc1].8b, v1.8b \t\n" 50 "mov %[vsrc2].8b, v2.8b \t\n" 51 "mov %[vsrc3].8b, v3.8b \t\n" 52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), 54 [src] "+&r" (src) 55 : : "v0", "v1", "v2", "v3" 56 ); 57 58 vsrc.val[0] = vsrc_0; 59 vsrc.val[1] = vsrc_1; 60 vsrc.val[2] = vsrc_2; 61 vsrc.val[3] = vsrc_3; 62 63 return vsrc; 64} 65#endif 66 67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 68 const SkPMColor* SK_RESTRICT src, int count, 69 U8CPU alpha, int /*x*/, int /*y*/) { 70 SkASSERT(255 == alpha); 71 72 while (count >= 8) { 73 uint8x8x4_t vsrc; 74 uint16x8_t vdst; 75 76 // Load 77#ifdef SK_CPU_ARM64 78 vsrc = sk_vld4_u8_arm64_3(src); 79#else 80 vsrc = vld4_u8((uint8_t*)src); 81 src += 8; 82#endif 83 84 // Convert src to 565 85 vdst = SkPixel32ToPixel16_neon8(vsrc); 86 87 // Store 88 vst1q_u16(dst, vdst); 89 90 // Prepare next iteration 91 dst += 8; 92 count -= 8; 93 }; 94 95 // Leftovers 96 while (count > 0) { 97 SkPMColor c = *src++; 98 SkPMColorAssert(c); 99 *dst = SkPixel32ToPixel16_ToU16(c); 100 dst++; 101 count--; 102 }; 103} 104 105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 106 const SkPMColor* SK_RESTRICT src, int count, 107 U8CPU alpha, int /*x*/, int /*y*/) { 108 SkASSERT(255 > alpha); 109 110 uint16x8_t vmask_blue, vscale; 111 112 // prepare constants 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 114 vmask_blue = vmovq_n_u16(0x1F); 115 116 while (count >= 8) { 117 uint8x8x4_t vsrc; 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 119 uint16x8_t vres_r, vres_g, vres_b; 120 121 // Load src 122#ifdef SK_CPU_ARM64 123 vsrc = sk_vld4_u8_arm64_3(src); 124#else 125 { 126 register uint8x8_t d0 asm("d0"); 127 register uint8x8_t d1 asm("d1"); 128 register uint8x8_t d2 asm("d2"); 129 register uint8x8_t d3 asm("d3"); 130 131 asm ( 132 "vld4.8 {d0-d3},[%[src]]!" 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 134 : 135 ); 136 vsrc.val[0] = d0; 137 vsrc.val[1] = d1; 138 vsrc.val[2] = d2; 139 } 140#endif 141 142 // Load and unpack dst 143 vdst = vld1q_u16(dst); 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 148 149 // Shift src to 565 range 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); 153 154 // Scale src - dst 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; 158 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); 162 163 vres_r += vdst_r; 164 vres_g += vdst_g; 165 vres_b += vdst_b; 166 167 // Combine 168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue 169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue 170 171 // Store 172 vst1q_u16(dst, vres_b); 173 dst += 8; 174 count -= 8; 175 } 176 if (count > 0) { 177 int scale = SkAlpha255To256(alpha); 178 do { 179 SkPMColor c = *src++; 180 SkPMColorAssert(c); 181 uint16_t d = *dst; 182 *dst++ = SkPackRGB16( 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 186 } while (--count != 0); 187 } 188} 189 190#ifdef SK_CPU_ARM32 191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 192 const SkPMColor* SK_RESTRICT src, int count, 193 U8CPU alpha, int /*x*/, int /*y*/) { 194 SkASSERT(255 == alpha); 195 196 if (count >= 8) { 197 uint16_t* SK_RESTRICT keep_dst = 0; 198 199 asm volatile ( 200 "ands ip, %[count], #7 \n\t" 201 "vmov.u8 d31, #1<<7 \n\t" 202 "vld1.16 {q12}, [%[dst]] \n\t" 203 "vld4.8 {d0-d3}, [%[src]] \n\t" 204 // Thumb does not support the standard ARM conditional 205 // instructions but instead requires the 'it' instruction 206 // to signal conditional execution 207 "it eq \n\t" 208 "moveq ip, #8 \n\t" 209 "mov %[keep_dst], %[dst] \n\t" 210 211 "add %[src], %[src], ip, LSL#2 \n\t" 212 "add %[dst], %[dst], ip, LSL#1 \n\t" 213 "subs %[count], %[count], ip \n\t" 214 "b 9f \n\t" 215 // LOOP 216 "2: \n\t" 217 218 "vld1.16 {q12}, [%[dst]]! \n\t" 219 "vld4.8 {d0-d3}, [%[src]]! \n\t" 220 "vst1.16 {q10}, [%[keep_dst]] \n\t" 221 "sub %[keep_dst], %[dst], #8*2 \n\t" 222 "subs %[count], %[count], #8 \n\t" 223 "9: \n\t" 224 "pld [%[dst],#32] \n\t" 225 // expand 0565 q12 to 8888 {d4-d7} 226 "vmovn.u16 d4, q12 \n\t" 227 "vshr.u16 q11, q12, #5 \n\t" 228 "vshr.u16 q10, q12, #6+5 \n\t" 229 "vmovn.u16 d5, q11 \n\t" 230 "vmovn.u16 d6, q10 \n\t" 231 "vshl.u8 d4, d4, #3 \n\t" 232 "vshl.u8 d5, d5, #2 \n\t" 233 "vshl.u8 d6, d6, #3 \n\t" 234 235 "vmovl.u8 q14, d31 \n\t" 236 "vmovl.u8 q13, d31 \n\t" 237 "vmovl.u8 q12, d31 \n\t" 238 239 // duplicate in 4/2/1 & 8pix vsns 240 "vmvn.8 d30, d3 \n\t" 241 "vmlal.u8 q14, d30, d6 \n\t" 242 "vmlal.u8 q13, d30, d5 \n\t" 243 "vmlal.u8 q12, d30, d4 \n\t" 244 "vshr.u16 q8, q14, #5 \n\t" 245 "vshr.u16 q9, q13, #6 \n\t" 246 "vaddhn.u16 d6, q14, q8 \n\t" 247 "vshr.u16 q8, q12, #5 \n\t" 248 "vaddhn.u16 d5, q13, q9 \n\t" 249 "vqadd.u8 d6, d6, d0 \n\t" // moved up 250 "vaddhn.u16 d4, q12, q8 \n\t" 251 // intentionally don't calculate alpha 252 // result in d4-d6 253 254 "vqadd.u8 d5, d5, d1 \n\t" 255 "vqadd.u8 d4, d4, d2 \n\t" 256 257 // pack 8888 {d4-d6} to 0565 q10 258 "vshll.u8 q10, d6, #8 \n\t" 259 "vshll.u8 q3, d5, #8 \n\t" 260 "vshll.u8 q2, d4, #8 \n\t" 261 "vsri.u16 q10, q3, #5 \n\t" 262 "vsri.u16 q10, q2, #11 \n\t" 263 264 "bne 2b \n\t" 265 266 "1: \n\t" 267 "vst1.16 {q10}, [%[keep_dst]] \n\t" 268 : [count] "+r" (count) 269 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 270 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 271 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 272 "d30","d31" 273 ); 274 } 275 else 276 { // handle count < 8 277 uint16_t* SK_RESTRICT keep_dst = 0; 278 279 asm volatile ( 280 "vmov.u8 d31, #1<<7 \n\t" 281 "mov %[keep_dst], %[dst] \n\t" 282 283 "tst %[count], #4 \n\t" 284 "beq 14f \n\t" 285 "vld1.16 {d25}, [%[dst]]! \n\t" 286 "vld1.32 {q1}, [%[src]]! \n\t" 287 288 "14: \n\t" 289 "tst %[count], #2 \n\t" 290 "beq 12f \n\t" 291 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 292 "vld1.32 {d1}, [%[src]]! \n\t" 293 294 "12: \n\t" 295 "tst %[count], #1 \n\t" 296 "beq 11f \n\t" 297 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 298 "vld1.32 {d0[1]}, [%[src]]! \n\t" 299 300 "11: \n\t" 301 // unzips achieve the same as a vld4 operation 302 "vuzpq.u16 q0, q1 \n\t" 303 "vuzp.u8 d0, d1 \n\t" 304 "vuzp.u8 d2, d3 \n\t" 305 // expand 0565 q12 to 8888 {d4-d7} 306 "vmovn.u16 d4, q12 \n\t" 307 "vshr.u16 q11, q12, #5 \n\t" 308 "vshr.u16 q10, q12, #6+5 \n\t" 309 "vmovn.u16 d5, q11 \n\t" 310 "vmovn.u16 d6, q10 \n\t" 311 "vshl.u8 d4, d4, #3 \n\t" 312 "vshl.u8 d5, d5, #2 \n\t" 313 "vshl.u8 d6, d6, #3 \n\t" 314 315 "vmovl.u8 q14, d31 \n\t" 316 "vmovl.u8 q13, d31 \n\t" 317 "vmovl.u8 q12, d31 \n\t" 318 319 // duplicate in 4/2/1 & 8pix vsns 320 "vmvn.8 d30, d3 \n\t" 321 "vmlal.u8 q14, d30, d6 \n\t" 322 "vmlal.u8 q13, d30, d5 \n\t" 323 "vmlal.u8 q12, d30, d4 \n\t" 324 "vshr.u16 q8, q14, #5 \n\t" 325 "vshr.u16 q9, q13, #6 \n\t" 326 "vaddhn.u16 d6, q14, q8 \n\t" 327 "vshr.u16 q8, q12, #5 \n\t" 328 "vaddhn.u16 d5, q13, q9 \n\t" 329 "vqadd.u8 d6, d6, d0 \n\t" // moved up 330 "vaddhn.u16 d4, q12, q8 \n\t" 331 // intentionally don't calculate alpha 332 // result in d4-d6 333 334 "vqadd.u8 d5, d5, d1 \n\t" 335 "vqadd.u8 d4, d4, d2 \n\t" 336 337 // pack 8888 {d4-d6} to 0565 q10 338 "vshll.u8 q10, d6, #8 \n\t" 339 "vshll.u8 q3, d5, #8 \n\t" 340 "vshll.u8 q2, d4, #8 \n\t" 341 "vsri.u16 q10, q3, #5 \n\t" 342 "vsri.u16 q10, q2, #11 \n\t" 343 344 // store 345 "tst %[count], #4 \n\t" 346 "beq 24f \n\t" 347 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 348 349 "24: \n\t" 350 "tst %[count], #2 \n\t" 351 "beq 22f \n\t" 352 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 353 354 "22: \n\t" 355 "tst %[count], #1 \n\t" 356 "beq 21f \n\t" 357 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 358 359 "21: \n\t" 360 : [count] "+r" (count) 361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 364 "d30","d31" 365 ); 366 } 367} 368#endif 369 370static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 371 prod += vdupq_n_u16(128); 372 prod += vshrq_n_u16(prod, 8); 373 return vshrq_n_u16(prod, 8); 374} 375 376void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 377 const SkPMColor* SK_RESTRICT src, int count, 378 U8CPU alpha, int /*x*/, int /*y*/) { 379 SkASSERT(255 > alpha); 380 381 /* This code implements a Neon version of S32A_D565_Blend. The results have 382 * a few mismatches compared to the original code. These mismatches never 383 * exceed 1. 384 */ 385 386 if (count >= 8) { 387 uint16x8_t valpha_max, vmask_blue; 388 uint8x8_t valpha; 389 390 // prepare constants 391 valpha_max = vmovq_n_u16(255); 392 valpha = vdup_n_u8(alpha); 393 vmask_blue = vmovq_n_u16(SK_B16_MASK); 394 395 do { 396 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 397 uint16x8_t vres_a, vres_r, vres_g, vres_b; 398 uint8x8x4_t vsrc; 399 400 // load pixels 401 vdst = vld1q_u16(dst); 402#ifdef SK_CPU_ARM64 403 vsrc = sk_vld4_u8_arm64_4(src); 404#else 405#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 406 asm ( 407 "vld4.u8 %h[vsrc], [%[src]]!" 408 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 409 : : 410 ); 411#else 412 register uint8x8_t d0 asm("d0"); 413 register uint8x8_t d1 asm("d1"); 414 register uint8x8_t d2 asm("d2"); 415 register uint8x8_t d3 asm("d3"); 416 417 asm volatile ( 418 "vld4.u8 {d0-d3},[%[src]]!;" 419 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 420 [src] "+&r" (src) 421 : : 422 ); 423 vsrc.val[0] = d0; 424 vsrc.val[1] = d1; 425 vsrc.val[2] = d2; 426 vsrc.val[3] = d3; 427#endif 428#endif // #ifdef SK_CPU_ARM64 429 430 431 // deinterleave dst 432 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 433 vdst_b = vdst & vmask_blue; // extract blue 434 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 435 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 436 437 // shift src to 565 438 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 439 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 440 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 441 442 // calc src * src_scale 443 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 444 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 445 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 446 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 447 448 // prepare dst_scale 449 vres_a = SkDiv255Round_neon8(vres_a); 450 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 451 452 // add dst * dst_scale to previous result 453 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 454 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 455 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 456 457#ifdef S32A_D565_BLEND_EXACT 458 // It is possible to get exact results with this but it is slow, 459 // even slower than C code in some cases 460 vres_r = SkDiv255Round_neon8(vres_r); 461 vres_g = SkDiv255Round_neon8(vres_g); 462 vres_b = SkDiv255Round_neon8(vres_b); 463#else 464 vres_r = vrshrq_n_u16(vres_r, 8); 465 vres_g = vrshrq_n_u16(vres_g, 8); 466 vres_b = vrshrq_n_u16(vres_b, 8); 467#endif 468 // pack result 469 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 470 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 471 472 // store 473 vst1q_u16(dst, vres_b); 474 dst += 8; 475 count -= 8; 476 } while (count >= 8); 477 } 478 479 // leftovers 480 while (count-- > 0) { 481 SkPMColor sc = *src++; 482 if (sc) { 483 uint16_t dc = *dst; 484 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 485 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 486 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 487 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 488 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 489 } 490 dst += 1; 491 } 492} 493 494/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 495 * each dither value is spaced out into byte lanes, and repeated 496 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 497 * start of each row. 498 */ 499static const uint8_t gDitherMatrix_Neon[48] = { 500 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 501 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 502 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 503 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 504 505}; 506 507void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 508 int count, U8CPU alpha, int x, int y) 509{ 510 511 SkASSERT(255 > alpha); 512 513 // rescale alpha to range 1 - 256 514 int scale = SkAlpha255To256(alpha); 515 516 if (count >= 8) { 517 /* select row and offset for dither array */ 518 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 519 520 uint8x8_t vdither = vld1_u8(dstart); // load dither values 521 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 522 523 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 524 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 525 526 do { 527 528 uint8x8x4_t vsrc; 529 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 530 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 531 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 532 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 533 uint16x8_t vdst; 534 uint16x8_t vdst_r, vdst_g, vdst_b; 535 int16x8_t vres_r, vres_g, vres_b; 536 int8x8_t vres8_r, vres8_g, vres8_b; 537 538 // Load source and add dither 539#ifdef SK_CPU_ARM64 540 vsrc = sk_vld4_u8_arm64_3(src); 541#else 542 { 543 register uint8x8_t d0 asm("d0"); 544 register uint8x8_t d1 asm("d1"); 545 register uint8x8_t d2 asm("d2"); 546 register uint8x8_t d3 asm("d3"); 547 548 asm ( 549 "vld4.8 {d0-d3},[%[src]]! " 550 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 551 : 552 ); 553 vsrc.val[0] = d0; 554 vsrc.val[1] = d1; 555 vsrc.val[2] = d2; 556 } 557#endif 558 vsrc_r = vsrc.val[NEON_R]; 559 vsrc_g = vsrc.val[NEON_G]; 560 vsrc_b = vsrc.val[NEON_B]; 561 562 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 563 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 564 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 565 566 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 567 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 568 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 569 570 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 571 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 572 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 573 574 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 575 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 576 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 577 578 // Load dst and unpack 579 vdst = vld1q_u16(dst); 580 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 581 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 582 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 583 584 // subtract dst from src and widen 585 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 586 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 587 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 588 589 // multiply diffs by scale and shift 590 vres_r = vmulq_s16(vres_r, vscale); 591 vres_g = vmulq_s16(vres_g, vscale); 592 vres_b = vmulq_s16(vres_b, vscale); 593 594 vres8_r = vshrn_n_s16(vres_r, 8); 595 vres8_g = vshrn_n_s16(vres_g, 8); 596 vres8_b = vshrn_n_s16(vres_b, 8); 597 598 // add dst to result 599 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 600 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 601 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 602 603 // put result into 565 format 604 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 605 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 606 607 // Store result 608 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 609 610 // Next iteration 611 dst += 8; 612 count -= 8; 613 614 } while (count >= 8); 615 } 616 617 // Leftovers 618 if (count > 0) { 619 int scale = SkAlpha255To256(alpha); 620 DITHER_565_SCAN(y); 621 do { 622 SkPMColor c = *src++; 623 SkPMColorAssert(c); 624 625 int dither = DITHER_VALUE(x); 626 int sr = SkGetPackedR32(c); 627 int sg = SkGetPackedG32(c); 628 int sb = SkGetPackedB32(c); 629 sr = SkDITHER_R32To565(sr, dither); 630 sg = SkDITHER_G32To565(sg, dither); 631 sb = SkDITHER_B32To565(sb, dither); 632 633 uint16_t d = *dst; 634 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 635 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 636 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 637 DITHER_INC_X(x); 638 } while (--count != 0); 639 } 640} 641 642void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 643 const SkPMColor* SK_RESTRICT src, 644 int count, U8CPU alpha) { 645 646 SkASSERT(255 == alpha); 647 if (count > 0) { 648 649 650 uint8x8_t alpha_mask; 651 652 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 653 alpha_mask = vld1_u8(alpha_mask_setup); 654 655 /* do the NEON unrolled code */ 656#define UNROLL 4 657 while (count >= UNROLL) { 658 uint8x8_t src_raw, dst_raw, dst_final; 659 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 660 661 /* The two prefetches below may make the code slighlty 662 * slower for small values of count but are worth having 663 * in the general case. 664 */ 665 __builtin_prefetch(src+32); 666 __builtin_prefetch(dst+32); 667 668 /* get the source */ 669 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 670#if UNROLL > 2 671 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 672#endif 673 674 /* get and hold the dst too */ 675 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 676#if UNROLL > 2 677 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 678#endif 679 680 /* 1st and 2nd bits of the unrolling */ 681 { 682 uint8x8_t dst_cooked; 683 uint16x8_t dst_wide; 684 uint8x8_t alpha_narrow; 685 uint16x8_t alpha_wide; 686 687 /* get the alphas spread out properly */ 688 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 689 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 690 691 /* spread the dest */ 692 dst_wide = vmovl_u8(dst_raw); 693 694 /* alpha mul the dest */ 695 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 696 dst_cooked = vshrn_n_u16(dst_wide, 8); 697 698 /* sum -- ignoring any byte lane overflows */ 699 dst_final = vadd_u8(src_raw, dst_cooked); 700 } 701 702#if UNROLL > 2 703 /* the 3rd and 4th bits of our unrolling */ 704 { 705 uint8x8_t dst_cooked; 706 uint16x8_t dst_wide; 707 uint8x8_t alpha_narrow; 708 uint16x8_t alpha_wide; 709 710 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 711 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 712 713 /* spread the dest */ 714 dst_wide = vmovl_u8(dst_raw_2); 715 716 /* alpha mul the dest */ 717 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 718 dst_cooked = vshrn_n_u16(dst_wide, 8); 719 720 /* sum -- ignoring any byte lane overflows */ 721 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 722 } 723#endif 724 725 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 726#if UNROLL > 2 727 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 728#endif 729 730 src += UNROLL; 731 dst += UNROLL; 732 count -= UNROLL; 733 } 734#undef UNROLL 735 736 /* do any residual iterations */ 737 while (--count >= 0) { 738 *dst = SkPMSrcOver(*src, *dst); 739 src += 1; 740 dst += 1; 741 } 742 } 743} 744 745void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 746 const SkPMColor* SK_RESTRICT src, 747 int count, U8CPU alpha) { 748 SkASSERT(255 == alpha); 749 750 if (count <= 0) 751 return; 752 753 /* Use these to check if src is transparent or opaque */ 754 const unsigned int ALPHA_OPAQ = 0xFF000000; 755 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 756 757#define UNROLL 4 758 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 759 const SkPMColor* SK_RESTRICT src_temp = src; 760 761 /* set up the NEON variables */ 762 uint8x8_t alpha_mask; 763 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 764 alpha_mask = vld1_u8(alpha_mask_setup); 765 766 uint8x8_t src_raw, dst_raw, dst_final; 767 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 768 uint8x8_t dst_cooked; 769 uint16x8_t dst_wide; 770 uint8x8_t alpha_narrow; 771 uint16x8_t alpha_wide; 772 773 /* choose the first processing type */ 774 if( src >= src_end) 775 goto TAIL; 776 if(*src <= ALPHA_TRANS) 777 goto ALPHA_0; 778 if(*src >= ALPHA_OPAQ) 779 goto ALPHA_255; 780 /* fall-thru */ 781 782ALPHA_1_TO_254: 783 do { 784 785 /* get the source */ 786 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 787 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 788 789 /* get and hold the dst too */ 790 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 791 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 792 793 794 /* get the alphas spread out properly */ 795 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 796 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 797 /* we collapsed (255-a)+1 ... */ 798 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 799 800 /* spread the dest */ 801 dst_wide = vmovl_u8(dst_raw); 802 803 /* alpha mul the dest */ 804 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 805 dst_cooked = vshrn_n_u16(dst_wide, 8); 806 807 /* sum -- ignoring any byte lane overflows */ 808 dst_final = vadd_u8(src_raw, dst_cooked); 809 810 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 811 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 812 /* we collapsed (255-a)+1 ... */ 813 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 814 815 /* spread the dest */ 816 dst_wide = vmovl_u8(dst_raw_2); 817 818 /* alpha mul the dest */ 819 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 820 dst_cooked = vshrn_n_u16(dst_wide, 8); 821 822 /* sum -- ignoring any byte lane overflows */ 823 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 824 825 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 826 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 827 828 src += UNROLL; 829 dst += UNROLL; 830 831 /* if 2 of the next pixels aren't between 1 and 254 832 it might make sense to go to the optimized loops */ 833 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 834 break; 835 836 } while(src < src_end); 837 838 if (src >= src_end) 839 goto TAIL; 840 841 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 842 goto ALPHA_255; 843 844 /*fall-thru*/ 845 846ALPHA_0: 847 848 /*In this state, we know the current alpha is 0 and 849 we optimize for the next alpha also being zero. */ 850 src_temp = src; //so we don't have to increment dst every time 851 do { 852 if(*(++src) > ALPHA_TRANS) 853 break; 854 if(*(++src) > ALPHA_TRANS) 855 break; 856 if(*(++src) > ALPHA_TRANS) 857 break; 858 if(*(++src) > ALPHA_TRANS) 859 break; 860 } while(src < src_end); 861 862 dst += (src - src_temp); 863 864 /* no longer alpha 0, so determine where to go next. */ 865 if( src >= src_end) 866 goto TAIL; 867 if(*src >= ALPHA_OPAQ) 868 goto ALPHA_255; 869 else 870 goto ALPHA_1_TO_254; 871 872ALPHA_255: 873 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 874 dst[0]=src[0]; 875 dst[1]=src[1]; 876 dst[2]=src[2]; 877 dst[3]=src[3]; 878 src+=UNROLL; 879 dst+=UNROLL; 880 if(src >= src_end) 881 goto TAIL; 882 } 883 884 //Handle remainder. 885 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 886 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 887 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 888 } 889 } 890 891 if( src >= src_end) 892 goto TAIL; 893 if(*src <= ALPHA_TRANS) 894 goto ALPHA_0; 895 else 896 goto ALPHA_1_TO_254; 897 898TAIL: 899 /* do any residual iterations */ 900 src_end += UNROLL + 1; //goto the real end 901 while(src != src_end) { 902 if( *src != 0 ) { 903 if( *src >= ALPHA_OPAQ ) { 904 *dst = *src; 905 } 906 else { 907 *dst = SkPMSrcOver(*src, *dst); 908 } 909 } 910 src++; 911 dst++; 912 } 913 914#undef UNROLL 915 return; 916} 917 918/* Neon version of S32_Blend_BlitRow32() 919 * portable version is in src/core/SkBlitRow_D32.cpp 920 */ 921void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 922 const SkPMColor* SK_RESTRICT src, 923 int count, U8CPU alpha) { 924 SkASSERT(alpha <= 255); 925 926 if (count <= 0) { 927 return; 928 } 929 930 uint16_t src_scale = SkAlpha255To256(alpha); 931 uint16_t dst_scale = 256 - src_scale; 932 933 while (count >= 2) { 934 uint8x8_t vsrc, vdst, vres; 935 uint16x8_t vsrc_wide, vdst_wide; 936 937 /* These commented prefetches are a big win for count 938 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 939 * They also hurt a little (<5%) on an A15 940 */ 941 //__builtin_prefetch(src+32); 942 //__builtin_prefetch(dst+32); 943 944 // Load 945 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 946 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 947 948 // Process src 949 vsrc_wide = vmovl_u8(vsrc); 950 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 951 952 // Process dst 953 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 954 955 // Combine 956 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 957 958 // Store 959 vst1_u32(dst, vreinterpret_u32_u8(vres)); 960 961 src += 2; 962 dst += 2; 963 count -= 2; 964 } 965 966 if (count == 1) { 967 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 968 uint16x8_t vsrc_wide, vdst_wide; 969 970 // Load 971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 973 974 // Process 975 vsrc_wide = vmovl_u8(vsrc); 976 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 977 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 978 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 979 980 // Store 981 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 982 } 983} 984 985#ifdef SK_CPU_ARM32 986void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 987 const SkPMColor* SK_RESTRICT src, 988 int count, U8CPU alpha) { 989 990 SkASSERT(255 >= alpha); 991 992 if (count <= 0) { 993 return; 994 } 995 996 unsigned alpha256 = SkAlpha255To256(alpha); 997 998 // First deal with odd counts 999 if (count & 1) { 1000 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1001 uint16x8_t vdst_wide, vsrc_wide; 1002 unsigned dst_scale; 1003 1004 // Load 1005 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1006 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1007 1008 // Calc dst_scale 1009 dst_scale = vget_lane_u8(vsrc, 3); 1010 dst_scale *= alpha256; 1011 dst_scale >>= 8; 1012 dst_scale = 256 - dst_scale; 1013 1014 // Process src 1015 vsrc_wide = vmovl_u8(vsrc); 1016 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 1017 1018 // Process dst 1019 vdst_wide = vmovl_u8(vdst); 1020 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 1021 1022 // Combine 1023 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1024 1025 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1026 dst++; 1027 src++; 1028 count--; 1029 } 1030 1031 if (count) { 1032 uint8x8_t alpha_mask; 1033 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1034 alpha_mask = vld1_u8(alpha_mask_setup); 1035 1036 do { 1037 1038 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 1039 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 1040 1041 __builtin_prefetch(src+32); 1042 __builtin_prefetch(dst+32); 1043 1044 // Load 1045 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1046 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1047 1048 // Prepare src_scale 1049 vsrc_scale = vdupq_n_u16(alpha256); 1050 1051 // Calc dst_scale 1052 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1053 vdst_scale = vmovl_u8(vsrc_alphas); 1054 vdst_scale *= vsrc_scale; 1055 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1056 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1057 1058 // Process src 1059 vsrc_wide = vmovl_u8(vsrc); 1060 vsrc_wide *= vsrc_scale; 1061 1062 // Process dst 1063 vdst_wide = vmovl_u8(vdst); 1064 vdst_wide *= vdst_scale; 1065 1066 // Combine 1067 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1068 1069 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1070 1071 src += 2; 1072 dst += 2; 1073 count -= 2; 1074 } while(count); 1075 } 1076} 1077 1078/////////////////////////////////////////////////////////////////////////////// 1079 1080#undef DEBUG_OPAQUE_DITHER 1081 1082#if defined(DEBUG_OPAQUE_DITHER) 1083static void showme8(char *str, void *p, int len) 1084{ 1085 static char buf[256]; 1086 char tbuf[32]; 1087 int i; 1088 char *pc = (char*) p; 1089 sprintf(buf,"%8s:", str); 1090 for(i=0;i<len;i++) { 1091 sprintf(tbuf, " %02x", pc[i]); 1092 strcat(buf, tbuf); 1093 } 1094 SkDebugf("%s\n", buf); 1095} 1096static void showme16(char *str, void *p, int len) 1097{ 1098 static char buf[256]; 1099 char tbuf[32]; 1100 int i; 1101 uint16_t *pc = (uint16_t*) p; 1102 sprintf(buf,"%8s:", str); 1103 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1104 for(i=0;i<len;i++) { 1105 sprintf(tbuf, " %04x", pc[i]); 1106 strcat(buf, tbuf); 1107 } 1108 SkDebugf("%s\n", buf); 1109} 1110#endif 1111#endif // #ifdef SK_CPU_ARM32 1112 1113void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1114 const SkPMColor* SK_RESTRICT src, 1115 int count, U8CPU alpha, int x, int y) { 1116 SkASSERT(255 == alpha); 1117 1118#define UNROLL 8 1119 1120 if (count >= UNROLL) { 1121 1122#if defined(DEBUG_OPAQUE_DITHER) 1123 uint16_t tmpbuf[UNROLL]; 1124 int td[UNROLL]; 1125 int tdv[UNROLL]; 1126 int ta[UNROLL]; 1127 int tap[UNROLL]; 1128 uint16_t in_dst[UNROLL]; 1129 int offset = 0; 1130 int noisy = 0; 1131#endif 1132 1133 uint8x8_t dbase; 1134 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1135 dbase = vld1_u8(dstart); 1136 1137 do { 1138 uint8x8x4_t vsrc; 1139 uint8x8_t sr, sg, sb, sa, d; 1140 uint16x8_t dst8, scale8, alpha8; 1141 uint16x8_t dst_r, dst_g, dst_b; 1142 1143#if defined(DEBUG_OPAQUE_DITHER) 1144 // calculate 8 elements worth into a temp buffer 1145 { 1146 int my_y = y; 1147 int my_x = x; 1148 SkPMColor* my_src = (SkPMColor*)src; 1149 uint16_t* my_dst = dst; 1150 int i; 1151 1152 DITHER_565_SCAN(my_y); 1153 for(i = 0; i < UNROLL; i++) { 1154 SkPMColor c = *my_src++; 1155 SkPMColorAssert(c); 1156 if (c) { 1157 unsigned a = SkGetPackedA32(c); 1158 1159 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1160 tdv[i] = DITHER_VALUE(my_x); 1161 ta[i] = a; 1162 tap[i] = SkAlpha255To256(a); 1163 td[i] = d; 1164 1165 unsigned sr = SkGetPackedR32(c); 1166 unsigned sg = SkGetPackedG32(c); 1167 unsigned sb = SkGetPackedB32(c); 1168 sr = SkDITHER_R32_FOR_565(sr, d); 1169 sg = SkDITHER_G32_FOR_565(sg, d); 1170 sb = SkDITHER_B32_FOR_565(sb, d); 1171 1172 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1173 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1174 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1175 // now src and dst expanded are in g:11 r:10 x:1 b:10 1176 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1177 td[i] = d; 1178 } else { 1179 tmpbuf[i] = *my_dst; 1180 ta[i] = tdv[i] = td[i] = 0xbeef; 1181 } 1182 in_dst[i] = *my_dst; 1183 my_dst += 1; 1184 DITHER_INC_X(my_x); 1185 } 1186 } 1187#endif 1188 1189#ifdef SK_CPU_ARM64 1190 vsrc = sk_vld4_u8_arm64_4(src); 1191#else 1192 { 1193 register uint8x8_t d0 asm("d0"); 1194 register uint8x8_t d1 asm("d1"); 1195 register uint8x8_t d2 asm("d2"); 1196 register uint8x8_t d3 asm("d3"); 1197 1198 asm ("vld4.8 {d0-d3},[%[src]]! " 1199 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1200 : 1201 ); 1202 vsrc.val[0] = d0; 1203 vsrc.val[1] = d1; 1204 vsrc.val[2] = d2; 1205 vsrc.val[3] = d3; 1206 } 1207#endif 1208 sa = vsrc.val[NEON_A]; 1209 sr = vsrc.val[NEON_R]; 1210 sg = vsrc.val[NEON_G]; 1211 sb = vsrc.val[NEON_B]; 1212 1213 /* calculate 'd', which will be 0..7 1214 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1215 */ 1216 alpha8 = vmovl_u8(dbase); 1217 alpha8 = vmlal_u8(alpha8, sa, dbase); 1218 d = vshrn_n_u16(alpha8, 8); // narrowing too 1219 1220 // sr = sr - (sr>>5) + d 1221 /* watching for 8-bit overflow. d is 0..7; risky range of 1222 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1223 * safe as long as we do ((sr-sr>>5) + d) 1224 */ 1225 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1226 sr = vadd_u8(sr, d); 1227 1228 // sb = sb - (sb>>5) + d 1229 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1230 sb = vadd_u8(sb, d); 1231 1232 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1233 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1234 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1235 1236 // need to pick up 8 dst's -- at 16 bits each, 128 bits 1237 dst8 = vld1q_u16(dst); 1238 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1239 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1240 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1241 1242 // blend 1243 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1244 1245 // combine the addq and mul, save 3 insns 1246 scale8 = vshrq_n_u16(scale8, 3); 1247 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1248 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1249 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1250 1251 // repack to store 1252 dst8 = vshrq_n_u16(dst_b, 5); 1253 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1254 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1255 1256 vst1q_u16(dst, dst8); 1257 1258#if defined(DEBUG_OPAQUE_DITHER) 1259 // verify my 8 elements match the temp buffer 1260 { 1261 int i, bad=0; 1262 static int invocation; 1263 1264 for (i = 0; i < UNROLL; i++) { 1265 if (tmpbuf[i] != dst[i]) { 1266 bad=1; 1267 } 1268 } 1269 if (bad) { 1270 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1271 invocation, offset); 1272 SkDebugf(" alpha 0x%x\n", alpha); 1273 for (i = 0; i < UNROLL; i++) 1274 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1275 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], 1276 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); 1277 1278 showme16("alpha8", &alpha8, sizeof(alpha8)); 1279 showme16("scale8", &scale8, sizeof(scale8)); 1280 showme8("d", &d, sizeof(d)); 1281 showme16("dst8", &dst8, sizeof(dst8)); 1282 showme16("dst_b", &dst_b, sizeof(dst_b)); 1283 showme16("dst_g", &dst_g, sizeof(dst_g)); 1284 showme16("dst_r", &dst_r, sizeof(dst_r)); 1285 showme8("sb", &sb, sizeof(sb)); 1286 showme8("sg", &sg, sizeof(sg)); 1287 showme8("sr", &sr, sizeof(sr)); 1288 1289 return; 1290 } 1291 offset += UNROLL; 1292 invocation++; 1293 } 1294#endif 1295 dst += UNROLL; 1296 count -= UNROLL; 1297 // skip x += UNROLL, since it's unchanged mod-4 1298 } while (count >= UNROLL); 1299 } 1300#undef UNROLL 1301 1302 // residuals 1303 if (count > 0) { 1304 DITHER_565_SCAN(y); 1305 do { 1306 SkPMColor c = *src++; 1307 SkPMColorAssert(c); 1308 if (c) { 1309 unsigned a = SkGetPackedA32(c); 1310 1311 // dither and alpha are just temporary variables to work-around 1312 // an ICE in debug. 1313 unsigned dither = DITHER_VALUE(x); 1314 unsigned alpha = SkAlpha255To256(a); 1315 int d = SkAlphaMul(dither, alpha); 1316 1317 unsigned sr = SkGetPackedR32(c); 1318 unsigned sg = SkGetPackedG32(c); 1319 unsigned sb = SkGetPackedB32(c); 1320 sr = SkDITHER_R32_FOR_565(sr, d); 1321 sg = SkDITHER_G32_FOR_565(sg, d); 1322 sb = SkDITHER_B32_FOR_565(sb, d); 1323 1324 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1325 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1326 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1327 // now src and dst expanded are in g:11 r:10 x:1 b:10 1328 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1329 } 1330 dst += 1; 1331 DITHER_INC_X(x); 1332 } while (--count != 0); 1333 } 1334} 1335 1336/////////////////////////////////////////////////////////////////////////////// 1337 1338#undef DEBUG_S32_OPAQUE_DITHER 1339 1340void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1341 const SkPMColor* SK_RESTRICT src, 1342 int count, U8CPU alpha, int x, int y) { 1343 SkASSERT(255 == alpha); 1344 1345#define UNROLL 8 1346 if (count >= UNROLL) { 1347 uint8x8_t d; 1348 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1349 d = vld1_u8(dstart); 1350 1351 while (count >= UNROLL) { 1352 uint8x8_t sr, sg, sb; 1353 uint16x8_t dr, dg, db; 1354 uint16x8_t dst8; 1355 uint8x8x4_t vsrc; 1356 1357#ifdef SK_CPU_ARM64 1358 vsrc = sk_vld4_u8_arm64_3(src); 1359#else 1360 { 1361 register uint8x8_t d0 asm("d0"); 1362 register uint8x8_t d1 asm("d1"); 1363 register uint8x8_t d2 asm("d2"); 1364 register uint8x8_t d3 asm("d3"); 1365 1366 asm ( 1367 "vld4.8 {d0-d3},[%[src]]! " 1368 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1369 : 1370 ); 1371 vsrc.val[0] = d0; 1372 vsrc.val[1] = d1; 1373 vsrc.val[2] = d2; 1374 } 1375#endif 1376 sr = vsrc.val[NEON_R]; 1377 sg = vsrc.val[NEON_G]; 1378 sb = vsrc.val[NEON_B]; 1379 1380 /* XXX: if we want to prefetch, hide it in the above asm() 1381 * using the gcc __builtin_prefetch(), the prefetch will 1382 * fall to the bottom of the loop -- it won't stick up 1383 * at the top of the loop, just after the vld4. 1384 */ 1385 1386 // sr = sr - (sr>>5) + d 1387 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1388 dr = vaddl_u8(sr, d); 1389 1390 // sb = sb - (sb>>5) + d 1391 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1392 db = vaddl_u8(sb, d); 1393 1394 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1395 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1396 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1397 1398 // pack high bits of each into 565 format (rgb, b is lsb) 1399 dst8 = vshrq_n_u16(db, 3); 1400 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1401 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1402 1403 // store it 1404 vst1q_u16(dst, dst8); 1405 1406#if defined(DEBUG_S32_OPAQUE_DITHER) 1407 // always good to know if we generated good results 1408 { 1409 int i, myx = x, myy = y; 1410 DITHER_565_SCAN(myy); 1411 for (i=0;i<UNROLL;i++) { 1412 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1413 SkPMColor c = src[i-8]; 1414 unsigned dither = DITHER_VALUE(myx); 1415 uint16_t val = SkDitherRGB32To565(c, dither); 1416 if (val != dst[i]) { 1417 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1418 c, dither, val, dst[i], dstart[i]); 1419 } 1420 DITHER_INC_X(myx); 1421 } 1422 } 1423#endif 1424 1425 dst += UNROLL; 1426 // we don't need to increment src as the asm above has already done it 1427 count -= UNROLL; 1428 x += UNROLL; // probably superfluous 1429 } 1430 } 1431#undef UNROLL 1432 1433 // residuals 1434 if (count > 0) { 1435 DITHER_565_SCAN(y); 1436 do { 1437 SkPMColor c = *src++; 1438 SkPMColorAssert(c); 1439 SkASSERT(SkGetPackedA32(c) == 255); 1440 1441 unsigned dither = DITHER_VALUE(x); 1442 *dst++ = SkDitherRGB32To565(c, dither); 1443 DITHER_INC_X(x); 1444 } while (--count != 0); 1445 } 1446} 1447 1448void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1449 SkPMColor color) { 1450 if (count <= 0) { 1451 return; 1452 } 1453 1454 if (0 == color) { 1455 if (src != dst) { 1456 memcpy(dst, src, count * sizeof(SkPMColor)); 1457 } 1458 return; 1459 } 1460 1461 unsigned colorA = SkGetPackedA32(color); 1462 if (255 == colorA) { 1463 sk_memset32(dst, color, count); 1464 return; 1465 } 1466 1467 unsigned scale = 256 - SkAlpha255To256(colorA); 1468 1469 if (count >= 8) { 1470 uint32x4_t vcolor; 1471 uint8x8_t vscale; 1472 1473 vcolor = vdupq_n_u32(color); 1474 1475 // scale numerical interval [0-255], so load as 8 bits 1476 vscale = vdup_n_u8(scale); 1477 1478 do { 1479 // load src color, 8 pixels, 4 64 bit registers 1480 // (and increment src). 1481 uint32x2x4_t vsrc; 1482#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) 1483 asm ( 1484 "vld1.32 %h[vsrc], [%[src]]!" 1485 : [vsrc] "=w" (vsrc), [src] "+r" (src) 1486 : : 1487 ); 1488#else // 64bit targets and Clang 1489 vsrc.val[0] = vld1_u32(src); 1490 vsrc.val[1] = vld1_u32(src+2); 1491 vsrc.val[2] = vld1_u32(src+4); 1492 vsrc.val[3] = vld1_u32(src+6); 1493 src += 8; 1494#endif 1495 1496 // multiply long by scale, 64 bits at a time, 1497 // destination into a 128 bit register. 1498 uint16x8x4_t vtmp; 1499 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale); 1500 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale); 1501 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale); 1502 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale); 1503 1504 // shift the 128 bit registers, containing the 16 1505 // bit scaled values back to 8 bits, narrowing the 1506 // results to 64 bit registers. 1507 uint8x16x2_t vres; 1508 vres.val[0] = vcombine_u8( 1509 vshrn_n_u16(vtmp.val[0], 8), 1510 vshrn_n_u16(vtmp.val[1], 8)); 1511 vres.val[1] = vcombine_u8( 1512 vshrn_n_u16(vtmp.val[2], 8), 1513 vshrn_n_u16(vtmp.val[3], 8)); 1514 1515 // adding back the color, using 128 bit registers. 1516 uint32x4x2_t vdst; 1517 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] + 1518 vreinterpretq_u8_u32(vcolor)); 1519 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] + 1520 vreinterpretq_u8_u32(vcolor)); 1521 1522 // store back the 8 calculated pixels (2 128 bit 1523 // registers), and increment dst. 1524#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))) 1525 asm ( 1526 "vst1.32 %h[vdst], [%[dst]]!" 1527 : [dst] "+r" (dst) 1528 : [vdst] "w" (vdst) 1529 : "memory" 1530 ); 1531#else // 64bit targets and Clang 1532 vst1q_u32(dst, vdst.val[0]); 1533 vst1q_u32(dst+4, vdst.val[1]); 1534 dst += 8; 1535#endif 1536 count -= 8; 1537 1538 } while (count >= 8); 1539 } 1540 1541 while (count > 0) { 1542 *dst = color + SkAlphaMulQ(*src, scale); 1543 src += 1; 1544 dst += 1; 1545 count--; 1546 } 1547} 1548 1549/////////////////////////////////////////////////////////////////////////////// 1550 1551const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1552 // no dither 1553 S32_D565_Opaque_neon, 1554 S32_D565_Blend_neon, 1555#ifdef SK_CPU_ARM32 1556 S32A_D565_Opaque_neon, 1557#else 1558 NULL, 1559#endif 1560 S32A_D565_Blend_neon, 1561 1562 // dither 1563 S32_D565_Opaque_Dither_neon, 1564 S32_D565_Blend_Dither_neon, 1565 S32A_D565_Opaque_Dither_neon, 1566 NULL, // S32A_D565_Blend_Dither 1567}; 1568 1569const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1570 NULL, // S32_Opaque, 1571 S32_Blend_BlitRow32_neon, // S32_Blend, 1572 /* 1573 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1574 * value and attempts to optimize accordingly. The optimization is 1575 * sensitive to the source content and is not a win in all cases. For 1576 * example, if there are a lot of transitions between the alpha states, 1577 * the performance will almost certainly be worse. However, for many 1578 * common cases the performance is equivalent or better than the standard 1579 * case where we do not inspect the src alpha. 1580 */ 1581#if SK_A32_SHIFT == 24 1582 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1583 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1584#else 1585 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1586#endif 1587#ifdef SK_CPU_ARM32 1588 S32A_Blend_BlitRow32_neon // S32A_Blend 1589#else 1590 NULL 1591#endif 1592}; 1593