SkBlitRow_opts_arm_neon.cpp revision 95c2e5532b094add82b007bdfcd4c64050b6b366
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18#include "SkColor_opts_neon.h" 19#include <arm_neon.h> 20 21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 while (count >= 8) { 27 uint8x8x4_t vsrc; 28 uint16x8_t vdst; 29 30 // Load 31 vsrc = vld4_u8((uint8_t*)src); 32 33 // Convert src to 565 34 vdst = SkPixel32ToPixel16_neon8(vsrc); 35 36 // Store 37 vst1q_u16(dst, vdst); 38 39 // Prepare next iteration 40 dst += 8; 41 src += 8; 42 count -= 8; 43 }; 44 45 // Leftovers 46 while (count > 0) { 47 SkPMColor c = *src++; 48 SkPMColorAssert(c); 49 *dst = SkPixel32ToPixel16_ToU16(c); 50 dst++; 51 count--; 52 }; 53} 54 55void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 56 const SkPMColor* SK_RESTRICT src, int count, 57 U8CPU alpha, int /*x*/, int /*y*/) { 58 SkASSERT(255 > alpha); 59 60 uint16x8_t vmask_blue, vscale; 61 62 // prepare constants 63 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 64 vmask_blue = vmovq_n_u16(0x1F); 65 66 while (count >= 8) { 67 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 68 uint16x8_t vres_r, vres_g, vres_b; 69 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 70 71 // Load src 72 { 73 register uint8x8_t d0 asm("d0"); 74 register uint8x8_t d1 asm("d1"); 75 register uint8x8_t d2 asm("d2"); 76 register uint8x8_t d3 asm("d3"); 77 78 asm ( 79 "vld4.8 {d0-d3},[%[src]]!" 80 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 81 : 82 ); 83 vsrc_g = d1; 84#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 85 vsrc_r = d2; vsrc_b = d0; 86#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 87 vsrc_r = d0; vsrc_b = d2; 88#endif 89 } 90 91 // Load and unpack dst 92 vdst = vld1q_u16(dst); 93 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 94 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 95 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 96 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 97 98 // Shift src to 565 99 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range 100 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range 101 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range 102 103 // Scale src - dst 104 vres_r = vmovl_u8(vsrc_r) - vdst_r; 105 vres_g = vmovl_u8(vsrc_g) - vdst_g; 106 vres_b = vmovl_u8(vsrc_b) - vdst_b; 107 108 vres_r = vshrq_n_u16(vres_r * vscale, 8); 109 vres_g = vshrq_n_u16(vres_g * vscale, 8); 110 vres_b = vshrq_n_u16(vres_b * vscale, 8); 111 112 vres_r += vdst_r; 113 vres_g += vdst_g; 114 vres_b += vdst_b; 115 116 // Combine 117 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue 118 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue 119 120 // Store 121 vst1q_u16(dst, vres_b); 122 dst += 8; 123 count -= 8; 124 } 125 if (count > 0) { 126 int scale = SkAlpha255To256(alpha); 127 do { 128 SkPMColor c = *src++; 129 SkPMColorAssert(c); 130 uint16_t d = *dst; 131 *dst++ = SkPackRGB16( 132 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 133 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 134 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 135 } while (--count != 0); 136 } 137} 138 139void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 140 const SkPMColor* SK_RESTRICT src, int count, 141 U8CPU alpha, int /*x*/, int /*y*/) { 142 SkASSERT(255 == alpha); 143 144 if (count >= 8) { 145 uint16_t* SK_RESTRICT keep_dst = 0; 146 147 asm volatile ( 148 "ands ip, %[count], #7 \n\t" 149 "vmov.u8 d31, #1<<7 \n\t" 150 "vld1.16 {q12}, [%[dst]] \n\t" 151 "vld4.8 {d0-d3}, [%[src]] \n\t" 152 // Thumb does not support the standard ARM conditional 153 // instructions but instead requires the 'it' instruction 154 // to signal conditional execution 155 "it eq \n\t" 156 "moveq ip, #8 \n\t" 157 "mov %[keep_dst], %[dst] \n\t" 158 159 "add %[src], %[src], ip, LSL#2 \n\t" 160 "add %[dst], %[dst], ip, LSL#1 \n\t" 161 "subs %[count], %[count], ip \n\t" 162 "b 9f \n\t" 163 // LOOP 164 "2: \n\t" 165 166 "vld1.16 {q12}, [%[dst]]! \n\t" 167 "vld4.8 {d0-d3}, [%[src]]! \n\t" 168 "vst1.16 {q10}, [%[keep_dst]] \n\t" 169 "sub %[keep_dst], %[dst], #8*2 \n\t" 170 "subs %[count], %[count], #8 \n\t" 171 "9: \n\t" 172 "pld [%[dst],#32] \n\t" 173 // expand 0565 q12 to 8888 {d4-d7} 174 "vmovn.u16 d4, q12 \n\t" 175 "vshr.u16 q11, q12, #5 \n\t" 176 "vshr.u16 q10, q12, #6+5 \n\t" 177 "vmovn.u16 d5, q11 \n\t" 178 "vmovn.u16 d6, q10 \n\t" 179 "vshl.u8 d4, d4, #3 \n\t" 180 "vshl.u8 d5, d5, #2 \n\t" 181 "vshl.u8 d6, d6, #3 \n\t" 182 183 "vmovl.u8 q14, d31 \n\t" 184 "vmovl.u8 q13, d31 \n\t" 185 "vmovl.u8 q12, d31 \n\t" 186 187 // duplicate in 4/2/1 & 8pix vsns 188 "vmvn.8 d30, d3 \n\t" 189 "vmlal.u8 q14, d30, d6 \n\t" 190 "vmlal.u8 q13, d30, d5 \n\t" 191 "vmlal.u8 q12, d30, d4 \n\t" 192 "vshr.u16 q8, q14, #5 \n\t" 193 "vshr.u16 q9, q13, #6 \n\t" 194 "vaddhn.u16 d6, q14, q8 \n\t" 195 "vshr.u16 q8, q12, #5 \n\t" 196 "vaddhn.u16 d5, q13, q9 \n\t" 197 "vqadd.u8 d6, d6, d0 \n\t" // moved up 198 "vaddhn.u16 d4, q12, q8 \n\t" 199 // intentionally don't calculate alpha 200 // result in d4-d6 201 202 "vqadd.u8 d5, d5, d1 \n\t" 203 "vqadd.u8 d4, d4, d2 \n\t" 204 205 // pack 8888 {d4-d6} to 0565 q10 206 "vshll.u8 q10, d6, #8 \n\t" 207 "vshll.u8 q3, d5, #8 \n\t" 208 "vshll.u8 q2, d4, #8 \n\t" 209 "vsri.u16 q10, q3, #5 \n\t" 210 "vsri.u16 q10, q2, #11 \n\t" 211 212 "bne 2b \n\t" 213 214 "1: \n\t" 215 "vst1.16 {q10}, [%[keep_dst]] \n\t" 216 : [count] "+r" (count) 217 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 218 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 219 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 220 "d30","d31" 221 ); 222 } 223 else 224 { // handle count < 8 225 uint16_t* SK_RESTRICT keep_dst = 0; 226 227 asm volatile ( 228 "vmov.u8 d31, #1<<7 \n\t" 229 "mov %[keep_dst], %[dst] \n\t" 230 231 "tst %[count], #4 \n\t" 232 "beq 14f \n\t" 233 "vld1.16 {d25}, [%[dst]]! \n\t" 234 "vld1.32 {q1}, [%[src]]! \n\t" 235 236 "14: \n\t" 237 "tst %[count], #2 \n\t" 238 "beq 12f \n\t" 239 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 240 "vld1.32 {d1}, [%[src]]! \n\t" 241 242 "12: \n\t" 243 "tst %[count], #1 \n\t" 244 "beq 11f \n\t" 245 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 246 "vld1.32 {d0[1]}, [%[src]]! \n\t" 247 248 "11: \n\t" 249 // unzips achieve the same as a vld4 operation 250 "vuzpq.u16 q0, q1 \n\t" 251 "vuzp.u8 d0, d1 \n\t" 252 "vuzp.u8 d2, d3 \n\t" 253 // expand 0565 q12 to 8888 {d4-d7} 254 "vmovn.u16 d4, q12 \n\t" 255 "vshr.u16 q11, q12, #5 \n\t" 256 "vshr.u16 q10, q12, #6+5 \n\t" 257 "vmovn.u16 d5, q11 \n\t" 258 "vmovn.u16 d6, q10 \n\t" 259 "vshl.u8 d4, d4, #3 \n\t" 260 "vshl.u8 d5, d5, #2 \n\t" 261 "vshl.u8 d6, d6, #3 \n\t" 262 263 "vmovl.u8 q14, d31 \n\t" 264 "vmovl.u8 q13, d31 \n\t" 265 "vmovl.u8 q12, d31 \n\t" 266 267 // duplicate in 4/2/1 & 8pix vsns 268 "vmvn.8 d30, d3 \n\t" 269 "vmlal.u8 q14, d30, d6 \n\t" 270 "vmlal.u8 q13, d30, d5 \n\t" 271 "vmlal.u8 q12, d30, d4 \n\t" 272 "vshr.u16 q8, q14, #5 \n\t" 273 "vshr.u16 q9, q13, #6 \n\t" 274 "vaddhn.u16 d6, q14, q8 \n\t" 275 "vshr.u16 q8, q12, #5 \n\t" 276 "vaddhn.u16 d5, q13, q9 \n\t" 277 "vqadd.u8 d6, d6, d0 \n\t" // moved up 278 "vaddhn.u16 d4, q12, q8 \n\t" 279 // intentionally don't calculate alpha 280 // result in d4-d6 281 282 "vqadd.u8 d5, d5, d1 \n\t" 283 "vqadd.u8 d4, d4, d2 \n\t" 284 285 // pack 8888 {d4-d6} to 0565 q10 286 "vshll.u8 q10, d6, #8 \n\t" 287 "vshll.u8 q3, d5, #8 \n\t" 288 "vshll.u8 q2, d4, #8 \n\t" 289 "vsri.u16 q10, q3, #5 \n\t" 290 "vsri.u16 q10, q2, #11 \n\t" 291 292 // store 293 "tst %[count], #4 \n\t" 294 "beq 24f \n\t" 295 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 296 297 "24: \n\t" 298 "tst %[count], #2 \n\t" 299 "beq 22f \n\t" 300 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 301 302 "22: \n\t" 303 "tst %[count], #1 \n\t" 304 "beq 21f \n\t" 305 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 306 307 "21: \n\t" 308 : [count] "+r" (count) 309 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 310 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 311 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 312 "d30","d31" 313 ); 314 } 315} 316 317static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 318 prod += vdupq_n_u16(128); 319 prod += vshrq_n_u16(prod, 8); 320 return vshrq_n_u16(prod, 8); 321} 322 323void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 324 const SkPMColor* SK_RESTRICT src, int count, 325 U8CPU alpha, int /*x*/, int /*y*/) { 326 SkASSERT(255 > alpha); 327 328 /* This code implements a Neon version of S32A_D565_Blend. The results have 329 * a few mismatches compared to the original code. These mismatches never 330 * exceed 1. 331 */ 332 333 if (count >= 8) { 334 uint16x8_t valpha_max, vmask_blue; 335 uint8x8_t valpha; 336 337 // prepare constants 338 valpha_max = vmovq_n_u16(255); 339 valpha = vdup_n_u8(alpha); 340 vmask_blue = vmovq_n_u16(SK_B16_MASK); 341 342 do { 343 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 344 uint16x8_t vres_a, vres_r, vres_g, vres_b; 345 uint8x8x4_t vsrc; 346 347 // load pixels 348 vdst = vld1q_u16(dst); 349#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 350 asm ( 351 "vld4.u8 %h[vsrc], [%[src]]!" 352 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 353 : : 354 ); 355#else 356 register uint8x8_t d0 asm("d0"); 357 register uint8x8_t d1 asm("d1"); 358 register uint8x8_t d2 asm("d2"); 359 register uint8x8_t d3 asm("d3"); 360 361 asm volatile ( 362 "vld4.u8 {d0-d3},[%[src]]!;" 363 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 364 [src] "+&r" (src) 365 : : 366 ); 367 vsrc.val[0] = d0; 368 vsrc.val[1] = d1; 369 vsrc.val[2] = d2; 370 vsrc.val[3] = d3; 371#endif 372 373 374 // deinterleave dst 375 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 376 vdst_b = vdst & vmask_blue; // extract blue 377 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 378 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 379 380 // shift src to 565 381 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 382 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 383 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 384 385 // calc src * src_scale 386 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 387 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 388 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 389 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 390 391 // prepare dst_scale 392 vres_a = SkDiv255Round_neon8(vres_a); 393 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 394 395 // add dst * dst_scale to previous result 396 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 397 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 398 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 399 400#ifdef S32A_D565_BLEND_EXACT 401 // It is possible to get exact results with this but it is slow, 402 // even slower than C code in some cases 403 vres_r = SkDiv255Round_neon8(vres_r); 404 vres_g = SkDiv255Round_neon8(vres_g); 405 vres_b = SkDiv255Round_neon8(vres_b); 406#else 407 vres_r = vrshrq_n_u16(vres_r, 8); 408 vres_g = vrshrq_n_u16(vres_g, 8); 409 vres_b = vrshrq_n_u16(vres_b, 8); 410#endif 411 // pack result 412 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 413 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 414 415 // store 416 vst1q_u16(dst, vres_b); 417 dst += 8; 418 count -= 8; 419 } while (count >= 8); 420 } 421 422 // leftovers 423 while (count-- > 0) { 424 SkPMColor sc = *src++; 425 if (sc) { 426 uint16_t dc = *dst; 427 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 428 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 429 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 430 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 431 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 432 } 433 dst += 1; 434 } 435} 436 437/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 438 * each dither value is spaced out into byte lanes, and repeated 439 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 440 * start of each row. 441 */ 442static const uint8_t gDitherMatrix_Neon[48] = { 443 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 444 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 445 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 446 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 447 448}; 449 450void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 451 int count, U8CPU alpha, int x, int y) 452{ 453 454 SkASSERT(255 > alpha); 455 456 // rescale alpha to range 1 - 256 457 int scale = SkAlpha255To256(alpha); 458 459 if (count >= 8) { 460 /* select row and offset for dither array */ 461 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 462 463 uint8x8_t vdither = vld1_u8(dstart); // load dither values 464 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 465 466 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 467 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 468 469 do { 470 471 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 472 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 473 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 474 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 475 uint16x8_t vdst; 476 uint16x8_t vdst_r, vdst_g, vdst_b; 477 int16x8_t vres_r, vres_g, vres_b; 478 int8x8_t vres8_r, vres8_g, vres8_b; 479 480 // Load source and add dither 481 { 482 register uint8x8_t d0 asm("d0"); 483 register uint8x8_t d1 asm("d1"); 484 register uint8x8_t d2 asm("d2"); 485 register uint8x8_t d3 asm("d3"); 486 487 asm ( 488 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 489 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 490 : 491 ); 492 vsrc_g = d1; 493#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 494 vsrc_r = d2; vsrc_b = d0; 495#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 496 vsrc_r = d0; vsrc_b = d2; 497#endif 498 } 499 500 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 501 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 502 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 503 504 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 505 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 506 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 507 508 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 509 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 510 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 511 512 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 513 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 514 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 515 516 // Load dst and unpack 517 vdst = vld1q_u16(dst); 518 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 519 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 520 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 521 522 // subtract dst from src and widen 523 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 524 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 525 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 526 527 // multiply diffs by scale and shift 528 vres_r = vmulq_s16(vres_r, vscale); 529 vres_g = vmulq_s16(vres_g, vscale); 530 vres_b = vmulq_s16(vres_b, vscale); 531 532 vres8_r = vshrn_n_s16(vres_r, 8); 533 vres8_g = vshrn_n_s16(vres_g, 8); 534 vres8_b = vshrn_n_s16(vres_b, 8); 535 536 // add dst to result 537 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 538 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 539 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 540 541 // put result into 565 format 542 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 543 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 544 545 // Store result 546 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 547 548 // Next iteration 549 dst += 8; 550 count -= 8; 551 552 } while (count >= 8); 553 } 554 555 // Leftovers 556 if (count > 0) { 557 int scale = SkAlpha255To256(alpha); 558 DITHER_565_SCAN(y); 559 do { 560 SkPMColor c = *src++; 561 SkPMColorAssert(c); 562 563 int dither = DITHER_VALUE(x); 564 int sr = SkGetPackedR32(c); 565 int sg = SkGetPackedG32(c); 566 int sb = SkGetPackedB32(c); 567 sr = SkDITHER_R32To565(sr, dither); 568 sg = SkDITHER_G32To565(sg, dither); 569 sb = SkDITHER_B32To565(sb, dither); 570 571 uint16_t d = *dst; 572 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 573 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 574 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 575 DITHER_INC_X(x); 576 } while (--count != 0); 577 } 578} 579 580void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 581 const SkPMColor* SK_RESTRICT src, 582 int count, U8CPU alpha) { 583 584 SkASSERT(255 == alpha); 585 if (count > 0) { 586 587 588 uint8x8_t alpha_mask; 589 590 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 591 alpha_mask = vld1_u8(alpha_mask_setup); 592 593 /* do the NEON unrolled code */ 594#define UNROLL 4 595 while (count >= UNROLL) { 596 uint8x8_t src_raw, dst_raw, dst_final; 597 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 598 599 /* The two prefetches below may make the code slighlty 600 * slower for small values of count but are worth having 601 * in the general case. 602 */ 603 __builtin_prefetch(src+32); 604 __builtin_prefetch(dst+32); 605 606 /* get the source */ 607 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 608#if UNROLL > 2 609 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 610#endif 611 612 /* get and hold the dst too */ 613 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 614#if UNROLL > 2 615 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 616#endif 617 618 /* 1st and 2nd bits of the unrolling */ 619 { 620 uint8x8_t dst_cooked; 621 uint16x8_t dst_wide; 622 uint8x8_t alpha_narrow; 623 uint16x8_t alpha_wide; 624 625 /* get the alphas spread out properly */ 626 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 627 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 628 629 /* spread the dest */ 630 dst_wide = vmovl_u8(dst_raw); 631 632 /* alpha mul the dest */ 633 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 634 dst_cooked = vshrn_n_u16(dst_wide, 8); 635 636 /* sum -- ignoring any byte lane overflows */ 637 dst_final = vadd_u8(src_raw, dst_cooked); 638 } 639 640#if UNROLL > 2 641 /* the 3rd and 4th bits of our unrolling */ 642 { 643 uint8x8_t dst_cooked; 644 uint16x8_t dst_wide; 645 uint8x8_t alpha_narrow; 646 uint16x8_t alpha_wide; 647 648 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 649 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 650 651 /* spread the dest */ 652 dst_wide = vmovl_u8(dst_raw_2); 653 654 /* alpha mul the dest */ 655 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 656 dst_cooked = vshrn_n_u16(dst_wide, 8); 657 658 /* sum -- ignoring any byte lane overflows */ 659 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 660 } 661#endif 662 663 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 664#if UNROLL > 2 665 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 666#endif 667 668 src += UNROLL; 669 dst += UNROLL; 670 count -= UNROLL; 671 } 672#undef UNROLL 673 674 /* do any residual iterations */ 675 while (--count >= 0) { 676 *dst = SkPMSrcOver(*src, *dst); 677 src += 1; 678 dst += 1; 679 } 680 } 681} 682 683void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 684 const SkPMColor* SK_RESTRICT src, 685 int count, U8CPU alpha) { 686 SkASSERT(255 == alpha); 687 688 if (count <= 0) 689 return; 690 691 /* Use these to check if src is transparent or opaque */ 692 const unsigned int ALPHA_OPAQ = 0xFF000000; 693 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 694 695#define UNROLL 4 696 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 697 const SkPMColor* SK_RESTRICT src_temp = src; 698 699 /* set up the NEON variables */ 700 uint8x8_t alpha_mask; 701 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 702 alpha_mask = vld1_u8(alpha_mask_setup); 703 704 uint8x8_t src_raw, dst_raw, dst_final; 705 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 706 uint8x8_t dst_cooked; 707 uint16x8_t dst_wide; 708 uint8x8_t alpha_narrow; 709 uint16x8_t alpha_wide; 710 711 /* choose the first processing type */ 712 if( src >= src_end) 713 goto TAIL; 714 if(*src <= ALPHA_TRANS) 715 goto ALPHA_0; 716 if(*src >= ALPHA_OPAQ) 717 goto ALPHA_255; 718 /* fall-thru */ 719 720ALPHA_1_TO_254: 721 do { 722 723 /* get the source */ 724 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 725 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 726 727 /* get and hold the dst too */ 728 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 729 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 730 731 732 /* get the alphas spread out properly */ 733 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 734 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 735 /* we collapsed (255-a)+1 ... */ 736 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 737 738 /* spread the dest */ 739 dst_wide = vmovl_u8(dst_raw); 740 741 /* alpha mul the dest */ 742 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 743 dst_cooked = vshrn_n_u16(dst_wide, 8); 744 745 /* sum -- ignoring any byte lane overflows */ 746 dst_final = vadd_u8(src_raw, dst_cooked); 747 748 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 749 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 750 /* we collapsed (255-a)+1 ... */ 751 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 752 753 /* spread the dest */ 754 dst_wide = vmovl_u8(dst_raw_2); 755 756 /* alpha mul the dest */ 757 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 758 dst_cooked = vshrn_n_u16(dst_wide, 8); 759 760 /* sum -- ignoring any byte lane overflows */ 761 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 762 763 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 764 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 765 766 src += UNROLL; 767 dst += UNROLL; 768 769 /* if 2 of the next pixels aren't between 1 and 254 770 it might make sense to go to the optimized loops */ 771 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 772 break; 773 774 } while(src < src_end); 775 776 if (src >= src_end) 777 goto TAIL; 778 779 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 780 goto ALPHA_255; 781 782 /*fall-thru*/ 783 784ALPHA_0: 785 786 /*In this state, we know the current alpha is 0 and 787 we optimize for the next alpha also being zero. */ 788 src_temp = src; //so we don't have to increment dst every time 789 do { 790 if(*(++src) > ALPHA_TRANS) 791 break; 792 if(*(++src) > ALPHA_TRANS) 793 break; 794 if(*(++src) > ALPHA_TRANS) 795 break; 796 if(*(++src) > ALPHA_TRANS) 797 break; 798 } while(src < src_end); 799 800 dst += (src - src_temp); 801 802 /* no longer alpha 0, so determine where to go next. */ 803 if( src >= src_end) 804 goto TAIL; 805 if(*src >= ALPHA_OPAQ) 806 goto ALPHA_255; 807 else 808 goto ALPHA_1_TO_254; 809 810ALPHA_255: 811 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 812 dst[0]=src[0]; 813 dst[1]=src[1]; 814 dst[2]=src[2]; 815 dst[3]=src[3]; 816 src+=UNROLL; 817 dst+=UNROLL; 818 if(src >= src_end) 819 goto TAIL; 820 } 821 822 //Handle remainder. 823 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 824 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 825 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 826 } 827 } 828 829 if( src >= src_end) 830 goto TAIL; 831 if(*src <= ALPHA_TRANS) 832 goto ALPHA_0; 833 else 834 goto ALPHA_1_TO_254; 835 836TAIL: 837 /* do any residual iterations */ 838 src_end += UNROLL + 1; //goto the real end 839 while(src != src_end) { 840 if( *src != 0 ) { 841 if( *src >= ALPHA_OPAQ ) { 842 *dst = *src; 843 } 844 else { 845 *dst = SkPMSrcOver(*src, *dst); 846 } 847 } 848 src++; 849 dst++; 850 } 851 852#undef UNROLL 853 return; 854} 855 856/* Neon version of S32_Blend_BlitRow32() 857 * portable version is in src/core/SkBlitRow_D32.cpp 858 */ 859void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 860 const SkPMColor* SK_RESTRICT src, 861 int count, U8CPU alpha) { 862 SkASSERT(alpha <= 255); 863 864 if (count <= 0) { 865 return; 866 } 867 868 uint16_t src_scale = SkAlpha255To256(alpha); 869 uint16_t dst_scale = 256 - src_scale; 870 871 while (count >= 2) { 872 uint8x8_t vsrc, vdst, vres; 873 uint16x8_t vsrc_wide, vdst_wide; 874 875 /* These commented prefetches are a big win for count 876 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 877 * They also hurt a little (<5%) on an A15 878 */ 879 //__builtin_prefetch(src+32); 880 //__builtin_prefetch(dst+32); 881 882 // Load 883 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 884 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 885 886 // Process src 887 vsrc_wide = vmovl_u8(vsrc); 888 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 889 890 // Process dst 891 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 892 893 // Combine 894 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 895 896 // Store 897 vst1_u32(dst, vreinterpret_u32_u8(vres)); 898 899 src += 2; 900 dst += 2; 901 count -= 2; 902 } 903 904 if (count == 1) { 905 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 906 uint16x8_t vsrc_wide, vdst_wide; 907 908 // Load 909 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 910 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 911 912 // Process 913 vsrc_wide = vmovl_u8(vsrc); 914 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 915 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 916 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 917 918 // Store 919 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 920 } 921} 922 923void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 924 const SkPMColor* SK_RESTRICT src, 925 int count, U8CPU alpha) { 926 927 SkASSERT(255 >= alpha); 928 929 if (count <= 0) { 930 return; 931 } 932 933 unsigned alpha256 = SkAlpha255To256(alpha); 934 935 // First deal with odd counts 936 if (count & 1) { 937 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 938 uint16x8_t vdst_wide, vsrc_wide; 939 unsigned dst_scale; 940 941 // Load 942 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 943 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 944 945 // Calc dst_scale 946 dst_scale = vget_lane_u8(vsrc, 3); 947 dst_scale *= alpha256; 948 dst_scale >>= 8; 949 dst_scale = 256 - dst_scale; 950 951 // Process src 952 vsrc_wide = vmovl_u8(vsrc); 953 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 954 955 // Process dst 956 vdst_wide = vmovl_u8(vdst); 957 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 958 959 // Combine 960 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 961 962 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 963 dst++; 964 src++; 965 count--; 966 } 967 968 if (count) { 969 uint8x8_t alpha_mask; 970 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 971 alpha_mask = vld1_u8(alpha_mask_setup); 972 973 do { 974 975 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 976 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 977 978 __builtin_prefetch(src+32); 979 __builtin_prefetch(dst+32); 980 981 // Load 982 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 983 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 984 985 // Prepare src_scale 986 vsrc_scale = vdupq_n_u16(alpha256); 987 988 // Calc dst_scale 989 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 990 vdst_scale = vmovl_u8(vsrc_alphas); 991 vdst_scale *= vsrc_scale; 992 vdst_scale = vshrq_n_u16(vdst_scale, 8); 993 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 994 995 // Process src 996 vsrc_wide = vmovl_u8(vsrc); 997 vsrc_wide *= vsrc_scale; 998 999 // Process dst 1000 vdst_wide = vmovl_u8(vdst); 1001 vdst_wide *= vdst_scale; 1002 1003 // Combine 1004 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1005 1006 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1007 1008 src += 2; 1009 dst += 2; 1010 count -= 2; 1011 } while(count); 1012 } 1013} 1014 1015/////////////////////////////////////////////////////////////////////////////// 1016 1017#undef DEBUG_OPAQUE_DITHER 1018 1019#if defined(DEBUG_OPAQUE_DITHER) 1020static void showme8(char *str, void *p, int len) 1021{ 1022 static char buf[256]; 1023 char tbuf[32]; 1024 int i; 1025 char *pc = (char*) p; 1026 sprintf(buf,"%8s:", str); 1027 for(i=0;i<len;i++) { 1028 sprintf(tbuf, " %02x", pc[i]); 1029 strcat(buf, tbuf); 1030 } 1031 SkDebugf("%s\n", buf); 1032} 1033static void showme16(char *str, void *p, int len) 1034{ 1035 static char buf[256]; 1036 char tbuf[32]; 1037 int i; 1038 uint16_t *pc = (uint16_t*) p; 1039 sprintf(buf,"%8s:", str); 1040 len = (len / sizeof(uint16_t)); /* passed as bytes */ 1041 for(i=0;i<len;i++) { 1042 sprintf(tbuf, " %04x", pc[i]); 1043 strcat(buf, tbuf); 1044 } 1045 SkDebugf("%s\n", buf); 1046} 1047#endif 1048 1049void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1050 const SkPMColor* SK_RESTRICT src, 1051 int count, U8CPU alpha, int x, int y) { 1052 SkASSERT(255 == alpha); 1053 1054#define UNROLL 8 1055 1056 if (count >= UNROLL) { 1057 1058#if defined(DEBUG_OPAQUE_DITHER) 1059 uint16_t tmpbuf[UNROLL]; 1060 int td[UNROLL]; 1061 int tdv[UNROLL]; 1062 int ta[UNROLL]; 1063 int tap[UNROLL]; 1064 uint16_t in_dst[UNROLL]; 1065 int offset = 0; 1066 int noisy = 0; 1067#endif 1068 1069 uint8x8_t dbase; 1070 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1071 dbase = vld1_u8(dstart); 1072 1073 do { 1074 uint8x8_t sr, sg, sb, sa, d; 1075 uint16x8_t dst8, scale8, alpha8; 1076 uint16x8_t dst_r, dst_g, dst_b; 1077 1078#if defined(DEBUG_OPAQUE_DITHER) 1079 // calculate 8 elements worth into a temp buffer 1080 { 1081 int my_y = y; 1082 int my_x = x; 1083 SkPMColor* my_src = (SkPMColor*)src; 1084 uint16_t* my_dst = dst; 1085 int i; 1086 1087 DITHER_565_SCAN(my_y); 1088 for(i = 0; i < UNROLL; i++) { 1089 SkPMColor c = *my_src++; 1090 SkPMColorAssert(c); 1091 if (c) { 1092 unsigned a = SkGetPackedA32(c); 1093 1094 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1095 tdv[i] = DITHER_VALUE(my_x); 1096 ta[i] = a; 1097 tap[i] = SkAlpha255To256(a); 1098 td[i] = d; 1099 1100 unsigned sr = SkGetPackedR32(c); 1101 unsigned sg = SkGetPackedG32(c); 1102 unsigned sb = SkGetPackedB32(c); 1103 sr = SkDITHER_R32_FOR_565(sr, d); 1104 sg = SkDITHER_G32_FOR_565(sg, d); 1105 sb = SkDITHER_B32_FOR_565(sb, d); 1106 1107 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1108 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1109 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1110 // now src and dst expanded are in g:11 r:10 x:1 b:10 1111 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1112 td[i] = d; 1113 } else { 1114 tmpbuf[i] = *my_dst; 1115 ta[i] = tdv[i] = td[i] = 0xbeef; 1116 } 1117 in_dst[i] = *my_dst; 1118 my_dst += 1; 1119 DITHER_INC_X(my_x); 1120 } 1121 } 1122#endif 1123 1124 1125 { 1126 register uint8x8_t d0 asm("d0"); 1127 register uint8x8_t d1 asm("d1"); 1128 register uint8x8_t d2 asm("d2"); 1129 register uint8x8_t d3 asm("d3"); 1130 1131 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1132 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1133 : 1134 ); 1135#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1136 sr = d2; sg = d1; sb = d0; sa = d3; 1137#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1138 sr = d0; sg = d1; sb = d2; sa = d3; 1139#endif 1140 } 1141 1142 /* calculate 'd', which will be 0..7 1143 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1144 */ 1145 alpha8 = vmovl_u8(dbase); 1146 alpha8 = vmlal_u8(alpha8, sa, dbase); 1147 d = vshrn_n_u16(alpha8, 8); // narrowing too 1148 1149 // sr = sr - (sr>>5) + d 1150 /* watching for 8-bit overflow. d is 0..7; risky range of 1151 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1152 * safe as long as we do ((sr-sr>>5) + d) 1153 */ 1154 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1155 sr = vadd_u8(sr, d); 1156 1157 // sb = sb - (sb>>5) + d 1158 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1159 sb = vadd_u8(sb, d); 1160 1161 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1162 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1163 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1164 1165 // need to pick up 8 dst's -- at 16 bits each, 128 bits 1166 dst8 = vld1q_u16(dst); 1167 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1168 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1169 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1170 1171 // blend 1172 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1173 1174 // combine the addq and mul, save 3 insns 1175 scale8 = vshrq_n_u16(scale8, 3); 1176 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1177 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1178 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1179 1180 // repack to store 1181 dst8 = vshrq_n_u16(dst_b, 5); 1182 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1183 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1184 1185 vst1q_u16(dst, dst8); 1186 1187#if defined(DEBUG_OPAQUE_DITHER) 1188 // verify my 8 elements match the temp buffer 1189 { 1190 int i, bad=0; 1191 static int invocation; 1192 1193 for (i = 0; i < UNROLL; i++) { 1194 if (tmpbuf[i] != dst[i]) { 1195 bad=1; 1196 } 1197 } 1198 if (bad) { 1199 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1200 invocation, offset); 1201 SkDebugf(" alpha 0x%x\n", alpha); 1202 for (i = 0; i < UNROLL; i++) 1203 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1204 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], 1205 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); 1206 1207 showme16("alpha8", &alpha8, sizeof(alpha8)); 1208 showme16("scale8", &scale8, sizeof(scale8)); 1209 showme8("d", &d, sizeof(d)); 1210 showme16("dst8", &dst8, sizeof(dst8)); 1211 showme16("dst_b", &dst_b, sizeof(dst_b)); 1212 showme16("dst_g", &dst_g, sizeof(dst_g)); 1213 showme16("dst_r", &dst_r, sizeof(dst_r)); 1214 showme8("sb", &sb, sizeof(sb)); 1215 showme8("sg", &sg, sizeof(sg)); 1216 showme8("sr", &sr, sizeof(sr)); 1217 1218 return; 1219 } 1220 offset += UNROLL; 1221 invocation++; 1222 } 1223#endif 1224 dst += UNROLL; 1225 count -= UNROLL; 1226 // skip x += UNROLL, since it's unchanged mod-4 1227 } while (count >= UNROLL); 1228 } 1229#undef UNROLL 1230 1231 // residuals 1232 if (count > 0) { 1233 DITHER_565_SCAN(y); 1234 do { 1235 SkPMColor c = *src++; 1236 SkPMColorAssert(c); 1237 if (c) { 1238 unsigned a = SkGetPackedA32(c); 1239 1240 // dither and alpha are just temporary variables to work-around 1241 // an ICE in debug. 1242 unsigned dither = DITHER_VALUE(x); 1243 unsigned alpha = SkAlpha255To256(a); 1244 int d = SkAlphaMul(dither, alpha); 1245 1246 unsigned sr = SkGetPackedR32(c); 1247 unsigned sg = SkGetPackedG32(c); 1248 unsigned sb = SkGetPackedB32(c); 1249 sr = SkDITHER_R32_FOR_565(sr, d); 1250 sg = SkDITHER_G32_FOR_565(sg, d); 1251 sb = SkDITHER_B32_FOR_565(sb, d); 1252 1253 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1254 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1255 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1256 // now src and dst expanded are in g:11 r:10 x:1 b:10 1257 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1258 } 1259 dst += 1; 1260 DITHER_INC_X(x); 1261 } while (--count != 0); 1262 } 1263} 1264 1265/////////////////////////////////////////////////////////////////////////////// 1266 1267#undef DEBUG_S32_OPAQUE_DITHER 1268 1269void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1270 const SkPMColor* SK_RESTRICT src, 1271 int count, U8CPU alpha, int x, int y) { 1272 SkASSERT(255 == alpha); 1273 1274#define UNROLL 8 1275 if (count >= UNROLL) { 1276 uint8x8_t d; 1277 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1278 d = vld1_u8(dstart); 1279 1280 while (count >= UNROLL) { 1281 uint8x8_t sr, sg, sb; 1282 uint16x8_t dr, dg, db; 1283 uint16x8_t dst8; 1284 1285 { 1286 register uint8x8_t d0 asm("d0"); 1287 register uint8x8_t d1 asm("d1"); 1288 register uint8x8_t d2 asm("d2"); 1289 register uint8x8_t d3 asm("d3"); 1290 1291 asm ( 1292 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1293 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1294 : 1295 ); 1296 sg = d1; 1297#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1298 sr = d2; sb = d0; 1299#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1300 sr = d0; sb = d2; 1301#endif 1302 } 1303 /* XXX: if we want to prefetch, hide it in the above asm() 1304 * using the gcc __builtin_prefetch(), the prefetch will 1305 * fall to the bottom of the loop -- it won't stick up 1306 * at the top of the loop, just after the vld4. 1307 */ 1308 1309 // sr = sr - (sr>>5) + d 1310 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1311 dr = vaddl_u8(sr, d); 1312 1313 // sb = sb - (sb>>5) + d 1314 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1315 db = vaddl_u8(sb, d); 1316 1317 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1318 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1319 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1320 1321 // pack high bits of each into 565 format (rgb, b is lsb) 1322 dst8 = vshrq_n_u16(db, 3); 1323 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1324 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1325 1326 // store it 1327 vst1q_u16(dst, dst8); 1328 1329#if defined(DEBUG_S32_OPAQUE_DITHER) 1330 // always good to know if we generated good results 1331 { 1332 int i, myx = x, myy = y; 1333 DITHER_565_SCAN(myy); 1334 for (i=0;i<UNROLL;i++) { 1335 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1336 SkPMColor c = src[i-8]; 1337 unsigned dither = DITHER_VALUE(myx); 1338 uint16_t val = SkDitherRGB32To565(c, dither); 1339 if (val != dst[i]) { 1340 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1341 c, dither, val, dst[i], dstart[i]); 1342 } 1343 DITHER_INC_X(myx); 1344 } 1345 } 1346#endif 1347 1348 dst += UNROLL; 1349 // we don't need to increment src as the asm above has already done it 1350 count -= UNROLL; 1351 x += UNROLL; // probably superfluous 1352 } 1353 } 1354#undef UNROLL 1355 1356 // residuals 1357 if (count > 0) { 1358 DITHER_565_SCAN(y); 1359 do { 1360 SkPMColor c = *src++; 1361 SkPMColorAssert(c); 1362 SkASSERT(SkGetPackedA32(c) == 255); 1363 1364 unsigned dither = DITHER_VALUE(x); 1365 *dst++ = SkDitherRGB32To565(c, dither); 1366 DITHER_INC_X(x); 1367 } while (--count != 0); 1368 } 1369} 1370 1371void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1372 SkPMColor color) { 1373 if (count <= 0) { 1374 return; 1375 } 1376 1377 if (0 == color) { 1378 if (src != dst) { 1379 memcpy(dst, src, count * sizeof(SkPMColor)); 1380 } 1381 return; 1382 } 1383 1384 unsigned colorA = SkGetPackedA32(color); 1385 if (255 == colorA) { 1386 sk_memset32(dst, color, count); 1387 } else { 1388 unsigned scale = 256 - SkAlpha255To256(colorA); 1389 1390 if (count >= 8) { 1391 // at the end of this assembly, count will have been decremented 1392 // to a negative value. That is, if count mod 8 = x, it will be 1393 // -8 +x coming out. 1394 asm volatile ( 1395 PLD128(src, 0) 1396 1397 "vdup.32 q0, %[color] \n\t" 1398 1399 PLD128(src, 128) 1400 1401 // scale numerical interval [0-255], so load as 8 bits 1402 "vdup.8 d2, %[scale] \n\t" 1403 1404 PLD128(src, 256) 1405 1406 "subs %[count], %[count], #8 \n\t" 1407 1408 PLD128(src, 384) 1409 1410 "Loop_Color32: \n\t" 1411 1412 // load src color, 8 pixels, 4 64 bit registers 1413 // (and increment src). 1414 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1415 1416 PLD128(src, 384) 1417 1418 // multiply long by scale, 64 bits at a time, 1419 // destination into a 128 bit register. 1420 "vmull.u8 q4, d4, d2 \n\t" 1421 "vmull.u8 q5, d5, d2 \n\t" 1422 "vmull.u8 q6, d6, d2 \n\t" 1423 "vmull.u8 q7, d7, d2 \n\t" 1424 1425 // shift the 128 bit registers, containing the 16 1426 // bit scaled values back to 8 bits, narrowing the 1427 // results to 64 bit registers. 1428 "vshrn.i16 d8, q4, #8 \n\t" 1429 "vshrn.i16 d9, q5, #8 \n\t" 1430 "vshrn.i16 d10, q6, #8 \n\t" 1431 "vshrn.i16 d11, q7, #8 \n\t" 1432 1433 // adding back the color, using 128 bit registers. 1434 "vadd.i8 q6, q4, q0 \n\t" 1435 "vadd.i8 q7, q5, q0 \n\t" 1436 1437 // store back the 8 calculated pixels (2 128 bit 1438 // registers), and increment dst. 1439 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1440 1441 "subs %[count], %[count], #8 \n\t" 1442 "bge Loop_Color32 \n\t" 1443 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1444 : [color] "r" (color), [scale] "r" (scale) 1445 : "cc", "memory", 1446 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1447 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1448 ); 1449 // At this point, if we went through the inline assembly, count is 1450 // a negative value: 1451 // if the value is -8, there is no pixel left to process. 1452 // if the value is -7, there is one pixel left to process 1453 // ... 1454 // And'ing it with 7 will give us the number of pixels 1455 // left to process. 1456 count = count & 0x7; 1457 } 1458 1459 while (count > 0) { 1460 *dst = color + SkAlphaMulQ(*src, scale); 1461 src += 1; 1462 dst += 1; 1463 count--; 1464 } 1465 } 1466} 1467 1468/////////////////////////////////////////////////////////////////////////////// 1469 1470const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1471 // no dither 1472 S32_D565_Opaque_neon, 1473 S32_D565_Blend_neon, 1474 S32A_D565_Opaque_neon, 1475 S32A_D565_Blend_neon, 1476 1477 // dither 1478 S32_D565_Opaque_Dither_neon, 1479 S32_D565_Blend_Dither_neon, 1480 S32A_D565_Opaque_Dither_neon, 1481 NULL, // S32A_D565_Blend_Dither 1482}; 1483 1484const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1485 NULL, // S32_Opaque, 1486 S32_Blend_BlitRow32_neon, // S32_Blend, 1487 /* 1488 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1489 * value and attempts to optimize accordingly. The optimization is 1490 * sensitive to the source content and is not a win in all cases. For 1491 * example, if there are a lot of transitions between the alpha states, 1492 * the performance will almost certainly be worse. However, for many 1493 * common cases the performance is equivalent or better than the standard 1494 * case where we do not inspect the src alpha. 1495 */ 1496#if SK_A32_SHIFT == 24 1497 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1498 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1499#else 1500 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1501#endif 1502 S32A_Blend_BlitRow32_neon // S32A_Blend 1503}; 1504