SkBlitRow_opts_arm_neon.cpp revision dfff2737f8ad3e945a4dcbe175380d4b2a91a260
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18#include "SkColor_opts_neon.h" 19#include <arm_neon.h> 20 21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 while (count >= 8) { 27 uint8x8x4_t vsrc; 28 uint16x8_t vdst; 29 30 // Load 31 vsrc = vld4_u8((uint8_t*)src); 32 33 // Convert src to 565 34 vdst = SkPixel32ToPixel16_neon8(vsrc); 35 36 // Store 37 vst1q_u16(dst, vdst); 38 39 // Prepare next iteration 40 dst += 8; 41 src += 8; 42 count -= 8; 43 }; 44 45 // Leftovers 46 while (count > 0) { 47 SkPMColor c = *src++; 48 SkPMColorAssert(c); 49 *dst = SkPixel32ToPixel16_ToU16(c); 50 dst++; 51 count--; 52 }; 53} 54 55void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 56 const SkPMColor* SK_RESTRICT src, int count, 57 U8CPU alpha, int /*x*/, int /*y*/) { 58 SkASSERT(255 == alpha); 59 60 if (count >= 8) { 61 uint16_t* SK_RESTRICT keep_dst = 0; 62 63 asm volatile ( 64 "ands ip, %[count], #7 \n\t" 65 "vmov.u8 d31, #1<<7 \n\t" 66 "vld1.16 {q12}, [%[dst]] \n\t" 67 "vld4.8 {d0-d3}, [%[src]] \n\t" 68 // Thumb does not support the standard ARM conditional 69 // instructions but instead requires the 'it' instruction 70 // to signal conditional execution 71 "it eq \n\t" 72 "moveq ip, #8 \n\t" 73 "mov %[keep_dst], %[dst] \n\t" 74 75 "add %[src], %[src], ip, LSL#2 \n\t" 76 "add %[dst], %[dst], ip, LSL#1 \n\t" 77 "subs %[count], %[count], ip \n\t" 78 "b 9f \n\t" 79 // LOOP 80 "2: \n\t" 81 82 "vld1.16 {q12}, [%[dst]]! \n\t" 83 "vld4.8 {d0-d3}, [%[src]]! \n\t" 84 "vst1.16 {q10}, [%[keep_dst]] \n\t" 85 "sub %[keep_dst], %[dst], #8*2 \n\t" 86 "subs %[count], %[count], #8 \n\t" 87 "9: \n\t" 88 "pld [%[dst],#32] \n\t" 89 // expand 0565 q12 to 8888 {d4-d7} 90 "vmovn.u16 d4, q12 \n\t" 91 "vshr.u16 q11, q12, #5 \n\t" 92 "vshr.u16 q10, q12, #6+5 \n\t" 93 "vmovn.u16 d5, q11 \n\t" 94 "vmovn.u16 d6, q10 \n\t" 95 "vshl.u8 d4, d4, #3 \n\t" 96 "vshl.u8 d5, d5, #2 \n\t" 97 "vshl.u8 d6, d6, #3 \n\t" 98 99 "vmovl.u8 q14, d31 \n\t" 100 "vmovl.u8 q13, d31 \n\t" 101 "vmovl.u8 q12, d31 \n\t" 102 103 // duplicate in 4/2/1 & 8pix vsns 104 "vmvn.8 d30, d3 \n\t" 105 "vmlal.u8 q14, d30, d6 \n\t" 106 "vmlal.u8 q13, d30, d5 \n\t" 107 "vmlal.u8 q12, d30, d4 \n\t" 108 "vshr.u16 q8, q14, #5 \n\t" 109 "vshr.u16 q9, q13, #6 \n\t" 110 "vaddhn.u16 d6, q14, q8 \n\t" 111 "vshr.u16 q8, q12, #5 \n\t" 112 "vaddhn.u16 d5, q13, q9 \n\t" 113 "vqadd.u8 d6, d6, d0 \n\t" // moved up 114 "vaddhn.u16 d4, q12, q8 \n\t" 115 // intentionally don't calculate alpha 116 // result in d4-d6 117 118 "vqadd.u8 d5, d5, d1 \n\t" 119 "vqadd.u8 d4, d4, d2 \n\t" 120 121 // pack 8888 {d4-d6} to 0565 q10 122 "vshll.u8 q10, d6, #8 \n\t" 123 "vshll.u8 q3, d5, #8 \n\t" 124 "vshll.u8 q2, d4, #8 \n\t" 125 "vsri.u16 q10, q3, #5 \n\t" 126 "vsri.u16 q10, q2, #11 \n\t" 127 128 "bne 2b \n\t" 129 130 "1: \n\t" 131 "vst1.16 {q10}, [%[keep_dst]] \n\t" 132 : [count] "+r" (count) 133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 136 "d30","d31" 137 ); 138 } 139 else 140 { // handle count < 8 141 uint16_t* SK_RESTRICT keep_dst = 0; 142 143 asm volatile ( 144 "vmov.u8 d31, #1<<7 \n\t" 145 "mov %[keep_dst], %[dst] \n\t" 146 147 "tst %[count], #4 \n\t" 148 "beq 14f \n\t" 149 "vld1.16 {d25}, [%[dst]]! \n\t" 150 "vld1.32 {q1}, [%[src]]! \n\t" 151 152 "14: \n\t" 153 "tst %[count], #2 \n\t" 154 "beq 12f \n\t" 155 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 156 "vld1.32 {d1}, [%[src]]! \n\t" 157 158 "12: \n\t" 159 "tst %[count], #1 \n\t" 160 "beq 11f \n\t" 161 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 162 "vld1.32 {d0[1]}, [%[src]]! \n\t" 163 164 "11: \n\t" 165 // unzips achieve the same as a vld4 operation 166 "vuzpq.u16 q0, q1 \n\t" 167 "vuzp.u8 d0, d1 \n\t" 168 "vuzp.u8 d2, d3 \n\t" 169 // expand 0565 q12 to 8888 {d4-d7} 170 "vmovn.u16 d4, q12 \n\t" 171 "vshr.u16 q11, q12, #5 \n\t" 172 "vshr.u16 q10, q12, #6+5 \n\t" 173 "vmovn.u16 d5, q11 \n\t" 174 "vmovn.u16 d6, q10 \n\t" 175 "vshl.u8 d4, d4, #3 \n\t" 176 "vshl.u8 d5, d5, #2 \n\t" 177 "vshl.u8 d6, d6, #3 \n\t" 178 179 "vmovl.u8 q14, d31 \n\t" 180 "vmovl.u8 q13, d31 \n\t" 181 "vmovl.u8 q12, d31 \n\t" 182 183 // duplicate in 4/2/1 & 8pix vsns 184 "vmvn.8 d30, d3 \n\t" 185 "vmlal.u8 q14, d30, d6 \n\t" 186 "vmlal.u8 q13, d30, d5 \n\t" 187 "vmlal.u8 q12, d30, d4 \n\t" 188 "vshr.u16 q8, q14, #5 \n\t" 189 "vshr.u16 q9, q13, #6 \n\t" 190 "vaddhn.u16 d6, q14, q8 \n\t" 191 "vshr.u16 q8, q12, #5 \n\t" 192 "vaddhn.u16 d5, q13, q9 \n\t" 193 "vqadd.u8 d6, d6, d0 \n\t" // moved up 194 "vaddhn.u16 d4, q12, q8 \n\t" 195 // intentionally don't calculate alpha 196 // result in d4-d6 197 198 "vqadd.u8 d5, d5, d1 \n\t" 199 "vqadd.u8 d4, d4, d2 \n\t" 200 201 // pack 8888 {d4-d6} to 0565 q10 202 "vshll.u8 q10, d6, #8 \n\t" 203 "vshll.u8 q3, d5, #8 \n\t" 204 "vshll.u8 q2, d4, #8 \n\t" 205 "vsri.u16 q10, q3, #5 \n\t" 206 "vsri.u16 q10, q2, #11 \n\t" 207 208 // store 209 "tst %[count], #4 \n\t" 210 "beq 24f \n\t" 211 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 212 213 "24: \n\t" 214 "tst %[count], #2 \n\t" 215 "beq 22f \n\t" 216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 217 218 "22: \n\t" 219 "tst %[count], #1 \n\t" 220 "beq 21f \n\t" 221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 222 223 "21: \n\t" 224 : [count] "+r" (count) 225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 228 "d30","d31" 229 ); 230 } 231} 232 233static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 234 prod += vdupq_n_u16(128); 235 prod += vshrq_n_u16(prod, 8); 236 return vshrq_n_u16(prod, 8); 237} 238 239void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 240 const SkPMColor* SK_RESTRICT src, int count, 241 U8CPU alpha, int /*x*/, int /*y*/) { 242 SkASSERT(255 > alpha); 243 244 /* This code implements a Neon version of S32A_D565_Blend. The results have 245 * a few mismatches compared to the original code. These mismatches never 246 * exceed 1. 247 */ 248 249 if (count >= 8) { 250 uint16x8_t valpha_max, vmask_blue; 251 uint8x8_t valpha; 252 253 // prepare constants 254 valpha_max = vmovq_n_u16(255); 255 valpha = vdup_n_u8(alpha); 256 vmask_blue = vmovq_n_u16(SK_B16_MASK); 257 258 do { 259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 260 uint16x8_t vres_a, vres_r, vres_g, vres_b; 261 uint8x8x4_t vsrc; 262 263 // load pixels 264 vdst = vld1q_u16(dst); 265#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 266 asm ( 267 "vld4.u8 %h[vsrc], [%[src]]!" 268 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 269 : : 270 ); 271#else 272 register uint8x8_t d0 asm("d0"); 273 register uint8x8_t d1 asm("d1"); 274 register uint8x8_t d2 asm("d2"); 275 register uint8x8_t d3 asm("d3"); 276 277 asm volatile ( 278 "vld4.u8 {d0-d3},[%[src]]!;" 279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 280 [src] "+&r" (src) 281 : : 282 ); 283 vsrc.val[0] = d0; 284 vsrc.val[1] = d1; 285 vsrc.val[2] = d2; 286 vsrc.val[3] = d3; 287#endif 288 289 290 // deinterleave dst 291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 292 vdst_b = vdst & vmask_blue; // extract blue 293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 295 296 // shift src to 565 297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 300 301 // calc src * src_scale 302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 306 307 // prepare dst_scale 308 vres_a = SkDiv255Round_neon8(vres_a); 309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 310 311 // add dst * dst_scale to previous result 312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 315 316#ifdef S32A_D565_BLEND_EXACT 317 // It is possible to get exact results with this but it is slow, 318 // even slower than C code in some cases 319 vres_r = SkDiv255Round_neon8(vres_r); 320 vres_g = SkDiv255Round_neon8(vres_g); 321 vres_b = SkDiv255Round_neon8(vres_b); 322#else 323 vres_r = vrshrq_n_u16(vres_r, 8); 324 vres_g = vrshrq_n_u16(vres_g, 8); 325 vres_b = vrshrq_n_u16(vres_b, 8); 326#endif 327 // pack result 328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 330 331 // store 332 vst1q_u16(dst, vres_b); 333 dst += 8; 334 count -= 8; 335 } while (count >= 8); 336 } 337 338 // leftovers 339 while (count-- > 0) { 340 SkPMColor sc = *src++; 341 if (sc) { 342 uint16_t dc = *dst; 343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 348 } 349 dst += 1; 350 } 351} 352 353/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 354 * each dither value is spaced out into byte lanes, and repeated 355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 356 * start of each row. 357 */ 358static const uint8_t gDitherMatrix_Neon[48] = { 359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 360 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 361 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 362 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 363 364}; 365 366void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 367 int count, U8CPU alpha, int x, int y) 368{ 369 370 SkASSERT(255 > alpha); 371 372 // rescale alpha to range 1 - 256 373 int scale = SkAlpha255To256(alpha); 374 375 if (count >= 8) { 376 /* select row and offset for dither array */ 377 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 378 379 uint8x8_t vdither = vld1_u8(dstart); // load dither values 380 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 381 382 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 383 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 384 385 do { 386 387 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 388 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 389 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 390 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 391 uint16x8_t vdst; 392 uint16x8_t vdst_r, vdst_g, vdst_b; 393 int16x8_t vres_r, vres_g, vres_b; 394 int8x8_t vres8_r, vres8_g, vres8_b; 395 396 // Load source and add dither 397 { 398 register uint8x8_t d0 asm("d0"); 399 register uint8x8_t d1 asm("d1"); 400 register uint8x8_t d2 asm("d2"); 401 register uint8x8_t d3 asm("d3"); 402 403 asm ( 404 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 405 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 406 : 407 ); 408 vsrc_g = d1; 409#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 410 vsrc_r = d2; vsrc_b = d0; 411#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 412 vsrc_r = d0; vsrc_b = d2; 413#endif 414 } 415 416 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 417 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 418 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 419 420 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 421 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 422 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 423 424 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 425 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 426 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 427 428 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 429 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 430 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 431 432 // Load dst and unpack 433 vdst = vld1q_u16(dst); 434 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 435 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 436 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 437 438 // subtract dst from src and widen 439 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 440 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 441 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 442 443 // multiply diffs by scale and shift 444 vres_r = vmulq_s16(vres_r, vscale); 445 vres_g = vmulq_s16(vres_g, vscale); 446 vres_b = vmulq_s16(vres_b, vscale); 447 448 vres8_r = vshrn_n_s16(vres_r, 8); 449 vres8_g = vshrn_n_s16(vres_g, 8); 450 vres8_b = vshrn_n_s16(vres_b, 8); 451 452 // add dst to result 453 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 454 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 455 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 456 457 // put result into 565 format 458 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 459 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 460 461 // Store result 462 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 463 464 // Next iteration 465 dst += 8; 466 count -= 8; 467 468 } while (count >= 8); 469 } 470 471 // Leftovers 472 if (count > 0) { 473 int scale = SkAlpha255To256(alpha); 474 DITHER_565_SCAN(y); 475 do { 476 SkPMColor c = *src++; 477 SkPMColorAssert(c); 478 479 int dither = DITHER_VALUE(x); 480 int sr = SkGetPackedR32(c); 481 int sg = SkGetPackedG32(c); 482 int sb = SkGetPackedB32(c); 483 sr = SkDITHER_R32To565(sr, dither); 484 sg = SkDITHER_G32To565(sg, dither); 485 sb = SkDITHER_B32To565(sb, dither); 486 487 uint16_t d = *dst; 488 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 489 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 490 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 491 DITHER_INC_X(x); 492 } while (--count != 0); 493 } 494} 495 496void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 497 const SkPMColor* SK_RESTRICT src, 498 int count, U8CPU alpha) { 499 500 SkASSERT(255 == alpha); 501 if (count > 0) { 502 503 504 uint8x8_t alpha_mask; 505 506 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 507 alpha_mask = vld1_u8(alpha_mask_setup); 508 509 /* do the NEON unrolled code */ 510#define UNROLL 4 511 while (count >= UNROLL) { 512 uint8x8_t src_raw, dst_raw, dst_final; 513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 514 515 /* The two prefetches below may make the code slighlty 516 * slower for small values of count but are worth having 517 * in the general case. 518 */ 519 __builtin_prefetch(src+32); 520 __builtin_prefetch(dst+32); 521 522 /* get the source */ 523 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 524#if UNROLL > 2 525 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 526#endif 527 528 /* get and hold the dst too */ 529 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 530#if UNROLL > 2 531 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 532#endif 533 534 /* 1st and 2nd bits of the unrolling */ 535 { 536 uint8x8_t dst_cooked; 537 uint16x8_t dst_wide; 538 uint8x8_t alpha_narrow; 539 uint16x8_t alpha_wide; 540 541 /* get the alphas spread out properly */ 542 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 543 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 544 545 /* spread the dest */ 546 dst_wide = vmovl_u8(dst_raw); 547 548 /* alpha mul the dest */ 549 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 550 dst_cooked = vshrn_n_u16(dst_wide, 8); 551 552 /* sum -- ignoring any byte lane overflows */ 553 dst_final = vadd_u8(src_raw, dst_cooked); 554 } 555 556#if UNROLL > 2 557 /* the 3rd and 4th bits of our unrolling */ 558 { 559 uint8x8_t dst_cooked; 560 uint16x8_t dst_wide; 561 uint8x8_t alpha_narrow; 562 uint16x8_t alpha_wide; 563 564 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 565 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 566 567 /* spread the dest */ 568 dst_wide = vmovl_u8(dst_raw_2); 569 570 /* alpha mul the dest */ 571 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 572 dst_cooked = vshrn_n_u16(dst_wide, 8); 573 574 /* sum -- ignoring any byte lane overflows */ 575 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 576 } 577#endif 578 579 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 580#if UNROLL > 2 581 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 582#endif 583 584 src += UNROLL; 585 dst += UNROLL; 586 count -= UNROLL; 587 } 588#undef UNROLL 589 590 /* do any residual iterations */ 591 while (--count >= 0) { 592 *dst = SkPMSrcOver(*src, *dst); 593 src += 1; 594 dst += 1; 595 } 596 } 597} 598 599void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 600 const SkPMColor* SK_RESTRICT src, 601 int count, U8CPU alpha) { 602 SkASSERT(255 == alpha); 603 604 if (count <= 0) 605 return; 606 607 /* Use these to check if src is transparent or opaque */ 608 const unsigned int ALPHA_OPAQ = 0xFF000000; 609 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 610 611#define UNROLL 4 612 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 613 const SkPMColor* SK_RESTRICT src_temp = src; 614 615 /* set up the NEON variables */ 616 uint8x8_t alpha_mask; 617 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 618 alpha_mask = vld1_u8(alpha_mask_setup); 619 620 uint8x8_t src_raw, dst_raw, dst_final; 621 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 622 uint8x8_t dst_cooked; 623 uint16x8_t dst_wide; 624 uint8x8_t alpha_narrow; 625 uint16x8_t alpha_wide; 626 627 /* choose the first processing type */ 628 if( src >= src_end) 629 goto TAIL; 630 if(*src <= ALPHA_TRANS) 631 goto ALPHA_0; 632 if(*src >= ALPHA_OPAQ) 633 goto ALPHA_255; 634 /* fall-thru */ 635 636ALPHA_1_TO_254: 637 do { 638 639 /* get the source */ 640 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 641 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 642 643 /* get and hold the dst too */ 644 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 645 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 646 647 648 /* get the alphas spread out properly */ 649 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 650 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 651 /* we collapsed (255-a)+1 ... */ 652 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 653 654 /* spread the dest */ 655 dst_wide = vmovl_u8(dst_raw); 656 657 /* alpha mul the dest */ 658 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 659 dst_cooked = vshrn_n_u16(dst_wide, 8); 660 661 /* sum -- ignoring any byte lane overflows */ 662 dst_final = vadd_u8(src_raw, dst_cooked); 663 664 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 665 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 666 /* we collapsed (255-a)+1 ... */ 667 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 668 669 /* spread the dest */ 670 dst_wide = vmovl_u8(dst_raw_2); 671 672 /* alpha mul the dest */ 673 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 674 dst_cooked = vshrn_n_u16(dst_wide, 8); 675 676 /* sum -- ignoring any byte lane overflows */ 677 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 678 679 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 680 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 681 682 src += UNROLL; 683 dst += UNROLL; 684 685 /* if 2 of the next pixels aren't between 1 and 254 686 it might make sense to go to the optimized loops */ 687 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 688 break; 689 690 } while(src < src_end); 691 692 if (src >= src_end) 693 goto TAIL; 694 695 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 696 goto ALPHA_255; 697 698 /*fall-thru*/ 699 700ALPHA_0: 701 702 /*In this state, we know the current alpha is 0 and 703 we optimize for the next alpha also being zero. */ 704 src_temp = src; //so we don't have to increment dst every time 705 do { 706 if(*(++src) > ALPHA_TRANS) 707 break; 708 if(*(++src) > ALPHA_TRANS) 709 break; 710 if(*(++src) > ALPHA_TRANS) 711 break; 712 if(*(++src) > ALPHA_TRANS) 713 break; 714 } while(src < src_end); 715 716 dst += (src - src_temp); 717 718 /* no longer alpha 0, so determine where to go next. */ 719 if( src >= src_end) 720 goto TAIL; 721 if(*src >= ALPHA_OPAQ) 722 goto ALPHA_255; 723 else 724 goto ALPHA_1_TO_254; 725 726ALPHA_255: 727 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 728 dst[0]=src[0]; 729 dst[1]=src[1]; 730 dst[2]=src[2]; 731 dst[3]=src[3]; 732 src+=UNROLL; 733 dst+=UNROLL; 734 if(src >= src_end) 735 goto TAIL; 736 } 737 738 //Handle remainder. 739 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 740 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 741 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 742 } 743 } 744 745 if( src >= src_end) 746 goto TAIL; 747 if(*src <= ALPHA_TRANS) 748 goto ALPHA_0; 749 else 750 goto ALPHA_1_TO_254; 751 752TAIL: 753 /* do any residual iterations */ 754 src_end += UNROLL + 1; //goto the real end 755 while(src != src_end) { 756 if( *src != 0 ) { 757 if( *src >= ALPHA_OPAQ ) { 758 *dst = *src; 759 } 760 else { 761 *dst = SkPMSrcOver(*src, *dst); 762 } 763 } 764 src++; 765 dst++; 766 } 767 768#undef UNROLL 769 return; 770} 771 772/* Neon version of S32_Blend_BlitRow32() 773 * portable version is in src/core/SkBlitRow_D32.cpp 774 */ 775void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 776 const SkPMColor* SK_RESTRICT src, 777 int count, U8CPU alpha) { 778 SkASSERT(alpha <= 255); 779 780 if (count <= 0) { 781 return; 782 } 783 784 uint16_t src_scale = SkAlpha255To256(alpha); 785 uint16_t dst_scale = 256 - src_scale; 786 787 while (count >= 2) { 788 uint8x8_t vsrc, vdst, vres; 789 uint16x8_t vsrc_wide, vdst_wide; 790 791 /* These commented prefetches are a big win for count 792 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 793 * They also hurt a little (<5%) on an A15 794 */ 795 //__builtin_prefetch(src+32); 796 //__builtin_prefetch(dst+32); 797 798 // Load 799 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 800 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 801 802 // Process src 803 vsrc_wide = vmovl_u8(vsrc); 804 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 805 806 // Process dst 807 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 808 809 // Combine 810 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 811 812 // Store 813 vst1_u32(dst, vreinterpret_u32_u8(vres)); 814 815 src += 2; 816 dst += 2; 817 count -= 2; 818 } 819 820 if (count == 1) { 821 uint8x8_t vsrc, vdst, vres; 822 uint16x8_t vsrc_wide, vdst_wide; 823 824 // Load 825 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 826 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 827 828 // Process 829 vsrc_wide = vmovl_u8(vsrc); 830 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 831 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 832 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 833 834 // Store 835 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 836 } 837} 838 839void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 840 const SkPMColor* SK_RESTRICT src, 841 int count, U8CPU alpha) { 842 843 SkASSERT(255 >= alpha); 844 845 if (count <= 0) { 846 return; 847 } 848 849 unsigned alpha256 = SkAlpha255To256(alpha); 850 851 // First deal with odd counts 852 if (count & 1) { 853 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 854 uint16x8_t vdst_wide, vsrc_wide; 855 unsigned dst_scale; 856 857 // Load 858 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 859 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 860 861 // Calc dst_scale 862 dst_scale = vget_lane_u8(vsrc, 3); 863 dst_scale *= alpha256; 864 dst_scale >>= 8; 865 dst_scale = 256 - dst_scale; 866 867 // Process src 868 vsrc_wide = vmovl_u8(vsrc); 869 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 870 871 // Process dst 872 vdst_wide = vmovl_u8(vdst); 873 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 874 875 // Combine 876 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 877 878 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 879 dst++; 880 src++; 881 count--; 882 } 883 884 if (count) { 885 uint8x8_t alpha_mask; 886 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 887 alpha_mask = vld1_u8(alpha_mask_setup); 888 889 do { 890 891 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 892 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 893 894 __builtin_prefetch(src+32); 895 __builtin_prefetch(dst+32); 896 897 // Load 898 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 899 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 900 901 // Prepare src_scale 902 vsrc_scale = vdupq_n_u16(alpha256); 903 904 // Calc dst_scale 905 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 906 vdst_scale = vmovl_u8(vsrc_alphas); 907 vdst_scale *= vsrc_scale; 908 vdst_scale = vshrq_n_u16(vdst_scale, 8); 909 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 910 911 // Process src 912 vsrc_wide = vmovl_u8(vsrc); 913 vsrc_wide *= vsrc_scale; 914 915 // Process dst 916 vdst_wide = vmovl_u8(vdst); 917 vdst_wide *= vdst_scale; 918 919 // Combine 920 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 921 922 vst1_u32(dst, vreinterpret_u32_u8(vres)); 923 924 src += 2; 925 dst += 2; 926 count -= 2; 927 } while(count); 928 } 929} 930 931/////////////////////////////////////////////////////////////////////////////// 932 933#undef DEBUG_OPAQUE_DITHER 934 935#if defined(DEBUG_OPAQUE_DITHER) 936static void showme8(char *str, void *p, int len) 937{ 938 static char buf[256]; 939 char tbuf[32]; 940 int i; 941 char *pc = (char*) p; 942 sprintf(buf,"%8s:", str); 943 for(i=0;i<len;i++) { 944 sprintf(tbuf, " %02x", pc[i]); 945 strcat(buf, tbuf); 946 } 947 SkDebugf("%s\n", buf); 948} 949static void showme16(char *str, void *p, int len) 950{ 951 static char buf[256]; 952 char tbuf[32]; 953 int i; 954 uint16_t *pc = (uint16_t*) p; 955 sprintf(buf,"%8s:", str); 956 len = (len / sizeof(uint16_t)); /* passed as bytes */ 957 for(i=0;i<len;i++) { 958 sprintf(tbuf, " %04x", pc[i]); 959 strcat(buf, tbuf); 960 } 961 SkDebugf("%s\n", buf); 962} 963#endif 964 965void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 966 const SkPMColor* SK_RESTRICT src, 967 int count, U8CPU alpha, int x, int y) { 968 SkASSERT(255 == alpha); 969 970#define UNROLL 8 971 972 if (count >= UNROLL) { 973 uint8x8_t dbase; 974 975#if defined(DEBUG_OPAQUE_DITHER) 976 uint16_t tmpbuf[UNROLL]; 977 int td[UNROLL]; 978 int tdv[UNROLL]; 979 int ta[UNROLL]; 980 int tap[UNROLL]; 981 uint16_t in_dst[UNROLL]; 982 int offset = 0; 983 int noisy = 0; 984#endif 985 986 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 987 dbase = vld1_u8(dstart); 988 989 do { 990 uint8x8_t sr, sg, sb, sa, d; 991 uint16x8_t dst8, scale8, alpha8; 992 uint16x8_t dst_r, dst_g, dst_b; 993 994#if defined(DEBUG_OPAQUE_DITHER) 995 /* calculate 8 elements worth into a temp buffer */ 996 { 997 int my_y = y; 998 int my_x = x; 999 SkPMColor* my_src = (SkPMColor*)src; 1000 uint16_t* my_dst = dst; 1001 int i; 1002 1003 DITHER_565_SCAN(my_y); 1004 for(i=0;i<UNROLL;i++) { 1005 SkPMColor c = *my_src++; 1006 SkPMColorAssert(c); 1007 if (c) { 1008 unsigned a = SkGetPackedA32(c); 1009 1010 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1011 tdv[i] = DITHER_VALUE(my_x); 1012 ta[i] = a; 1013 tap[i] = SkAlpha255To256(a); 1014 td[i] = d; 1015 1016 unsigned sr = SkGetPackedR32(c); 1017 unsigned sg = SkGetPackedG32(c); 1018 unsigned sb = SkGetPackedB32(c); 1019 sr = SkDITHER_R32_FOR_565(sr, d); 1020 sg = SkDITHER_G32_FOR_565(sg, d); 1021 sb = SkDITHER_B32_FOR_565(sb, d); 1022 1023 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1024 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1025 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1026 // now src and dst expanded are in g:11 r:10 x:1 b:10 1027 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1028 td[i] = d; 1029 1030 } else { 1031 tmpbuf[i] = *my_dst; 1032 ta[i] = tdv[i] = td[i] = 0xbeef; 1033 } 1034 in_dst[i] = *my_dst; 1035 my_dst += 1; 1036 DITHER_INC_X(my_x); 1037 } 1038 } 1039#endif 1040 1041 /* source is in ABGR */ 1042 { 1043 register uint8x8_t d0 asm("d0"); 1044 register uint8x8_t d1 asm("d1"); 1045 register uint8x8_t d2 asm("d2"); 1046 register uint8x8_t d3 asm("d3"); 1047 1048 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1049 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1050 : "r" (src) 1051 ); 1052 sr = d0; sg = d1; sb = d2; sa = d3; 1053 } 1054 1055 /* calculate 'd', which will be 0..7 */ 1056 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1057#if defined(SK_BUILD_FOR_ANDROID) 1058 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1059 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1060#else 1061 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1062#endif 1063 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1064 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 1065 1066 /* sr = sr - (sr>>5) + d */ 1067 /* watching for 8-bit overflow. d is 0..7; risky range of 1068 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1069 * safe as long as we do ((sr-sr>>5) + d) */ 1070 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1071 sr = vadd_u8(sr, d); 1072 1073 /* sb = sb - (sb>>5) + d */ 1074 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1075 sb = vadd_u8(sb, d); 1076 1077 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1078 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1079 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1080 1081 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1082 dst8 = vld1q_u16(dst); 1083 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1084 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1085 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1086 1087 /* blend */ 1088#if 1 1089 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1090 /* originally 255-sa + 1 */ 1091 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1092#else 1093 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1094 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1095#endif 1096 1097#if 1 1098 /* combine the addq and mul, save 3 insns */ 1099 scale8 = vshrq_n_u16(scale8, 3); 1100 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1101 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1102 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1103#else 1104 /* known correct, but +3 insns over above */ 1105 scale8 = vshrq_n_u16(scale8, 3); 1106 dst_b = vmulq_u16(dst_b, scale8); 1107 dst_g = vmulq_u16(dst_g, scale8); 1108 dst_r = vmulq_u16(dst_r, scale8); 1109 1110 /* combine */ 1111 /* NB: vshll widens, need to preserve those bits */ 1112 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1113 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1114 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1115#endif 1116 1117 /* repack to store */ 1118 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1119 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1120 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1121 1122 vst1q_u16(dst, dst8); 1123 1124#if defined(DEBUG_OPAQUE_DITHER) 1125 /* verify my 8 elements match the temp buffer */ 1126 { 1127 int i, bad=0; 1128 static int invocation; 1129 1130 for (i=0;i<UNROLL;i++) 1131 if (tmpbuf[i] != dst[i]) bad=1; 1132 if (bad) { 1133 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1134 invocation, offset); 1135 SkDebugf(" alpha 0x%x\n", alpha); 1136 for (i=0;i<UNROLL;i++) 1137 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1138 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1139 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1140 1141 showme16("alpha8", &alpha8, sizeof(alpha8)); 1142 showme16("scale8", &scale8, sizeof(scale8)); 1143 showme8("d", &d, sizeof(d)); 1144 showme16("dst8", &dst8, sizeof(dst8)); 1145 showme16("dst_b", &dst_b, sizeof(dst_b)); 1146 showme16("dst_g", &dst_g, sizeof(dst_g)); 1147 showme16("dst_r", &dst_r, sizeof(dst_r)); 1148 showme8("sb", &sb, sizeof(sb)); 1149 showme8("sg", &sg, sizeof(sg)); 1150 showme8("sr", &sr, sizeof(sr)); 1151 1152 /* cop out */ 1153 return; 1154 } 1155 offset += UNROLL; 1156 invocation++; 1157 } 1158#endif 1159 1160 dst += UNROLL; 1161 src += UNROLL; 1162 count -= UNROLL; 1163 /* skip x += UNROLL, since it's unchanged mod-4 */ 1164 } while (count >= UNROLL); 1165 } 1166#undef UNROLL 1167 1168 /* residuals */ 1169 if (count > 0) { 1170 DITHER_565_SCAN(y); 1171 do { 1172 SkPMColor c = *src++; 1173 SkPMColorAssert(c); 1174 if (c) { 1175 unsigned a = SkGetPackedA32(c); 1176 1177 // dither and alpha are just temporary variables to work-around 1178 // an ICE in debug. 1179 unsigned dither = DITHER_VALUE(x); 1180 unsigned alpha = SkAlpha255To256(a); 1181 int d = SkAlphaMul(dither, alpha); 1182 1183 unsigned sr = SkGetPackedR32(c); 1184 unsigned sg = SkGetPackedG32(c); 1185 unsigned sb = SkGetPackedB32(c); 1186 sr = SkDITHER_R32_FOR_565(sr, d); 1187 sg = SkDITHER_G32_FOR_565(sg, d); 1188 sb = SkDITHER_B32_FOR_565(sb, d); 1189 1190 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1191 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1192 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1193 // now src and dst expanded are in g:11 r:10 x:1 b:10 1194 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1195 } 1196 dst += 1; 1197 DITHER_INC_X(x); 1198 } while (--count != 0); 1199 } 1200} 1201 1202/////////////////////////////////////////////////////////////////////////////// 1203 1204#undef DEBUG_S32_OPAQUE_DITHER 1205 1206void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1207 const SkPMColor* SK_RESTRICT src, 1208 int count, U8CPU alpha, int x, int y) { 1209 SkASSERT(255 == alpha); 1210 1211#define UNROLL 8 1212 if (count >= UNROLL) { 1213 uint8x8_t d; 1214 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1215 d = vld1_u8(dstart); 1216 1217 while (count >= UNROLL) { 1218 uint8x8_t sr, sg, sb; 1219 uint16x8_t dr, dg, db; 1220 uint16x8_t dst8; 1221 1222 { 1223 register uint8x8_t d0 asm("d0"); 1224 register uint8x8_t d1 asm("d1"); 1225 register uint8x8_t d2 asm("d2"); 1226 register uint8x8_t d3 asm("d3"); 1227 1228 asm ( 1229 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1230 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1231 : 1232 ); 1233 sg = d1; 1234#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1235 sr = d2; sb = d0; 1236#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1237 sr = d0; sb = d2; 1238#endif 1239 } 1240 /* XXX: if we want to prefetch, hide it in the above asm() 1241 * using the gcc __builtin_prefetch(), the prefetch will 1242 * fall to the bottom of the loop -- it won't stick up 1243 * at the top of the loop, just after the vld4. 1244 */ 1245 1246 // sr = sr - (sr>>5) + d 1247 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1248 dr = vaddl_u8(sr, d); 1249 1250 // sb = sb - (sb>>5) + d 1251 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1252 db = vaddl_u8(sb, d); 1253 1254 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1255 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1256 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1257 1258 // pack high bits of each into 565 format (rgb, b is lsb) 1259 dst8 = vshrq_n_u16(db, 3); 1260 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1261 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1262 1263 // store it 1264 vst1q_u16(dst, dst8); 1265 1266#if defined(DEBUG_S32_OPAQUE_DITHER) 1267 // always good to know if we generated good results 1268 { 1269 int i, myx = x, myy = y; 1270 DITHER_565_SCAN(myy); 1271 for (i=0;i<UNROLL;i++) { 1272 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1273 SkPMColor c = src[i-8]; 1274 unsigned dither = DITHER_VALUE(myx); 1275 uint16_t val = SkDitherRGB32To565(c, dither); 1276 if (val != dst[i]) { 1277 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1278 c, dither, val, dst[i], dstart[i]); 1279 } 1280 DITHER_INC_X(myx); 1281 } 1282 } 1283#endif 1284 1285 dst += UNROLL; 1286 // we don't need to increment src as the asm above has already done it 1287 count -= UNROLL; 1288 x += UNROLL; // probably superfluous 1289 } 1290 } 1291#undef UNROLL 1292 1293 // residuals 1294 if (count > 0) { 1295 DITHER_565_SCAN(y); 1296 do { 1297 SkPMColor c = *src++; 1298 SkPMColorAssert(c); 1299 SkASSERT(SkGetPackedA32(c) == 255); 1300 1301 unsigned dither = DITHER_VALUE(x); 1302 *dst++ = SkDitherRGB32To565(c, dither); 1303 DITHER_INC_X(x); 1304 } while (--count != 0); 1305 } 1306} 1307 1308void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1309 SkPMColor color) { 1310 if (count <= 0) { 1311 return; 1312 } 1313 1314 if (0 == color) { 1315 if (src != dst) { 1316 memcpy(dst, src, count * sizeof(SkPMColor)); 1317 } 1318 return; 1319 } 1320 1321 unsigned colorA = SkGetPackedA32(color); 1322 if (255 == colorA) { 1323 sk_memset32(dst, color, count); 1324 } else { 1325 unsigned scale = 256 - SkAlpha255To256(colorA); 1326 1327 if (count >= 8) { 1328 // at the end of this assembly, count will have been decremented 1329 // to a negative value. That is, if count mod 8 = x, it will be 1330 // -8 +x coming out. 1331 asm volatile ( 1332 PLD128(src, 0) 1333 1334 "vdup.32 q0, %[color] \n\t" 1335 1336 PLD128(src, 128) 1337 1338 // scale numerical interval [0-255], so load as 8 bits 1339 "vdup.8 d2, %[scale] \n\t" 1340 1341 PLD128(src, 256) 1342 1343 "subs %[count], %[count], #8 \n\t" 1344 1345 PLD128(src, 384) 1346 1347 "Loop_Color32: \n\t" 1348 1349 // load src color, 8 pixels, 4 64 bit registers 1350 // (and increment src). 1351 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1352 1353 PLD128(src, 384) 1354 1355 // multiply long by scale, 64 bits at a time, 1356 // destination into a 128 bit register. 1357 "vmull.u8 q4, d4, d2 \n\t" 1358 "vmull.u8 q5, d5, d2 \n\t" 1359 "vmull.u8 q6, d6, d2 \n\t" 1360 "vmull.u8 q7, d7, d2 \n\t" 1361 1362 // shift the 128 bit registers, containing the 16 1363 // bit scaled values back to 8 bits, narrowing the 1364 // results to 64 bit registers. 1365 "vshrn.i16 d8, q4, #8 \n\t" 1366 "vshrn.i16 d9, q5, #8 \n\t" 1367 "vshrn.i16 d10, q6, #8 \n\t" 1368 "vshrn.i16 d11, q7, #8 \n\t" 1369 1370 // adding back the color, using 128 bit registers. 1371 "vadd.i8 q6, q4, q0 \n\t" 1372 "vadd.i8 q7, q5, q0 \n\t" 1373 1374 // store back the 8 calculated pixels (2 128 bit 1375 // registers), and increment dst. 1376 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1377 1378 "subs %[count], %[count], #8 \n\t" 1379 "bge Loop_Color32 \n\t" 1380 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1381 : [color] "r" (color), [scale] "r" (scale) 1382 : "cc", "memory", 1383 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1384 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1385 ); 1386 // At this point, if we went through the inline assembly, count is 1387 // a negative value: 1388 // if the value is -8, there is no pixel left to process. 1389 // if the value is -7, there is one pixel left to process 1390 // ... 1391 // And'ing it with 7 will give us the number of pixels 1392 // left to process. 1393 count = count & 0x7; 1394 } 1395 1396 while (count > 0) { 1397 *dst = color + SkAlphaMulQ(*src, scale); 1398 src += 1; 1399 dst += 1; 1400 count--; 1401 } 1402 } 1403} 1404 1405/////////////////////////////////////////////////////////////////////////////// 1406 1407const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1408 // no dither 1409 // NOTE: For the S32_D565_Blend function below, we don't have a special 1410 // version that assumes that each source pixel is opaque. But our 1411 // S32A is still faster than the default, so use it. 1412 S32_D565_Opaque_neon, 1413 S32A_D565_Blend_neon, // really S32_D565_Blend 1414 S32A_D565_Opaque_neon, 1415 S32A_D565_Blend_neon, 1416 1417 // dither 1418 S32_D565_Opaque_Dither_neon, 1419 S32_D565_Blend_Dither_neon, 1420 S32A_D565_Opaque_Dither_neon, 1421 NULL, // S32A_D565_Blend_Dither 1422}; 1423 1424const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1425 NULL, // S32_Opaque, 1426 S32_Blend_BlitRow32_neon, // S32_Blend, 1427 /* 1428 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1429 * value and attempts to optimize accordingly. The optimization is 1430 * sensitive to the source content and is not a win in all cases. For 1431 * example, if there are a lot of transitions between the alpha states, 1432 * the performance will almost certainly be worse. However, for many 1433 * common cases the performance is equivalent or better than the standard 1434 * case where we do not inspect the src alpha. 1435 */ 1436#if SK_A32_SHIFT == 24 1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1439#else 1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1441#endif 1442 S32A_Blend_BlitRow32_neon // S32A_Blend 1443}; 1444