SkBlitRow_opts_arm_neon.cpp revision 4cc26324e3be5258fae9dc102aa6a3af7d1c96ea
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18#include "SkColor_opts_neon.h" 19#include <arm_neon.h> 20 21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 while (count >= 8) { 27 uint8x8x4_t vsrc; 28 uint16x8_t vdst; 29 30 // Load 31 vsrc = vld4_u8((uint8_t*)src); 32 33 // Convert src to 565 34 vdst = vshll_n_u8(vsrc.val[NEON_R], 8); 35 vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5); 36 vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6); 37 38 // Store 39 vst1q_u16(dst, vdst); 40 41 // Prepare next iteration 42 dst += 8; 43 src += 8; 44 count -= 8; 45 }; 46 47 // Leftovers 48 while (count > 0) { 49 SkPMColor c = *src++; 50 SkPMColorAssert(c); 51 *dst = SkPixel32ToPixel16_ToU16(c); 52 dst++; 53 count--; 54 }; 55} 56 57void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 58 const SkPMColor* SK_RESTRICT src, int count, 59 U8CPU alpha, int /*x*/, int /*y*/) { 60 SkASSERT(255 == alpha); 61 62 if (count >= 8) { 63 uint16_t* SK_RESTRICT keep_dst = 0; 64 65 asm volatile ( 66 "ands ip, %[count], #7 \n\t" 67 "vmov.u8 d31, #1<<7 \n\t" 68 "vld1.16 {q12}, [%[dst]] \n\t" 69 "vld4.8 {d0-d3}, [%[src]] \n\t" 70 // Thumb does not support the standard ARM conditional 71 // instructions but instead requires the 'it' instruction 72 // to signal conditional execution 73 "it eq \n\t" 74 "moveq ip, #8 \n\t" 75 "mov %[keep_dst], %[dst] \n\t" 76 77 "add %[src], %[src], ip, LSL#2 \n\t" 78 "add %[dst], %[dst], ip, LSL#1 \n\t" 79 "subs %[count], %[count], ip \n\t" 80 "b 9f \n\t" 81 // LOOP 82 "2: \n\t" 83 84 "vld1.16 {q12}, [%[dst]]! \n\t" 85 "vld4.8 {d0-d3}, [%[src]]! \n\t" 86 "vst1.16 {q10}, [%[keep_dst]] \n\t" 87 "sub %[keep_dst], %[dst], #8*2 \n\t" 88 "subs %[count], %[count], #8 \n\t" 89 "9: \n\t" 90 "pld [%[dst],#32] \n\t" 91 // expand 0565 q12 to 8888 {d4-d7} 92 "vmovn.u16 d4, q12 \n\t" 93 "vshr.u16 q11, q12, #5 \n\t" 94 "vshr.u16 q10, q12, #6+5 \n\t" 95 "vmovn.u16 d5, q11 \n\t" 96 "vmovn.u16 d6, q10 \n\t" 97 "vshl.u8 d4, d4, #3 \n\t" 98 "vshl.u8 d5, d5, #2 \n\t" 99 "vshl.u8 d6, d6, #3 \n\t" 100 101 "vmovl.u8 q14, d31 \n\t" 102 "vmovl.u8 q13, d31 \n\t" 103 "vmovl.u8 q12, d31 \n\t" 104 105 // duplicate in 4/2/1 & 8pix vsns 106 "vmvn.8 d30, d3 \n\t" 107 "vmlal.u8 q14, d30, d6 \n\t" 108 "vmlal.u8 q13, d30, d5 \n\t" 109 "vmlal.u8 q12, d30, d4 \n\t" 110 "vshr.u16 q8, q14, #5 \n\t" 111 "vshr.u16 q9, q13, #6 \n\t" 112 "vaddhn.u16 d6, q14, q8 \n\t" 113 "vshr.u16 q8, q12, #5 \n\t" 114 "vaddhn.u16 d5, q13, q9 \n\t" 115 "vqadd.u8 d6, d6, d0 \n\t" // moved up 116 "vaddhn.u16 d4, q12, q8 \n\t" 117 // intentionally don't calculate alpha 118 // result in d4-d6 119 120 "vqadd.u8 d5, d5, d1 \n\t" 121 "vqadd.u8 d4, d4, d2 \n\t" 122 123 // pack 8888 {d4-d6} to 0565 q10 124 "vshll.u8 q10, d6, #8 \n\t" 125 "vshll.u8 q3, d5, #8 \n\t" 126 "vshll.u8 q2, d4, #8 \n\t" 127 "vsri.u16 q10, q3, #5 \n\t" 128 "vsri.u16 q10, q2, #11 \n\t" 129 130 "bne 2b \n\t" 131 132 "1: \n\t" 133 "vst1.16 {q10}, [%[keep_dst]] \n\t" 134 : [count] "+r" (count) 135 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 136 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 137 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 138 "d30","d31" 139 ); 140 } 141 else 142 { // handle count < 8 143 uint16_t* SK_RESTRICT keep_dst = 0; 144 145 asm volatile ( 146 "vmov.u8 d31, #1<<7 \n\t" 147 "mov %[keep_dst], %[dst] \n\t" 148 149 "tst %[count], #4 \n\t" 150 "beq 14f \n\t" 151 "vld1.16 {d25}, [%[dst]]! \n\t" 152 "vld1.32 {q1}, [%[src]]! \n\t" 153 154 "14: \n\t" 155 "tst %[count], #2 \n\t" 156 "beq 12f \n\t" 157 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 158 "vld1.32 {d1}, [%[src]]! \n\t" 159 160 "12: \n\t" 161 "tst %[count], #1 \n\t" 162 "beq 11f \n\t" 163 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 164 "vld1.32 {d0[1]}, [%[src]]! \n\t" 165 166 "11: \n\t" 167 // unzips achieve the same as a vld4 operation 168 "vuzpq.u16 q0, q1 \n\t" 169 "vuzp.u8 d0, d1 \n\t" 170 "vuzp.u8 d2, d3 \n\t" 171 // expand 0565 q12 to 8888 {d4-d7} 172 "vmovn.u16 d4, q12 \n\t" 173 "vshr.u16 q11, q12, #5 \n\t" 174 "vshr.u16 q10, q12, #6+5 \n\t" 175 "vmovn.u16 d5, q11 \n\t" 176 "vmovn.u16 d6, q10 \n\t" 177 "vshl.u8 d4, d4, #3 \n\t" 178 "vshl.u8 d5, d5, #2 \n\t" 179 "vshl.u8 d6, d6, #3 \n\t" 180 181 "vmovl.u8 q14, d31 \n\t" 182 "vmovl.u8 q13, d31 \n\t" 183 "vmovl.u8 q12, d31 \n\t" 184 185 // duplicate in 4/2/1 & 8pix vsns 186 "vmvn.8 d30, d3 \n\t" 187 "vmlal.u8 q14, d30, d6 \n\t" 188 "vmlal.u8 q13, d30, d5 \n\t" 189 "vmlal.u8 q12, d30, d4 \n\t" 190 "vshr.u16 q8, q14, #5 \n\t" 191 "vshr.u16 q9, q13, #6 \n\t" 192 "vaddhn.u16 d6, q14, q8 \n\t" 193 "vshr.u16 q8, q12, #5 \n\t" 194 "vaddhn.u16 d5, q13, q9 \n\t" 195 "vqadd.u8 d6, d6, d0 \n\t" // moved up 196 "vaddhn.u16 d4, q12, q8 \n\t" 197 // intentionally don't calculate alpha 198 // result in d4-d6 199 200 "vqadd.u8 d5, d5, d1 \n\t" 201 "vqadd.u8 d4, d4, d2 \n\t" 202 203 // pack 8888 {d4-d6} to 0565 q10 204 "vshll.u8 q10, d6, #8 \n\t" 205 "vshll.u8 q3, d5, #8 \n\t" 206 "vshll.u8 q2, d4, #8 \n\t" 207 "vsri.u16 q10, q3, #5 \n\t" 208 "vsri.u16 q10, q2, #11 \n\t" 209 210 // store 211 "tst %[count], #4 \n\t" 212 "beq 24f \n\t" 213 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 214 215 "24: \n\t" 216 "tst %[count], #2 \n\t" 217 "beq 22f \n\t" 218 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 219 220 "22: \n\t" 221 "tst %[count], #1 \n\t" 222 "beq 21f \n\t" 223 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 224 225 "21: \n\t" 226 : [count] "+r" (count) 227 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 228 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 229 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 230 "d30","d31" 231 ); 232 } 233} 234 235void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 236 const SkPMColor* SK_RESTRICT src, int count, 237 U8CPU alpha, int /*x*/, int /*y*/) { 238 239 U8CPU alpha_for_asm = alpha; 240 241 asm volatile ( 242 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 243 * the original in two respects: 244 * 1. The results have a few mismatches compared to the original code. These mismatches 245 * never exceed 1. It's possible to improve accuracy vs. a floating point 246 * implementation by introducing rounding right shifts (vrshr) for the final stage. 247 * Rounding is not present in the code below, because although results would be closer 248 * to a floating point implementation, the number of mismatches compared to the 249 * original code would be far greater. 250 * 2. On certain inputs, the original code can overflow, causing colour channels to 251 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 252 * to affect another. 253 */ 254 255#if 1 256 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 257 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 258#else 259 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 260#endif 261 "vmov.u16 q3, #255 \n\t" // set up constant 262 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 263 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 264 "beq 2f \n\t" // if count8 == 0, exit 265 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 266 267 "1: \n\t" 268 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 269 "subs r4, r4, #1 \n\t" // decrement loop counter 270 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 271 // and deinterleave 272 273 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 274 "vand q10, q0, q15 \n\t" // extract blue 275 "vshr.u16 q8, q0, #11 \n\t" // extract red 276 "vshr.u16 q9, q9, #10 \n\t" // extract green 277 // dstrgb = {q8, q9, q10} 278 279 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 280 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 281 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 282 283 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 284 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 285 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 286 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 287 // srcrgba = {q11, q12, q13, q14} 288 289 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 290 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 291 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 292 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 293 294 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 295 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 296 // dst_scale = q2 297 298 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 299 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 300 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 301 302#if 1 303 // trying for a better match with SkDiv255Round(a) 304 // C alg is: a+=128; (a+a>>8)>>8 305 // we'll use just a rounding shift [q2 is available for scratch] 306 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 307 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 308 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 309#else 310 // arm's original "truncating divide by 256" 311 "vshr.u16 q11, q11, #8 \n\t" // shift down red 312 "vshr.u16 q12, q12, #8 \n\t" // shift down green 313 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 314#endif 315 316 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 317 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 318 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 319 320 "bne 1b \n\t" // if counter != 0, loop 321 "2: \n\t" // exit 322 323 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 324 : 325 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 326 ); 327 328 count &= 7; 329 if (count > 0) { 330 do { 331 SkPMColor sc = *src++; 332 if (sc) { 333 uint16_t dc = *dst; 334 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 335 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 336 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 337 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 338 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 339 } 340 dst += 1; 341 } while (--count != 0); 342 } 343} 344 345/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 346 * each dither value is spaced out into byte lanes, and repeated 347 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 348 * start of each row. 349 */ 350static const uint8_t gDitherMatrix_Neon[48] = { 351 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 352 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 353 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 354 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 355 356}; 357 358void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 359 int count, U8CPU alpha, int x, int y) 360{ 361 362 SkASSERT(255 > alpha); 363 364 // rescale alpha to range 1 - 256 365 int scale = SkAlpha255To256(alpha); 366 367 if (count >= 8) { 368 /* select row and offset for dither array */ 369 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 370 371 uint8x8_t vdither = vld1_u8(dstart); // load dither values 372 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 373 374 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 375 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 376 377 do { 378 379 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 380 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 381 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 382 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 383 uint16x8_t vdst; 384 uint16x8_t vdst_r, vdst_g, vdst_b; 385 int16x8_t vres_r, vres_g, vres_b; 386 int8x8_t vres8_r, vres8_g, vres8_b; 387 388 // Load source and add dither 389 { 390 register uint8x8_t d0 asm("d0"); 391 register uint8x8_t d1 asm("d1"); 392 register uint8x8_t d2 asm("d2"); 393 register uint8x8_t d3 asm("d3"); 394 395 asm ( 396 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 397 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 398 : 399 ); 400 vsrc_g = d1; 401#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 402 vsrc_r = d2; vsrc_b = d0; 403#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 404 vsrc_r = d0; vsrc_b = d2; 405#endif 406 } 407 408 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 409 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 410 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 411 412 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 413 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 414 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 415 416 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 417 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 418 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 419 420 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 421 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 422 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 423 424 // Load dst and unpack 425 vdst = vld1q_u16(dst); 426 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 427 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 428 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 429 430 // subtract dst from src and widen 431 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 432 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 433 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 434 435 // multiply diffs by scale and shift 436 vres_r = vmulq_s16(vres_r, vscale); 437 vres_g = vmulq_s16(vres_g, vscale); 438 vres_b = vmulq_s16(vres_b, vscale); 439 440 vres8_r = vshrn_n_s16(vres_r, 8); 441 vres8_g = vshrn_n_s16(vres_g, 8); 442 vres8_b = vshrn_n_s16(vres_b, 8); 443 444 // add dst to result 445 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 446 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 447 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 448 449 // put result into 565 format 450 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 451 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 452 453 // Store result 454 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 455 456 // Next iteration 457 dst += 8; 458 count -= 8; 459 460 } while (count >= 8); 461 } 462 463 // Leftovers 464 if (count > 0) { 465 int scale = SkAlpha255To256(alpha); 466 DITHER_565_SCAN(y); 467 do { 468 SkPMColor c = *src++; 469 SkPMColorAssert(c); 470 471 int dither = DITHER_VALUE(x); 472 int sr = SkGetPackedR32(c); 473 int sg = SkGetPackedG32(c); 474 int sb = SkGetPackedB32(c); 475 sr = SkDITHER_R32To565(sr, dither); 476 sg = SkDITHER_G32To565(sg, dither); 477 sb = SkDITHER_B32To565(sb, dither); 478 479 uint16_t d = *dst; 480 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 481 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 482 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 483 DITHER_INC_X(x); 484 } while (--count != 0); 485 } 486} 487 488void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 489 const SkPMColor* SK_RESTRICT src, 490 int count, U8CPU alpha) { 491 492 SkASSERT(255 == alpha); 493 if (count > 0) { 494 495 496 uint8x8_t alpha_mask; 497 498 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 499 alpha_mask = vld1_u8(alpha_mask_setup); 500 501 /* do the NEON unrolled code */ 502#define UNROLL 4 503 while (count >= UNROLL) { 504 uint8x8_t src_raw, dst_raw, dst_final; 505 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 506 507 /* The two prefetches below may make the code slighlty 508 * slower for small values of count but are worth having 509 * in the general case. 510 */ 511 __builtin_prefetch(src+32); 512 __builtin_prefetch(dst+32); 513 514 /* get the source */ 515 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 516#if UNROLL > 2 517 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 518#endif 519 520 /* get and hold the dst too */ 521 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 522#if UNROLL > 2 523 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 524#endif 525 526 /* 1st and 2nd bits of the unrolling */ 527 { 528 uint8x8_t dst_cooked; 529 uint16x8_t dst_wide; 530 uint8x8_t alpha_narrow; 531 uint16x8_t alpha_wide; 532 533 /* get the alphas spread out properly */ 534 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 535 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 536 537 /* spread the dest */ 538 dst_wide = vmovl_u8(dst_raw); 539 540 /* alpha mul the dest */ 541 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 542 dst_cooked = vshrn_n_u16(dst_wide, 8); 543 544 /* sum -- ignoring any byte lane overflows */ 545 dst_final = vadd_u8(src_raw, dst_cooked); 546 } 547 548#if UNROLL > 2 549 /* the 3rd and 4th bits of our unrolling */ 550 { 551 uint8x8_t dst_cooked; 552 uint16x8_t dst_wide; 553 uint8x8_t alpha_narrow; 554 uint16x8_t alpha_wide; 555 556 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 557 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 558 559 /* spread the dest */ 560 dst_wide = vmovl_u8(dst_raw_2); 561 562 /* alpha mul the dest */ 563 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 564 dst_cooked = vshrn_n_u16(dst_wide, 8); 565 566 /* sum -- ignoring any byte lane overflows */ 567 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 568 } 569#endif 570 571 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 572#if UNROLL > 2 573 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 574#endif 575 576 src += UNROLL; 577 dst += UNROLL; 578 count -= UNROLL; 579 } 580#undef UNROLL 581 582 /* do any residual iterations */ 583 while (--count >= 0) { 584 *dst = SkPMSrcOver(*src, *dst); 585 src += 1; 586 dst += 1; 587 } 588 } 589} 590 591void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 592 const SkPMColor* SK_RESTRICT src, 593 int count, U8CPU alpha) { 594 SkASSERT(255 == alpha); 595 596 if (count <= 0) 597 return; 598 599 /* Use these to check if src is transparent or opaque */ 600 const unsigned int ALPHA_OPAQ = 0xFF000000; 601 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 602 603#define UNROLL 4 604 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 605 const SkPMColor* SK_RESTRICT src_temp = src; 606 607 /* set up the NEON variables */ 608 uint8x8_t alpha_mask; 609 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 610 alpha_mask = vld1_u8(alpha_mask_setup); 611 612 uint8x8_t src_raw, dst_raw, dst_final; 613 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 614 uint8x8_t dst_cooked; 615 uint16x8_t dst_wide; 616 uint8x8_t alpha_narrow; 617 uint16x8_t alpha_wide; 618 619 /* choose the first processing type */ 620 if( src >= src_end) 621 goto TAIL; 622 if(*src <= ALPHA_TRANS) 623 goto ALPHA_0; 624 if(*src >= ALPHA_OPAQ) 625 goto ALPHA_255; 626 /* fall-thru */ 627 628ALPHA_1_TO_254: 629 do { 630 631 /* get the source */ 632 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 633 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 634 635 /* get and hold the dst too */ 636 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 637 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 638 639 640 /* get the alphas spread out properly */ 641 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 642 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 643 /* we collapsed (255-a)+1 ... */ 644 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 645 646 /* spread the dest */ 647 dst_wide = vmovl_u8(dst_raw); 648 649 /* alpha mul the dest */ 650 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 651 dst_cooked = vshrn_n_u16(dst_wide, 8); 652 653 /* sum -- ignoring any byte lane overflows */ 654 dst_final = vadd_u8(src_raw, dst_cooked); 655 656 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 657 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 658 /* we collapsed (255-a)+1 ... */ 659 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 660 661 /* spread the dest */ 662 dst_wide = vmovl_u8(dst_raw_2); 663 664 /* alpha mul the dest */ 665 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 666 dst_cooked = vshrn_n_u16(dst_wide, 8); 667 668 /* sum -- ignoring any byte lane overflows */ 669 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 670 671 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 672 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 673 674 src += UNROLL; 675 dst += UNROLL; 676 677 /* if 2 of the next pixels aren't between 1 and 254 678 it might make sense to go to the optimized loops */ 679 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 680 break; 681 682 } while(src < src_end); 683 684 if (src >= src_end) 685 goto TAIL; 686 687 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 688 goto ALPHA_255; 689 690 /*fall-thru*/ 691 692ALPHA_0: 693 694 /*In this state, we know the current alpha is 0 and 695 we optimize for the next alpha also being zero. */ 696 src_temp = src; //so we don't have to increment dst every time 697 do { 698 if(*(++src) > ALPHA_TRANS) 699 break; 700 if(*(++src) > ALPHA_TRANS) 701 break; 702 if(*(++src) > ALPHA_TRANS) 703 break; 704 if(*(++src) > ALPHA_TRANS) 705 break; 706 } while(src < src_end); 707 708 dst += (src - src_temp); 709 710 /* no longer alpha 0, so determine where to go next. */ 711 if( src >= src_end) 712 goto TAIL; 713 if(*src >= ALPHA_OPAQ) 714 goto ALPHA_255; 715 else 716 goto ALPHA_1_TO_254; 717 718ALPHA_255: 719 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 720 dst[0]=src[0]; 721 dst[1]=src[1]; 722 dst[2]=src[2]; 723 dst[3]=src[3]; 724 src+=UNROLL; 725 dst+=UNROLL; 726 if(src >= src_end) 727 goto TAIL; 728 } 729 730 //Handle remainder. 731 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 732 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 733 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 734 } 735 } 736 737 if( src >= src_end) 738 goto TAIL; 739 if(*src <= ALPHA_TRANS) 740 goto ALPHA_0; 741 else 742 goto ALPHA_1_TO_254; 743 744TAIL: 745 /* do any residual iterations */ 746 src_end += UNROLL + 1; //goto the real end 747 while(src != src_end) { 748 if( *src != 0 ) { 749 if( *src >= ALPHA_OPAQ ) { 750 *dst = *src; 751 } 752 else { 753 *dst = SkPMSrcOver(*src, *dst); 754 } 755 } 756 src++; 757 dst++; 758 } 759 760#undef UNROLL 761 return; 762} 763 764/* Neon version of S32_Blend_BlitRow32() 765 * portable version is in src/core/SkBlitRow_D32.cpp 766 */ 767void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 768 const SkPMColor* SK_RESTRICT src, 769 int count, U8CPU alpha) { 770 SkASSERT(alpha <= 255); 771 if (count > 0) { 772 uint16_t src_scale = SkAlpha255To256(alpha); 773 uint16_t dst_scale = 256 - src_scale; 774 775 /* run them N at a time through the NEON unit */ 776 /* note that each 1 is 4 bytes, each treated exactly the same, 777 * so we can work under that guise. We *do* know that the src&dst 778 * will be 32-bit aligned quantities, so we can specify that on 779 * the load/store ops and do a neon 'reinterpret' to get us to 780 * byte-sized (pun intended) pieces that we widen/multiply/shift 781 * we're limited at 128 bits in the wide ops, which is 8x16bits 782 * or a pair of 32 bit src/dsts. 783 */ 784 /* we *could* manually unroll this loop so that we load 128 bits 785 * (as a pair of 64s) from each of src and dst, processing them 786 * in pieces. This might give us a little better management of 787 * the memory latency, but my initial attempts here did not 788 * produce an instruction stream that looked all that nice. 789 */ 790#define UNROLL 2 791 while (count >= UNROLL) { 792 uint8x8_t src_raw, dst_raw, dst_final; 793 uint16x8_t src_wide, dst_wide; 794 795 /* get 64 bits of src, widen it, multiply by src_scale */ 796 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 797 src_wide = vmovl_u8(src_raw); 798 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 799 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 800 801 /* ditto with dst */ 802 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 803 dst_wide = vmovl_u8(dst_raw); 804 805 /* combine add with dst multiply into mul-accumulate */ 806 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 807 808 dst_final = vshrn_n_u16(dst_wide, 8); 809 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 810 811 src += UNROLL; 812 dst += UNROLL; 813 count -= UNROLL; 814 } 815 /* RBE: well, i don't like how gcc manages src/dst across the above 816 * loop it's constantly calculating src+bias, dst+bias and it only 817 * adjusts the real ones when we leave the loop. Not sure why 818 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 819 * the adjustments to src/dst/count, but it does... 820 * (might be SSA-style internal logic... 821 */ 822 823#if UNROLL == 2 824 if (count == 1) { 825 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 826 } 827#else 828 if (count > 0) { 829 do { 830 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 831 src += 1; 832 dst += 1; 833 } while (--count > 0); 834 } 835#endif 836 837#undef UNROLL 838 } 839} 840 841void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 842 const SkPMColor* SK_RESTRICT src, 843 int count, U8CPU alpha) { 844 845 SkASSERT(255 >= alpha); 846 847 if (count <= 0) { 848 return; 849 } 850 851 unsigned alpha256 = SkAlpha255To256(alpha); 852 853 // First deal with odd counts 854 if (count & 1) { 855 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 856 uint16x8_t vdst_wide, vsrc_wide; 857 unsigned dst_scale; 858 859 // Load 860 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 861 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 862 863 // Calc dst_scale 864 dst_scale = vget_lane_u8(vsrc, 3); 865 dst_scale *= alpha256; 866 dst_scale >>= 8; 867 dst_scale = 256 - dst_scale; 868 869 // Process src 870 vsrc_wide = vmovl_u8(vsrc); 871 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 872 873 // Process dst 874 vdst_wide = vmovl_u8(vdst); 875 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 876 877 // Combine 878 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 879 880 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 881 dst++; 882 src++; 883 count--; 884 } 885 886 if (count) { 887 uint8x8_t alpha_mask; 888 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 889 alpha_mask = vld1_u8(alpha_mask_setup); 890 891 do { 892 893 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 894 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 895 896 __builtin_prefetch(src+32); 897 __builtin_prefetch(dst+32); 898 899 // Load 900 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 901 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 902 903 // Prepare src_scale 904 vsrc_scale = vdupq_n_u16(alpha256); 905 906 // Calc dst_scale 907 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 908 vdst_scale = vmovl_u8(vsrc_alphas); 909 vdst_scale *= vsrc_scale; 910 vdst_scale = vshrq_n_u16(vdst_scale, 8); 911 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 912 913 // Process src 914 vsrc_wide = vmovl_u8(vsrc); 915 vsrc_wide *= vsrc_scale; 916 917 // Process dst 918 vdst_wide = vmovl_u8(vdst); 919 vdst_wide *= vdst_scale; 920 921 // Combine 922 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 923 924 vst1_u32(dst, vreinterpret_u32_u8(vres)); 925 926 src += 2; 927 dst += 2; 928 count -= 2; 929 } while(count); 930 } 931} 932 933/////////////////////////////////////////////////////////////////////////////// 934 935#undef DEBUG_OPAQUE_DITHER 936 937#if defined(DEBUG_OPAQUE_DITHER) 938static void showme8(char *str, void *p, int len) 939{ 940 static char buf[256]; 941 char tbuf[32]; 942 int i; 943 char *pc = (char*) p; 944 sprintf(buf,"%8s:", str); 945 for(i=0;i<len;i++) { 946 sprintf(tbuf, " %02x", pc[i]); 947 strcat(buf, tbuf); 948 } 949 SkDebugf("%s\n", buf); 950} 951static void showme16(char *str, void *p, int len) 952{ 953 static char buf[256]; 954 char tbuf[32]; 955 int i; 956 uint16_t *pc = (uint16_t*) p; 957 sprintf(buf,"%8s:", str); 958 len = (len / sizeof(uint16_t)); /* passed as bytes */ 959 for(i=0;i<len;i++) { 960 sprintf(tbuf, " %04x", pc[i]); 961 strcat(buf, tbuf); 962 } 963 SkDebugf("%s\n", buf); 964} 965#endif 966 967void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 968 const SkPMColor* SK_RESTRICT src, 969 int count, U8CPU alpha, int x, int y) { 970 SkASSERT(255 == alpha); 971 972#define UNROLL 8 973 974 if (count >= UNROLL) { 975 uint8x8_t dbase; 976 977#if defined(DEBUG_OPAQUE_DITHER) 978 uint16_t tmpbuf[UNROLL]; 979 int td[UNROLL]; 980 int tdv[UNROLL]; 981 int ta[UNROLL]; 982 int tap[UNROLL]; 983 uint16_t in_dst[UNROLL]; 984 int offset = 0; 985 int noisy = 0; 986#endif 987 988 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 989 dbase = vld1_u8(dstart); 990 991 do { 992 uint8x8_t sr, sg, sb, sa, d; 993 uint16x8_t dst8, scale8, alpha8; 994 uint16x8_t dst_r, dst_g, dst_b; 995 996#if defined(DEBUG_OPAQUE_DITHER) 997 /* calculate 8 elements worth into a temp buffer */ 998 { 999 int my_y = y; 1000 int my_x = x; 1001 SkPMColor* my_src = (SkPMColor*)src; 1002 uint16_t* my_dst = dst; 1003 int i; 1004 1005 DITHER_565_SCAN(my_y); 1006 for(i=0;i<UNROLL;i++) { 1007 SkPMColor c = *my_src++; 1008 SkPMColorAssert(c); 1009 if (c) { 1010 unsigned a = SkGetPackedA32(c); 1011 1012 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1013 tdv[i] = DITHER_VALUE(my_x); 1014 ta[i] = a; 1015 tap[i] = SkAlpha255To256(a); 1016 td[i] = d; 1017 1018 unsigned sr = SkGetPackedR32(c); 1019 unsigned sg = SkGetPackedG32(c); 1020 unsigned sb = SkGetPackedB32(c); 1021 sr = SkDITHER_R32_FOR_565(sr, d); 1022 sg = SkDITHER_G32_FOR_565(sg, d); 1023 sb = SkDITHER_B32_FOR_565(sb, d); 1024 1025 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1026 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1027 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1028 // now src and dst expanded are in g:11 r:10 x:1 b:10 1029 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1030 td[i] = d; 1031 1032 } else { 1033 tmpbuf[i] = *my_dst; 1034 ta[i] = tdv[i] = td[i] = 0xbeef; 1035 } 1036 in_dst[i] = *my_dst; 1037 my_dst += 1; 1038 DITHER_INC_X(my_x); 1039 } 1040 } 1041#endif 1042 1043 /* source is in ABGR */ 1044 { 1045 register uint8x8_t d0 asm("d0"); 1046 register uint8x8_t d1 asm("d1"); 1047 register uint8x8_t d2 asm("d2"); 1048 register uint8x8_t d3 asm("d3"); 1049 1050 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1051 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1052 : "r" (src) 1053 ); 1054 sr = d0; sg = d1; sb = d2; sa = d3; 1055 } 1056 1057 /* calculate 'd', which will be 0..7 */ 1058 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1059#if defined(SK_BUILD_FOR_ANDROID) 1060 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1061 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1062#else 1063 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1064#endif 1065 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1066 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 1067 1068 /* sr = sr - (sr>>5) + d */ 1069 /* watching for 8-bit overflow. d is 0..7; risky range of 1070 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1071 * safe as long as we do ((sr-sr>>5) + d) */ 1072 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1073 sr = vadd_u8(sr, d); 1074 1075 /* sb = sb - (sb>>5) + d */ 1076 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1077 sb = vadd_u8(sb, d); 1078 1079 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1080 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1081 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1082 1083 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1084 dst8 = vld1q_u16(dst); 1085 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1086 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1087 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1088 1089 /* blend */ 1090#if 1 1091 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1092 /* originally 255-sa + 1 */ 1093 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1094#else 1095 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1096 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1097#endif 1098 1099#if 1 1100 /* combine the addq and mul, save 3 insns */ 1101 scale8 = vshrq_n_u16(scale8, 3); 1102 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1103 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1104 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1105#else 1106 /* known correct, but +3 insns over above */ 1107 scale8 = vshrq_n_u16(scale8, 3); 1108 dst_b = vmulq_u16(dst_b, scale8); 1109 dst_g = vmulq_u16(dst_g, scale8); 1110 dst_r = vmulq_u16(dst_r, scale8); 1111 1112 /* combine */ 1113 /* NB: vshll widens, need to preserve those bits */ 1114 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1115 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1116 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1117#endif 1118 1119 /* repack to store */ 1120 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1121 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1122 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1123 1124 vst1q_u16(dst, dst8); 1125 1126#if defined(DEBUG_OPAQUE_DITHER) 1127 /* verify my 8 elements match the temp buffer */ 1128 { 1129 int i, bad=0; 1130 static int invocation; 1131 1132 for (i=0;i<UNROLL;i++) 1133 if (tmpbuf[i] != dst[i]) bad=1; 1134 if (bad) { 1135 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1136 invocation, offset); 1137 SkDebugf(" alpha 0x%x\n", alpha); 1138 for (i=0;i<UNROLL;i++) 1139 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1140 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1141 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1142 1143 showme16("alpha8", &alpha8, sizeof(alpha8)); 1144 showme16("scale8", &scale8, sizeof(scale8)); 1145 showme8("d", &d, sizeof(d)); 1146 showme16("dst8", &dst8, sizeof(dst8)); 1147 showme16("dst_b", &dst_b, sizeof(dst_b)); 1148 showme16("dst_g", &dst_g, sizeof(dst_g)); 1149 showme16("dst_r", &dst_r, sizeof(dst_r)); 1150 showme8("sb", &sb, sizeof(sb)); 1151 showme8("sg", &sg, sizeof(sg)); 1152 showme8("sr", &sr, sizeof(sr)); 1153 1154 /* cop out */ 1155 return; 1156 } 1157 offset += UNROLL; 1158 invocation++; 1159 } 1160#endif 1161 1162 dst += UNROLL; 1163 src += UNROLL; 1164 count -= UNROLL; 1165 /* skip x += UNROLL, since it's unchanged mod-4 */ 1166 } while (count >= UNROLL); 1167 } 1168#undef UNROLL 1169 1170 /* residuals */ 1171 if (count > 0) { 1172 DITHER_565_SCAN(y); 1173 do { 1174 SkPMColor c = *src++; 1175 SkPMColorAssert(c); 1176 if (c) { 1177 unsigned a = SkGetPackedA32(c); 1178 1179 // dither and alpha are just temporary variables to work-around 1180 // an ICE in debug. 1181 unsigned dither = DITHER_VALUE(x); 1182 unsigned alpha = SkAlpha255To256(a); 1183 int d = SkAlphaMul(dither, alpha); 1184 1185 unsigned sr = SkGetPackedR32(c); 1186 unsigned sg = SkGetPackedG32(c); 1187 unsigned sb = SkGetPackedB32(c); 1188 sr = SkDITHER_R32_FOR_565(sr, d); 1189 sg = SkDITHER_G32_FOR_565(sg, d); 1190 sb = SkDITHER_B32_FOR_565(sb, d); 1191 1192 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1193 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1194 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1195 // now src and dst expanded are in g:11 r:10 x:1 b:10 1196 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1197 } 1198 dst += 1; 1199 DITHER_INC_X(x); 1200 } while (--count != 0); 1201 } 1202} 1203 1204/////////////////////////////////////////////////////////////////////////////// 1205 1206#undef DEBUG_S32_OPAQUE_DITHER 1207 1208void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1209 const SkPMColor* SK_RESTRICT src, 1210 int count, U8CPU alpha, int x, int y) { 1211 SkASSERT(255 == alpha); 1212 1213#define UNROLL 8 1214 if (count >= UNROLL) { 1215 uint8x8_t d; 1216 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1217 d = vld1_u8(dstart); 1218 1219 while (count >= UNROLL) { 1220 uint8x8_t sr, sg, sb; 1221 uint16x8_t dr, dg, db; 1222 uint16x8_t dst8; 1223 1224 { 1225 register uint8x8_t d0 asm("d0"); 1226 register uint8x8_t d1 asm("d1"); 1227 register uint8x8_t d2 asm("d2"); 1228 register uint8x8_t d3 asm("d3"); 1229 1230 asm ( 1231 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1232 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1233 : 1234 ); 1235 sg = d1; 1236#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1237 sr = d2; sb = d0; 1238#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1239 sr = d0; sb = d2; 1240#endif 1241 } 1242 /* XXX: if we want to prefetch, hide it in the above asm() 1243 * using the gcc __builtin_prefetch(), the prefetch will 1244 * fall to the bottom of the loop -- it won't stick up 1245 * at the top of the loop, just after the vld4. 1246 */ 1247 1248 // sr = sr - (sr>>5) + d 1249 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1250 dr = vaddl_u8(sr, d); 1251 1252 // sb = sb - (sb>>5) + d 1253 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1254 db = vaddl_u8(sb, d); 1255 1256 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1257 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1258 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1259 1260 // pack high bits of each into 565 format (rgb, b is lsb) 1261 dst8 = vshrq_n_u16(db, 3); 1262 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1263 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1264 1265 // store it 1266 vst1q_u16(dst, dst8); 1267 1268#if defined(DEBUG_S32_OPAQUE_DITHER) 1269 // always good to know if we generated good results 1270 { 1271 int i, myx = x, myy = y; 1272 DITHER_565_SCAN(myy); 1273 for (i=0;i<UNROLL;i++) { 1274 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1275 SkPMColor c = src[i-8]; 1276 unsigned dither = DITHER_VALUE(myx); 1277 uint16_t val = SkDitherRGB32To565(c, dither); 1278 if (val != dst[i]) { 1279 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1280 c, dither, val, dst[i], dstart[i]); 1281 } 1282 DITHER_INC_X(myx); 1283 } 1284 } 1285#endif 1286 1287 dst += UNROLL; 1288 // we don't need to increment src as the asm above has already done it 1289 count -= UNROLL; 1290 x += UNROLL; // probably superfluous 1291 } 1292 } 1293#undef UNROLL 1294 1295 // residuals 1296 if (count > 0) { 1297 DITHER_565_SCAN(y); 1298 do { 1299 SkPMColor c = *src++; 1300 SkPMColorAssert(c); 1301 SkASSERT(SkGetPackedA32(c) == 255); 1302 1303 unsigned dither = DITHER_VALUE(x); 1304 *dst++ = SkDitherRGB32To565(c, dither); 1305 DITHER_INC_X(x); 1306 } while (--count != 0); 1307 } 1308} 1309 1310void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1311 SkPMColor color) { 1312 if (count <= 0) { 1313 return; 1314 } 1315 1316 if (0 == color) { 1317 if (src != dst) { 1318 memcpy(dst, src, count * sizeof(SkPMColor)); 1319 } 1320 return; 1321 } 1322 1323 unsigned colorA = SkGetPackedA32(color); 1324 if (255 == colorA) { 1325 sk_memset32(dst, color, count); 1326 } else { 1327 unsigned scale = 256 - SkAlpha255To256(colorA); 1328 1329 if (count >= 8) { 1330 // at the end of this assembly, count will have been decremented 1331 // to a negative value. That is, if count mod 8 = x, it will be 1332 // -8 +x coming out. 1333 asm volatile ( 1334 PLD128(src, 0) 1335 1336 "vdup.32 q0, %[color] \n\t" 1337 1338 PLD128(src, 128) 1339 1340 // scale numerical interval [0-255], so load as 8 bits 1341 "vdup.8 d2, %[scale] \n\t" 1342 1343 PLD128(src, 256) 1344 1345 "subs %[count], %[count], #8 \n\t" 1346 1347 PLD128(src, 384) 1348 1349 "Loop_Color32: \n\t" 1350 1351 // load src color, 8 pixels, 4 64 bit registers 1352 // (and increment src). 1353 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1354 1355 PLD128(src, 384) 1356 1357 // multiply long by scale, 64 bits at a time, 1358 // destination into a 128 bit register. 1359 "vmull.u8 q4, d4, d2 \n\t" 1360 "vmull.u8 q5, d5, d2 \n\t" 1361 "vmull.u8 q6, d6, d2 \n\t" 1362 "vmull.u8 q7, d7, d2 \n\t" 1363 1364 // shift the 128 bit registers, containing the 16 1365 // bit scaled values back to 8 bits, narrowing the 1366 // results to 64 bit registers. 1367 "vshrn.i16 d8, q4, #8 \n\t" 1368 "vshrn.i16 d9, q5, #8 \n\t" 1369 "vshrn.i16 d10, q6, #8 \n\t" 1370 "vshrn.i16 d11, q7, #8 \n\t" 1371 1372 // adding back the color, using 128 bit registers. 1373 "vadd.i8 q6, q4, q0 \n\t" 1374 "vadd.i8 q7, q5, q0 \n\t" 1375 1376 // store back the 8 calculated pixels (2 128 bit 1377 // registers), and increment dst. 1378 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1379 1380 "subs %[count], %[count], #8 \n\t" 1381 "bge Loop_Color32 \n\t" 1382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1383 : [color] "r" (color), [scale] "r" (scale) 1384 : "cc", "memory", 1385 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1386 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1387 ); 1388 // At this point, if we went through the inline assembly, count is 1389 // a negative value: 1390 // if the value is -8, there is no pixel left to process. 1391 // if the value is -7, there is one pixel left to process 1392 // ... 1393 // And'ing it with 7 will give us the number of pixels 1394 // left to process. 1395 count = count & 0x7; 1396 } 1397 1398 while (count > 0) { 1399 *dst = color + SkAlphaMulQ(*src, scale); 1400 src += 1; 1401 dst += 1; 1402 count--; 1403 } 1404 } 1405} 1406 1407/////////////////////////////////////////////////////////////////////////////// 1408 1409const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1410 // no dither 1411 // NOTE: For the S32_D565_Blend function below, we don't have a special 1412 // version that assumes that each source pixel is opaque. But our 1413 // S32A is still faster than the default, so use it. 1414 S32_D565_Opaque_neon, 1415 S32A_D565_Blend_neon, // really S32_D565_Blend 1416 S32A_D565_Opaque_neon, 1417 S32A_D565_Blend_neon, 1418 1419 // dither 1420 S32_D565_Opaque_Dither_neon, 1421 S32_D565_Blend_Dither_neon, 1422 S32A_D565_Opaque_Dither_neon, 1423 NULL, // S32A_D565_Blend_Dither 1424}; 1425 1426const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1427 NULL, // S32_Opaque, 1428 S32_Blend_BlitRow32_neon, // S32_Blend, 1429 /* 1430 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1431 * value and attempts to optimize accordingly. The optimization is 1432 * sensitive to the source content and is not a win in all cases. For 1433 * example, if there are a lot of transitions between the alpha states, 1434 * the performance will almost certainly be worse. However, for many 1435 * common cases the performance is equivalent or better than the standard 1436 * case where we do not inspect the src alpha. 1437 */ 1438#if SK_A32_SHIFT == 24 1439 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1440 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1441#else 1442 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1443#endif 1444 S32A_Blend_BlitRow32_neon // S32A_Blend 1445}; 1446