SkBlitRow_opts_arm_neon.cpp revision 0060159457453ca45a47828648c8f29d5695983c
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18#include "SkColor_opts_neon.h" 19#include <arm_neon.h> 20 21void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 while (count >= 8) { 27 uint8x8x4_t vsrc; 28 uint16x8_t vdst; 29 30 // Load 31 vsrc = vld4_u8((uint8_t*)src); 32 33 // Convert src to 565 34 vdst = vshll_n_u8(vsrc.val[NEON_R], 8); 35 vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5); 36 vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6); 37 38 // Store 39 vst1q_u16(dst, vdst); 40 41 // Prepare next iteration 42 dst += 8; 43 src += 8; 44 count -= 8; 45 }; 46 47 // Leftovers 48 while (count > 0) { 49 SkPMColor c = *src++; 50 SkPMColorAssert(c); 51 *dst = SkPixel32ToPixel16_ToU16(c); 52 dst++; 53 count--; 54 }; 55} 56 57void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 58 const SkPMColor* SK_RESTRICT src, int count, 59 U8CPU alpha, int /*x*/, int /*y*/) { 60 SkASSERT(255 == alpha); 61 62 if (count >= 8) { 63 uint16_t* SK_RESTRICT keep_dst = 0; 64 65 asm volatile ( 66 "ands ip, %[count], #7 \n\t" 67 "vmov.u8 d31, #1<<7 \n\t" 68 "vld1.16 {q12}, [%[dst]] \n\t" 69 "vld4.8 {d0-d3}, [%[src]] \n\t" 70 // Thumb does not support the standard ARM conditional 71 // instructions but instead requires the 'it' instruction 72 // to signal conditional execution 73 "it eq \n\t" 74 "moveq ip, #8 \n\t" 75 "mov %[keep_dst], %[dst] \n\t" 76 77 "add %[src], %[src], ip, LSL#2 \n\t" 78 "add %[dst], %[dst], ip, LSL#1 \n\t" 79 "subs %[count], %[count], ip \n\t" 80 "b 9f \n\t" 81 // LOOP 82 "2: \n\t" 83 84 "vld1.16 {q12}, [%[dst]]! \n\t" 85 "vld4.8 {d0-d3}, [%[src]]! \n\t" 86 "vst1.16 {q10}, [%[keep_dst]] \n\t" 87 "sub %[keep_dst], %[dst], #8*2 \n\t" 88 "subs %[count], %[count], #8 \n\t" 89 "9: \n\t" 90 "pld [%[dst],#32] \n\t" 91 // expand 0565 q12 to 8888 {d4-d7} 92 "vmovn.u16 d4, q12 \n\t" 93 "vshr.u16 q11, q12, #5 \n\t" 94 "vshr.u16 q10, q12, #6+5 \n\t" 95 "vmovn.u16 d5, q11 \n\t" 96 "vmovn.u16 d6, q10 \n\t" 97 "vshl.u8 d4, d4, #3 \n\t" 98 "vshl.u8 d5, d5, #2 \n\t" 99 "vshl.u8 d6, d6, #3 \n\t" 100 101 "vmovl.u8 q14, d31 \n\t" 102 "vmovl.u8 q13, d31 \n\t" 103 "vmovl.u8 q12, d31 \n\t" 104 105 // duplicate in 4/2/1 & 8pix vsns 106 "vmvn.8 d30, d3 \n\t" 107 "vmlal.u8 q14, d30, d6 \n\t" 108 "vmlal.u8 q13, d30, d5 \n\t" 109 "vmlal.u8 q12, d30, d4 \n\t" 110 "vshr.u16 q8, q14, #5 \n\t" 111 "vshr.u16 q9, q13, #6 \n\t" 112 "vaddhn.u16 d6, q14, q8 \n\t" 113 "vshr.u16 q8, q12, #5 \n\t" 114 "vaddhn.u16 d5, q13, q9 \n\t" 115 "vqadd.u8 d6, d6, d0 \n\t" // moved up 116 "vaddhn.u16 d4, q12, q8 \n\t" 117 // intentionally don't calculate alpha 118 // result in d4-d6 119 120 "vqadd.u8 d5, d5, d1 \n\t" 121 "vqadd.u8 d4, d4, d2 \n\t" 122 123 // pack 8888 {d4-d6} to 0565 q10 124 "vshll.u8 q10, d6, #8 \n\t" 125 "vshll.u8 q3, d5, #8 \n\t" 126 "vshll.u8 q2, d4, #8 \n\t" 127 "vsri.u16 q10, q3, #5 \n\t" 128 "vsri.u16 q10, q2, #11 \n\t" 129 130 "bne 2b \n\t" 131 132 "1: \n\t" 133 "vst1.16 {q10}, [%[keep_dst]] \n\t" 134 : [count] "+r" (count) 135 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 136 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 137 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 138 "d30","d31" 139 ); 140 } 141 else 142 { // handle count < 8 143 uint16_t* SK_RESTRICT keep_dst = 0; 144 145 asm volatile ( 146 "vmov.u8 d31, #1<<7 \n\t" 147 "mov %[keep_dst], %[dst] \n\t" 148 149 "tst %[count], #4 \n\t" 150 "beq 14f \n\t" 151 "vld1.16 {d25}, [%[dst]]! \n\t" 152 "vld1.32 {q1}, [%[src]]! \n\t" 153 154 "14: \n\t" 155 "tst %[count], #2 \n\t" 156 "beq 12f \n\t" 157 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 158 "vld1.32 {d1}, [%[src]]! \n\t" 159 160 "12: \n\t" 161 "tst %[count], #1 \n\t" 162 "beq 11f \n\t" 163 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 164 "vld1.32 {d0[1]}, [%[src]]! \n\t" 165 166 "11: \n\t" 167 // unzips achieve the same as a vld4 operation 168 "vuzpq.u16 q0, q1 \n\t" 169 "vuzp.u8 d0, d1 \n\t" 170 "vuzp.u8 d2, d3 \n\t" 171 // expand 0565 q12 to 8888 {d4-d7} 172 "vmovn.u16 d4, q12 \n\t" 173 "vshr.u16 q11, q12, #5 \n\t" 174 "vshr.u16 q10, q12, #6+5 \n\t" 175 "vmovn.u16 d5, q11 \n\t" 176 "vmovn.u16 d6, q10 \n\t" 177 "vshl.u8 d4, d4, #3 \n\t" 178 "vshl.u8 d5, d5, #2 \n\t" 179 "vshl.u8 d6, d6, #3 \n\t" 180 181 "vmovl.u8 q14, d31 \n\t" 182 "vmovl.u8 q13, d31 \n\t" 183 "vmovl.u8 q12, d31 \n\t" 184 185 // duplicate in 4/2/1 & 8pix vsns 186 "vmvn.8 d30, d3 \n\t" 187 "vmlal.u8 q14, d30, d6 \n\t" 188 "vmlal.u8 q13, d30, d5 \n\t" 189 "vmlal.u8 q12, d30, d4 \n\t" 190 "vshr.u16 q8, q14, #5 \n\t" 191 "vshr.u16 q9, q13, #6 \n\t" 192 "vaddhn.u16 d6, q14, q8 \n\t" 193 "vshr.u16 q8, q12, #5 \n\t" 194 "vaddhn.u16 d5, q13, q9 \n\t" 195 "vqadd.u8 d6, d6, d0 \n\t" // moved up 196 "vaddhn.u16 d4, q12, q8 \n\t" 197 // intentionally don't calculate alpha 198 // result in d4-d6 199 200 "vqadd.u8 d5, d5, d1 \n\t" 201 "vqadd.u8 d4, d4, d2 \n\t" 202 203 // pack 8888 {d4-d6} to 0565 q10 204 "vshll.u8 q10, d6, #8 \n\t" 205 "vshll.u8 q3, d5, #8 \n\t" 206 "vshll.u8 q2, d4, #8 \n\t" 207 "vsri.u16 q10, q3, #5 \n\t" 208 "vsri.u16 q10, q2, #11 \n\t" 209 210 // store 211 "tst %[count], #4 \n\t" 212 "beq 24f \n\t" 213 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 214 215 "24: \n\t" 216 "tst %[count], #2 \n\t" 217 "beq 22f \n\t" 218 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 219 220 "22: \n\t" 221 "tst %[count], #1 \n\t" 222 "beq 21f \n\t" 223 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 224 225 "21: \n\t" 226 : [count] "+r" (count) 227 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 228 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 229 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 230 "d30","d31" 231 ); 232 } 233} 234 235void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 236 const SkPMColor* SK_RESTRICT src, int count, 237 U8CPU alpha, int /*x*/, int /*y*/) { 238 239 U8CPU alpha_for_asm = alpha; 240 241 asm volatile ( 242 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 243 * the original in two respects: 244 * 1. The results have a few mismatches compared to the original code. These mismatches 245 * never exceed 1. It's possible to improve accuracy vs. a floating point 246 * implementation by introducing rounding right shifts (vrshr) for the final stage. 247 * Rounding is not present in the code below, because although results would be closer 248 * to a floating point implementation, the number of mismatches compared to the 249 * original code would be far greater. 250 * 2. On certain inputs, the original code can overflow, causing colour channels to 251 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 252 * to affect another. 253 */ 254 255#if 1 256 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 257 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 258#else 259 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 260#endif 261 "vmov.u16 q3, #255 \n\t" // set up constant 262 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 263 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 264 "beq 2f \n\t" // if count8 == 0, exit 265 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 266 267 "1: \n\t" 268 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 269 "subs r4, r4, #1 \n\t" // decrement loop counter 270 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 271 // and deinterleave 272 273 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 274 "vand q10, q0, q15 \n\t" // extract blue 275 "vshr.u16 q8, q0, #11 \n\t" // extract red 276 "vshr.u16 q9, q9, #10 \n\t" // extract green 277 // dstrgb = {q8, q9, q10} 278 279 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 280 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 281 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 282 283 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 284 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 285 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 286 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 287 // srcrgba = {q11, q12, q13, q14} 288 289 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 290 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 291 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 292 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 293 294 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 295 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 296 // dst_scale = q2 297 298 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 299 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 300 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 301 302#if 1 303 // trying for a better match with SkDiv255Round(a) 304 // C alg is: a+=128; (a+a>>8)>>8 305 // we'll use just a rounding shift [q2 is available for scratch] 306 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 307 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 308 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 309#else 310 // arm's original "truncating divide by 256" 311 "vshr.u16 q11, q11, #8 \n\t" // shift down red 312 "vshr.u16 q12, q12, #8 \n\t" // shift down green 313 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 314#endif 315 316 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 317 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 318 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 319 320 "bne 1b \n\t" // if counter != 0, loop 321 "2: \n\t" // exit 322 323 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 324 : 325 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 326 ); 327 328 count &= 7; 329 if (count > 0) { 330 do { 331 SkPMColor sc = *src++; 332 if (sc) { 333 uint16_t dc = *dst; 334 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 335 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 336 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 337 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 338 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 339 } 340 dst += 1; 341 } while (--count != 0); 342 } 343} 344 345/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 346 * each dither value is spaced out into byte lanes, and repeated 347 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 348 * start of each row. 349 */ 350static const uint8_t gDitherMatrix_Neon[48] = { 351 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 352 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 353 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 354 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 355 356}; 357 358void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 359 int count, U8CPU alpha, int x, int y) 360{ 361 /* select row and offset for dither array */ 362 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 363 364 /* rescale alpha to range 0 - 256 */ 365 int scale = SkAlpha255To256(alpha); 366 367 asm volatile ( 368 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 369 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 370 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 371 "vmov.i8 d29, #0x3f \n\t" // set up green mask 372 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 373 "1: \n\t" 374 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 375 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 376 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 377 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 378 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 379 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 380 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 381 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 382 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 383 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 384 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 385 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 386 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 387 // load 8 pixels from dst, extract rgb 388 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 389 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 390 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 391 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 392 "vand d17, d17, d29 \n\t" // and green with green mask 393 "vand d18, d18, d28 \n\t" // and blue with blue mask 394 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 395 // src = {d22 (r), d23 (g), d24 (b)} 396 // dst = {d16 (r), d17 (g), d18 (b)} 397 // subtract dst from src and widen 398 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 399 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 400 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 401 // multiply diffs by scale and shift 402 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 403 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 404 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 405 "subs %[count], %[count], #8 \n\t" // decrement loop counter 406 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 407 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 408 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 409 // add dst to result 410 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 411 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 412 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 413 // put result into 565 format 414 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 415 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 416 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 417 "bgt 1b \n\t" // loop if count > 0 418 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 419 : [dstart] "r" (dstart), [scale] "r" (scale) 420 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 421 ); 422 423 DITHER_565_SCAN(y); 424 425 while((count & 7) > 0) 426 { 427 SkPMColor c = *src++; 428 429 int dither = DITHER_VALUE(x); 430 int sr = SkGetPackedR32(c); 431 int sg = SkGetPackedG32(c); 432 int sb = SkGetPackedB32(c); 433 sr = SkDITHER_R32To565(sr, dither); 434 sg = SkDITHER_G32To565(sg, dither); 435 sb = SkDITHER_B32To565(sb, dither); 436 437 uint16_t d = *dst; 438 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 439 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 440 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 441 DITHER_INC_X(x); 442 count--; 443 } 444} 445 446void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 447 const SkPMColor* SK_RESTRICT src, 448 int count, U8CPU alpha) { 449 450 SkASSERT(255 == alpha); 451 if (count > 0) { 452 453 454 uint8x8_t alpha_mask; 455 456 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 457 alpha_mask = vld1_u8(alpha_mask_setup); 458 459 /* do the NEON unrolled code */ 460#define UNROLL 4 461 while (count >= UNROLL) { 462 uint8x8_t src_raw, dst_raw, dst_final; 463 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 464 465 /* The two prefetches below may make the code slighlty 466 * slower for small values of count but are worth having 467 * in the general case. 468 */ 469 __builtin_prefetch(src+32); 470 __builtin_prefetch(dst+32); 471 472 /* get the source */ 473 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 474#if UNROLL > 2 475 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 476#endif 477 478 /* get and hold the dst too */ 479 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 480#if UNROLL > 2 481 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 482#endif 483 484 /* 1st and 2nd bits of the unrolling */ 485 { 486 uint8x8_t dst_cooked; 487 uint16x8_t dst_wide; 488 uint8x8_t alpha_narrow; 489 uint16x8_t alpha_wide; 490 491 /* get the alphas spread out properly */ 492 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 493 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 494 495 /* spread the dest */ 496 dst_wide = vmovl_u8(dst_raw); 497 498 /* alpha mul the dest */ 499 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 500 dst_cooked = vshrn_n_u16(dst_wide, 8); 501 502 /* sum -- ignoring any byte lane overflows */ 503 dst_final = vadd_u8(src_raw, dst_cooked); 504 } 505 506#if UNROLL > 2 507 /* the 3rd and 4th bits of our unrolling */ 508 { 509 uint8x8_t dst_cooked; 510 uint16x8_t dst_wide; 511 uint8x8_t alpha_narrow; 512 uint16x8_t alpha_wide; 513 514 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 515 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 516 517 /* spread the dest */ 518 dst_wide = vmovl_u8(dst_raw_2); 519 520 /* alpha mul the dest */ 521 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 522 dst_cooked = vshrn_n_u16(dst_wide, 8); 523 524 /* sum -- ignoring any byte lane overflows */ 525 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 526 } 527#endif 528 529 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 530#if UNROLL > 2 531 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 532#endif 533 534 src += UNROLL; 535 dst += UNROLL; 536 count -= UNROLL; 537 } 538#undef UNROLL 539 540 /* do any residual iterations */ 541 while (--count >= 0) { 542 *dst = SkPMSrcOver(*src, *dst); 543 src += 1; 544 dst += 1; 545 } 546 } 547} 548 549void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 550 const SkPMColor* SK_RESTRICT src, 551 int count, U8CPU alpha) { 552 SkASSERT(255 == alpha); 553 554 if (count <= 0) 555 return; 556 557 /* Use these to check if src is transparent or opaque */ 558 const unsigned int ALPHA_OPAQ = 0xFF000000; 559 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 560 561#define UNROLL 4 562 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 563 const SkPMColor* SK_RESTRICT src_temp = src; 564 565 /* set up the NEON variables */ 566 uint8x8_t alpha_mask; 567 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 568 alpha_mask = vld1_u8(alpha_mask_setup); 569 570 uint8x8_t src_raw, dst_raw, dst_final; 571 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 572 uint8x8_t dst_cooked; 573 uint16x8_t dst_wide; 574 uint8x8_t alpha_narrow; 575 uint16x8_t alpha_wide; 576 577 /* choose the first processing type */ 578 if( src >= src_end) 579 goto TAIL; 580 if(*src <= ALPHA_TRANS) 581 goto ALPHA_0; 582 if(*src >= ALPHA_OPAQ) 583 goto ALPHA_255; 584 /* fall-thru */ 585 586ALPHA_1_TO_254: 587 do { 588 589 /* get the source */ 590 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 591 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 592 593 /* get and hold the dst too */ 594 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 595 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 596 597 598 /* get the alphas spread out properly */ 599 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 600 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 601 /* we collapsed (255-a)+1 ... */ 602 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 603 604 /* spread the dest */ 605 dst_wide = vmovl_u8(dst_raw); 606 607 /* alpha mul the dest */ 608 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 609 dst_cooked = vshrn_n_u16(dst_wide, 8); 610 611 /* sum -- ignoring any byte lane overflows */ 612 dst_final = vadd_u8(src_raw, dst_cooked); 613 614 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 615 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 616 /* we collapsed (255-a)+1 ... */ 617 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 618 619 /* spread the dest */ 620 dst_wide = vmovl_u8(dst_raw_2); 621 622 /* alpha mul the dest */ 623 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 624 dst_cooked = vshrn_n_u16(dst_wide, 8); 625 626 /* sum -- ignoring any byte lane overflows */ 627 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 628 629 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 630 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 631 632 src += UNROLL; 633 dst += UNROLL; 634 635 /* if 2 of the next pixels aren't between 1 and 254 636 it might make sense to go to the optimized loops */ 637 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 638 break; 639 640 } while(src < src_end); 641 642 if (src >= src_end) 643 goto TAIL; 644 645 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 646 goto ALPHA_255; 647 648 /*fall-thru*/ 649 650ALPHA_0: 651 652 /*In this state, we know the current alpha is 0 and 653 we optimize for the next alpha also being zero. */ 654 src_temp = src; //so we don't have to increment dst every time 655 do { 656 if(*(++src) > ALPHA_TRANS) 657 break; 658 if(*(++src) > ALPHA_TRANS) 659 break; 660 if(*(++src) > ALPHA_TRANS) 661 break; 662 if(*(++src) > ALPHA_TRANS) 663 break; 664 } while(src < src_end); 665 666 dst += (src - src_temp); 667 668 /* no longer alpha 0, so determine where to go next. */ 669 if( src >= src_end) 670 goto TAIL; 671 if(*src >= ALPHA_OPAQ) 672 goto ALPHA_255; 673 else 674 goto ALPHA_1_TO_254; 675 676ALPHA_255: 677 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 678 dst[0]=src[0]; 679 dst[1]=src[1]; 680 dst[2]=src[2]; 681 dst[3]=src[3]; 682 src+=UNROLL; 683 dst+=UNROLL; 684 if(src >= src_end) 685 goto TAIL; 686 } 687 688 //Handle remainder. 689 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 690 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 691 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 692 } 693 } 694 695 if( src >= src_end) 696 goto TAIL; 697 if(*src <= ALPHA_TRANS) 698 goto ALPHA_0; 699 else 700 goto ALPHA_1_TO_254; 701 702TAIL: 703 /* do any residual iterations */ 704 src_end += UNROLL + 1; //goto the real end 705 while(src != src_end) { 706 if( *src != 0 ) { 707 if( *src >= ALPHA_OPAQ ) { 708 *dst = *src; 709 } 710 else { 711 *dst = SkPMSrcOver(*src, *dst); 712 } 713 } 714 src++; 715 dst++; 716 } 717 718#undef UNROLL 719 return; 720} 721 722/* Neon version of S32_Blend_BlitRow32() 723 * portable version is in src/core/SkBlitRow_D32.cpp 724 */ 725void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 726 const SkPMColor* SK_RESTRICT src, 727 int count, U8CPU alpha) { 728 SkASSERT(alpha <= 255); 729 if (count > 0) { 730 uint16_t src_scale = SkAlpha255To256(alpha); 731 uint16_t dst_scale = 256 - src_scale; 732 733 /* run them N at a time through the NEON unit */ 734 /* note that each 1 is 4 bytes, each treated exactly the same, 735 * so we can work under that guise. We *do* know that the src&dst 736 * will be 32-bit aligned quantities, so we can specify that on 737 * the load/store ops and do a neon 'reinterpret' to get us to 738 * byte-sized (pun intended) pieces that we widen/multiply/shift 739 * we're limited at 128 bits in the wide ops, which is 8x16bits 740 * or a pair of 32 bit src/dsts. 741 */ 742 /* we *could* manually unroll this loop so that we load 128 bits 743 * (as a pair of 64s) from each of src and dst, processing them 744 * in pieces. This might give us a little better management of 745 * the memory latency, but my initial attempts here did not 746 * produce an instruction stream that looked all that nice. 747 */ 748#define UNROLL 2 749 while (count >= UNROLL) { 750 uint8x8_t src_raw, dst_raw, dst_final; 751 uint16x8_t src_wide, dst_wide; 752 753 /* get 64 bits of src, widen it, multiply by src_scale */ 754 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 755 src_wide = vmovl_u8(src_raw); 756 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 757 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 758 759 /* ditto with dst */ 760 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 761 dst_wide = vmovl_u8(dst_raw); 762 763 /* combine add with dst multiply into mul-accumulate */ 764 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 765 766 dst_final = vshrn_n_u16(dst_wide, 8); 767 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 768 769 src += UNROLL; 770 dst += UNROLL; 771 count -= UNROLL; 772 } 773 /* RBE: well, i don't like how gcc manages src/dst across the above 774 * loop it's constantly calculating src+bias, dst+bias and it only 775 * adjusts the real ones when we leave the loop. Not sure why 776 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 777 * the adjustments to src/dst/count, but it does... 778 * (might be SSA-style internal logic... 779 */ 780 781#if UNROLL == 2 782 if (count == 1) { 783 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 784 } 785#else 786 if (count > 0) { 787 do { 788 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 789 src += 1; 790 dst += 1; 791 } while (--count > 0); 792 } 793#endif 794 795#undef UNROLL 796 } 797} 798 799void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 800 const SkPMColor* SK_RESTRICT src, 801 int count, U8CPU alpha) { 802 803 SkASSERT(255 >= alpha); 804 805 if (count <= 0) { 806 return; 807 } 808 809 unsigned alpha256 = SkAlpha255To256(alpha); 810 811 // First deal with odd counts 812 if (count & 1) { 813 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 814 uint16x8_t vdst_wide, vsrc_wide; 815 unsigned dst_scale; 816 817 // Load 818 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 819 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 820 821 // Calc dst_scale 822 dst_scale = vget_lane_u8(vsrc, 3); 823 dst_scale *= alpha256; 824 dst_scale >>= 8; 825 dst_scale = 256 - dst_scale; 826 827 // Process src 828 vsrc_wide = vmovl_u8(vsrc); 829 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 830 831 // Process dst 832 vdst_wide = vmovl_u8(vdst); 833 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 834 835 // Combine 836 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 837 838 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 839 dst++; 840 src++; 841 count--; 842 } 843 844 if (count) { 845 uint8x8_t alpha_mask; 846 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 847 alpha_mask = vld1_u8(alpha_mask_setup); 848 849 do { 850 851 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 852 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 853 854 __builtin_prefetch(src+32); 855 __builtin_prefetch(dst+32); 856 857 // Load 858 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 859 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 860 861 // Prepare src_scale 862 vsrc_scale = vdupq_n_u16(alpha256); 863 864 // Calc dst_scale 865 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 866 vdst_scale = vmovl_u8(vsrc_alphas); 867 vdst_scale *= vsrc_scale; 868 vdst_scale = vshrq_n_u16(vdst_scale, 8); 869 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 870 871 // Process src 872 vsrc_wide = vmovl_u8(vsrc); 873 vsrc_wide *= vsrc_scale; 874 875 // Process dst 876 vdst_wide = vmovl_u8(vdst); 877 vdst_wide *= vdst_scale; 878 879 // Combine 880 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 881 882 vst1_u32(dst, vreinterpret_u32_u8(vres)); 883 884 src += 2; 885 dst += 2; 886 count -= 2; 887 } while(count); 888 } 889} 890 891/////////////////////////////////////////////////////////////////////////////// 892 893#undef DEBUG_OPAQUE_DITHER 894 895#if defined(DEBUG_OPAQUE_DITHER) 896static void showme8(char *str, void *p, int len) 897{ 898 static char buf[256]; 899 char tbuf[32]; 900 int i; 901 char *pc = (char*) p; 902 sprintf(buf,"%8s:", str); 903 for(i=0;i<len;i++) { 904 sprintf(tbuf, " %02x", pc[i]); 905 strcat(buf, tbuf); 906 } 907 SkDebugf("%s\n", buf); 908} 909static void showme16(char *str, void *p, int len) 910{ 911 static char buf[256]; 912 char tbuf[32]; 913 int i; 914 uint16_t *pc = (uint16_t*) p; 915 sprintf(buf,"%8s:", str); 916 len = (len / sizeof(uint16_t)); /* passed as bytes */ 917 for(i=0;i<len;i++) { 918 sprintf(tbuf, " %04x", pc[i]); 919 strcat(buf, tbuf); 920 } 921 SkDebugf("%s\n", buf); 922} 923#endif 924 925void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 926 const SkPMColor* SK_RESTRICT src, 927 int count, U8CPU alpha, int x, int y) { 928 SkASSERT(255 == alpha); 929 930#define UNROLL 8 931 932 if (count >= UNROLL) { 933 uint8x8_t dbase; 934 935#if defined(DEBUG_OPAQUE_DITHER) 936 uint16_t tmpbuf[UNROLL]; 937 int td[UNROLL]; 938 int tdv[UNROLL]; 939 int ta[UNROLL]; 940 int tap[UNROLL]; 941 uint16_t in_dst[UNROLL]; 942 int offset = 0; 943 int noisy = 0; 944#endif 945 946 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 947 dbase = vld1_u8(dstart); 948 949 do { 950 uint8x8_t sr, sg, sb, sa, d; 951 uint16x8_t dst8, scale8, alpha8; 952 uint16x8_t dst_r, dst_g, dst_b; 953 954#if defined(DEBUG_OPAQUE_DITHER) 955 /* calculate 8 elements worth into a temp buffer */ 956 { 957 int my_y = y; 958 int my_x = x; 959 SkPMColor* my_src = (SkPMColor*)src; 960 uint16_t* my_dst = dst; 961 int i; 962 963 DITHER_565_SCAN(my_y); 964 for(i=0;i<UNROLL;i++) { 965 SkPMColor c = *my_src++; 966 SkPMColorAssert(c); 967 if (c) { 968 unsigned a = SkGetPackedA32(c); 969 970 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 971 tdv[i] = DITHER_VALUE(my_x); 972 ta[i] = a; 973 tap[i] = SkAlpha255To256(a); 974 td[i] = d; 975 976 unsigned sr = SkGetPackedR32(c); 977 unsigned sg = SkGetPackedG32(c); 978 unsigned sb = SkGetPackedB32(c); 979 sr = SkDITHER_R32_FOR_565(sr, d); 980 sg = SkDITHER_G32_FOR_565(sg, d); 981 sb = SkDITHER_B32_FOR_565(sb, d); 982 983 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 984 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 985 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 986 // now src and dst expanded are in g:11 r:10 x:1 b:10 987 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 988 td[i] = d; 989 990 } else { 991 tmpbuf[i] = *my_dst; 992 ta[i] = tdv[i] = td[i] = 0xbeef; 993 } 994 in_dst[i] = *my_dst; 995 my_dst += 1; 996 DITHER_INC_X(my_x); 997 } 998 } 999#endif 1000 1001 /* source is in ABGR */ 1002 { 1003 register uint8x8_t d0 asm("d0"); 1004 register uint8x8_t d1 asm("d1"); 1005 register uint8x8_t d2 asm("d2"); 1006 register uint8x8_t d3 asm("d3"); 1007 1008 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1009 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1010 : "r" (src) 1011 ); 1012 sr = d0; sg = d1; sb = d2; sa = d3; 1013 } 1014 1015 /* calculate 'd', which will be 0..7 */ 1016 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 1017#if defined(SK_BUILD_FOR_ANDROID) 1018 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1019 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 1020#else 1021 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 1022#endif 1023 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 1024 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 1025 1026 /* sr = sr - (sr>>5) + d */ 1027 /* watching for 8-bit overflow. d is 0..7; risky range of 1028 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1029 * safe as long as we do ((sr-sr>>5) + d) */ 1030 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1031 sr = vadd_u8(sr, d); 1032 1033 /* sb = sb - (sb>>5) + d */ 1034 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1035 sb = vadd_u8(sb, d); 1036 1037 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1038 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1039 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1040 1041 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1042 dst8 = vld1q_u16(dst); 1043 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1044 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1045 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1046 1047 /* blend */ 1048#if 1 1049 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1050 /* originally 255-sa + 1 */ 1051 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1052#else 1053 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1054 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1055#endif 1056 1057#if 1 1058 /* combine the addq and mul, save 3 insns */ 1059 scale8 = vshrq_n_u16(scale8, 3); 1060 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1061 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1062 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1063#else 1064 /* known correct, but +3 insns over above */ 1065 scale8 = vshrq_n_u16(scale8, 3); 1066 dst_b = vmulq_u16(dst_b, scale8); 1067 dst_g = vmulq_u16(dst_g, scale8); 1068 dst_r = vmulq_u16(dst_r, scale8); 1069 1070 /* combine */ 1071 /* NB: vshll widens, need to preserve those bits */ 1072 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1073 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1074 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1075#endif 1076 1077 /* repack to store */ 1078 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1079 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1080 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1081 1082 vst1q_u16(dst, dst8); 1083 1084#if defined(DEBUG_OPAQUE_DITHER) 1085 /* verify my 8 elements match the temp buffer */ 1086 { 1087 int i, bad=0; 1088 static int invocation; 1089 1090 for (i=0;i<UNROLL;i++) 1091 if (tmpbuf[i] != dst[i]) bad=1; 1092 if (bad) { 1093 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1094 invocation, offset); 1095 SkDebugf(" alpha 0x%x\n", alpha); 1096 for (i=0;i<UNROLL;i++) 1097 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1098 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1099 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1100 1101 showme16("alpha8", &alpha8, sizeof(alpha8)); 1102 showme16("scale8", &scale8, sizeof(scale8)); 1103 showme8("d", &d, sizeof(d)); 1104 showme16("dst8", &dst8, sizeof(dst8)); 1105 showme16("dst_b", &dst_b, sizeof(dst_b)); 1106 showme16("dst_g", &dst_g, sizeof(dst_g)); 1107 showme16("dst_r", &dst_r, sizeof(dst_r)); 1108 showme8("sb", &sb, sizeof(sb)); 1109 showme8("sg", &sg, sizeof(sg)); 1110 showme8("sr", &sr, sizeof(sr)); 1111 1112 /* cop out */ 1113 return; 1114 } 1115 offset += UNROLL; 1116 invocation++; 1117 } 1118#endif 1119 1120 dst += UNROLL; 1121 src += UNROLL; 1122 count -= UNROLL; 1123 /* skip x += UNROLL, since it's unchanged mod-4 */ 1124 } while (count >= UNROLL); 1125 } 1126#undef UNROLL 1127 1128 /* residuals */ 1129 if (count > 0) { 1130 DITHER_565_SCAN(y); 1131 do { 1132 SkPMColor c = *src++; 1133 SkPMColorAssert(c); 1134 if (c) { 1135 unsigned a = SkGetPackedA32(c); 1136 1137 // dither and alpha are just temporary variables to work-around 1138 // an ICE in debug. 1139 unsigned dither = DITHER_VALUE(x); 1140 unsigned alpha = SkAlpha255To256(a); 1141 int d = SkAlphaMul(dither, alpha); 1142 1143 unsigned sr = SkGetPackedR32(c); 1144 unsigned sg = SkGetPackedG32(c); 1145 unsigned sb = SkGetPackedB32(c); 1146 sr = SkDITHER_R32_FOR_565(sr, d); 1147 sg = SkDITHER_G32_FOR_565(sg, d); 1148 sb = SkDITHER_B32_FOR_565(sb, d); 1149 1150 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1151 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1152 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1153 // now src and dst expanded are in g:11 r:10 x:1 b:10 1154 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1155 } 1156 dst += 1; 1157 DITHER_INC_X(x); 1158 } while (--count != 0); 1159 } 1160} 1161 1162/////////////////////////////////////////////////////////////////////////////// 1163 1164#undef DEBUG_S32_OPAQUE_DITHER 1165 1166void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1167 const SkPMColor* SK_RESTRICT src, 1168 int count, U8CPU alpha, int x, int y) { 1169 SkASSERT(255 == alpha); 1170 1171#define UNROLL 8 1172 if (count >= UNROLL) { 1173 uint8x8_t d; 1174 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1175 d = vld1_u8(dstart); 1176 1177 while (count >= UNROLL) { 1178 uint8x8_t sr, sg, sb; 1179 uint16x8_t dr, dg, db; 1180 uint16x8_t dst8; 1181 1182 { 1183 register uint8x8_t d0 asm("d0"); 1184 register uint8x8_t d1 asm("d1"); 1185 register uint8x8_t d2 asm("d2"); 1186 register uint8x8_t d3 asm("d3"); 1187 1188 asm ( 1189 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1190 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1191 : 1192 ); 1193 sg = d1; 1194#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1195 sr = d2; sb = d0; 1196#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1197 sr = d0; sb = d2; 1198#endif 1199 } 1200 /* XXX: if we want to prefetch, hide it in the above asm() 1201 * using the gcc __builtin_prefetch(), the prefetch will 1202 * fall to the bottom of the loop -- it won't stick up 1203 * at the top of the loop, just after the vld4. 1204 */ 1205 1206 // sr = sr - (sr>>5) + d 1207 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1208 dr = vaddl_u8(sr, d); 1209 1210 // sb = sb - (sb>>5) + d 1211 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1212 db = vaddl_u8(sb, d); 1213 1214 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1215 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1216 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1217 1218 // pack high bits of each into 565 format (rgb, b is lsb) 1219 dst8 = vshrq_n_u16(db, 3); 1220 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1221 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1222 1223 // store it 1224 vst1q_u16(dst, dst8); 1225 1226#if defined(DEBUG_S32_OPAQUE_DITHER) 1227 // always good to know if we generated good results 1228 { 1229 int i, myx = x, myy = y; 1230 DITHER_565_SCAN(myy); 1231 for (i=0;i<UNROLL;i++) { 1232 // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1233 SkPMColor c = src[i-8]; 1234 unsigned dither = DITHER_VALUE(myx); 1235 uint16_t val = SkDitherRGB32To565(c, dither); 1236 if (val != dst[i]) { 1237 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1238 c, dither, val, dst[i], dstart[i]); 1239 } 1240 DITHER_INC_X(myx); 1241 } 1242 } 1243#endif 1244 1245 dst += UNROLL; 1246 // we don't need to increment src as the asm above has already done it 1247 count -= UNROLL; 1248 x += UNROLL; // probably superfluous 1249 } 1250 } 1251#undef UNROLL 1252 1253 // residuals 1254 if (count > 0) { 1255 DITHER_565_SCAN(y); 1256 do { 1257 SkPMColor c = *src++; 1258 SkPMColorAssert(c); 1259 SkASSERT(SkGetPackedA32(c) == 255); 1260 1261 unsigned dither = DITHER_VALUE(x); 1262 *dst++ = SkDitherRGB32To565(c, dither); 1263 DITHER_INC_X(x); 1264 } while (--count != 0); 1265 } 1266} 1267 1268void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1269 SkPMColor color) { 1270 if (count <= 0) { 1271 return; 1272 } 1273 1274 if (0 == color) { 1275 if (src != dst) { 1276 memcpy(dst, src, count * sizeof(SkPMColor)); 1277 } 1278 return; 1279 } 1280 1281 unsigned colorA = SkGetPackedA32(color); 1282 if (255 == colorA) { 1283 sk_memset32(dst, color, count); 1284 } else { 1285 unsigned scale = 256 - SkAlpha255To256(colorA); 1286 1287 if (count >= 8) { 1288 // at the end of this assembly, count will have been decremented 1289 // to a negative value. That is, if count mod 8 = x, it will be 1290 // -8 +x coming out. 1291 asm volatile ( 1292 PLD128(src, 0) 1293 1294 "vdup.32 q0, %[color] \n\t" 1295 1296 PLD128(src, 128) 1297 1298 // scale numerical interval [0-255], so load as 8 bits 1299 "vdup.8 d2, %[scale] \n\t" 1300 1301 PLD128(src, 256) 1302 1303 "subs %[count], %[count], #8 \n\t" 1304 1305 PLD128(src, 384) 1306 1307 "Loop_Color32: \n\t" 1308 1309 // load src color, 8 pixels, 4 64 bit registers 1310 // (and increment src). 1311 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1312 1313 PLD128(src, 384) 1314 1315 // multiply long by scale, 64 bits at a time, 1316 // destination into a 128 bit register. 1317 "vmull.u8 q4, d4, d2 \n\t" 1318 "vmull.u8 q5, d5, d2 \n\t" 1319 "vmull.u8 q6, d6, d2 \n\t" 1320 "vmull.u8 q7, d7, d2 \n\t" 1321 1322 // shift the 128 bit registers, containing the 16 1323 // bit scaled values back to 8 bits, narrowing the 1324 // results to 64 bit registers. 1325 "vshrn.i16 d8, q4, #8 \n\t" 1326 "vshrn.i16 d9, q5, #8 \n\t" 1327 "vshrn.i16 d10, q6, #8 \n\t" 1328 "vshrn.i16 d11, q7, #8 \n\t" 1329 1330 // adding back the color, using 128 bit registers. 1331 "vadd.i8 q6, q4, q0 \n\t" 1332 "vadd.i8 q7, q5, q0 \n\t" 1333 1334 // store back the 8 calculated pixels (2 128 bit 1335 // registers), and increment dst. 1336 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1337 1338 "subs %[count], %[count], #8 \n\t" 1339 "bge Loop_Color32 \n\t" 1340 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1341 : [color] "r" (color), [scale] "r" (scale) 1342 : "cc", "memory", 1343 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1344 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1345 ); 1346 // At this point, if we went through the inline assembly, count is 1347 // a negative value: 1348 // if the value is -8, there is no pixel left to process. 1349 // if the value is -7, there is one pixel left to process 1350 // ... 1351 // And'ing it with 7 will give us the number of pixels 1352 // left to process. 1353 count = count & 0x7; 1354 } 1355 1356 while (count > 0) { 1357 *dst = color + SkAlphaMulQ(*src, scale); 1358 src += 1; 1359 dst += 1; 1360 count--; 1361 } 1362 } 1363} 1364 1365/////////////////////////////////////////////////////////////////////////////// 1366 1367const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1368 // no dither 1369 // NOTE: For the S32_D565_Blend function below, we don't have a special 1370 // version that assumes that each source pixel is opaque. But our 1371 // S32A is still faster than the default, so use it. 1372 S32_D565_Opaque_neon, 1373 S32A_D565_Blend_neon, // really S32_D565_Blend 1374 S32A_D565_Opaque_neon, 1375 S32A_D565_Blend_neon, 1376 1377 // dither 1378 S32_D565_Opaque_Dither_neon, 1379 S32_D565_Blend_Dither_neon, 1380 S32A_D565_Opaque_Dither_neon, 1381 NULL, // S32A_D565_Blend_Dither 1382}; 1383 1384const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1385 NULL, // S32_Opaque, 1386 S32_Blend_BlitRow32_neon, // S32_Blend, 1387 /* 1388 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1389 * value and attempts to optimize accordingly. The optimization is 1390 * sensitive to the source content and is not a win in all cases. For 1391 * example, if there are a lot of transitions between the alpha states, 1392 * the performance will almost certainly be worse. However, for many 1393 * common cases the performance is equivalent or better than the standard 1394 * case where we do not inspect the src alpha. 1395 */ 1396#if SK_A32_SHIFT == 24 1397 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1398 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1399#else 1400 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1401#endif 1402 S32A_Blend_BlitRow32_neon // S32A_Blend 1403}; 1404