SkBlitRow_opts_arm_neon.cpp revision a111e492b84c312a6bd5d5d9ef100dca48f4941d
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18 19#include <arm_neon.h> 20 21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 if (count >= 8) { 27 uint16_t* SK_RESTRICT keep_dst = 0; 28 29 asm volatile ( 30 "ands ip, %[count], #7 \n\t" 31 "vmov.u8 d31, #1<<7 \n\t" 32 "vld1.16 {q12}, [%[dst]] \n\t" 33 "vld4.8 {d0-d3}, [%[src]] \n\t" 34 // Thumb does not support the standard ARM conditional 35 // instructions but instead requires the 'it' instruction 36 // to signal conditional execution 37 "it eq \n\t" 38 "moveq ip, #8 \n\t" 39 "mov %[keep_dst], %[dst] \n\t" 40 41 "add %[src], %[src], ip, LSL#2 \n\t" 42 "add %[dst], %[dst], ip, LSL#1 \n\t" 43 "subs %[count], %[count], ip \n\t" 44 "b 9f \n\t" 45 // LOOP 46 "2: \n\t" 47 48 "vld1.16 {q12}, [%[dst]]! \n\t" 49 "vld4.8 {d0-d3}, [%[src]]! \n\t" 50 "vst1.16 {q10}, [%[keep_dst]] \n\t" 51 "sub %[keep_dst], %[dst], #8*2 \n\t" 52 "subs %[count], %[count], #8 \n\t" 53 "9: \n\t" 54 "pld [%[dst],#32] \n\t" 55 // expand 0565 q12 to 8888 {d4-d7} 56 "vmovn.u16 d4, q12 \n\t" 57 "vshr.u16 q11, q12, #5 \n\t" 58 "vshr.u16 q10, q12, #6+5 \n\t" 59 "vmovn.u16 d5, q11 \n\t" 60 "vmovn.u16 d6, q10 \n\t" 61 "vshl.u8 d4, d4, #3 \n\t" 62 "vshl.u8 d5, d5, #2 \n\t" 63 "vshl.u8 d6, d6, #3 \n\t" 64 65 "vmovl.u8 q14, d31 \n\t" 66 "vmovl.u8 q13, d31 \n\t" 67 "vmovl.u8 q12, d31 \n\t" 68 69 // duplicate in 4/2/1 & 8pix vsns 70 "vmvn.8 d30, d3 \n\t" 71 "vmlal.u8 q14, d30, d6 \n\t" 72 "vmlal.u8 q13, d30, d5 \n\t" 73 "vmlal.u8 q12, d30, d4 \n\t" 74 "vshr.u16 q8, q14, #5 \n\t" 75 "vshr.u16 q9, q13, #6 \n\t" 76 "vaddhn.u16 d6, q14, q8 \n\t" 77 "vshr.u16 q8, q12, #5 \n\t" 78 "vaddhn.u16 d5, q13, q9 \n\t" 79 "vqadd.u8 d6, d6, d0 \n\t" // moved up 80 "vaddhn.u16 d4, q12, q8 \n\t" 81 // intentionally don't calculate alpha 82 // result in d4-d6 83 84 "vqadd.u8 d5, d5, d1 \n\t" 85 "vqadd.u8 d4, d4, d2 \n\t" 86 87 // pack 8888 {d4-d6} to 0565 q10 88 "vshll.u8 q10, d6, #8 \n\t" 89 "vshll.u8 q3, d5, #8 \n\t" 90 "vshll.u8 q2, d4, #8 \n\t" 91 "vsri.u16 q10, q3, #5 \n\t" 92 "vsri.u16 q10, q2, #11 \n\t" 93 94 "bne 2b \n\t" 95 96 "1: \n\t" 97 "vst1.16 {q10}, [%[keep_dst]] \n\t" 98 : [count] "+r" (count) 99 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 100 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 101 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 102 "d30","d31" 103 ); 104 } 105 else 106 { // handle count < 8 107 uint16_t* SK_RESTRICT keep_dst = 0; 108 109 asm volatile ( 110 "vmov.u8 d31, #1<<7 \n\t" 111 "mov %[keep_dst], %[dst] \n\t" 112 113 "tst %[count], #4 \n\t" 114 "beq 14f \n\t" 115 "vld1.16 {d25}, [%[dst]]! \n\t" 116 "vld1.32 {q1}, [%[src]]! \n\t" 117 118 "14: \n\t" 119 "tst %[count], #2 \n\t" 120 "beq 12f \n\t" 121 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 122 "vld1.32 {d1}, [%[src]]! \n\t" 123 124 "12: \n\t" 125 "tst %[count], #1 \n\t" 126 "beq 11f \n\t" 127 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 128 "vld1.32 {d0[1]}, [%[src]]! \n\t" 129 130 "11: \n\t" 131 // unzips achieve the same as a vld4 operation 132 "vuzpq.u16 q0, q1 \n\t" 133 "vuzp.u8 d0, d1 \n\t" 134 "vuzp.u8 d2, d3 \n\t" 135 // expand 0565 q12 to 8888 {d4-d7} 136 "vmovn.u16 d4, q12 \n\t" 137 "vshr.u16 q11, q12, #5 \n\t" 138 "vshr.u16 q10, q12, #6+5 \n\t" 139 "vmovn.u16 d5, q11 \n\t" 140 "vmovn.u16 d6, q10 \n\t" 141 "vshl.u8 d4, d4, #3 \n\t" 142 "vshl.u8 d5, d5, #2 \n\t" 143 "vshl.u8 d6, d6, #3 \n\t" 144 145 "vmovl.u8 q14, d31 \n\t" 146 "vmovl.u8 q13, d31 \n\t" 147 "vmovl.u8 q12, d31 \n\t" 148 149 // duplicate in 4/2/1 & 8pix vsns 150 "vmvn.8 d30, d3 \n\t" 151 "vmlal.u8 q14, d30, d6 \n\t" 152 "vmlal.u8 q13, d30, d5 \n\t" 153 "vmlal.u8 q12, d30, d4 \n\t" 154 "vshr.u16 q8, q14, #5 \n\t" 155 "vshr.u16 q9, q13, #6 \n\t" 156 "vaddhn.u16 d6, q14, q8 \n\t" 157 "vshr.u16 q8, q12, #5 \n\t" 158 "vaddhn.u16 d5, q13, q9 \n\t" 159 "vqadd.u8 d6, d6, d0 \n\t" // moved up 160 "vaddhn.u16 d4, q12, q8 \n\t" 161 // intentionally don't calculate alpha 162 // result in d4-d6 163 164 "vqadd.u8 d5, d5, d1 \n\t" 165 "vqadd.u8 d4, d4, d2 \n\t" 166 167 // pack 8888 {d4-d6} to 0565 q10 168 "vshll.u8 q10, d6, #8 \n\t" 169 "vshll.u8 q3, d5, #8 \n\t" 170 "vshll.u8 q2, d4, #8 \n\t" 171 "vsri.u16 q10, q3, #5 \n\t" 172 "vsri.u16 q10, q2, #11 \n\t" 173 174 // store 175 "tst %[count], #4 \n\t" 176 "beq 24f \n\t" 177 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 178 179 "24: \n\t" 180 "tst %[count], #2 \n\t" 181 "beq 22f \n\t" 182 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 183 184 "22: \n\t" 185 "tst %[count], #1 \n\t" 186 "beq 21f \n\t" 187 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 188 189 "21: \n\t" 190 : [count] "+r" (count) 191 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 192 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 193 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 194 "d30","d31" 195 ); 196 } 197} 198 199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 200 const SkPMColor* SK_RESTRICT src, int count, 201 U8CPU alpha, int /*x*/, int /*y*/) { 202 203 U8CPU alpha_for_asm = alpha; 204 205 asm volatile ( 206 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 207 * the original in two respects: 208 * 1. The results have a few mismatches compared to the original code. These mismatches 209 * never exceed 1. It's possible to improve accuracy vs. a floating point 210 * implementation by introducing rounding right shifts (vrshr) for the final stage. 211 * Rounding is not present in the code below, because although results would be closer 212 * to a floating point implementation, the number of mismatches compared to the 213 * original code would be far greater. 214 * 2. On certain inputs, the original code can overflow, causing colour channels to 215 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 216 * to affect another. 217 */ 218 219#if 1 220 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 221 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 222#else 223 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 224#endif 225 "vmov.u16 q3, #255 \n\t" // set up constant 226 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 227 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 228 "beq 2f \n\t" // if count8 == 0, exit 229 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 230 231 "1: \n\t" 232 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 233 "subs r4, r4, #1 \n\t" // decrement loop counter 234 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 235 // and deinterleave 236 237 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 238 "vand q10, q0, q15 \n\t" // extract blue 239 "vshr.u16 q8, q0, #11 \n\t" // extract red 240 "vshr.u16 q9, q9, #10 \n\t" // extract green 241 // dstrgb = {q8, q9, q10} 242 243 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 244 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 245 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 246 247 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 248 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 249 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 250 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 251 // srcrgba = {q11, q12, q13, q14} 252 253 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 254 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 255 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 256 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 257 258 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 259 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 260 // dst_scale = q2 261 262 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 263 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 264 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 265 266#if 1 267 // trying for a better match with SkDiv255Round(a) 268 // C alg is: a+=128; (a+a>>8)>>8 269 // we'll use just a rounding shift [q2 is available for scratch] 270 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 271 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 272 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 273#else 274 // arm's original "truncating divide by 256" 275 "vshr.u16 q11, q11, #8 \n\t" // shift down red 276 "vshr.u16 q12, q12, #8 \n\t" // shift down green 277 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 278#endif 279 280 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 281 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 282 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 283 284 "bne 1b \n\t" // if counter != 0, loop 285 "2: \n\t" // exit 286 287 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 288 : 289 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 290 ); 291 292 count &= 7; 293 if (count > 0) { 294 do { 295 SkPMColor sc = *src++; 296 if (sc) { 297 uint16_t dc = *dst; 298 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 299 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 300 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 301 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 302 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 303 } 304 dst += 1; 305 } while (--count != 0); 306 } 307} 308 309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 310 * each dither value is spaced out into byte lanes, and repeated 311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 312 * start of each row. 313 */ 314static const uint8_t gDitherMatrix_Neon[48] = { 315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 319 320}; 321 322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 323 int count, U8CPU alpha, int x, int y) 324{ 325 /* select row and offset for dither array */ 326 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 327 328 /* rescale alpha to range 0 - 256 */ 329 int scale = SkAlpha255To256(alpha); 330 331 asm volatile ( 332 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 333 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 334 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 335 "vmov.i8 d29, #0x3f \n\t" // set up green mask 336 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 337 "1: \n\t" 338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 339 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 340 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 341 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 342 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 343 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 344 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 345 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 346 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 347 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 348 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 349 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 350 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 351 // load 8 pixels from dst, extract rgb 352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 353 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 354 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 355 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 356 "vand d17, d17, d29 \n\t" // and green with green mask 357 "vand d18, d18, d28 \n\t" // and blue with blue mask 358 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 359 // src = {d22 (r), d23 (g), d24 (b)} 360 // dst = {d16 (r), d17 (g), d18 (b)} 361 // subtract dst from src and widen 362 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 363 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 364 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 365 // multiply diffs by scale and shift 366 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 367 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 368 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 369 "subs %[count], %[count], #8 \n\t" // decrement loop counter 370 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 371 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 372 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 373 // add dst to result 374 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 375 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 376 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 377 // put result into 565 format 378 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 379 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 381 "bgt 1b \n\t" // loop if count > 0 382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 383 : [dstart] "r" (dstart), [scale] "r" (scale) 384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 385 ); 386 387 DITHER_565_SCAN(y); 388 389 while((count & 7) > 0) 390 { 391 SkPMColor c = *src++; 392 393 int dither = DITHER_VALUE(x); 394 int sr = SkGetPackedR32(c); 395 int sg = SkGetPackedG32(c); 396 int sb = SkGetPackedB32(c); 397 sr = SkDITHER_R32To565(sr, dither); 398 sg = SkDITHER_G32To565(sg, dither); 399 sb = SkDITHER_B32To565(sb, dither); 400 401 uint16_t d = *dst; 402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 403 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 404 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 405 DITHER_INC_X(x); 406 count--; 407 } 408} 409 410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 411 const SkPMColor* SK_RESTRICT src, 412 int count, U8CPU alpha) { 413 414 SkASSERT(255 == alpha); 415 if (count > 0) { 416 417 418 uint8x8_t alpha_mask; 419 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 421 alpha_mask = vld1_u8(alpha_mask_setup); 422 423 /* do the NEON unrolled code */ 424#define UNROLL 4 425 while (count >= UNROLL) { 426 uint8x8_t src_raw, dst_raw, dst_final; 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 428 429 /* The two prefetches below may make the code slighlty 430 * slower for small values of count but are worth having 431 * in the general case. 432 */ 433 __builtin_prefetch(src+32); 434 __builtin_prefetch(dst+32); 435 436 /* get the source */ 437 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 438#if UNROLL > 2 439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 440#endif 441 442 /* get and hold the dst too */ 443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 444#if UNROLL > 2 445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 446#endif 447 448 /* 1st and 2nd bits of the unrolling */ 449 { 450 uint8x8_t dst_cooked; 451 uint16x8_t dst_wide; 452 uint8x8_t alpha_narrow; 453 uint16x8_t alpha_wide; 454 455 /* get the alphas spread out properly */ 456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 458 459 /* spread the dest */ 460 dst_wide = vmovl_u8(dst_raw); 461 462 /* alpha mul the dest */ 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 464 dst_cooked = vshrn_n_u16(dst_wide, 8); 465 466 /* sum -- ignoring any byte lane overflows */ 467 dst_final = vadd_u8(src_raw, dst_cooked); 468 } 469 470#if UNROLL > 2 471 /* the 3rd and 4th bits of our unrolling */ 472 { 473 uint8x8_t dst_cooked; 474 uint16x8_t dst_wide; 475 uint8x8_t alpha_narrow; 476 uint16x8_t alpha_wide; 477 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 480 481 /* spread the dest */ 482 dst_wide = vmovl_u8(dst_raw_2); 483 484 /* alpha mul the dest */ 485 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 486 dst_cooked = vshrn_n_u16(dst_wide, 8); 487 488 /* sum -- ignoring any byte lane overflows */ 489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 490 } 491#endif 492 493 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 494#if UNROLL > 2 495 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 496#endif 497 498 src += UNROLL; 499 dst += UNROLL; 500 count -= UNROLL; 501 } 502#undef UNROLL 503 504 /* do any residual iterations */ 505 while (--count >= 0) { 506 *dst = SkPMSrcOver(*src, *dst); 507 src += 1; 508 dst += 1; 509 } 510 } 511} 512 513void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 514 const SkPMColor* SK_RESTRICT src, 515 int count, U8CPU alpha) { 516 SkASSERT(255 == alpha); 517 518 if (count <= 0) 519 return; 520 521 /* Use these to check if src is transparent or opaque */ 522 const unsigned int ALPHA_OPAQ = 0xFF000000; 523 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 524 525#define UNROLL 4 526 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 527 const SkPMColor* SK_RESTRICT src_temp = src; 528 529 /* set up the NEON variables */ 530 uint8x8_t alpha_mask; 531 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 532 alpha_mask = vld1_u8(alpha_mask_setup); 533 534 uint8x8_t src_raw, dst_raw, dst_final; 535 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 536 uint8x8_t dst_cooked; 537 uint16x8_t dst_wide; 538 uint8x8_t alpha_narrow; 539 uint16x8_t alpha_wide; 540 541 /* choose the first processing type */ 542 if( src >= src_end) 543 goto TAIL; 544 if(*src <= ALPHA_TRANS) 545 goto ALPHA_0; 546 if(*src >= ALPHA_OPAQ) 547 goto ALPHA_255; 548 /* fall-thru */ 549 550ALPHA_1_TO_254: 551 do { 552 553 /* get the source */ 554 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 555 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 556 557 /* get and hold the dst too */ 558 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 559 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 560 561 562 /* get the alphas spread out properly */ 563 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 564 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 565 /* we collapsed (255-a)+1 ... */ 566 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 567 568 /* spread the dest */ 569 dst_wide = vmovl_u8(dst_raw); 570 571 /* alpha mul the dest */ 572 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 573 dst_cooked = vshrn_n_u16(dst_wide, 8); 574 575 /* sum -- ignoring any byte lane overflows */ 576 dst_final = vadd_u8(src_raw, dst_cooked); 577 578 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 579 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 580 /* we collapsed (255-a)+1 ... */ 581 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 582 583 /* spread the dest */ 584 dst_wide = vmovl_u8(dst_raw_2); 585 586 /* alpha mul the dest */ 587 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 588 dst_cooked = vshrn_n_u16(dst_wide, 8); 589 590 /* sum -- ignoring any byte lane overflows */ 591 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 592 593 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 594 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 595 596 src += UNROLL; 597 dst += UNROLL; 598 599 /* if 2 of the next pixels aren't between 1 and 254 600 it might make sense to go to the optimized loops */ 601 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 602 break; 603 604 } while(src < src_end); 605 606 if (src >= src_end) 607 goto TAIL; 608 609 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 610 goto ALPHA_255; 611 612 /*fall-thru*/ 613 614ALPHA_0: 615 616 /*In this state, we know the current alpha is 0 and 617 we optimize for the next alpha also being zero. */ 618 src_temp = src; //so we don't have to increment dst every time 619 do { 620 if(*(++src) > ALPHA_TRANS) 621 break; 622 if(*(++src) > ALPHA_TRANS) 623 break; 624 if(*(++src) > ALPHA_TRANS) 625 break; 626 if(*(++src) > ALPHA_TRANS) 627 break; 628 } while(src < src_end); 629 630 dst += (src - src_temp); 631 632 /* no longer alpha 0, so determine where to go next. */ 633 if( src >= src_end) 634 goto TAIL; 635 if(*src >= ALPHA_OPAQ) 636 goto ALPHA_255; 637 else 638 goto ALPHA_1_TO_254; 639 640ALPHA_255: 641 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 642 dst[0]=src[0]; 643 dst[1]=src[1]; 644 dst[2]=src[2]; 645 dst[3]=src[3]; 646 src+=UNROLL; 647 dst+=UNROLL; 648 if(src >= src_end) 649 goto TAIL; 650 } 651 652 //Handle remainder. 653 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 654 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 655 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 656 } 657 } 658 659 if( src >= src_end) 660 goto TAIL; 661 if(*src <= ALPHA_TRANS) 662 goto ALPHA_0; 663 else 664 goto ALPHA_1_TO_254; 665 666TAIL: 667 /* do any residual iterations */ 668 src_end += UNROLL + 1; //goto the real end 669 while(src != src_end) { 670 if( *src != 0 ) { 671 if( *src >= ALPHA_OPAQ ) { 672 *dst = *src; 673 } 674 else { 675 *dst = SkPMSrcOver(*src, *dst); 676 } 677 } 678 src++; 679 dst++; 680 } 681 682#undef UNROLL 683 return; 684} 685 686/* Neon version of S32_Blend_BlitRow32() 687 * portable version is in src/core/SkBlitRow_D32.cpp 688 */ 689void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 690 const SkPMColor* SK_RESTRICT src, 691 int count, U8CPU alpha) { 692 SkASSERT(alpha <= 255); 693 if (count > 0) { 694 uint16_t src_scale = SkAlpha255To256(alpha); 695 uint16_t dst_scale = 256 - src_scale; 696 697 /* run them N at a time through the NEON unit */ 698 /* note that each 1 is 4 bytes, each treated exactly the same, 699 * so we can work under that guise. We *do* know that the src&dst 700 * will be 32-bit aligned quantities, so we can specify that on 701 * the load/store ops and do a neon 'reinterpret' to get us to 702 * byte-sized (pun intended) pieces that we widen/multiply/shift 703 * we're limited at 128 bits in the wide ops, which is 8x16bits 704 * or a pair of 32 bit src/dsts. 705 */ 706 /* we *could* manually unroll this loop so that we load 128 bits 707 * (as a pair of 64s) from each of src and dst, processing them 708 * in pieces. This might give us a little better management of 709 * the memory latency, but my initial attempts here did not 710 * produce an instruction stream that looked all that nice. 711 */ 712#define UNROLL 2 713 while (count >= UNROLL) { 714 uint8x8_t src_raw, dst_raw, dst_final; 715 uint16x8_t src_wide, dst_wide; 716 717 /* get 64 bits of src, widen it, multiply by src_scale */ 718 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 719 src_wide = vmovl_u8(src_raw); 720 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 721 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 722 723 /* ditto with dst */ 724 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 725 dst_wide = vmovl_u8(dst_raw); 726 727 /* combine add with dst multiply into mul-accumulate */ 728 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 729 730 dst_final = vshrn_n_u16(dst_wide, 8); 731 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 732 733 src += UNROLL; 734 dst += UNROLL; 735 count -= UNROLL; 736 } 737 /* RBE: well, i don't like how gcc manages src/dst across the above 738 * loop it's constantly calculating src+bias, dst+bias and it only 739 * adjusts the real ones when we leave the loop. Not sure why 740 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 741 * the adjustments to src/dst/count, but it does... 742 * (might be SSA-style internal logic... 743 */ 744 745#if UNROLL == 2 746 if (count == 1) { 747 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 748 } 749#else 750 if (count > 0) { 751 do { 752 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 753 src += 1; 754 dst += 1; 755 } while (--count > 0); 756 } 757#endif 758 759#undef UNROLL 760 } 761} 762 763void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 764 const SkPMColor* SK_RESTRICT src, 765 int count, U8CPU alpha) { 766 767 SkASSERT(255 >= alpha); 768 769 if (count <= 0) { 770 return; 771 } 772 773 unsigned alpha256 = SkAlpha255To256(alpha); 774 775 // First deal with odd counts 776 if (count & 1) { 777 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 778 uint16x8_t vdst_wide, vsrc_wide; 779 unsigned dst_scale; 780 781 // Load 782 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 783 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 784 785 // Calc dst_scale 786 dst_scale = vget_lane_u8(vsrc, 3); 787 dst_scale *= alpha256; 788 dst_scale >>= 8; 789 dst_scale = 256 - dst_scale; 790 791 // Process src 792 vsrc_wide = vmovl_u8(vsrc); 793 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 794 795 // Process dst 796 vdst_wide = vmovl_u8(vdst); 797 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 798 799 // Combine 800 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 801 802 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 803 dst++; 804 src++; 805 count--; 806 } 807 808 if (count) { 809 uint8x8_t alpha_mask; 810 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 811 alpha_mask = vld1_u8(alpha_mask_setup); 812 813 do { 814 815 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 816 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 817 818 __builtin_prefetch(src+32); 819 __builtin_prefetch(dst+32); 820 821 // Load 822 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 823 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 824 825 // Prepare src_scale 826 vsrc_scale = vdupq_n_u16(alpha256); 827 828 // Calc dst_scale 829 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 830 vdst_scale = vmovl_u8(vsrc_alphas); 831 vdst_scale *= vsrc_scale; 832 vdst_scale = vshrq_n_u16(vdst_scale, 8); 833 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 834 835 // Process src 836 vsrc_wide = vmovl_u8(vsrc); 837 vsrc_wide *= vsrc_scale; 838 839 // Process dst 840 vdst_wide = vmovl_u8(vdst); 841 vdst_wide *= vdst_scale; 842 843 // Combine 844 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 845 846 vst1_u32(dst, vreinterpret_u32_u8(vres)); 847 848 src += 2; 849 dst += 2; 850 count -= 2; 851 } while(count); 852 } 853} 854 855/////////////////////////////////////////////////////////////////////////////// 856 857#undef DEBUG_OPAQUE_DITHER 858 859#if defined(DEBUG_OPAQUE_DITHER) 860static void showme8(char *str, void *p, int len) 861{ 862 static char buf[256]; 863 char tbuf[32]; 864 int i; 865 char *pc = (char*) p; 866 sprintf(buf,"%8s:", str); 867 for(i=0;i<len;i++) { 868 sprintf(tbuf, " %02x", pc[i]); 869 strcat(buf, tbuf); 870 } 871 SkDebugf("%s\n", buf); 872} 873static void showme16(char *str, void *p, int len) 874{ 875 static char buf[256]; 876 char tbuf[32]; 877 int i; 878 uint16_t *pc = (uint16_t*) p; 879 sprintf(buf,"%8s:", str); 880 len = (len / sizeof(uint16_t)); /* passed as bytes */ 881 for(i=0;i<len;i++) { 882 sprintf(tbuf, " %04x", pc[i]); 883 strcat(buf, tbuf); 884 } 885 SkDebugf("%s\n", buf); 886} 887#endif 888 889void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 890 const SkPMColor* SK_RESTRICT src, 891 int count, U8CPU alpha, int x, int y) { 892 SkASSERT(255 == alpha); 893 894#define UNROLL 8 895 896 if (count >= UNROLL) { 897 uint8x8_t dbase; 898 899#if defined(DEBUG_OPAQUE_DITHER) 900 uint16_t tmpbuf[UNROLL]; 901 int td[UNROLL]; 902 int tdv[UNROLL]; 903 int ta[UNROLL]; 904 int tap[UNROLL]; 905 uint16_t in_dst[UNROLL]; 906 int offset = 0; 907 int noisy = 0; 908#endif 909 910 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 911 dbase = vld1_u8(dstart); 912 913 do { 914 uint8x8_t sr, sg, sb, sa, d; 915 uint16x8_t dst8, scale8, alpha8; 916 uint16x8_t dst_r, dst_g, dst_b; 917 918#if defined(DEBUG_OPAQUE_DITHER) 919 /* calculate 8 elements worth into a temp buffer */ 920 { 921 int my_y = y; 922 int my_x = x; 923 SkPMColor* my_src = (SkPMColor*)src; 924 uint16_t* my_dst = dst; 925 int i; 926 927 DITHER_565_SCAN(my_y); 928 for(i=0;i<UNROLL;i++) { 929 SkPMColor c = *my_src++; 930 SkPMColorAssert(c); 931 if (c) { 932 unsigned a = SkGetPackedA32(c); 933 934 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 935 tdv[i] = DITHER_VALUE(my_x); 936 ta[i] = a; 937 tap[i] = SkAlpha255To256(a); 938 td[i] = d; 939 940 unsigned sr = SkGetPackedR32(c); 941 unsigned sg = SkGetPackedG32(c); 942 unsigned sb = SkGetPackedB32(c); 943 sr = SkDITHER_R32_FOR_565(sr, d); 944 sg = SkDITHER_G32_FOR_565(sg, d); 945 sb = SkDITHER_B32_FOR_565(sb, d); 946 947 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 948 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 949 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 950 // now src and dst expanded are in g:11 r:10 x:1 b:10 951 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 952 td[i] = d; 953 954 } else { 955 tmpbuf[i] = *my_dst; 956 ta[i] = tdv[i] = td[i] = 0xbeef; 957 } 958 in_dst[i] = *my_dst; 959 my_dst += 1; 960 DITHER_INC_X(my_x); 961 } 962 } 963#endif 964 965 /* source is in ABGR */ 966 { 967 register uint8x8_t d0 asm("d0"); 968 register uint8x8_t d1 asm("d1"); 969 register uint8x8_t d2 asm("d2"); 970 register uint8x8_t d3 asm("d3"); 971 972 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 973 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 974 : "r" (src) 975 ); 976 sr = d0; sg = d1; sb = d2; sa = d3; 977 } 978 979 /* calculate 'd', which will be 0..7 */ 980 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 981#if defined(SK_BUILD_FOR_ANDROID) 982 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 983 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 984#else 985 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 986#endif 987 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 988 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 989 990 /* sr = sr - (sr>>5) + d */ 991 /* watching for 8-bit overflow. d is 0..7; risky range of 992 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 993 * safe as long as we do ((sr-sr>>5) + d) */ 994 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 995 sr = vadd_u8(sr, d); 996 997 /* sb = sb - (sb>>5) + d */ 998 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 999 sb = vadd_u8(sb, d); 1000 1001 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1002 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1003 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1004 1005 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1006 dst8 = vld1q_u16(dst); 1007 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1008 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1009 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1010 1011 /* blend */ 1012#if 1 1013 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1014 /* originally 255-sa + 1 */ 1015 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1016#else 1017 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1018 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1019#endif 1020 1021#if 1 1022 /* combine the addq and mul, save 3 insns */ 1023 scale8 = vshrq_n_u16(scale8, 3); 1024 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1025 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1026 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1027#else 1028 /* known correct, but +3 insns over above */ 1029 scale8 = vshrq_n_u16(scale8, 3); 1030 dst_b = vmulq_u16(dst_b, scale8); 1031 dst_g = vmulq_u16(dst_g, scale8); 1032 dst_r = vmulq_u16(dst_r, scale8); 1033 1034 /* combine */ 1035 /* NB: vshll widens, need to preserve those bits */ 1036 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1037 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1038 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1039#endif 1040 1041 /* repack to store */ 1042 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1043 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1044 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1045 1046 vst1q_u16(dst, dst8); 1047 1048#if defined(DEBUG_OPAQUE_DITHER) 1049 /* verify my 8 elements match the temp buffer */ 1050 { 1051 int i, bad=0; 1052 static int invocation; 1053 1054 for (i=0;i<UNROLL;i++) 1055 if (tmpbuf[i] != dst[i]) bad=1; 1056 if (bad) { 1057 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1058 invocation, offset); 1059 SkDebugf(" alpha 0x%x\n", alpha); 1060 for (i=0;i<UNROLL;i++) 1061 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1062 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1063 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1064 1065 showme16("alpha8", &alpha8, sizeof(alpha8)); 1066 showme16("scale8", &scale8, sizeof(scale8)); 1067 showme8("d", &d, sizeof(d)); 1068 showme16("dst8", &dst8, sizeof(dst8)); 1069 showme16("dst_b", &dst_b, sizeof(dst_b)); 1070 showme16("dst_g", &dst_g, sizeof(dst_g)); 1071 showme16("dst_r", &dst_r, sizeof(dst_r)); 1072 showme8("sb", &sb, sizeof(sb)); 1073 showme8("sg", &sg, sizeof(sg)); 1074 showme8("sr", &sr, sizeof(sr)); 1075 1076 /* cop out */ 1077 return; 1078 } 1079 offset += UNROLL; 1080 invocation++; 1081 } 1082#endif 1083 1084 dst += UNROLL; 1085 src += UNROLL; 1086 count -= UNROLL; 1087 /* skip x += UNROLL, since it's unchanged mod-4 */ 1088 } while (count >= UNROLL); 1089 } 1090#undef UNROLL 1091 1092 /* residuals */ 1093 if (count > 0) { 1094 DITHER_565_SCAN(y); 1095 do { 1096 SkPMColor c = *src++; 1097 SkPMColorAssert(c); 1098 if (c) { 1099 unsigned a = SkGetPackedA32(c); 1100 1101 // dither and alpha are just temporary variables to work-around 1102 // an ICE in debug. 1103 unsigned dither = DITHER_VALUE(x); 1104 unsigned alpha = SkAlpha255To256(a); 1105 int d = SkAlphaMul(dither, alpha); 1106 1107 unsigned sr = SkGetPackedR32(c); 1108 unsigned sg = SkGetPackedG32(c); 1109 unsigned sb = SkGetPackedB32(c); 1110 sr = SkDITHER_R32_FOR_565(sr, d); 1111 sg = SkDITHER_G32_FOR_565(sg, d); 1112 sb = SkDITHER_B32_FOR_565(sb, d); 1113 1114 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1115 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1116 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1117 // now src and dst expanded are in g:11 r:10 x:1 b:10 1118 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1119 } 1120 dst += 1; 1121 DITHER_INC_X(x); 1122 } while (--count != 0); 1123 } 1124} 1125 1126/////////////////////////////////////////////////////////////////////////////// 1127 1128/* 2009/10/27: RBE says "a work in progress"; debugging says ok; 1129 * speedup untested, but ARM version is 26 insns/iteration and 1130 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 1131 * which is 10x the native version; that's pure instruction counts, 1132 * not accounting for any instruction or memory latencies. 1133 */ 1134 1135#undef DEBUG_S32_OPAQUE_DITHER 1136 1137void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1138 const SkPMColor* SK_RESTRICT src, 1139 int count, U8CPU alpha, int x, int y) { 1140 SkASSERT(255 == alpha); 1141 1142#define UNROLL 8 1143 if (count >= UNROLL) { 1144 uint8x8_t d; 1145 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1146 d = vld1_u8(dstart); 1147 1148 while (count >= UNROLL) { 1149 uint8x8_t sr, sg, sb; 1150 uint16x8_t dr, dg, db; 1151 uint16x8_t dst8; 1152 1153 /* source is in ABGR ordering (R == lsb) */ 1154 { 1155 register uint8x8_t d0 asm("d0"); 1156 register uint8x8_t d1 asm("d1"); 1157 register uint8x8_t d2 asm("d2"); 1158 register uint8x8_t d3 asm("d3"); 1159 1160 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1161 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1162 : "r" (src) 1163 ); 1164 sr = d0; sg = d1; sb = d2; 1165 } 1166 /* XXX: if we want to prefetch, hide it in the above asm() 1167 * using the gcc __builtin_prefetch(), the prefetch will 1168 * fall to the bottom of the loop -- it won't stick up 1169 * at the top of the loop, just after the vld4. 1170 */ 1171 1172 /* sr = sr - (sr>>5) + d */ 1173 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1174 dr = vaddl_u8(sr, d); 1175 1176 /* sb = sb - (sb>>5) + d */ 1177 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1178 db = vaddl_u8(sb, d); 1179 1180 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1181 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1182 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 1183 /* XXX: check that the "d>>1" here is hoisted */ 1184 1185 /* pack high bits of each into 565 format (rgb, b is lsb) */ 1186 dst8 = vshrq_n_u16(db, 3); 1187 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1188 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 1189 1190 /* store it */ 1191 vst1q_u16(dst, dst8); 1192 1193#if defined(DEBUG_S32_OPAQUE_DITHER) 1194 /* always good to know if we generated good results */ 1195 { 1196 int i, myx = x, myy = y; 1197 DITHER_565_SCAN(myy); 1198 for (i=0;i<UNROLL;i++) { 1199 SkPMColor c = src[i]; 1200 unsigned dither = DITHER_VALUE(myx); 1201 uint16_t val = SkDitherRGB32To565(c, dither); 1202 if (val != dst[i]) { 1203 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1204 c, dither, val, dst[i], dstart[i]); 1205 } 1206 DITHER_INC_X(myx); 1207 } 1208 } 1209#endif 1210 1211 dst += UNROLL; 1212 src += UNROLL; 1213 count -= UNROLL; 1214 x += UNROLL; /* probably superfluous */ 1215 } 1216 } 1217#undef UNROLL 1218 1219 /* residuals */ 1220 if (count > 0) { 1221 DITHER_565_SCAN(y); 1222 do { 1223 SkPMColor c = *src++; 1224 SkPMColorAssert(c); 1225 SkASSERT(SkGetPackedA32(c) == 255); 1226 1227 unsigned dither = DITHER_VALUE(x); 1228 *dst++ = SkDitherRGB32To565(c, dither); 1229 DITHER_INC_X(x); 1230 } while (--count != 0); 1231 } 1232} 1233 1234void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1235 SkPMColor color) { 1236 if (count <= 0) { 1237 return; 1238 } 1239 1240 if (0 == color) { 1241 if (src != dst) { 1242 memcpy(dst, src, count * sizeof(SkPMColor)); 1243 } 1244 return; 1245 } 1246 1247 unsigned colorA = SkGetPackedA32(color); 1248 if (255 == colorA) { 1249 sk_memset32(dst, color, count); 1250 } else { 1251 unsigned scale = 256 - SkAlpha255To256(colorA); 1252 1253 if (count >= 8) { 1254 // at the end of this assembly, count will have been decremented 1255 // to a negative value. That is, if count mod 8 = x, it will be 1256 // -8 +x coming out. 1257 asm volatile ( 1258 PLD128(src, 0) 1259 1260 "vdup.32 q0, %[color] \n\t" 1261 1262 PLD128(src, 128) 1263 1264 // scale numerical interval [0-255], so load as 8 bits 1265 "vdup.8 d2, %[scale] \n\t" 1266 1267 PLD128(src, 256) 1268 1269 "subs %[count], %[count], #8 \n\t" 1270 1271 PLD128(src, 384) 1272 1273 "Loop_Color32: \n\t" 1274 1275 // load src color, 8 pixels, 4 64 bit registers 1276 // (and increment src). 1277 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1278 1279 PLD128(src, 384) 1280 1281 // multiply long by scale, 64 bits at a time, 1282 // destination into a 128 bit register. 1283 "vmull.u8 q4, d4, d2 \n\t" 1284 "vmull.u8 q5, d5, d2 \n\t" 1285 "vmull.u8 q6, d6, d2 \n\t" 1286 "vmull.u8 q7, d7, d2 \n\t" 1287 1288 // shift the 128 bit registers, containing the 16 1289 // bit scaled values back to 8 bits, narrowing the 1290 // results to 64 bit registers. 1291 "vshrn.i16 d8, q4, #8 \n\t" 1292 "vshrn.i16 d9, q5, #8 \n\t" 1293 "vshrn.i16 d10, q6, #8 \n\t" 1294 "vshrn.i16 d11, q7, #8 \n\t" 1295 1296 // adding back the color, using 128 bit registers. 1297 "vadd.i8 q6, q4, q0 \n\t" 1298 "vadd.i8 q7, q5, q0 \n\t" 1299 1300 // store back the 8 calculated pixels (2 128 bit 1301 // registers), and increment dst. 1302 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1303 1304 "subs %[count], %[count], #8 \n\t" 1305 "bge Loop_Color32 \n\t" 1306 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1307 : [color] "r" (color), [scale] "r" (scale) 1308 : "cc", "memory", 1309 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1310 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1311 ); 1312 // At this point, if we went through the inline assembly, count is 1313 // a negative value: 1314 // if the value is -8, there is no pixel left to process. 1315 // if the value is -7, there is one pixel left to process 1316 // ... 1317 // And'ing it with 7 will give us the number of pixels 1318 // left to process. 1319 count = count & 0x7; 1320 } 1321 1322 while (count > 0) { 1323 *dst = color + SkAlphaMulQ(*src, scale); 1324 src += 1; 1325 dst += 1; 1326 count--; 1327 } 1328 } 1329} 1330 1331/////////////////////////////////////////////////////////////////////////////// 1332 1333const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1334 // no dither 1335 // NOTE: For the two functions below, we don't have a special version 1336 // that assumes that each source pixel is opaque. But our S32A is 1337 // still faster than the default, so use it. 1338 S32A_D565_Opaque_neon, // really S32_D565_Opaque 1339 S32A_D565_Blend_neon, // really S32_D565_Blend 1340 S32A_D565_Opaque_neon, 1341 S32A_D565_Blend_neon, 1342 1343 // dither 1344 S32_D565_Opaque_Dither_neon, 1345 S32_D565_Blend_Dither_neon, 1346 S32A_D565_Opaque_Dither_neon, 1347 NULL, // S32A_D565_Blend_Dither 1348}; 1349 1350const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1351 NULL, // S32_Opaque, 1352 S32_Blend_BlitRow32_neon, // S32_Blend, 1353 /* 1354 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1355 * value and attempts to optimize accordingly. The optimization is 1356 * sensitive to the source content and is not a win in all cases. For 1357 * example, if there are a lot of transitions between the alpha states, 1358 * the performance will almost certainly be worse. However, for many 1359 * common cases the performance is equivalent or better than the standard 1360 * case where we do not inspect the src alpha. 1361 */ 1362#if SK_A32_SHIFT == 24 1363 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1364 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1365#else 1366 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1367#endif 1368 S32A_Blend_BlitRow32_neon // S32A_Blend 1369}; 1370