SkBlitRow_opts_arm_neon.cpp revision c2050e3a3ecfb8738b36e2add15c526e8e0f21fe
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkCachePreload_arm.h" 18 19#include <arm_neon.h> 20 21void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 22 const SkPMColor* SK_RESTRICT src, int count, 23 U8CPU alpha, int /*x*/, int /*y*/) { 24 SkASSERT(255 == alpha); 25 26 if (count >= 8) { 27 uint16_t* SK_RESTRICT keep_dst = 0; 28 29 asm volatile ( 30 "ands ip, %[count], #7 \n\t" 31 "vmov.u8 d31, #1<<7 \n\t" 32 "vld1.16 {q12}, [%[dst]] \n\t" 33 "vld4.8 {d0-d3}, [%[src]] \n\t" 34 // Thumb does not support the standard ARM conditional 35 // instructions but instead requires the 'it' instruction 36 // to signal conditional execution 37 "it eq \n\t" 38 "moveq ip, #8 \n\t" 39 "mov %[keep_dst], %[dst] \n\t" 40 41 "add %[src], %[src], ip, LSL#2 \n\t" 42 "add %[dst], %[dst], ip, LSL#1 \n\t" 43 "subs %[count], %[count], ip \n\t" 44 "b 9f \n\t" 45 // LOOP 46 "2: \n\t" 47 48 "vld1.16 {q12}, [%[dst]]! \n\t" 49 "vld4.8 {d0-d3}, [%[src]]! \n\t" 50 "vst1.16 {q10}, [%[keep_dst]] \n\t" 51 "sub %[keep_dst], %[dst], #8*2 \n\t" 52 "subs %[count], %[count], #8 \n\t" 53 "9: \n\t" 54 "pld [%[dst],#32] \n\t" 55 // expand 0565 q12 to 8888 {d4-d7} 56 "vmovn.u16 d4, q12 \n\t" 57 "vshr.u16 q11, q12, #5 \n\t" 58 "vshr.u16 q10, q12, #6+5 \n\t" 59 "vmovn.u16 d5, q11 \n\t" 60 "vmovn.u16 d6, q10 \n\t" 61 "vshl.u8 d4, d4, #3 \n\t" 62 "vshl.u8 d5, d5, #2 \n\t" 63 "vshl.u8 d6, d6, #3 \n\t" 64 65 "vmovl.u8 q14, d31 \n\t" 66 "vmovl.u8 q13, d31 \n\t" 67 "vmovl.u8 q12, d31 \n\t" 68 69 // duplicate in 4/2/1 & 8pix vsns 70 "vmvn.8 d30, d3 \n\t" 71 "vmlal.u8 q14, d30, d6 \n\t" 72 "vmlal.u8 q13, d30, d5 \n\t" 73 "vmlal.u8 q12, d30, d4 \n\t" 74 "vshr.u16 q8, q14, #5 \n\t" 75 "vshr.u16 q9, q13, #6 \n\t" 76 "vaddhn.u16 d6, q14, q8 \n\t" 77 "vshr.u16 q8, q12, #5 \n\t" 78 "vaddhn.u16 d5, q13, q9 \n\t" 79 "vqadd.u8 d6, d6, d0 \n\t" // moved up 80 "vaddhn.u16 d4, q12, q8 \n\t" 81 // intentionally don't calculate alpha 82 // result in d4-d6 83 84 "vqadd.u8 d5, d5, d1 \n\t" 85 "vqadd.u8 d4, d4, d2 \n\t" 86 87 // pack 8888 {d4-d6} to 0565 q10 88 "vshll.u8 q10, d6, #8 \n\t" 89 "vshll.u8 q3, d5, #8 \n\t" 90 "vshll.u8 q2, d4, #8 \n\t" 91 "vsri.u16 q10, q3, #5 \n\t" 92 "vsri.u16 q10, q2, #11 \n\t" 93 94 "bne 2b \n\t" 95 96 "1: \n\t" 97 "vst1.16 {q10}, [%[keep_dst]] \n\t" 98 : [count] "+r" (count) 99 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 100 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 101 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 102 "d30","d31" 103 ); 104 } 105 else 106 { // handle count < 8 107 uint16_t* SK_RESTRICT keep_dst = 0; 108 109 asm volatile ( 110 "vmov.u8 d31, #1<<7 \n\t" 111 "mov %[keep_dst], %[dst] \n\t" 112 113 "tst %[count], #4 \n\t" 114 "beq 14f \n\t" 115 "vld1.16 {d25}, [%[dst]]! \n\t" 116 "vld1.32 {q1}, [%[src]]! \n\t" 117 118 "14: \n\t" 119 "tst %[count], #2 \n\t" 120 "beq 12f \n\t" 121 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 122 "vld1.32 {d1}, [%[src]]! \n\t" 123 124 "12: \n\t" 125 "tst %[count], #1 \n\t" 126 "beq 11f \n\t" 127 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 128 "vld1.32 {d0[1]}, [%[src]]! \n\t" 129 130 "11: \n\t" 131 // unzips achieve the same as a vld4 operation 132 "vuzpq.u16 q0, q1 \n\t" 133 "vuzp.u8 d0, d1 \n\t" 134 "vuzp.u8 d2, d3 \n\t" 135 // expand 0565 q12 to 8888 {d4-d7} 136 "vmovn.u16 d4, q12 \n\t" 137 "vshr.u16 q11, q12, #5 \n\t" 138 "vshr.u16 q10, q12, #6+5 \n\t" 139 "vmovn.u16 d5, q11 \n\t" 140 "vmovn.u16 d6, q10 \n\t" 141 "vshl.u8 d4, d4, #3 \n\t" 142 "vshl.u8 d5, d5, #2 \n\t" 143 "vshl.u8 d6, d6, #3 \n\t" 144 145 "vmovl.u8 q14, d31 \n\t" 146 "vmovl.u8 q13, d31 \n\t" 147 "vmovl.u8 q12, d31 \n\t" 148 149 // duplicate in 4/2/1 & 8pix vsns 150 "vmvn.8 d30, d3 \n\t" 151 "vmlal.u8 q14, d30, d6 \n\t" 152 "vmlal.u8 q13, d30, d5 \n\t" 153 "vmlal.u8 q12, d30, d4 \n\t" 154 "vshr.u16 q8, q14, #5 \n\t" 155 "vshr.u16 q9, q13, #6 \n\t" 156 "vaddhn.u16 d6, q14, q8 \n\t" 157 "vshr.u16 q8, q12, #5 \n\t" 158 "vaddhn.u16 d5, q13, q9 \n\t" 159 "vqadd.u8 d6, d6, d0 \n\t" // moved up 160 "vaddhn.u16 d4, q12, q8 \n\t" 161 // intentionally don't calculate alpha 162 // result in d4-d6 163 164 "vqadd.u8 d5, d5, d1 \n\t" 165 "vqadd.u8 d4, d4, d2 \n\t" 166 167 // pack 8888 {d4-d6} to 0565 q10 168 "vshll.u8 q10, d6, #8 \n\t" 169 "vshll.u8 q3, d5, #8 \n\t" 170 "vshll.u8 q2, d4, #8 \n\t" 171 "vsri.u16 q10, q3, #5 \n\t" 172 "vsri.u16 q10, q2, #11 \n\t" 173 174 // store 175 "tst %[count], #4 \n\t" 176 "beq 24f \n\t" 177 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 178 179 "24: \n\t" 180 "tst %[count], #2 \n\t" 181 "beq 22f \n\t" 182 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 183 184 "22: \n\t" 185 "tst %[count], #1 \n\t" 186 "beq 21f \n\t" 187 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 188 189 "21: \n\t" 190 : [count] "+r" (count) 191 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 192 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 193 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 194 "d30","d31" 195 ); 196 } 197} 198 199void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 200 const SkPMColor* SK_RESTRICT src, int count, 201 U8CPU alpha, int /*x*/, int /*y*/) { 202 203 U8CPU alpha_for_asm = alpha; 204 205 asm volatile ( 206 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 207 * the original in two respects: 208 * 1. The results have a few mismatches compared to the original code. These mismatches 209 * never exceed 1. It's possible to improve accuracy vs. a floating point 210 * implementation by introducing rounding right shifts (vrshr) for the final stage. 211 * Rounding is not present in the code below, because although results would be closer 212 * to a floating point implementation, the number of mismatches compared to the 213 * original code would be far greater. 214 * 2. On certain inputs, the original code can overflow, causing colour channels to 215 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 216 * to affect another. 217 */ 218 219#if 1 220 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 221 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 222#else 223 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 224#endif 225 "vmov.u16 q3, #255 \n\t" // set up constant 226 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 227 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 228 "beq 2f \n\t" // if count8 == 0, exit 229 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 230 231 "1: \n\t" 232 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 233 "subs r4, r4, #1 \n\t" // decrement loop counter 234 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 235 // and deinterleave 236 237 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 238 "vand q10, q0, q15 \n\t" // extract blue 239 "vshr.u16 q8, q0, #11 \n\t" // extract red 240 "vshr.u16 q9, q9, #10 \n\t" // extract green 241 // dstrgb = {q8, q9, q10} 242 243 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 244 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 245 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 246 247 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 248 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 249 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 250 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 251 // srcrgba = {q11, q12, q13, q14} 252 253 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 254 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 255 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 256 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 257 258 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 259 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 260 // dst_scale = q2 261 262 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 263 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 264 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 265 266#if 1 267 // trying for a better match with SkDiv255Round(a) 268 // C alg is: a+=128; (a+a>>8)>>8 269 // we'll use just a rounding shift [q2 is available for scratch] 270 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 271 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 272 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 273#else 274 // arm's original "truncating divide by 256" 275 "vshr.u16 q11, q11, #8 \n\t" // shift down red 276 "vshr.u16 q12, q12, #8 \n\t" // shift down green 277 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 278#endif 279 280 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 281 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 282 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 283 284 "bne 1b \n\t" // if counter != 0, loop 285 "2: \n\t" // exit 286 287 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 288 : 289 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 290 ); 291 292 count &= 7; 293 if (count > 0) { 294 do { 295 SkPMColor sc = *src++; 296 if (sc) { 297 uint16_t dc = *dst; 298 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 299 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 300 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 301 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 302 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 303 } 304 dst += 1; 305 } while (--count != 0); 306 } 307} 308 309/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 310 * each dither value is spaced out into byte lanes, and repeated 311 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 312 * start of each row. 313 */ 314static const uint8_t gDitherMatrix_Neon[48] = { 315 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 316 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 317 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 318 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 319 320}; 321 322void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 323 int count, U8CPU alpha, int x, int y) 324{ 325 /* select row and offset for dither array */ 326 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 327 328 /* rescale alpha to range 0 - 256 */ 329 int scale = SkAlpha255To256(alpha); 330 331 asm volatile ( 332 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 333 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 334 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 335 "vmov.i8 d29, #0x3f \n\t" // set up green mask 336 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 337 "1: \n\t" 338 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 339 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 340 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 341 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 342 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 343 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 344 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 345 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 346 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 347 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 348 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 349 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 350 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 351 // load 8 pixels from dst, extract rgb 352 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 353 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 354 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 355 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 356 "vand d17, d17, d29 \n\t" // and green with green mask 357 "vand d18, d18, d28 \n\t" // and blue with blue mask 358 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 359 // src = {d22 (r), d23 (g), d24 (b)} 360 // dst = {d16 (r), d17 (g), d18 (b)} 361 // subtract dst from src and widen 362 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 363 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 364 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 365 // multiply diffs by scale and shift 366 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 367 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 368 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 369 "subs %[count], %[count], #8 \n\t" // decrement loop counter 370 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 371 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 372 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 373 // add dst to result 374 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 375 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 376 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 377 // put result into 565 format 378 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 379 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 380 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 381 "bgt 1b \n\t" // loop if count > 0 382 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 383 : [dstart] "r" (dstart), [scale] "r" (scale) 384 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 385 ); 386 387 DITHER_565_SCAN(y); 388 389 while((count & 7) > 0) 390 { 391 SkPMColor c = *src++; 392 393 int dither = DITHER_VALUE(x); 394 int sr = SkGetPackedR32(c); 395 int sg = SkGetPackedG32(c); 396 int sb = SkGetPackedB32(c); 397 sr = SkDITHER_R32To565(sr, dither); 398 sg = SkDITHER_G32To565(sg, dither); 399 sb = SkDITHER_B32To565(sb, dither); 400 401 uint16_t d = *dst; 402 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 403 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 404 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 405 DITHER_INC_X(x); 406 count--; 407 } 408} 409 410void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 411 const SkPMColor* SK_RESTRICT src, 412 int count, U8CPU alpha) { 413 414 SkASSERT(255 == alpha); 415 if (count > 0) { 416 417 418 uint8x8_t alpha_mask; 419 420 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 421 alpha_mask = vld1_u8(alpha_mask_setup); 422 423 /* do the NEON unrolled code */ 424#define UNROLL 4 425 while (count >= UNROLL) { 426 uint8x8_t src_raw, dst_raw, dst_final; 427 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 428 429 /* The two prefetches below may make the code slighlty 430 * slower for small values of count but are worth having 431 * in the general case. 432 */ 433 __builtin_prefetch(src+32); 434 __builtin_prefetch(dst+32); 435 436 /* get the source */ 437 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 438#if UNROLL > 2 439 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 440#endif 441 442 /* get and hold the dst too */ 443 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 444#if UNROLL > 2 445 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 446#endif 447 448 /* 1st and 2nd bits of the unrolling */ 449 { 450 uint8x8_t dst_cooked; 451 uint16x8_t dst_wide; 452 uint8x8_t alpha_narrow; 453 uint16x8_t alpha_wide; 454 455 /* get the alphas spread out properly */ 456 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 457 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 458 459 /* spread the dest */ 460 dst_wide = vmovl_u8(dst_raw); 461 462 /* alpha mul the dest */ 463 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 464 dst_cooked = vshrn_n_u16(dst_wide, 8); 465 466 /* sum -- ignoring any byte lane overflows */ 467 dst_final = vadd_u8(src_raw, dst_cooked); 468 } 469 470#if UNROLL > 2 471 /* the 3rd and 4th bits of our unrolling */ 472 { 473 uint8x8_t dst_cooked; 474 uint16x8_t dst_wide; 475 uint8x8_t alpha_narrow; 476 uint16x8_t alpha_wide; 477 478 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 479 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 480 481 /* spread the dest */ 482 dst_wide = vmovl_u8(dst_raw_2); 483 484 /* alpha mul the dest */ 485 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 486 dst_cooked = vshrn_n_u16(dst_wide, 8); 487 488 /* sum -- ignoring any byte lane overflows */ 489 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 490 } 491#endif 492 493 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 494#if UNROLL > 2 495 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 496#endif 497 498 src += UNROLL; 499 dst += UNROLL; 500 count -= UNROLL; 501 } 502#undef UNROLL 503 504 /* do any residual iterations */ 505 while (--count >= 0) { 506 *dst = SkPMSrcOver(*src, *dst); 507 src += 1; 508 dst += 1; 509 } 510 } 511} 512 513void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 514 const SkPMColor* SK_RESTRICT src, 515 int count, U8CPU alpha) { 516 SkASSERT(255 == alpha); 517 518 if (count <= 0) 519 return; 520 521 /* Use these to check if src is transparent or opaque */ 522 const unsigned int ALPHA_OPAQ = 0xFF000000; 523 const unsigned int ALPHA_TRANS = 0x00FFFFFF; 524 525#define UNROLL 4 526 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 527 const SkPMColor* SK_RESTRICT src_temp = src; 528 529 /* set up the NEON variables */ 530 uint8x8_t alpha_mask; 531 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 532 alpha_mask = vld1_u8(alpha_mask_setup); 533 534 uint8x8_t src_raw, dst_raw, dst_final; 535 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 536 uint8x8_t dst_cooked; 537 uint16x8_t dst_wide; 538 uint8x8_t alpha_narrow; 539 uint16x8_t alpha_wide; 540 541 /* choose the first processing type */ 542 if( src >= src_end) 543 goto TAIL; 544 if(*src <= ALPHA_TRANS) 545 goto ALPHA_0; 546 if(*src >= ALPHA_OPAQ) 547 goto ALPHA_255; 548 /* fall-thru */ 549 550ALPHA_1_TO_254: 551 do { 552 553 /* get the source */ 554 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 555 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 556 557 /* get and hold the dst too */ 558 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 559 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 560 561 562 /* get the alphas spread out properly */ 563 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 564 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 565 /* we collapsed (255-a)+1 ... */ 566 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 567 568 /* spread the dest */ 569 dst_wide = vmovl_u8(dst_raw); 570 571 /* alpha mul the dest */ 572 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 573 dst_cooked = vshrn_n_u16(dst_wide, 8); 574 575 /* sum -- ignoring any byte lane overflows */ 576 dst_final = vadd_u8(src_raw, dst_cooked); 577 578 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 579 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 580 /* we collapsed (255-a)+1 ... */ 581 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 582 583 /* spread the dest */ 584 dst_wide = vmovl_u8(dst_raw_2); 585 586 /* alpha mul the dest */ 587 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 588 dst_cooked = vshrn_n_u16(dst_wide, 8); 589 590 /* sum -- ignoring any byte lane overflows */ 591 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 592 593 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 594 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 595 596 src += UNROLL; 597 dst += UNROLL; 598 599 /* if 2 of the next pixels aren't between 1 and 254 600 it might make sense to go to the optimized loops */ 601 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 602 break; 603 604 } while(src < src_end); 605 606 if (src >= src_end) 607 goto TAIL; 608 609 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 610 goto ALPHA_255; 611 612 /*fall-thru*/ 613 614ALPHA_0: 615 616 /*In this state, we know the current alpha is 0 and 617 we optimize for the next alpha also being zero. */ 618 src_temp = src; //so we don't have to increment dst every time 619 do { 620 if(*(++src) > ALPHA_TRANS) 621 break; 622 if(*(++src) > ALPHA_TRANS) 623 break; 624 if(*(++src) > ALPHA_TRANS) 625 break; 626 if(*(++src) > ALPHA_TRANS) 627 break; 628 } while(src < src_end); 629 630 dst += (src - src_temp); 631 632 /* no longer alpha 0, so determine where to go next. */ 633 if( src >= src_end) 634 goto TAIL; 635 if(*src >= ALPHA_OPAQ) 636 goto ALPHA_255; 637 else 638 goto ALPHA_1_TO_254; 639 640ALPHA_255: 641 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 642 dst[0]=src[0]; 643 dst[1]=src[1]; 644 dst[2]=src[2]; 645 dst[3]=src[3]; 646 src+=UNROLL; 647 dst+=UNROLL; 648 if(src >= src_end) 649 goto TAIL; 650 } 651 652 //Handle remainder. 653 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 654 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 655 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 656 } 657 } 658 659 if( src >= src_end) 660 goto TAIL; 661 if(*src <= ALPHA_TRANS) 662 goto ALPHA_0; 663 else 664 goto ALPHA_1_TO_254; 665 666TAIL: 667 /* do any residual iterations */ 668 src_end += UNROLL + 1; //goto the real end 669 while(src != src_end) { 670 if( *src != 0 ) { 671 if( *src >= ALPHA_OPAQ ) { 672 *dst = *src; 673 } 674 else { 675 *dst = SkPMSrcOver(*src, *dst); 676 } 677 } 678 src++; 679 dst++; 680 } 681 682#undef UNROLL 683 return; 684} 685 686/* Neon version of S32_Blend_BlitRow32() 687 * portable version is in src/core/SkBlitRow_D32.cpp 688 */ 689void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 690 const SkPMColor* SK_RESTRICT src, 691 int count, U8CPU alpha) { 692 SkASSERT(alpha <= 255); 693 if (count > 0) { 694 uint16_t src_scale = SkAlpha255To256(alpha); 695 uint16_t dst_scale = 256 - src_scale; 696 697 /* run them N at a time through the NEON unit */ 698 /* note that each 1 is 4 bytes, each treated exactly the same, 699 * so we can work under that guise. We *do* know that the src&dst 700 * will be 32-bit aligned quantities, so we can specify that on 701 * the load/store ops and do a neon 'reinterpret' to get us to 702 * byte-sized (pun intended) pieces that we widen/multiply/shift 703 * we're limited at 128 bits in the wide ops, which is 8x16bits 704 * or a pair of 32 bit src/dsts. 705 */ 706 /* we *could* manually unroll this loop so that we load 128 bits 707 * (as a pair of 64s) from each of src and dst, processing them 708 * in pieces. This might give us a little better management of 709 * the memory latency, but my initial attempts here did not 710 * produce an instruction stream that looked all that nice. 711 */ 712#define UNROLL 2 713 while (count >= UNROLL) { 714 uint8x8_t src_raw, dst_raw, dst_final; 715 uint16x8_t src_wide, dst_wide; 716 717 /* get 64 bits of src, widen it, multiply by src_scale */ 718 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 719 src_wide = vmovl_u8(src_raw); 720 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 721 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 722 723 /* ditto with dst */ 724 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 725 dst_wide = vmovl_u8(dst_raw); 726 727 /* combine add with dst multiply into mul-accumulate */ 728 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 729 730 dst_final = vshrn_n_u16(dst_wide, 8); 731 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 732 733 src += UNROLL; 734 dst += UNROLL; 735 count -= UNROLL; 736 } 737 /* RBE: well, i don't like how gcc manages src/dst across the above 738 * loop it's constantly calculating src+bias, dst+bias and it only 739 * adjusts the real ones when we leave the loop. Not sure why 740 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 741 * the adjustments to src/dst/count, but it does... 742 * (might be SSA-style internal logic... 743 */ 744 745#if UNROLL == 2 746 if (count == 1) { 747 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 748 } 749#else 750 if (count > 0) { 751 do { 752 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 753 src += 1; 754 dst += 1; 755 } while (--count > 0); 756 } 757#endif 758 759#undef UNROLL 760 } 761} 762 763/////////////////////////////////////////////////////////////////////////////// 764 765#undef DEBUG_OPAQUE_DITHER 766 767#if defined(DEBUG_OPAQUE_DITHER) 768static void showme8(char *str, void *p, int len) 769{ 770 static char buf[256]; 771 char tbuf[32]; 772 int i; 773 char *pc = (char*) p; 774 sprintf(buf,"%8s:", str); 775 for(i=0;i<len;i++) { 776 sprintf(tbuf, " %02x", pc[i]); 777 strcat(buf, tbuf); 778 } 779 SkDebugf("%s\n", buf); 780} 781static void showme16(char *str, void *p, int len) 782{ 783 static char buf[256]; 784 char tbuf[32]; 785 int i; 786 uint16_t *pc = (uint16_t*) p; 787 sprintf(buf,"%8s:", str); 788 len = (len / sizeof(uint16_t)); /* passed as bytes */ 789 for(i=0;i<len;i++) { 790 sprintf(tbuf, " %04x", pc[i]); 791 strcat(buf, tbuf); 792 } 793 SkDebugf("%s\n", buf); 794} 795#endif 796 797void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 798 const SkPMColor* SK_RESTRICT src, 799 int count, U8CPU alpha, int x, int y) { 800 SkASSERT(255 == alpha); 801 802#define UNROLL 8 803 804 if (count >= UNROLL) { 805 uint8x8_t dbase; 806 807#if defined(DEBUG_OPAQUE_DITHER) 808 uint16_t tmpbuf[UNROLL]; 809 int td[UNROLL]; 810 int tdv[UNROLL]; 811 int ta[UNROLL]; 812 int tap[UNROLL]; 813 uint16_t in_dst[UNROLL]; 814 int offset = 0; 815 int noisy = 0; 816#endif 817 818 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 819 dbase = vld1_u8(dstart); 820 821 do { 822 uint8x8_t sr, sg, sb, sa, d; 823 uint16x8_t dst8, scale8, alpha8; 824 uint16x8_t dst_r, dst_g, dst_b; 825 826#if defined(DEBUG_OPAQUE_DITHER) 827 /* calculate 8 elements worth into a temp buffer */ 828 { 829 int my_y = y; 830 int my_x = x; 831 SkPMColor* my_src = (SkPMColor*)src; 832 uint16_t* my_dst = dst; 833 int i; 834 835 DITHER_565_SCAN(my_y); 836 for(i=0;i<UNROLL;i++) { 837 SkPMColor c = *my_src++; 838 SkPMColorAssert(c); 839 if (c) { 840 unsigned a = SkGetPackedA32(c); 841 842 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 843 tdv[i] = DITHER_VALUE(my_x); 844 ta[i] = a; 845 tap[i] = SkAlpha255To256(a); 846 td[i] = d; 847 848 unsigned sr = SkGetPackedR32(c); 849 unsigned sg = SkGetPackedG32(c); 850 unsigned sb = SkGetPackedB32(c); 851 sr = SkDITHER_R32_FOR_565(sr, d); 852 sg = SkDITHER_G32_FOR_565(sg, d); 853 sb = SkDITHER_B32_FOR_565(sb, d); 854 855 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 856 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 857 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 858 // now src and dst expanded are in g:11 r:10 x:1 b:10 859 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 860 td[i] = d; 861 862 } else { 863 tmpbuf[i] = *my_dst; 864 ta[i] = tdv[i] = td[i] = 0xbeef; 865 } 866 in_dst[i] = *my_dst; 867 my_dst += 1; 868 DITHER_INC_X(my_x); 869 } 870 } 871#endif 872 873 /* source is in ABGR */ 874 { 875 register uint8x8_t d0 asm("d0"); 876 register uint8x8_t d1 asm("d1"); 877 register uint8x8_t d2 asm("d2"); 878 register uint8x8_t d3 asm("d3"); 879 880 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 881 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 882 : "r" (src) 883 ); 884 sr = d0; sg = d1; sb = d2; sa = d3; 885 } 886 887 /* calculate 'd', which will be 0..7 */ 888 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 889#if defined(SK_BUILD_FOR_ANDROID) 890 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 891 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 892#else 893 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 894#endif 895 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 896 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 897 898 /* sr = sr - (sr>>5) + d */ 899 /* watching for 8-bit overflow. d is 0..7; risky range of 900 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 901 * safe as long as we do ((sr-sr>>5) + d) */ 902 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 903 sr = vadd_u8(sr, d); 904 905 /* sb = sb - (sb>>5) + d */ 906 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 907 sb = vadd_u8(sb, d); 908 909 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 910 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 911 sg = vadd_u8(sg, vshr_n_u8(d,1)); 912 913 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 914 dst8 = vld1q_u16(dst); 915 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 916 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 917 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 918 919 /* blend */ 920#if 1 921 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 922 /* originally 255-sa + 1 */ 923 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 924#else 925 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 926 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 927#endif 928 929#if 1 930 /* combine the addq and mul, save 3 insns */ 931 scale8 = vshrq_n_u16(scale8, 3); 932 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 933 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 934 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 935#else 936 /* known correct, but +3 insns over above */ 937 scale8 = vshrq_n_u16(scale8, 3); 938 dst_b = vmulq_u16(dst_b, scale8); 939 dst_g = vmulq_u16(dst_g, scale8); 940 dst_r = vmulq_u16(dst_r, scale8); 941 942 /* combine */ 943 /* NB: vshll widens, need to preserve those bits */ 944 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 945 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 946 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 947#endif 948 949 /* repack to store */ 950 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 951 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 952 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 953 954 vst1q_u16(dst, dst8); 955 956#if defined(DEBUG_OPAQUE_DITHER) 957 /* verify my 8 elements match the temp buffer */ 958 { 959 int i, bad=0; 960 static int invocation; 961 962 for (i=0;i<UNROLL;i++) 963 if (tmpbuf[i] != dst[i]) bad=1; 964 if (bad) { 965 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 966 invocation, offset); 967 SkDebugf(" alpha 0x%x\n", alpha); 968 for (i=0;i<UNROLL;i++) 969 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 970 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 971 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 972 973 showme16("alpha8", &alpha8, sizeof(alpha8)); 974 showme16("scale8", &scale8, sizeof(scale8)); 975 showme8("d", &d, sizeof(d)); 976 showme16("dst8", &dst8, sizeof(dst8)); 977 showme16("dst_b", &dst_b, sizeof(dst_b)); 978 showme16("dst_g", &dst_g, sizeof(dst_g)); 979 showme16("dst_r", &dst_r, sizeof(dst_r)); 980 showme8("sb", &sb, sizeof(sb)); 981 showme8("sg", &sg, sizeof(sg)); 982 showme8("sr", &sr, sizeof(sr)); 983 984 /* cop out */ 985 return; 986 } 987 offset += UNROLL; 988 invocation++; 989 } 990#endif 991 992 dst += UNROLL; 993 src += UNROLL; 994 count -= UNROLL; 995 /* skip x += UNROLL, since it's unchanged mod-4 */ 996 } while (count >= UNROLL); 997 } 998#undef UNROLL 999 1000 /* residuals */ 1001 if (count > 0) { 1002 DITHER_565_SCAN(y); 1003 do { 1004 SkPMColor c = *src++; 1005 SkPMColorAssert(c); 1006 if (c) { 1007 unsigned a = SkGetPackedA32(c); 1008 1009 // dither and alpha are just temporary variables to work-around 1010 // an ICE in debug. 1011 unsigned dither = DITHER_VALUE(x); 1012 unsigned alpha = SkAlpha255To256(a); 1013 int d = SkAlphaMul(dither, alpha); 1014 1015 unsigned sr = SkGetPackedR32(c); 1016 unsigned sg = SkGetPackedG32(c); 1017 unsigned sb = SkGetPackedB32(c); 1018 sr = SkDITHER_R32_FOR_565(sr, d); 1019 sg = SkDITHER_G32_FOR_565(sg, d); 1020 sb = SkDITHER_B32_FOR_565(sb, d); 1021 1022 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1023 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1024 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1025 // now src and dst expanded are in g:11 r:10 x:1 b:10 1026 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1027 } 1028 dst += 1; 1029 DITHER_INC_X(x); 1030 } while (--count != 0); 1031 } 1032} 1033 1034/////////////////////////////////////////////////////////////////////////////// 1035 1036/* 2009/10/27: RBE says "a work in progress"; debugging says ok; 1037 * speedup untested, but ARM version is 26 insns/iteration and 1038 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 1039 * which is 10x the native version; that's pure instruction counts, 1040 * not accounting for any instruction or memory latencies. 1041 */ 1042 1043#undef DEBUG_S32_OPAQUE_DITHER 1044 1045void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1046 const SkPMColor* SK_RESTRICT src, 1047 int count, U8CPU alpha, int x, int y) { 1048 SkASSERT(255 == alpha); 1049 1050#define UNROLL 8 1051 if (count >= UNROLL) { 1052 uint8x8_t d; 1053 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1054 d = vld1_u8(dstart); 1055 1056 while (count >= UNROLL) { 1057 uint8x8_t sr, sg, sb; 1058 uint16x8_t dr, dg, db; 1059 uint16x8_t dst8; 1060 1061 /* source is in ABGR ordering (R == lsb) */ 1062 { 1063 register uint8x8_t d0 asm("d0"); 1064 register uint8x8_t d1 asm("d1"); 1065 register uint8x8_t d2 asm("d2"); 1066 register uint8x8_t d3 asm("d3"); 1067 1068 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1069 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1070 : "r" (src) 1071 ); 1072 sr = d0; sg = d1; sb = d2; 1073 } 1074 /* XXX: if we want to prefetch, hide it in the above asm() 1075 * using the gcc __builtin_prefetch(), the prefetch will 1076 * fall to the bottom of the loop -- it won't stick up 1077 * at the top of the loop, just after the vld4. 1078 */ 1079 1080 /* sr = sr - (sr>>5) + d */ 1081 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1082 dr = vaddl_u8(sr, d); 1083 1084 /* sb = sb - (sb>>5) + d */ 1085 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1086 db = vaddl_u8(sb, d); 1087 1088 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1089 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1090 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 1091 /* XXX: check that the "d>>1" here is hoisted */ 1092 1093 /* pack high bits of each into 565 format (rgb, b is lsb) */ 1094 dst8 = vshrq_n_u16(db, 3); 1095 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1096 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 1097 1098 /* store it */ 1099 vst1q_u16(dst, dst8); 1100 1101#if defined(DEBUG_S32_OPAQUE_DITHER) 1102 /* always good to know if we generated good results */ 1103 { 1104 int i, myx = x, myy = y; 1105 DITHER_565_SCAN(myy); 1106 for (i=0;i<UNROLL;i++) { 1107 SkPMColor c = src[i]; 1108 unsigned dither = DITHER_VALUE(myx); 1109 uint16_t val = SkDitherRGB32To565(c, dither); 1110 if (val != dst[i]) { 1111 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1112 c, dither, val, dst[i], dstart[i]); 1113 } 1114 DITHER_INC_X(myx); 1115 } 1116 } 1117#endif 1118 1119 dst += UNROLL; 1120 src += UNROLL; 1121 count -= UNROLL; 1122 x += UNROLL; /* probably superfluous */ 1123 } 1124 } 1125#undef UNROLL 1126 1127 /* residuals */ 1128 if (count > 0) { 1129 DITHER_565_SCAN(y); 1130 do { 1131 SkPMColor c = *src++; 1132 SkPMColorAssert(c); 1133 SkASSERT(SkGetPackedA32(c) == 255); 1134 1135 unsigned dither = DITHER_VALUE(x); 1136 *dst++ = SkDitherRGB32To565(c, dither); 1137 DITHER_INC_X(x); 1138 } while (--count != 0); 1139 } 1140} 1141 1142void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1143 SkPMColor color) { 1144 if (count <= 0) { 1145 return; 1146 } 1147 1148 if (0 == color) { 1149 if (src != dst) { 1150 memcpy(dst, src, count * sizeof(SkPMColor)); 1151 } 1152 return; 1153 } 1154 1155 unsigned colorA = SkGetPackedA32(color); 1156 if (255 == colorA) { 1157 sk_memset32(dst, color, count); 1158 } else { 1159 unsigned scale = 256 - SkAlpha255To256(colorA); 1160 1161 if (count >= 8) { 1162 // at the end of this assembly, count will have been decremented 1163 // to a negative value. That is, if count mod 8 = x, it will be 1164 // -8 +x coming out. 1165 asm volatile ( 1166 PLD128(src, 0) 1167 1168 "vdup.32 q0, %[color] \n\t" 1169 1170 PLD128(src, 128) 1171 1172 // scale numerical interval [0-255], so load as 8 bits 1173 "vdup.8 d2, %[scale] \n\t" 1174 1175 PLD128(src, 256) 1176 1177 "subs %[count], %[count], #8 \n\t" 1178 1179 PLD128(src, 384) 1180 1181 "Loop_Color32: \n\t" 1182 1183 // load src color, 8 pixels, 4 64 bit registers 1184 // (and increment src). 1185 "vld1.32 {d4-d7}, [%[src]]! \n\t" 1186 1187 PLD128(src, 384) 1188 1189 // multiply long by scale, 64 bits at a time, 1190 // destination into a 128 bit register. 1191 "vmull.u8 q4, d4, d2 \n\t" 1192 "vmull.u8 q5, d5, d2 \n\t" 1193 "vmull.u8 q6, d6, d2 \n\t" 1194 "vmull.u8 q7, d7, d2 \n\t" 1195 1196 // shift the 128 bit registers, containing the 16 1197 // bit scaled values back to 8 bits, narrowing the 1198 // results to 64 bit registers. 1199 "vshrn.i16 d8, q4, #8 \n\t" 1200 "vshrn.i16 d9, q5, #8 \n\t" 1201 "vshrn.i16 d10, q6, #8 \n\t" 1202 "vshrn.i16 d11, q7, #8 \n\t" 1203 1204 // adding back the color, using 128 bit registers. 1205 "vadd.i8 q6, q4, q0 \n\t" 1206 "vadd.i8 q7, q5, q0 \n\t" 1207 1208 // store back the 8 calculated pixels (2 128 bit 1209 // registers), and increment dst. 1210 "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1211 1212 "subs %[count], %[count], #8 \n\t" 1213 "bge Loop_Color32 \n\t" 1214 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1215 : [color] "r" (color), [scale] "r" (scale) 1216 : "cc", "memory", 1217 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1218 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1219 ); 1220 // At this point, if we went through the inline assembly, count is 1221 // a negative value: 1222 // if the value is -8, there is no pixel left to process. 1223 // if the value is -7, there is one pixel left to process 1224 // ... 1225 // And'ing it with 7 will give us the number of pixels 1226 // left to process. 1227 count = count & 0x7; 1228 } 1229 1230 while (count > 0) { 1231 *dst = color + SkAlphaMulQ(*src, scale); 1232 src += 1; 1233 dst += 1; 1234 count--; 1235 } 1236 } 1237} 1238 1239/////////////////////////////////////////////////////////////////////////////// 1240 1241const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1242 // no dither 1243 // NOTE: For the two functions below, we don't have a special version 1244 // that assumes that each source pixel is opaque. But our S32A is 1245 // still faster than the default, so use it. 1246 S32A_D565_Opaque_neon, // really S32_D565_Opaque 1247 S32A_D565_Blend_neon, // really S32_D565_Blend 1248 S32A_D565_Opaque_neon, 1249 S32A_D565_Blend_neon, 1250 1251 // dither 1252 S32_D565_Opaque_Dither_neon, 1253 S32_D565_Blend_Dither_neon, 1254 S32A_D565_Opaque_Dither_neon, 1255 NULL, // S32A_D565_Blend_Dither 1256}; 1257 1258const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1259 NULL, // S32_Opaque, 1260 S32_Blend_BlitRow32_neon, // S32_Blend, 1261 /* 1262 * We have two choices for S32A_Opaque procs. The one reads the src alpha 1263 * value and attempts to optimize accordingly. The optimization is 1264 * sensitive to the source content and is not a win in all cases. For 1265 * example, if there are a lot of transitions between the alpha states, 1266 * the performance will almost certainly be worse. However, for many 1267 * common cases the performance is equivalent or better than the standard 1268 * case where we do not inspect the src alpha. 1269 */ 1270#if SK_A32_SHIFT == 24 1271 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1272 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1273#else 1274 S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1275#endif 1276 S32A_Blend_BlitRow32_arm // S32A_Blend 1277}; 1278