SkBlitRow_opts_arm.cpp revision 568468094af358ce981f7319b2bc2b4996ac4bce
1/* 2 * Copyright 2009 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9#include "SkBlitRow.h" 10#include "SkBlitMask.h" 11#include "SkColorPriv.h" 12#include "SkDither.h" 13 14#if defined(__ARM_HAVE_NEON) 15#include <arm_neon.h> 16#endif 17 18#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 19static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 20 const SkPMColor* SK_RESTRICT src, int count, 21 U8CPU alpha, int /*x*/, int /*y*/) { 22 SkASSERT(255 == alpha); 23 24 if (count >= 8) { 25 uint16_t* SK_RESTRICT keep_dst; 26 27 asm volatile ( 28 "ands ip, %[count], #7 \n\t" 29 "vmov.u8 d31, #1<<7 \n\t" 30 "vld1.16 {q12}, [%[dst]] \n\t" 31 "vld4.8 {d0-d3}, [%[src]] \n\t" 32 "moveq ip, #8 \n\t" 33 "mov %[keep_dst], %[dst] \n\t" 34 35 "add %[src], %[src], ip, LSL#2 \n\t" 36 "add %[dst], %[dst], ip, LSL#1 \n\t" 37 "subs %[count], %[count], ip \n\t" 38 "b 9f \n\t" 39 // LOOP 40 "2: \n\t" 41 42 "vld1.16 {q12}, [%[dst]]! \n\t" 43 "vld4.8 {d0-d3}, [%[src]]! \n\t" 44 "vst1.16 {q10}, [%[keep_dst]] \n\t" 45 "sub %[keep_dst], %[dst], #8*2 \n\t" 46 "subs %[count], %[count], #8 \n\t" 47 "9: \n\t" 48 "pld [%[dst],#32] \n\t" 49 // expand 0565 q12 to 8888 {d4-d7} 50 "vmovn.u16 d4, q12 \n\t" 51 "vshr.u16 q11, q12, #5 \n\t" 52 "vshr.u16 q10, q12, #6+5 \n\t" 53 "vmovn.u16 d5, q11 \n\t" 54 "vmovn.u16 d6, q10 \n\t" 55 "vshl.u8 d4, d4, #3 \n\t" 56 "vshl.u8 d5, d5, #2 \n\t" 57 "vshl.u8 d6, d6, #3 \n\t" 58 59 "vmovl.u8 q14, d31 \n\t" 60 "vmovl.u8 q13, d31 \n\t" 61 "vmovl.u8 q12, d31 \n\t" 62 63 // duplicate in 4/2/1 & 8pix vsns 64 "vmvn.8 d30, d3 \n\t" 65 "vmlal.u8 q14, d30, d6 \n\t" 66 "vmlal.u8 q13, d30, d5 \n\t" 67 "vmlal.u8 q12, d30, d4 \n\t" 68 "vshr.u16 q8, q14, #5 \n\t" 69 "vshr.u16 q9, q13, #6 \n\t" 70 "vaddhn.u16 d6, q14, q8 \n\t" 71 "vshr.u16 q8, q12, #5 \n\t" 72 "vaddhn.u16 d5, q13, q9 \n\t" 73 "vqadd.u8 d6, d6, d0 \n\t" // moved up 74 "vaddhn.u16 d4, q12, q8 \n\t" 75 // intentionally don't calculate alpha 76 // result in d4-d6 77 78 "vqadd.u8 d5, d5, d1 \n\t" 79 "vqadd.u8 d4, d4, d2 \n\t" 80 81 // pack 8888 {d4-d6} to 0565 q10 82 "vshll.u8 q10, d6, #8 \n\t" 83 "vshll.u8 q3, d5, #8 \n\t" 84 "vshll.u8 q2, d4, #8 \n\t" 85 "vsri.u16 q10, q3, #5 \n\t" 86 "vsri.u16 q10, q2, #11 \n\t" 87 88 "bne 2b \n\t" 89 90 "1: \n\t" 91 "vst1.16 {q10}, [%[keep_dst]] \n\t" 92 : [count] "+r" (count) 93 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 94 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 95 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 96 "d30","d31" 97 ); 98 } 99 else 100 { // handle count < 8 101 uint16_t* SK_RESTRICT keep_dst; 102 103 asm volatile ( 104 "vmov.u8 d31, #1<<7 \n\t" 105 "mov %[keep_dst], %[dst] \n\t" 106 107 "tst %[count], #4 \n\t" 108 "beq 14f \n\t" 109 "vld1.16 {d25}, [%[dst]]! \n\t" 110 "vld1.32 {q1}, [%[src]]! \n\t" 111 112 "14: \n\t" 113 "tst %[count], #2 \n\t" 114 "beq 12f \n\t" 115 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 116 "vld1.32 {d1}, [%[src]]! \n\t" 117 118 "12: \n\t" 119 "tst %[count], #1 \n\t" 120 "beq 11f \n\t" 121 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 122 "vld1.32 {d0[1]}, [%[src]]! \n\t" 123 124 "11: \n\t" 125 // unzips achieve the same as a vld4 operation 126 "vuzpq.u16 q0, q1 \n\t" 127 "vuzp.u8 d0, d1 \n\t" 128 "vuzp.u8 d2, d3 \n\t" 129 // expand 0565 q12 to 8888 {d4-d7} 130 "vmovn.u16 d4, q12 \n\t" 131 "vshr.u16 q11, q12, #5 \n\t" 132 "vshr.u16 q10, q12, #6+5 \n\t" 133 "vmovn.u16 d5, q11 \n\t" 134 "vmovn.u16 d6, q10 \n\t" 135 "vshl.u8 d4, d4, #3 \n\t" 136 "vshl.u8 d5, d5, #2 \n\t" 137 "vshl.u8 d6, d6, #3 \n\t" 138 139 "vmovl.u8 q14, d31 \n\t" 140 "vmovl.u8 q13, d31 \n\t" 141 "vmovl.u8 q12, d31 \n\t" 142 143 // duplicate in 4/2/1 & 8pix vsns 144 "vmvn.8 d30, d3 \n\t" 145 "vmlal.u8 q14, d30, d6 \n\t" 146 "vmlal.u8 q13, d30, d5 \n\t" 147 "vmlal.u8 q12, d30, d4 \n\t" 148 "vshr.u16 q8, q14, #5 \n\t" 149 "vshr.u16 q9, q13, #6 \n\t" 150 "vaddhn.u16 d6, q14, q8 \n\t" 151 "vshr.u16 q8, q12, #5 \n\t" 152 "vaddhn.u16 d5, q13, q9 \n\t" 153 "vqadd.u8 d6, d6, d0 \n\t" // moved up 154 "vaddhn.u16 d4, q12, q8 \n\t" 155 // intentionally don't calculate alpha 156 // result in d4-d6 157 158 "vqadd.u8 d5, d5, d1 \n\t" 159 "vqadd.u8 d4, d4, d2 \n\t" 160 161 // pack 8888 {d4-d6} to 0565 q10 162 "vshll.u8 q10, d6, #8 \n\t" 163 "vshll.u8 q3, d5, #8 \n\t" 164 "vshll.u8 q2, d4, #8 \n\t" 165 "vsri.u16 q10, q3, #5 \n\t" 166 "vsri.u16 q10, q2, #11 \n\t" 167 168 // store 169 "tst %[count], #4 \n\t" 170 "beq 24f \n\t" 171 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 172 173 "24: \n\t" 174 "tst %[count], #2 \n\t" 175 "beq 22f \n\t" 176 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 177 178 "22: \n\t" 179 "tst %[count], #1 \n\t" 180 "beq 21f \n\t" 181 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 182 183 "21: \n\t" 184 : [count] "+r" (count) 185 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 186 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 187 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 188 "d30","d31" 189 ); 190 } 191} 192 193static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 194 const SkPMColor* SK_RESTRICT src, int count, 195 U8CPU alpha, int /*x*/, int /*y*/) { 196 197 U8CPU alpha_for_asm = alpha; 198 199 asm volatile ( 200 /* This code implements a Neon version of S32A_D565_Blend. The output differs from 201 * the original in two respects: 202 * 1. The results have a few mismatches compared to the original code. These mismatches 203 * never exceed 1. It's possible to improve accuracy vs. a floating point 204 * implementation by introducing rounding right shifts (vrshr) for the final stage. 205 * Rounding is not present in the code below, because although results would be closer 206 * to a floating point implementation, the number of mismatches compared to the 207 * original code would be far greater. 208 * 2. On certain inputs, the original code can overflow, causing colour channels to 209 * mix. Although the Neon code can also overflow, it doesn't allow one colour channel 210 * to affect another. 211 */ 212 213#if 1 214 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ 215 "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 216#else 217 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 218#endif 219 "vmov.u16 q3, #255 \n\t" // set up constant 220 "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 221 "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon 222 "beq 2f \n\t" // if count8 == 0, exit 223 "vmov.u16 q15, #0x1f \n\t" // set up blue mask 224 225 "1: \n\t" 226 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels 227 "subs r4, r4, #1 \n\t" // decrement loop counter 228 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels 229 // and deinterleave 230 231 "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes 232 "vand q10, q0, q15 \n\t" // extract blue 233 "vshr.u16 q8, q0, #11 \n\t" // extract red 234 "vshr.u16 q9, q9, #10 \n\t" // extract green 235 // dstrgb = {q8, q9, q10} 236 237 "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range 238 "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range 239 "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range 240 241 "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits 242 "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits 243 "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits 244 "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits 245 // srcrgba = {q11, q12, q13, q14} 246 247 "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale 248 "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale 249 "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale 250 "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale 251 252 "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 253 "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) 254 // dst_scale = q2 255 256 "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale 257 "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale 258 "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale 259 260#if 1 261 // trying for a better match with SkDiv255Round(a) 262 // C alg is: a+=128; (a+a>>8)>>8 263 // we'll use just a rounding shift [q2 is available for scratch] 264 "vrshr.u16 q11, q11, #8 \n\t" // shift down red 265 "vrshr.u16 q12, q12, #8 \n\t" // shift down green 266 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue 267#else 268 // arm's original "truncating divide by 256" 269 "vshr.u16 q11, q11, #8 \n\t" // shift down red 270 "vshr.u16 q12, q12, #8 \n\t" // shift down green 271 "vshr.u16 q13, q13, #8 \n\t" // shift down blue 272#endif 273 274 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue 275 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue 276 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr 277 278 "bne 1b \n\t" // if counter != 0, loop 279 "2: \n\t" // exit 280 281 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) 282 : 283 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 284 ); 285 286 count &= 7; 287 if (count > 0) { 288 do { 289 SkPMColor sc = *src++; 290 if (sc) { 291 uint16_t dc = *dst; 292 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 293 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 294 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 295 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 296 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 297 } 298 dst += 1; 299 } while (--count != 0); 300 } 301} 302 303/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 304 * each dither value is spaced out into byte lanes, and repeated 305 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 306 * start of each row. 307 */ 308static const uint8_t gDitherMatrix_Neon[48] = { 309 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 310 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 311 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 312 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 313 314}; 315 316static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 317 int count, U8CPU alpha, int x, int y) 318{ 319 /* select row and offset for dither array */ 320 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 321 322 /* rescale alpha to range 0 - 256 */ 323 int scale = SkAlpha255To256(alpha); 324 325 asm volatile ( 326 "vld1.8 {d31}, [%[dstart]] \n\t" // load dither values 327 "vshr.u8 d30, d31, #1 \n\t" // calc. green dither values 328 "vdup.16 d6, %[scale] \n\t" // duplicate scale into neon reg 329 "vmov.i8 d29, #0x3f \n\t" // set up green mask 330 "vmov.i8 d28, #0x1f \n\t" // set up blue mask 331 "1: \n\t" 332 "vld4.8 {d0, d1, d2, d3}, [%[src]]! \n\t" // load 8 pixels and split into argb 333 "vshr.u8 d22, d0, #5 \n\t" // calc. red >> 5 334 "vshr.u8 d23, d1, #6 \n\t" // calc. green >> 6 335 "vshr.u8 d24, d2, #5 \n\t" // calc. blue >> 5 336 "vaddl.u8 q8, d0, d31 \n\t" // add in dither to red and widen 337 "vaddl.u8 q9, d1, d30 \n\t" // add in dither to green and widen 338 "vaddl.u8 q10, d2, d31 \n\t" // add in dither to blue and widen 339 "vsubw.u8 q8, q8, d22 \n\t" // sub shifted red from result 340 "vsubw.u8 q9, q9, d23 \n\t" // sub shifted green from result 341 "vsubw.u8 q10, q10, d24 \n\t" // sub shifted blue from result 342 "vshrn.i16 d22, q8, #3 \n\t" // shift right and narrow to 5 bits 343 "vshrn.i16 d23, q9, #2 \n\t" // shift right and narrow to 6 bits 344 "vshrn.i16 d24, q10, #3 \n\t" // shift right and narrow to 5 bits 345 // load 8 pixels from dst, extract rgb 346 "vld1.16 {d0, d1}, [%[dst]] \n\t" // load 8 pixels 347 "vshrn.i16 d17, q0, #5 \n\t" // shift green down to bottom 6 bits 348 "vmovn.i16 d18, q0 \n\t" // narrow to get blue as bytes 349 "vshr.u16 q0, q0, #11 \n\t" // shift down to extract red 350 "vand d17, d17, d29 \n\t" // and green with green mask 351 "vand d18, d18, d28 \n\t" // and blue with blue mask 352 "vmovn.i16 d16, q0 \n\t" // narrow to get red as bytes 353 // src = {d22 (r), d23 (g), d24 (b)} 354 // dst = {d16 (r), d17 (g), d18 (b)} 355 // subtract dst from src and widen 356 "vsubl.s8 q0, d22, d16 \n\t" // subtract red src from dst 357 "vsubl.s8 q1, d23, d17 \n\t" // subtract green src from dst 358 "vsubl.s8 q2, d24, d18 \n\t" // subtract blue src from dst 359 // multiply diffs by scale and shift 360 "vmul.i16 q0, q0, d6[0] \n\t" // multiply red by scale 361 "vmul.i16 q1, q1, d6[0] \n\t" // multiply blue by scale 362 "vmul.i16 q2, q2, d6[0] \n\t" // multiply green by scale 363 "subs %[count], %[count], #8 \n\t" // decrement loop counter 364 "vshrn.i16 d0, q0, #8 \n\t" // shift down red by 8 and narrow 365 "vshrn.i16 d2, q1, #8 \n\t" // shift down green by 8 and narrow 366 "vshrn.i16 d4, q2, #8 \n\t" // shift down blue by 8 and narrow 367 // add dst to result 368 "vaddl.s8 q0, d0, d16 \n\t" // add dst to red 369 "vaddl.s8 q1, d2, d17 \n\t" // add dst to green 370 "vaddl.s8 q2, d4, d18 \n\t" // add dst to blue 371 // put result into 565 format 372 "vsli.i16 q2, q1, #5 \n\t" // shift up green and insert into blue 373 "vsli.i16 q2, q0, #11 \n\t" // shift up red and insert into blue 374 "vst1.16 {d4, d5}, [%[dst]]! \n\t" // store result 375 "bgt 1b \n\t" // loop if count > 0 376 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 377 : [dstart] "r" (dstart), [scale] "r" (scale) 378 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31" 379 ); 380 381 DITHER_565_SCAN(y); 382 383 while((count & 7) > 0) 384 { 385 SkPMColor c = *src++; 386 387 int dither = DITHER_VALUE(x); 388 int sr = SkGetPackedR32(c); 389 int sg = SkGetPackedG32(c); 390 int sb = SkGetPackedB32(c); 391 sr = SkDITHER_R32To565(sr, dither); 392 sg = SkDITHER_G32To565(sg, dither); 393 sb = SkDITHER_B32To565(sb, dither); 394 395 uint16_t d = *dst; 396 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 397 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 398 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 399 DITHER_INC_X(x); 400 count--; 401 } 402} 403 404#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon 405#define S32A_D565_Blend_PROC S32A_D565_Blend_neon 406#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon 407#else 408#define S32A_D565_Opaque_PROC NULL 409#define S32A_D565_Blend_PROC NULL 410#define S32_D565_Blend_Dither_PROC NULL 411#endif 412 413/* Don't have a special version that assumes each src is opaque, but our S32A 414 is still faster than the default, so use it here 415 */ 416#define S32_D565_Opaque_PROC S32A_D565_Opaque_PROC 417#define S32_D565_Blend_PROC S32A_D565_Blend_PROC 418 419/////////////////////////////////////////////////////////////////////////////// 420 421#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 422 423static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 424 const SkPMColor* SK_RESTRICT src, 425 int count, U8CPU alpha) { 426 427 SkASSERT(255 == alpha); 428 if (count > 0) { 429 430 431 uint8x8_t alpha_mask; 432 433 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 434 alpha_mask = vld1_u8(alpha_mask_setup); 435 436 /* do the NEON unrolled code */ 437#define UNROLL 4 438 while (count >= UNROLL) { 439 uint8x8_t src_raw, dst_raw, dst_final; 440 uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 441 442 /* get the source */ 443 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 444#if UNROLL > 2 445 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 446#endif 447 448 /* get and hold the dst too */ 449 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 450#if UNROLL > 2 451 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 452#endif 453 454 /* 1st and 2nd bits of the unrolling */ 455 { 456 uint8x8_t dst_cooked; 457 uint16x8_t dst_wide; 458 uint8x8_t alpha_narrow; 459 uint16x8_t alpha_wide; 460 461 /* get the alphas spread out properly */ 462 alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 463#if 1 464 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 465 /* we collapsed (255-a)+1 ... */ 466 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 467#else 468 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 469 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 470#endif 471 472 /* spread the dest */ 473 dst_wide = vmovl_u8(dst_raw); 474 475 /* alpha mul the dest */ 476 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 477 dst_cooked = vshrn_n_u16(dst_wide, 8); 478 479 /* sum -- ignoring any byte lane overflows */ 480 dst_final = vadd_u8(src_raw, dst_cooked); 481 } 482 483#if UNROLL > 2 484 /* the 3rd and 4th bits of our unrolling */ 485 { 486 uint8x8_t dst_cooked; 487 uint16x8_t dst_wide; 488 uint8x8_t alpha_narrow; 489 uint16x8_t alpha_wide; 490 491 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 492#if 1 493 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 494 /* we collapsed (255-a)+1 ... */ 495 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 496#else 497 alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); 498 alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); 499#endif 500 501 /* spread the dest */ 502 dst_wide = vmovl_u8(dst_raw_2); 503 504 /* alpha mul the dest */ 505 dst_wide = vmulq_u16 (dst_wide, alpha_wide); 506 dst_cooked = vshrn_n_u16(dst_wide, 8); 507 508 /* sum -- ignoring any byte lane overflows */ 509 dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 510 } 511#endif 512 513 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 514#if UNROLL > 2 515 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 516#endif 517 518 src += UNROLL; 519 dst += UNROLL; 520 count -= UNROLL; 521 } 522#undef UNROLL 523 524 /* do any residual iterations */ 525 while (--count >= 0) { 526#ifdef TEST_SRC_ALPHA 527 SkPMColor sc = *src; 528 if (sc) { 529 unsigned srcA = SkGetPackedA32(sc); 530 SkPMColor result = sc; 531 if (srcA != 255) { 532 result = SkPMSrcOver(sc, *dst); 533 } 534 *dst = result; 535 } 536#else 537 *dst = SkPMSrcOver(*src, *dst); 538#endif 539 src += 1; 540 dst += 1; 541 } 542 } 543} 544 545#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_neon 546 547#else 548 549#ifdef TEST_SRC_ALPHA 550#error The ARM asm version of S32A_Opaque_BlitRow32 does not support TEST_SRC_ALPHA 551#endif 552 553static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 554 const SkPMColor* SK_RESTRICT src, 555 int count, U8CPU alpha) { 556 557 SkASSERT(255 == alpha); 558 559 /* Does not support the TEST_SRC_ALPHA case */ 560 asm volatile ( 561 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 562 "beq 3f \n\t" /* if zero exit */ 563 564 "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ 565 "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ 566 567 "cmp %[count], #2 \n\t" /* compare count with 2 */ 568 "blt 2f \n\t" /* if less than 2 -> single loop */ 569 570 /* Double Loop */ 571 "1: \n\t" /* <double loop> */ 572 "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ 573 "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ 574 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 575 576 /* ----------- */ 577 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 578 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 579 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 580 581 "mul r9, r9, r4 \n\t" /* br = br * scale */ 582 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 583 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 584 585 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 586 "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 587 "orr r7, r9, r10 \n\t" /* br | ag*/ 588 589 "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ 590 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ 591 592 /* ----------- */ 593 "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ 594 595 "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ 596 "mul r9, r9, r4 \n\t" /* br = br * scale */ 597 "sub %[count], %[count], #2 \n\t" 598 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 599 600 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 601 "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 602 "cmp %[count], #1 \n\t" /* comparing count with 1 */ 603 "orr r8, r9, r10 \n\t" /* br | ag */ 604 605 "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ 606 607 /* ----------------- */ 608 "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ 609 /* ----------------- */ 610 611 "bgt 1b \n\t" /* if greater than 1 -> reloop */ 612 "blt 3f \n\t" /* if less than 1 -> exit */ 613 614 /* Single Loop */ 615 "2: \n\t" /* <single loop> */ 616 "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ 617 "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ 618 "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 619 620 /* ----------- */ 621 "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 622 "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 623 624 "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 625 "mul r9, r9, r4 \n\t" /* br = br * scale */ 626 "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 627 "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 628 629 "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ 630 "orr r7, r9, r10 \n\t" /* br | ag */ 631 632 "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ 633 634 /* ----------------- */ 635 "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ 636 /* ----------------- */ 637 638 "3: \n\t" /* <exit> */ 639 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 640 : 641 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" 642 ); 643} 644#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm 645#endif 646 647/* 648 * ARM asm version of S32A_Blend_BlitRow32 649 */ 650static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 651 const SkPMColor* SK_RESTRICT src, 652 int count, U8CPU alpha) { 653 asm volatile ( 654 "cmp %[count], #0 \n\t" /* comparing count with 0 */ 655 "beq 3f \n\t" /* if zero exit */ 656 657 "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ 658 "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ 659 660 /* src1,2_scale */ 661 "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ 662 663 "cmp %[count], #2 \n\t" /* comparing count with 2 */ 664 "blt 2f \n\t" /* if less than 2 -> single loop */ 665 666 /* Double Loop */ 667 "1: \n\t" /* <double loop> */ 668 "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ 669 "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ 670 671 /* dst1_scale and dst2_scale*/ 672 "lsr r9, r5, #24 \n\t" /* src >> 24 */ 673 "lsr r10, r6, #24 \n\t" /* src >> 24 */ 674 "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 675 "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 676 "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ 677 "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ 678 "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ 679 "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ 680 681 /* ---------------------- */ 682 683 /* src1, src1_scale */ 684 "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ 685 "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ 686 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 687 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 688 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 689 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 690 "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ 691 692 /* dst1, dst1_scale */ 693 "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ 694 "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ 695 "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ 696 "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ 697 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 698 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 699 "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ 700 701 /* ---------------------- */ 702 "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ 703 /* ---------------------- */ 704 705 /* ====================== */ 706 707 /* src2, src2_scale */ 708 "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ 709 "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ 710 "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 711 "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 712 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 713 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 714 "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ 715 716 /* dst2, dst2_scale */ 717 "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ 718 "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ 719 "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ 720 "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ 721 "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 722 "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 723 "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ 724 725 "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ 726 /* ---------------------- */ 727 "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ 728 /* ---------------------- */ 729 "cmp %[count], #1 \n\t" /* compare count with 1 */ 730 /* ----------------- */ 731 "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ 732 /* ----------------- */ 733 734 "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ 735 "blt 3f \n\t" /* if %[count] less than 1 exit */ 736 /* else get into the single loop */ 737 /* Single Loop */ 738 "2: \n\t" /* <single loop> */ 739 "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ 740 "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ 741 742 "lsr r6, r5, #24 \n\t" /* src >> 24 */ 743 "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ 744 "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 745 "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ 746 "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ 747 "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ 748 "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ 749 750 /* src, src_scale */ 751 "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ 752 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 753 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 754 "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ 755 756 /* dst, dst_scale */ 757 "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ 758 "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ 759 "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ 760 "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ 761 "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 762 "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 763 "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ 764 765 "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ 766 767 /* ----------------- */ 768 "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ 769 /* ----------------- */ 770 771 "3: \n\t" /* <exit> */ 772 : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) 773 : 774 : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" 775 ); 776 777} 778#define S32A_Blend_BlitRow32_PROC S32A_Blend_BlitRow32_arm 779 780/* Neon version of S32_Blend_BlitRow32() 781 * portable version is in src/core/SkBlitRow_D32.cpp 782 */ 783#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 784static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 785 const SkPMColor* SK_RESTRICT src, 786 int count, U8CPU alpha) { 787 SkASSERT(alpha <= 255); 788 if (count > 0) { 789 uint16_t src_scale = SkAlpha255To256(alpha); 790 uint16_t dst_scale = 256 - src_scale; 791 792 /* run them N at a time through the NEON unit */ 793 /* note that each 1 is 4 bytes, each treated exactly the same, 794 * so we can work under that guise. We *do* know that the src&dst 795 * will be 32-bit aligned quantities, so we can specify that on 796 * the load/store ops and do a neon 'reinterpret' to get us to 797 * byte-sized (pun intended) pieces that we widen/multiply/shift 798 * we're limited at 128 bits in the wide ops, which is 8x16bits 799 * or a pair of 32 bit src/dsts. 800 */ 801 /* we *could* manually unroll this loop so that we load 128 bits 802 * (as a pair of 64s) from each of src and dst, processing them 803 * in pieces. This might give us a little better management of 804 * the memory latency, but my initial attempts here did not 805 * produce an instruction stream that looked all that nice. 806 */ 807#define UNROLL 2 808 while (count >= UNROLL) { 809 uint8x8_t src_raw, dst_raw, dst_final; 810 uint16x8_t src_wide, dst_wide; 811 812 /* get 64 bits of src, widen it, multiply by src_scale */ 813 src_raw = vreinterpret_u8_u32(vld1_u32(src)); 814 src_wide = vmovl_u8(src_raw); 815 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ 816 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); 817 818 /* ditto with dst */ 819 dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 820 dst_wide = vmovl_u8(dst_raw); 821 822 /* combine add with dst multiply into mul-accumulate */ 823 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); 824 825 dst_final = vshrn_n_u16(dst_wide, 8); 826 vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 827 828 src += UNROLL; 829 dst += UNROLL; 830 count -= UNROLL; 831 } 832 /* RBE: well, i don't like how gcc manages src/dst across the above 833 * loop it's constantly calculating src+bias, dst+bias and it only 834 * adjusts the real ones when we leave the loop. Not sure why 835 * it's "hoisting down" (hoisting implies above in my lexicon ;)) 836 * the adjustments to src/dst/count, but it does... 837 * (might be SSA-style internal logic... 838 */ 839 840#if UNROLL == 2 841 if (count == 1) { 842 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 843 } 844#else 845 if (count > 0) { 846 do { 847 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 848 src += 1; 849 dst += 1; 850 } while (--count > 0); 851 } 852#endif 853 854#undef UNROLL 855 } 856} 857 858#define S32_Blend_BlitRow32_PROC S32_Blend_BlitRow32_neon 859#else 860#define S32_Blend_BlitRow32_PROC NULL 861#endif 862 863/////////////////////////////////////////////////////////////////////////////// 864 865#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 866 867#undef DEBUG_OPAQUE_DITHER 868 869#if defined(DEBUG_OPAQUE_DITHER) 870static void showme8(char *str, void *p, int len) 871{ 872 static char buf[256]; 873 char tbuf[32]; 874 int i; 875 char *pc = (char*) p; 876 sprintf(buf,"%8s:", str); 877 for(i=0;i<len;i++) { 878 sprintf(tbuf, " %02x", pc[i]); 879 strcat(buf, tbuf); 880 } 881 SkDebugf("%s\n", buf); 882} 883static void showme16(char *str, void *p, int len) 884{ 885 static char buf[256]; 886 char tbuf[32]; 887 int i; 888 uint16_t *pc = (uint16_t*) p; 889 sprintf(buf,"%8s:", str); 890 len = (len / sizeof(uint16_t)); /* passed as bytes */ 891 for(i=0;i<len;i++) { 892 sprintf(tbuf, " %04x", pc[i]); 893 strcat(buf, tbuf); 894 } 895 SkDebugf("%s\n", buf); 896} 897#endif 898 899static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 900 const SkPMColor* SK_RESTRICT src, 901 int count, U8CPU alpha, int x, int y) { 902 SkASSERT(255 == alpha); 903 904#define UNROLL 8 905 906 if (count >= UNROLL) { 907 uint8x8_t dbase; 908 909#if defined(DEBUG_OPAQUE_DITHER) 910 uint16_t tmpbuf[UNROLL]; 911 int td[UNROLL]; 912 int tdv[UNROLL]; 913 int ta[UNROLL]; 914 int tap[UNROLL]; 915 uint16_t in_dst[UNROLL]; 916 int offset = 0; 917 int noisy = 0; 918#endif 919 920 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 921 dbase = vld1_u8(dstart); 922 923 do { 924 uint8x8_t sr, sg, sb, sa, d; 925 uint16x8_t dst8, scale8, alpha8; 926 uint16x8_t dst_r, dst_g, dst_b; 927 928#if defined(DEBUG_OPAQUE_DITHER) 929 /* calculate 8 elements worth into a temp buffer */ 930 { 931 int my_y = y; 932 int my_x = x; 933 SkPMColor* my_src = (SkPMColor*)src; 934 uint16_t* my_dst = dst; 935 int i; 936 937 DITHER_565_SCAN(my_y); 938 for(i=0;i<UNROLL;i++) { 939 SkPMColor c = *my_src++; 940 SkPMColorAssert(c); 941 if (c) { 942 unsigned a = SkGetPackedA32(c); 943 944 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 945 tdv[i] = DITHER_VALUE(my_x); 946 ta[i] = a; 947 tap[i] = SkAlpha255To256(a); 948 td[i] = d; 949 950 unsigned sr = SkGetPackedR32(c); 951 unsigned sg = SkGetPackedG32(c); 952 unsigned sb = SkGetPackedB32(c); 953 sr = SkDITHER_R32_FOR_565(sr, d); 954 sg = SkDITHER_G32_FOR_565(sg, d); 955 sb = SkDITHER_B32_FOR_565(sb, d); 956 957 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 958 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 959 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 960 // now src and dst expanded are in g:11 r:10 x:1 b:10 961 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 962 td[i] = d; 963 964 } else { 965 tmpbuf[i] = *my_dst; 966 ta[i] = tdv[i] = td[i] = 0xbeef; 967 } 968 in_dst[i] = *my_dst; 969 my_dst += 1; 970 DITHER_INC_X(my_x); 971 } 972 } 973#endif 974 975 /* source is in ABGR */ 976 { 977 register uint8x8_t d0 asm("d0"); 978 register uint8x8_t d1 asm("d1"); 979 register uint8x8_t d2 asm("d2"); 980 register uint8x8_t d3 asm("d3"); 981 982 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 983 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 984 : "r" (src) 985 ); 986 sr = d0; sg = d1; sb = d2; sa = d3; 987 } 988 989 /* calculate 'd', which will be 0..7 */ 990 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ 991#if SK_BUILD_FOR_ANDROID 992 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 993 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1)); 994#else 995 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); 996#endif 997 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase)); 998 d = vshrn_n_u16(alpha8, 8); /* narrowing too */ 999 1000 /* sr = sr - (sr>>5) + d */ 1001 /* watching for 8-bit overflow. d is 0..7; risky range of 1002 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1003 * safe as long as we do ((sr-sr>>5) + d) */ 1004 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1005 sr = vadd_u8(sr, d); 1006 1007 /* sb = sb - (sb>>5) + d */ 1008 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1009 sb = vadd_u8(sb, d); 1010 1011 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1012 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1013 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1014 1015 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ 1016 dst8 = vld1q_u16(dst); 1017 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); 1018 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); 1019 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ 1020 1021 /* blend */ 1022#if 1 1023 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */ 1024 /* originally 255-sa + 1 */ 1025 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1026#else 1027 scale8 = vsubw_u8(vdupq_n_u16(255), sa); 1028 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); 1029#endif 1030 1031#if 1 1032 /* combine the addq and mul, save 3 insns */ 1033 scale8 = vshrq_n_u16(scale8, 3); 1034 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1035 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1036 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1037#else 1038 /* known correct, but +3 insns over above */ 1039 scale8 = vshrq_n_u16(scale8, 3); 1040 dst_b = vmulq_u16(dst_b, scale8); 1041 dst_g = vmulq_u16(dst_g, scale8); 1042 dst_r = vmulq_u16(dst_r, scale8); 1043 1044 /* combine */ 1045 /* NB: vshll widens, need to preserve those bits */ 1046 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); 1047 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); 1048 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); 1049#endif 1050 1051 /* repack to store */ 1052 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); 1053 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1054 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1055 1056 vst1q_u16(dst, dst8); 1057 1058#if defined(DEBUG_OPAQUE_DITHER) 1059 /* verify my 8 elements match the temp buffer */ 1060 { 1061 int i, bad=0; 1062 static int invocation; 1063 1064 for (i=0;i<UNROLL;i++) 1065 if (tmpbuf[i] != dst[i]) bad=1; 1066 if (bad) { 1067 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1068 invocation, offset); 1069 SkDebugf(" alpha 0x%x\n", alpha); 1070 for (i=0;i<UNROLL;i++) 1071 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1072 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), 1073 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]); 1074 1075 showme16("alpha8", &alpha8, sizeof(alpha8)); 1076 showme16("scale8", &scale8, sizeof(scale8)); 1077 showme8("d", &d, sizeof(d)); 1078 showme16("dst8", &dst8, sizeof(dst8)); 1079 showme16("dst_b", &dst_b, sizeof(dst_b)); 1080 showme16("dst_g", &dst_g, sizeof(dst_g)); 1081 showme16("dst_r", &dst_r, sizeof(dst_r)); 1082 showme8("sb", &sb, sizeof(sb)); 1083 showme8("sg", &sg, sizeof(sg)); 1084 showme8("sr", &sr, sizeof(sr)); 1085 1086 /* cop out */ 1087 return; 1088 } 1089 offset += UNROLL; 1090 invocation++; 1091 } 1092#endif 1093 1094 dst += UNROLL; 1095 src += UNROLL; 1096 count -= UNROLL; 1097 /* skip x += UNROLL, since it's unchanged mod-4 */ 1098 } while (count >= UNROLL); 1099 } 1100#undef UNROLL 1101 1102 /* residuals */ 1103 if (count > 0) { 1104 DITHER_565_SCAN(y); 1105 do { 1106 SkPMColor c = *src++; 1107 SkPMColorAssert(c); 1108 if (c) { 1109 unsigned a = SkGetPackedA32(c); 1110 1111 // dither and alpha are just temporary variables to work-around 1112 // an ICE in debug. 1113 unsigned dither = DITHER_VALUE(x); 1114 unsigned alpha = SkAlpha255To256(a); 1115 int d = SkAlphaMul(dither, alpha); 1116 1117 unsigned sr = SkGetPackedR32(c); 1118 unsigned sg = SkGetPackedG32(c); 1119 unsigned sb = SkGetPackedB32(c); 1120 sr = SkDITHER_R32_FOR_565(sr, d); 1121 sg = SkDITHER_G32_FOR_565(sg, d); 1122 sb = SkDITHER_B32_FOR_565(sb, d); 1123 1124 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1125 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1126 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1127 // now src and dst expanded are in g:11 r:10 x:1 b:10 1128 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1129 } 1130 dst += 1; 1131 DITHER_INC_X(x); 1132 } while (--count != 0); 1133 } 1134} 1135 1136#define S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon 1137#else 1138#define S32A_D565_Opaque_Dither_PROC NULL 1139#endif 1140 1141/////////////////////////////////////////////////////////////////////////////// 1142 1143#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) 1144/* 2009/10/27: RBE says "a work in progress"; debugging says ok; 1145 * speedup untested, but ARM version is 26 insns/iteration and 1146 * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) 1147 * which is 10x the native version; that's pure instruction counts, 1148 * not accounting for any instruction or memory latencies. 1149 */ 1150 1151#undef DEBUG_S32_OPAQUE_DITHER 1152 1153static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1154 const SkPMColor* SK_RESTRICT src, 1155 int count, U8CPU alpha, int x, int y) { 1156 SkASSERT(255 == alpha); 1157 1158#define UNROLL 8 1159 if (count >= UNROLL) { 1160 uint8x8_t d; 1161 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1162 d = vld1_u8(dstart); 1163 1164 while (count >= UNROLL) { 1165 uint8x8_t sr, sg, sb, sa; 1166 uint16x8_t dr, dg, db, da; 1167 uint16x8_t dst8; 1168 1169 /* source is in ABGR ordering (R == lsb) */ 1170 { 1171 register uint8x8_t d0 asm("d0"); 1172 register uint8x8_t d1 asm("d1"); 1173 register uint8x8_t d2 asm("d2"); 1174 register uint8x8_t d3 asm("d3"); 1175 1176 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1177 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) 1178 : "r" (src) 1179 ); 1180 sr = d0; sg = d1; sb = d2; sa = d3; 1181 } 1182 /* XXX: if we want to prefetch, hide it in the above asm() 1183 * using the gcc __builtin_prefetch(), the prefetch will 1184 * fall to the bottom of the loop -- it won't stick up 1185 * at the top of the loop, just after the vld4. 1186 */ 1187 1188 /* sr = sr - (sr>>5) + d */ 1189 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1190 dr = vaddl_u8(sr, d); 1191 1192 /* sb = sb - (sb>>5) + d */ 1193 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1194 db = vaddl_u8(sb, d); 1195 1196 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ 1197 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1198 dg = vaddl_u8(sg, vshr_n_u8(d,1)); 1199 /* XXX: check that the "d>>1" here is hoisted */ 1200 1201 /* pack high bits of each into 565 format (rgb, b is lsb) */ 1202 dst8 = vshrq_n_u16(db, 3); 1203 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1204 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); 1205 1206 /* store it */ 1207 vst1q_u16(dst, dst8); 1208 1209#if defined(DEBUG_S32_OPAQUE_DITHER) 1210 /* always good to know if we generated good results */ 1211 { 1212 int i, myx = x, myy = y; 1213 DITHER_565_SCAN(myy); 1214 for (i=0;i<UNROLL;i++) { 1215 SkPMColor c = src[i]; 1216 unsigned dither = DITHER_VALUE(myx); 1217 uint16_t val = SkDitherRGB32To565(c, dither); 1218 if (val != dst[i]) { 1219 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1220 c, dither, val, dst[i], dstart[i]); 1221 } 1222 DITHER_INC_X(myx); 1223 } 1224 } 1225#endif 1226 1227 dst += UNROLL; 1228 src += UNROLL; 1229 count -= UNROLL; 1230 x += UNROLL; /* probably superfluous */ 1231 } 1232 } 1233#undef UNROLL 1234 1235 /* residuals */ 1236 if (count > 0) { 1237 DITHER_565_SCAN(y); 1238 do { 1239 SkPMColor c = *src++; 1240 SkPMColorAssert(c); 1241 SkASSERT(SkGetPackedA32(c) == 255); 1242 1243 unsigned dither = DITHER_VALUE(x); 1244 *dst++ = SkDitherRGB32To565(c, dither); 1245 DITHER_INC_X(x); 1246 } while (--count != 0); 1247 } 1248} 1249 1250#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon 1251#else 1252#define S32_D565_Opaque_Dither_PROC NULL 1253#endif 1254 1255/////////////////////////////////////////////////////////////////////////////// 1256 1257static const SkBlitRow::Proc platform_565_procs[] = { 1258 // no dither 1259 S32_D565_Opaque_PROC, 1260 S32_D565_Blend_PROC, 1261 S32A_D565_Opaque_PROC, 1262 S32A_D565_Blend_PROC, 1263 1264 // dither 1265 S32_D565_Opaque_Dither_PROC, 1266 S32_D565_Blend_Dither_PROC, 1267 S32A_D565_Opaque_Dither_PROC, 1268 NULL, // S32A_D565_Blend_Dither 1269}; 1270 1271static const SkBlitRow::Proc platform_4444_procs[] = { 1272 // no dither 1273 NULL, // S32_D4444_Opaque, 1274 NULL, // S32_D4444_Blend, 1275 NULL, // S32A_D4444_Opaque, 1276 NULL, // S32A_D4444_Blend, 1277 1278 // dither 1279 NULL, // S32_D4444_Opaque_Dither, 1280 NULL, // S32_D4444_Blend_Dither, 1281 NULL, // S32A_D4444_Opaque_Dither, 1282 NULL, // S32A_D4444_Blend_Dither 1283}; 1284 1285static const SkBlitRow::Proc32 platform_32_procs[] = { 1286 NULL, // S32_Opaque, 1287 S32_Blend_BlitRow32_PROC, // S32_Blend, 1288 S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, 1289 S32A_Blend_BlitRow32_PROC // S32A_Blend 1290}; 1291 1292SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { 1293 return platform_4444_procs[flags]; 1294} 1295 1296SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 1297 return platform_565_procs[flags]; 1298} 1299 1300SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 1301 return platform_32_procs[flags]; 1302} 1303 1304SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 1305 return NULL; 1306} 1307 1308/////////////////////////////////////////////////////////////////////////////// 1309 1310SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, 1311 SkMask::Format maskFormat, 1312 SkColor color) { 1313 return NULL; 1314} 1315 1316SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, 1317 SkMask::Format maskFormat, 1318 RowFlags flags) { 1319 return NULL; 1320} 1321