SkBlitRow_opts_arm_neon.cpp revision 90165c2269bc33ca3d6aaa73d528194daf48da4e
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include "SkBlitRow_opts_arm_neon.h" 9 10#include "SkBlitMask.h" 11#include "SkBlitRow.h" 12#include "SkColorPriv.h" 13#include "SkDither.h" 14#include "SkMathPriv.h" 15#include "SkUtils.h" 16 17#include "SkColor_opts_neon.h" 18#include <arm_neon.h> 19 20#ifdef SK_CPU_ARM64 21static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) { 22 uint8x8x4_t vsrc; 23 uint8x8_t vsrc_0, vsrc_1, vsrc_2; 24 25 asm ( 26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 27 "mov %[vsrc0].8b, v0.8b \t\n" 28 "mov %[vsrc1].8b, v1.8b \t\n" 29 "mov %[vsrc2].8b, v2.8b \t\n" 30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src) 32 : : "v0", "v1", "v2", "v3" 33 ); 34 35 vsrc.val[0] = vsrc_0; 36 vsrc.val[1] = vsrc_1; 37 vsrc.val[2] = vsrc_2; 38 39 return vsrc; 40} 41 42static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) { 43 uint8x8x4_t vsrc; 44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3; 45 46 asm ( 47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n" 48 "mov %[vsrc0].8b, v0.8b \t\n" 49 "mov %[vsrc1].8b, v1.8b \t\n" 50 "mov %[vsrc2].8b, v2.8b \t\n" 51 "mov %[vsrc3].8b, v3.8b \t\n" 52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1), 53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3), 54 [src] "+&r" (src) 55 : : "v0", "v1", "v2", "v3" 56 ); 57 58 vsrc.val[0] = vsrc_0; 59 vsrc.val[1] = vsrc_1; 60 vsrc.val[2] = vsrc_2; 61 vsrc.val[3] = vsrc_3; 62 63 return vsrc; 64} 65#endif 66 67void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 68 const SkPMColor* SK_RESTRICT src, int count, 69 U8CPU alpha, int /*x*/, int /*y*/) { 70 SkASSERT(255 == alpha); 71 72 while (count >= 8) { 73 uint8x8x4_t vsrc; 74 uint16x8_t vdst; 75 76 // Load 77#ifdef SK_CPU_ARM64 78 vsrc = sk_vld4_u8_arm64_3(src); 79#else 80 vsrc = vld4_u8((uint8_t*)src); 81 src += 8; 82#endif 83 84 // Convert src to 565 85 vdst = SkPixel32ToPixel16_neon8(vsrc); 86 87 // Store 88 vst1q_u16(dst, vdst); 89 90 // Prepare next iteration 91 dst += 8; 92 count -= 8; 93 }; 94 95 // Leftovers 96 while (count > 0) { 97 SkPMColor c = *src++; 98 SkPMColorAssert(c); 99 *dst = SkPixel32ToPixel16_ToU16(c); 100 dst++; 101 count--; 102 }; 103} 104 105void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 106 const SkPMColor* SK_RESTRICT src, int count, 107 U8CPU alpha, int /*x*/, int /*y*/) { 108 SkASSERT(255 > alpha); 109 110 uint16x8_t vmask_blue, vscale; 111 112 // prepare constants 113 vscale = vdupq_n_u16(SkAlpha255To256(alpha)); 114 vmask_blue = vmovq_n_u16(0x1F); 115 116 while (count >= 8) { 117 uint8x8x4_t vsrc; 118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 119 uint16x8_t vres_r, vres_g, vres_b; 120 121 // Load src 122#ifdef SK_CPU_ARM64 123 vsrc = sk_vld4_u8_arm64_3(src); 124#else 125 { 126 register uint8x8_t d0 asm("d0"); 127 register uint8x8_t d1 asm("d1"); 128 register uint8x8_t d2 asm("d2"); 129 register uint8x8_t d3 asm("d3"); 130 131 asm ( 132 "vld4.8 {d0-d3},[%[src]]!" 133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 134 : 135 ); 136 vsrc.val[0] = d0; 137 vsrc.val[1] = d1; 138 vsrc.val[2] = d2; 139 } 140#endif 141 142 // Load and unpack dst 143 vdst = vld1q_u16(dst); 144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes 145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue 146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red 147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green 148 149 // Shift src to 565 range 150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3); 151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2); 152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3); 153 154 // Scale src - dst 155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r; 156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g; 157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b; 158 159 vres_r = vshrq_n_u16(vres_r * vscale, 8); 160 vres_g = vshrq_n_u16(vres_g * vscale, 8); 161 vres_b = vshrq_n_u16(vres_b * vscale, 8); 162 163 vres_r += vdst_r; 164 vres_g += vdst_g; 165 vres_b += vdst_b; 166 167 // Combine 168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue 169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue 170 171 // Store 172 vst1q_u16(dst, vres_b); 173 dst += 8; 174 count -= 8; 175 } 176 if (count > 0) { 177 int scale = SkAlpha255To256(alpha); 178 do { 179 SkPMColor c = *src++; 180 SkPMColorAssert(c); 181 uint16_t d = *dst; 182 *dst++ = SkPackRGB16( 183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), 184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), 185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); 186 } while (--count != 0); 187 } 188} 189 190#ifdef SK_CPU_ARM32 191void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 192 const SkPMColor* SK_RESTRICT src, int count, 193 U8CPU alpha, int /*x*/, int /*y*/) { 194 SkASSERT(255 == alpha); 195 196 if (count >= 8) { 197 int32_t tmp = 0; 198 199 asm volatile ( 200 "ands %[tmp], %[count], #7 \n\t" 201 "vmov.u8 d31, #1<<7 \n\t" 202 "vld1.16 {q12}, [%[dst]] \n\t" 203 "vld4.8 {d0-d3}, [%[src]] \n\t" 204 // Thumb does not support the standard ARM conditional 205 // instructions but instead requires the 'it' instruction 206 // to signal conditional execution 207 "it eq \n\t" 208 "moveq %[tmp], #8 \n\t" 209 "mov ip, %[dst] \n\t" 210 211 "add %[src], %[src], %[tmp], LSL#2 \n\t" 212 "add %[dst], %[dst], %[tmp], LSL#1 \n\t" 213 "subs %[count], %[count], %[tmp] \n\t" 214 "b 9f \n\t" 215 // LOOP 216 "2: \n\t" 217 218 "vld1.16 {q12}, [%[dst]]! \n\t" 219 "vld4.8 {d0-d3}, [%[src]]! \n\t" 220 "vst1.16 {q10}, [ip] \n\t" 221 "sub ip, %[dst], #8*2 \n\t" 222 "subs %[count], %[count], #8 \n\t" 223 "9: \n\t" 224 "pld [%[dst],#32] \n\t" 225 // expand 0565 q12 to 8888 {d4-d7} 226 "vmovn.u16 d4, q12 \n\t" 227 "vshr.u16 q11, q12, #5 \n\t" 228 "vshr.u16 q10, q12, #6+5 \n\t" 229 "vmovn.u16 d5, q11 \n\t" 230 "vmovn.u16 d6, q10 \n\t" 231 "vshl.u8 d4, d4, #3 \n\t" 232 "vshl.u8 d5, d5, #2 \n\t" 233 "vshl.u8 d6, d6, #3 \n\t" 234 235 "vmovl.u8 q14, d31 \n\t" 236 "vmovl.u8 q13, d31 \n\t" 237 "vmovl.u8 q12, d31 \n\t" 238 239 // duplicate in 4/2/1 & 8pix vsns 240 "vmvn.8 d30, d3 \n\t" 241 "vmlal.u8 q14, d30, d6 \n\t" 242 "vmlal.u8 q13, d30, d5 \n\t" 243 "vmlal.u8 q12, d30, d4 \n\t" 244 "vshr.u16 q8, q14, #5 \n\t" 245 "vshr.u16 q9, q13, #6 \n\t" 246 "vaddhn.u16 d6, q14, q8 \n\t" 247 "vshr.u16 q8, q12, #5 \n\t" 248 "vaddhn.u16 d5, q13, q9 \n\t" 249 "vaddhn.u16 d4, q12, q8 \n\t" 250 // intentionally don't calculate alpha 251 // result in d4-d6 252 253 #ifdef SK_PMCOLOR_IS_RGBA 254 "vqadd.u8 d6, d6, d0 \n\t" 255 "vqadd.u8 d5, d5, d1 \n\t" 256 "vqadd.u8 d4, d4, d2 \n\t" 257 #else 258 "vqadd.u8 d6, d6, d2 \n\t" 259 "vqadd.u8 d5, d5, d1 \n\t" 260 "vqadd.u8 d4, d4, d0 \n\t" 261 #endif 262 263 // pack 8888 {d4-d6} to 0565 q10 264 "vshll.u8 q10, d6, #8 \n\t" 265 "vshll.u8 q3, d5, #8 \n\t" 266 "vshll.u8 q2, d4, #8 \n\t" 267 "vsri.u16 q10, q3, #5 \n\t" 268 "vsri.u16 q10, q2, #11 \n\t" 269 270 "bne 2b \n\t" 271 272 "1: \n\t" 273 "vst1.16 {q10}, [ip] \n\t" 274 : [count] "+r" (count) 275 : [dst] "r" (dst), [src] "r" (src), [tmp] "r"(tmp) 276 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 277 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 278 "d30","d31" 279 ); 280 } 281 else 282 { // handle count < 8 283 uint16_t* SK_RESTRICT keep_dst = 0; 284 285 asm volatile ( 286 "vmov.u8 d31, #1<<7 \n\t" 287 "mov %[keep_dst], %[dst] \n\t" 288 289 "tst %[count], #4 \n\t" 290 "beq 14f \n\t" 291 "vld1.16 {d25}, [%[dst]]! \n\t" 292 "vld1.32 {q1}, [%[src]]! \n\t" 293 294 "14: \n\t" 295 "tst %[count], #2 \n\t" 296 "beq 12f \n\t" 297 "vld1.32 {d24[1]}, [%[dst]]! \n\t" 298 "vld1.32 {d1}, [%[src]]! \n\t" 299 300 "12: \n\t" 301 "tst %[count], #1 \n\t" 302 "beq 11f \n\t" 303 "vld1.16 {d24[1]}, [%[dst]]! \n\t" 304 "vld1.32 {d0[1]}, [%[src]]! \n\t" 305 306 "11: \n\t" 307 // unzips achieve the same as a vld4 operation 308 "vuzp.u16 q0, q1 \n\t" 309 "vuzp.u8 d0, d1 \n\t" 310 "vuzp.u8 d2, d3 \n\t" 311 // expand 0565 q12 to 8888 {d4-d7} 312 "vmovn.u16 d4, q12 \n\t" 313 "vshr.u16 q11, q12, #5 \n\t" 314 "vshr.u16 q10, q12, #6+5 \n\t" 315 "vmovn.u16 d5, q11 \n\t" 316 "vmovn.u16 d6, q10 \n\t" 317 "vshl.u8 d4, d4, #3 \n\t" 318 "vshl.u8 d5, d5, #2 \n\t" 319 "vshl.u8 d6, d6, #3 \n\t" 320 321 "vmovl.u8 q14, d31 \n\t" 322 "vmovl.u8 q13, d31 \n\t" 323 "vmovl.u8 q12, d31 \n\t" 324 325 // duplicate in 4/2/1 & 8pix vsns 326 "vmvn.8 d30, d3 \n\t" 327 "vmlal.u8 q14, d30, d6 \n\t" 328 "vmlal.u8 q13, d30, d5 \n\t" 329 "vmlal.u8 q12, d30, d4 \n\t" 330 "vshr.u16 q8, q14, #5 \n\t" 331 "vshr.u16 q9, q13, #6 \n\t" 332 "vaddhn.u16 d6, q14, q8 \n\t" 333 "vshr.u16 q8, q12, #5 \n\t" 334 "vaddhn.u16 d5, q13, q9 \n\t" 335 "vaddhn.u16 d4, q12, q8 \n\t" 336 // intentionally don't calculate alpha 337 // result in d4-d6 338 339 #ifdef SK_PMCOLOR_IS_RGBA 340 "vqadd.u8 d6, d6, d0 \n\t" 341 "vqadd.u8 d5, d5, d1 \n\t" 342 "vqadd.u8 d4, d4, d2 \n\t" 343 #else 344 "vqadd.u8 d6, d6, d2 \n\t" 345 "vqadd.u8 d5, d5, d1 \n\t" 346 "vqadd.u8 d4, d4, d0 \n\t" 347 #endif 348 349 // pack 8888 {d4-d6} to 0565 q10 350 "vshll.u8 q10, d6, #8 \n\t" 351 "vshll.u8 q3, d5, #8 \n\t" 352 "vshll.u8 q2, d4, #8 \n\t" 353 "vsri.u16 q10, q3, #5 \n\t" 354 "vsri.u16 q10, q2, #11 \n\t" 355 356 // store 357 "tst %[count], #4 \n\t" 358 "beq 24f \n\t" 359 "vst1.16 {d21}, [%[keep_dst]]! \n\t" 360 361 "24: \n\t" 362 "tst %[count], #2 \n\t" 363 "beq 22f \n\t" 364 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 365 366 "22: \n\t" 367 "tst %[count], #1 \n\t" 368 "beq 21f \n\t" 369 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 370 371 "21: \n\t" 372 : [count] "+r" (count) 373 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 374 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 375 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 376 "d30","d31" 377 ); 378 } 379} 380 381#else // #ifdef SK_CPU_ARM32 382 383void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 384 const SkPMColor* SK_RESTRICT src, int count, 385 U8CPU alpha, int /*x*/, int /*y*/) { 386 SkASSERT(255 == alpha); 387 388 if (count >= 16) { 389 asm ( 390 "movi v4.8h, #0x80 \t\n" 391 392 "1: \t\n" 393 "sub %w[count], %w[count], #16 \t\n" 394 "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n" 395 "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n" 396 "prfm pldl1keep, [%[src],#512] \t\n" 397 "prfm pldl1keep, [%[dst],#256] \t\n" 398 "ushr v20.8h, v17.8h, #5 \t\n" 399 "ushr v31.8h, v16.8h, #5 \t\n" 400 "xtn v6.8b, v31.8h \t\n" 401 "xtn2 v6.16b, v20.8h \t\n" 402 "ushr v20.8h, v17.8h, #11 \t\n" 403 "shl v19.16b, v6.16b, #2 \t\n" 404 "ushr v31.8h, v16.8h, #11 \t\n" 405 "xtn v22.8b, v31.8h \t\n" 406 "xtn2 v22.16b, v20.8h \t\n" 407 "shl v18.16b, v22.16b, #3 \t\n" 408 "mvn v3.16b, v3.16b \t\n" 409 "xtn v16.8b, v16.8h \t\n" 410 "mov v7.16b, v4.16b \t\n" 411 "xtn2 v16.16b, v17.8h \t\n" 412 "umlal v7.8h, v3.8b, v19.8b \t\n" 413 "shl v16.16b, v16.16b, #3 \t\n" 414 "mov v22.16b, v4.16b \t\n" 415 "ushr v24.8h, v7.8h, #6 \t\n" 416 "umlal v22.8h, v3.8b, v18.8b \t\n" 417 "ushr v20.8h, v22.8h, #5 \t\n" 418 "addhn v20.8b, v22.8h, v20.8h \t\n" 419 "cmp %w[count], #16 \t\n" 420 "mov v6.16b, v4.16b \t\n" 421 "mov v5.16b, v4.16b \t\n" 422 "umlal v6.8h, v3.8b, v16.8b \t\n" 423 "umlal2 v5.8h, v3.16b, v19.16b \t\n" 424 "mov v17.16b, v4.16b \t\n" 425 "ushr v19.8h, v6.8h, #5 \t\n" 426 "umlal2 v17.8h, v3.16b, v18.16b \t\n" 427 "addhn v7.8b, v7.8h, v24.8h \t\n" 428 "ushr v18.8h, v5.8h, #6 \t\n" 429 "ushr v21.8h, v17.8h, #5 \t\n" 430 "addhn2 v7.16b, v5.8h, v18.8h \t\n" 431 "addhn2 v20.16b, v17.8h, v21.8h \t\n" 432 "mov v22.16b, v4.16b \t\n" 433 "addhn v6.8b, v6.8h, v19.8h \t\n" 434 "umlal2 v22.8h, v3.16b, v16.16b \t\n" 435 "ushr v5.8h, v22.8h, #5 \t\n" 436 "addhn2 v6.16b, v22.8h, v5.8h \t\n" 437 "uqadd v7.16b, v1.16b, v7.16b \t\n" 438#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 439 "uqadd v20.16b, v2.16b, v20.16b \t\n" 440 "uqadd v6.16b, v0.16b, v6.16b \t\n" 441#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 442 "uqadd v20.16b, v0.16b, v20.16b \t\n" 443 "uqadd v6.16b, v2.16b, v6.16b \t\n" 444#else 445#error "This function only supports BGRA and RGBA." 446#endif 447 "shll v22.8h, v20.8b, #8 \t\n" 448 "shll v5.8h, v7.8b, #8 \t\n" 449 "sri v22.8h, v5.8h, #5 \t\n" 450 "shll v17.8h, v6.8b, #8 \t\n" 451 "shll2 v23.8h, v20.16b, #8 \t\n" 452 "shll2 v7.8h, v7.16b, #8 \t\n" 453 "sri v22.8h, v17.8h, #11 \t\n" 454 "sri v23.8h, v7.8h, #5 \t\n" 455 "shll2 v6.8h, v6.16b, #8 \t\n" 456 "st1 {v22.8h}, [%[dst]], #16 \t\n" 457 "sri v23.8h, v6.8h, #11 \t\n" 458 "st1 {v23.8h}, [%[dst]], #16 \t\n" 459 "b.ge 1b \t\n" 460 : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count) 461 :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 462 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 463 "v31" 464 ); 465 } 466 // Leftovers 467 if (count > 0) { 468 do { 469 SkPMColor c = *src++; 470 SkPMColorAssert(c); 471 if (c) { 472 *dst = SkSrcOver32To16(c, *dst); 473 } 474 dst += 1; 475 } while (--count != 0); 476 } 477} 478#endif // #ifdef SK_CPU_ARM32 479 480static uint32_t pmcolor_to_expand16(SkPMColor c) { 481 unsigned r = SkGetPackedR32(c); 482 unsigned g = SkGetPackedG32(c); 483 unsigned b = SkGetPackedB32(c); 484 return (g << 24) | (r << 13) | (b << 2); 485} 486 487void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) { 488 uint32_t src_expand; 489 unsigned scale; 490 uint16x8_t vmask_blue; 491 492 if (count <= 0) return; 493 SkASSERT(((size_t)dst & 0x01) == 0); 494 495 /* 496 * This preamble code is in order to make dst aligned to 8 bytes 497 * in the next mutiple bytes read & write access. 498 */ 499 src_expand = pmcolor_to_expand16(src); 500 scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; 501 502#define DST_ALIGN 8 503 504 /* 505 * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time. 506 */ 507 int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); 508 509 for (int i = 0; i < preamble_size; i+=2, dst++) { 510 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; 511 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); 512 if (--count == 0) 513 break; 514 } 515 516 int count16 = 0; 517 count16 = count >> 4; 518 vmask_blue = vmovq_n_u16(SK_B16_MASK); 519 520 if (count16) { 521 uint16x8_t wide_sr; 522 uint16x8_t wide_sg; 523 uint16x8_t wide_sb; 524 uint16x8_t wide_256_sa; 525 526 unsigned sr = SkGetPackedR32(src); 527 unsigned sg = SkGetPackedG32(src); 528 unsigned sb = SkGetPackedB32(src); 529 unsigned sa = SkGetPackedA32(src); 530 531 // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb 532 // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted, 533 //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) 534 wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift 535 536 // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted, 537 //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5) 538 wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift 539 540 // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted, 541 //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) 542 wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift 543 544 wide_256_sa = 545 vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3 546 547 while (count16-- > 0) { 548 uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b; 549 uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b; 550 vdst1 = vld1q_u16(dst); 551 dst += 8; 552 vdst2 = vld1q_u16(dst); 553 dst -= 8; //to store dst again. 554 555 vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes 556 vdst1_b = vdst1 & vmask_blue; // extract blue 557 vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red 558 vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green 559 560 vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes 561 vdst2_b = vdst2 & vmask_blue; // extract blue 562 vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red 563 vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green 564 565 vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1 566 vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1 567 vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1 568 569 vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2 570 vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2 571 vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2 572 573 vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red 574 vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green 575 vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue 576 577 vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue 578 vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue 579 580 vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red 581 vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green 582 vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue 583 584 vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue 585 vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue 586 587 vst1q_u16(dst, vdst1); 588 dst += 8; 589 vst1q_u16(dst, vdst2); 590 dst += 8; 591 } 592 } 593 594 count &= 0xF; 595 if (count > 0) { 596 do { 597 uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; 598 *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); 599 dst += 1; 600 } while (--count != 0); 601 } 602} 603 604static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 605 prod += vdupq_n_u16(128); 606 prod += vshrq_n_u16(prod, 8); 607 return vshrq_n_u16(prod, 8); 608} 609 610void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 611 const SkPMColor* SK_RESTRICT src, int count, 612 U8CPU alpha, int /*x*/, int /*y*/) { 613 SkASSERT(255 > alpha); 614 615 /* This code implements a Neon version of S32A_D565_Blend. The results have 616 * a few mismatches compared to the original code. These mismatches never 617 * exceed 1. 618 */ 619 620 if (count >= 8) { 621 uint16x8_t valpha_max, vmask_blue; 622 uint8x8_t valpha; 623 624 // prepare constants 625 valpha_max = vmovq_n_u16(255); 626 valpha = vdup_n_u8(alpha); 627 vmask_blue = vmovq_n_u16(SK_B16_MASK); 628 629 do { 630 uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 631 uint16x8_t vres_a, vres_r, vres_g, vres_b; 632 uint8x8x4_t vsrc; 633 634 // load pixels 635 vdst = vld1q_u16(dst); 636#ifdef SK_CPU_ARM64 637 vsrc = sk_vld4_u8_arm64_4(src); 638#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 639 asm ( 640 "vld4.u8 %h[vsrc], [%[src]]!" 641 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 642 : : 643 ); 644#else 645 register uint8x8_t d0 asm("d0"); 646 register uint8x8_t d1 asm("d1"); 647 register uint8x8_t d2 asm("d2"); 648 register uint8x8_t d3 asm("d3"); 649 650 asm volatile ( 651 "vld4.u8 {d0-d3},[%[src]]!;" 652 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 653 [src] "+&r" (src) 654 : : 655 ); 656 vsrc.val[0] = d0; 657 vsrc.val[1] = d1; 658 vsrc.val[2] = d2; 659 vsrc.val[3] = d3; 660#endif 661 662 663 // deinterleave dst 664 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 665 vdst_b = vdst & vmask_blue; // extract blue 666 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 667 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 668 669 // shift src to 565 670 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 671 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 672 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 673 674 // calc src * src_scale 675 vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 676 vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 677 vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 678 vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 679 680 // prepare dst_scale 681 vres_a = SkDiv255Round_neon8(vres_a); 682 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 683 684 // add dst * dst_scale to previous result 685 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 686 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 687 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 688 689#ifdef S32A_D565_BLEND_EXACT 690 // It is possible to get exact results with this but it is slow, 691 // even slower than C code in some cases 692 vres_r = SkDiv255Round_neon8(vres_r); 693 vres_g = SkDiv255Round_neon8(vres_g); 694 vres_b = SkDiv255Round_neon8(vres_b); 695#else 696 vres_r = vrshrq_n_u16(vres_r, 8); 697 vres_g = vrshrq_n_u16(vres_g, 8); 698 vres_b = vrshrq_n_u16(vres_b, 8); 699#endif 700 // pack result 701 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 702 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 703 704 // store 705 vst1q_u16(dst, vres_b); 706 dst += 8; 707 count -= 8; 708 } while (count >= 8); 709 } 710 711 // leftovers 712 while (count-- > 0) { 713 SkPMColor sc = *src++; 714 if (sc) { 715 uint16_t dc = *dst; 716 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 717 unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale); 718 unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale); 719 unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale); 720 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 721 } 722 dst += 1; 723 } 724} 725 726/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 727 * each dither value is spaced out into byte lanes, and repeated 728 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 729 * start of each row. 730 */ 731static const uint8_t gDitherMatrix_Neon[48] = { 732 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 733 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 734 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 735 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 736 737}; 738 739void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 740 int count, U8CPU alpha, int x, int y) 741{ 742 743 SkASSERT(255 > alpha); 744 745 // rescale alpha to range 1 - 256 746 int scale = SkAlpha255To256(alpha); 747 748 if (count >= 8) { 749 /* select row and offset for dither array */ 750 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 751 752 uint8x8_t vdither = vld1_u8(dstart); // load dither values 753 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 754 755 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 756 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 757 758 do { 759 760 uint8x8x4_t vsrc; 761 uint8x8_t vsrc_r, vsrc_g, vsrc_b; 762 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 763 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 764 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 765 uint16x8_t vdst; 766 uint16x8_t vdst_r, vdst_g, vdst_b; 767 int16x8_t vres_r, vres_g, vres_b; 768 int8x8_t vres8_r, vres8_g, vres8_b; 769 770 // Load source and add dither 771#ifdef SK_CPU_ARM64 772 vsrc = sk_vld4_u8_arm64_3(src); 773#else 774 { 775 register uint8x8_t d0 asm("d0"); 776 register uint8x8_t d1 asm("d1"); 777 register uint8x8_t d2 asm("d2"); 778 register uint8x8_t d3 asm("d3"); 779 780 asm ( 781 "vld4.8 {d0-d3},[%[src]]! " 782 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 783 : 784 ); 785 vsrc.val[0] = d0; 786 vsrc.val[1] = d1; 787 vsrc.val[2] = d2; 788 } 789#endif 790 vsrc_r = vsrc.val[NEON_R]; 791 vsrc_g = vsrc.val[NEON_G]; 792 vsrc_b = vsrc.val[NEON_B]; 793 794 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 795 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 796 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 797 798 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 799 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 800 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 801 802 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 803 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 804 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 805 806 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 807 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 808 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 809 810 // Load dst and unpack 811 vdst = vld1q_u16(dst); 812 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 813 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 814 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 815 816 // subtract dst from src and widen 817 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 818 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 819 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 820 821 // multiply diffs by scale and shift 822 vres_r = vmulq_s16(vres_r, vscale); 823 vres_g = vmulq_s16(vres_g, vscale); 824 vres_b = vmulq_s16(vres_b, vscale); 825 826 vres8_r = vshrn_n_s16(vres_r, 8); 827 vres8_g = vshrn_n_s16(vres_g, 8); 828 vres8_b = vshrn_n_s16(vres_b, 8); 829 830 // add dst to result 831 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 832 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 833 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 834 835 // put result into 565 format 836 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 837 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 838 839 // Store result 840 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 841 842 // Next iteration 843 dst += 8; 844 count -= 8; 845 846 } while (count >= 8); 847 } 848 849 // Leftovers 850 if (count > 0) { 851 int scale = SkAlpha255To256(alpha); 852 DITHER_565_SCAN(y); 853 do { 854 SkPMColor c = *src++; 855 SkPMColorAssert(c); 856 857 int dither = DITHER_VALUE(x); 858 int sr = SkGetPackedR32(c); 859 int sg = SkGetPackedG32(c); 860 int sb = SkGetPackedB32(c); 861 sr = SkDITHER_R32To565(sr, dither); 862 sg = SkDITHER_G32To565(sg, dither); 863 sb = SkDITHER_B32To565(sb, dither); 864 865 uint16_t d = *dst; 866 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 867 SkAlphaBlend(sg, SkGetPackedG16(d), scale), 868 SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 869 DITHER_INC_X(x); 870 } while (--count != 0); 871 } 872} 873 874/* Neon version of S32_Blend_BlitRow32() 875 * portable version is in src/core/SkBlitRow_D32.cpp 876 */ 877void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 878 const SkPMColor* SK_RESTRICT src, 879 int count, U8CPU alpha) { 880 SkASSERT(alpha <= 255); 881 882 if (count <= 0) { 883 return; 884 } 885 886 uint16_t src_scale = SkAlpha255To256(alpha); 887 uint16_t dst_scale = 256 - src_scale; 888 889 while (count >= 2) { 890 uint8x8_t vsrc, vdst, vres; 891 uint16x8_t vsrc_wide, vdst_wide; 892 893 /* These commented prefetches are a big win for count 894 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 895 * They also hurt a little (<5%) on an A15 896 */ 897 //__builtin_prefetch(src+32); 898 //__builtin_prefetch(dst+32); 899 900 // Load 901 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 902 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 903 904 // Process src 905 vsrc_wide = vmovl_u8(vsrc); 906 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 907 908 // Process dst 909 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 910 911 // Combine 912#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP 913 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 914#else 915 vdst_wide += vsrc_wide; 916 vres = vshrn_n_u16(vdst_wide, 8); 917#endif 918 919 // Store 920 vst1_u32(dst, vreinterpret_u32_u8(vres)); 921 922 src += 2; 923 dst += 2; 924 count -= 2; 925 } 926 927 if (count == 1) { 928 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 929 uint16x8_t vsrc_wide, vdst_wide; 930 931 // Load 932 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 933 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 934 935 // Process 936 vsrc_wide = vmovl_u8(vsrc); 937 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 938 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 939#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP 940 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 941#else 942 vdst_wide += vsrc_wide; 943 vres = vshrn_n_u16(vdst_wide, 8); 944#endif 945 946 // Store 947 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 948 } 949} 950 951#ifdef SK_CPU_ARM32 952void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 953 const SkPMColor* SK_RESTRICT src, 954 int count, U8CPU alpha) { 955 956 SkASSERT(255 > alpha); 957 958 if (count <= 0) { 959 return; 960 } 961 962 unsigned alpha256 = SkAlpha255To256(alpha); 963 964 // First deal with odd counts 965 if (count & 1) { 966 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 967 uint16x8_t vdst_wide, vsrc_wide; 968 unsigned dst_scale; 969 970 // Load 971 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 972 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 973 974 // Calc dst_scale 975 dst_scale = vget_lane_u8(vsrc, 3); 976 dst_scale = SkAlphaMulInv256(dst_scale, alpha256); 977 978 // Process src 979 vsrc_wide = vmovl_u8(vsrc); 980 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 981 982 // Process dst 983 vdst_wide = vmovl_u8(vdst); 984 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 985 986 // Combine 987#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP 988 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 989#else 990 vdst_wide += vsrc_wide; 991 vres = vshrn_n_u16(vdst_wide, 8); 992#endif 993 994 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 995 dst++; 996 src++; 997 count--; 998 } 999 1000 if (count) { 1001 uint8x8_t alpha_mask; 1002 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1003 alpha_mask = vld1_u8(alpha_mask_setup); 1004 1005 do { 1006 1007 uint8x8_t vsrc, vdst, vres, vsrc_alphas; 1008 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 1009 1010 __builtin_prefetch(src+32); 1011 __builtin_prefetch(dst+32); 1012 1013 // Load 1014 vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1015 vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1016 1017 // Prepare src_scale 1018 vsrc_scale = vdupq_n_u16(alpha256); 1019 1020 // Calc dst_scale 1021 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1022 vdst_scale = vmovl_u8(vsrc_alphas); 1023#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP 1024 vdst_scale *= vsrc_scale; 1025 vdst_scale = vshrq_n_u16(vdst_scale, 8); 1026 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1027#else 1028 // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). 1029 // A 16-bit lane would overflow if we used 0xFFFF here, 1030 // so use an approximation with 0xFF00 that is off by 1, 1031 // and add back 1 after to get the correct value. 1032 // This is valid if alpha256 <= 255. 1033 vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); 1034 vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); 1035 vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); 1036#endif 1037 1038 // Process src 1039 vsrc_wide = vmovl_u8(vsrc); 1040 vsrc_wide *= vsrc_scale; 1041 1042 // Process dst 1043 vdst_wide = vmovl_u8(vdst); 1044 vdst_wide *= vdst_scale; 1045 1046 // Combine 1047#ifdef SK_SUPPORT_LEGACY_BROKEN_LERP 1048 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1049#else 1050 vdst_wide += vsrc_wide; 1051 vres = vshrn_n_u16(vdst_wide, 8); 1052#endif 1053 1054 vst1_u32(dst, vreinterpret_u32_u8(vres)); 1055 1056 src += 2; 1057 dst += 2; 1058 count -= 2; 1059 } while(count); 1060 } 1061} 1062 1063/////////////////////////////////////////////////////////////////////////////// 1064 1065#endif // #ifdef SK_CPU_ARM32 1066 1067void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1068 const SkPMColor* SK_RESTRICT src, 1069 int count, U8CPU alpha, int x, int y) { 1070 SkASSERT(255 == alpha); 1071 1072#define UNROLL 8 1073 1074 if (count >= UNROLL) { 1075 1076 uint8x8_t dbase; 1077 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1078 dbase = vld1_u8(dstart); 1079 1080 do { 1081 uint8x8x4_t vsrc; 1082 uint8x8_t sr, sg, sb, sa, d; 1083 uint16x8_t dst8, scale8, alpha8; 1084 uint16x8_t dst_r, dst_g, dst_b; 1085 1086#ifdef SK_CPU_ARM64 1087 vsrc = sk_vld4_u8_arm64_4(src); 1088#else 1089 { 1090 register uint8x8_t d0 asm("d0"); 1091 register uint8x8_t d1 asm("d1"); 1092 register uint8x8_t d2 asm("d2"); 1093 register uint8x8_t d3 asm("d3"); 1094 1095 asm ("vld4.8 {d0-d3},[%[src]]! " 1096 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1097 : 1098 ); 1099 vsrc.val[0] = d0; 1100 vsrc.val[1] = d1; 1101 vsrc.val[2] = d2; 1102 vsrc.val[3] = d3; 1103 } 1104#endif 1105 sa = vsrc.val[NEON_A]; 1106 sr = vsrc.val[NEON_R]; 1107 sg = vsrc.val[NEON_G]; 1108 sb = vsrc.val[NEON_B]; 1109 1110 /* calculate 'd', which will be 0..7 1111 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1112 */ 1113 alpha8 = vmovl_u8(dbase); 1114 alpha8 = vmlal_u8(alpha8, sa, dbase); 1115 d = vshrn_n_u16(alpha8, 8); // narrowing too 1116 1117 // sr = sr - (sr>>5) + d 1118 /* watching for 8-bit overflow. d is 0..7; risky range of 1119 * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1120 * safe as long as we do ((sr-sr>>5) + d) 1121 */ 1122 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1123 sr = vadd_u8(sr, d); 1124 1125 // sb = sb - (sb>>5) + d 1126 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1127 sb = vadd_u8(sb, d); 1128 1129 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1130 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1131 sg = vadd_u8(sg, vshr_n_u8(d,1)); 1132 1133 // need to pick up 8 dst's -- at 16 bits each, 128 bits 1134 dst8 = vld1q_u16(dst); 1135 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1136 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1137 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1138 1139 // blend 1140 scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1141 1142 // combine the addq and mul, save 3 insns 1143 scale8 = vshrq_n_u16(scale8, 3); 1144 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1145 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1146 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1147 1148 // repack to store 1149 dst8 = vshrq_n_u16(dst_b, 5); 1150 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1151 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1152 1153 vst1q_u16(dst, dst8); 1154 1155 dst += UNROLL; 1156 count -= UNROLL; 1157 // skip x += UNROLL, since it's unchanged mod-4 1158 } while (count >= UNROLL); 1159 } 1160#undef UNROLL 1161 1162 // residuals 1163 if (count > 0) { 1164 DITHER_565_SCAN(y); 1165 do { 1166 SkPMColor c = *src++; 1167 SkPMColorAssert(c); 1168 if (c) { 1169 unsigned a = SkGetPackedA32(c); 1170 1171 // dither and alpha are just temporary variables to work-around 1172 // an ICE in debug. 1173 unsigned dither = DITHER_VALUE(x); 1174 unsigned alpha = SkAlpha255To256(a); 1175 int d = SkAlphaMul(dither, alpha); 1176 1177 unsigned sr = SkGetPackedR32(c); 1178 unsigned sg = SkGetPackedG32(c); 1179 unsigned sb = SkGetPackedB32(c); 1180 sr = SkDITHER_R32_FOR_565(sr, d); 1181 sg = SkDITHER_G32_FOR_565(sg, d); 1182 sb = SkDITHER_B32_FOR_565(sb, d); 1183 1184 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1185 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1186 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1187 // now src and dst expanded are in g:11 r:10 x:1 b:10 1188 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1189 } 1190 dst += 1; 1191 DITHER_INC_X(x); 1192 } while (--count != 0); 1193 } 1194} 1195 1196/////////////////////////////////////////////////////////////////////////////// 1197 1198void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1199 const SkPMColor* SK_RESTRICT src, 1200 int count, U8CPU alpha, int x, int y) { 1201 SkASSERT(255 == alpha); 1202 1203#define UNROLL 8 1204 if (count >= UNROLL) { 1205 uint8x8_t d; 1206 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1207 d = vld1_u8(dstart); 1208 1209 while (count >= UNROLL) { 1210 uint8x8_t sr, sg, sb; 1211 uint16x8_t dr, dg, db; 1212 uint16x8_t dst8; 1213 uint8x8x4_t vsrc; 1214 1215#ifdef SK_CPU_ARM64 1216 vsrc = sk_vld4_u8_arm64_3(src); 1217#else 1218 { 1219 register uint8x8_t d0 asm("d0"); 1220 register uint8x8_t d1 asm("d1"); 1221 register uint8x8_t d2 asm("d2"); 1222 register uint8x8_t d3 asm("d3"); 1223 1224 asm ( 1225 "vld4.8 {d0-d3},[%[src]]! " 1226 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1227 : 1228 ); 1229 vsrc.val[0] = d0; 1230 vsrc.val[1] = d1; 1231 vsrc.val[2] = d2; 1232 } 1233#endif 1234 sr = vsrc.val[NEON_R]; 1235 sg = vsrc.val[NEON_G]; 1236 sb = vsrc.val[NEON_B]; 1237 1238 /* XXX: if we want to prefetch, hide it in the above asm() 1239 * using the gcc __builtin_prefetch(), the prefetch will 1240 * fall to the bottom of the loop -- it won't stick up 1241 * at the top of the loop, just after the vld4. 1242 */ 1243 1244 // sr = sr - (sr>>5) + d 1245 sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1246 dr = vaddl_u8(sr, d); 1247 1248 // sb = sb - (sb>>5) + d 1249 sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1250 db = vaddl_u8(sb, d); 1251 1252 // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1253 sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1254 dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1255 1256 // pack high bits of each into 565 format (rgb, b is lsb) 1257 dst8 = vshrq_n_u16(db, 3); 1258 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1259 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1260 1261 // store it 1262 vst1q_u16(dst, dst8); 1263 1264 dst += UNROLL; 1265 // we don't need to increment src as the asm above has already done it 1266 count -= UNROLL; 1267 x += UNROLL; // probably superfluous 1268 } 1269 } 1270#undef UNROLL 1271 1272 // residuals 1273 if (count > 0) { 1274 DITHER_565_SCAN(y); 1275 do { 1276 SkPMColor c = *src++; 1277 SkPMColorAssert(c); 1278 SkASSERT(SkGetPackedA32(c) == 255); 1279 1280 unsigned dither = DITHER_VALUE(x); 1281 *dst++ = SkDitherRGB32To565(c, dither); 1282 DITHER_INC_X(x); 1283 } while (--count != 0); 1284 } 1285} 1286 1287/////////////////////////////////////////////////////////////////////////////// 1288 1289const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = { 1290 // no dither 1291 S32_D565_Opaque_neon, 1292 S32_D565_Blend_neon, 1293 S32A_D565_Opaque_neon, 1294#if 0 1295 S32A_D565_Blend_neon, 1296#else 1297 nullptr, // https://code.google.com/p/skia/issues/detail?id=2797 1298#endif 1299 1300 // dither 1301 S32_D565_Opaque_Dither_neon, 1302 S32_D565_Blend_Dither_neon, 1303 S32A_D565_Opaque_Dither_neon, 1304 nullptr, // S32A_D565_Blend_Dither 1305}; 1306 1307const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { 1308 Color32A_D565_neon, // Color32_D565, 1309 Color32A_D565_neon, // Color32A_D565, 1310 Color32A_D565_neon, // Color32_D565_Dither, 1311 Color32A_D565_neon, // Color32A_D565_Dither 1312}; 1313 1314const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1315 nullptr, // S32_Opaque, 1316 S32_Blend_BlitRow32_neon, // S32_Blend, 1317 nullptr, // Ported to SkOpts 1318#ifdef SK_CPU_ARM32 1319 S32A_Blend_BlitRow32_neon // S32A_Blend 1320#else 1321 nullptr 1322#endif 1323}; 1324