1#include "SkXfermode.h" 2#include "SkXfermode_proccoeff.h" 3#include "SkColorPriv.h" 4 5#include <arm_neon.h> 6#include "SkColor_opts_neon.h" 7#include "SkXfermode_opts_arm_neon.h" 8 9#define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) 10 11 12//////////////////////////////////////////////////////////////////////////////// 13// NEONized skia functions 14//////////////////////////////////////////////////////////////////////////////// 15 16static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) { 17 uint16x8_t tmp; 18 uint8x8_t ret; 19 20 tmp = vmull_u8(color, alpha); 21 tmp = vaddq_u16(tmp, vdupq_n_u16(128)); 22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8)); 23 24 ret = vshrn_n_u16(tmp, 8); 25 26 return ret; 27} 28 29static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) { 30 uint16x8_t ret; 31 32 ret = vmull_u8(color, alpha); 33 ret = vaddq_u16(ret, vdupq_n_u16(128)); 34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8)); 35 36 ret = vshrq_n_u16(ret, 8); 37 38 return ret; 39} 40 41static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) { 42 uint16x8_t tmp; 43 44#ifdef SK_CPU_ARM64 45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)), 46 vreinterpretq_u32_s32(p2)); 47#else 48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)), 49 vmovn_u32(vreinterpretq_u32_s32(p2))); 50#endif 51 52 tmp += vdupq_n_u16(128); 53 tmp += vshrq_n_u16(tmp, 8); 54 55 return vshrn_n_u16(tmp, 8); 56} 57 58static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) { 59 prod += vdupq_n_u16(128); 60 prod += vshrq_n_u16(prod, 8); 61 62 return vshrq_n_u16(prod, 8); 63} 64 65static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) { 66 uint8x8_t ret; 67 uint32x4_t cmp1, cmp2; 68 uint16x8_t cmp16; 69 uint8x8_t cmp8, cmp8_1; 70 71 // Test if <= 0 72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0)); 73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0)); 74#ifdef SK_CPU_ARM64 75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); 76#else 77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); 78#endif 79 cmp8_1 = vmovn_u16(cmp16); 80 81 // Init to zero 82 ret = vdup_n_u8(0); 83 84 // Test if >= 255*255 85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255)); 86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255)); 87#ifdef SK_CPU_ARM64 88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2); 89#else 90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2)); 91#endif 92 cmp8 = vmovn_u16(cmp16); 93 94 // Insert 255 where true 95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret); 96 97 // Calc SkDiv255Round 98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2); 99 100 // Insert where false and previous test false 101 cmp8 = cmp8 | cmp8_1; 102 ret = vbsl_u8(cmp8, ret, div); 103 104 // Return the final combination 105 return ret; 106} 107 108//////////////////////////////////////////////////////////////////////////////// 109// 1 pixel modeprocs 110//////////////////////////////////////////////////////////////////////////////// 111 112// kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc] 113SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) { 114 unsigned sa = SkGetPackedA32(src); 115 unsigned da = SkGetPackedA32(dst); 116 unsigned isa = 255 - sa; 117 118 uint8x8_t vda, visa, vsrc, vdst; 119 120 vda = vdup_n_u8(da); 121 visa = vdup_n_u8(isa); 122 123 uint16x8_t vsrc_wide, vdst_wide; 124 vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src))); 125 vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst))); 126 127 vsrc_wide += vdupq_n_u16(128); 128 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 129 130 vdst_wide += vdupq_n_u16(128); 131 vdst_wide += vshrq_n_u16(vdst_wide, 8); 132 133 vsrc = vshrn_n_u16(vsrc_wide, 8); 134 vdst = vshrn_n_u16(vdst_wide, 8); 135 136 vsrc += vdst; 137 vsrc = vset_lane_u8(da, vsrc, 3); 138 139 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 140} 141 142// kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)] 143SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) { 144 unsigned sa = SkGetPackedA32(src); 145 unsigned da = SkGetPackedA32(dst); 146 unsigned ida = 255 - da; 147 148 uint8x8_t vsa, vida, vsrc, vdst; 149 150 vsa = vdup_n_u8(sa); 151 vida = vdup_n_u8(ida); 152 153 uint16x8_t vsrc_wide, vdst_wide; 154 vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src))); 155 vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst))); 156 157 vsrc_wide += vdupq_n_u16(128); 158 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 159 160 vdst_wide += vdupq_n_u16(128); 161 vdst_wide += vshrq_n_u16(vdst_wide, 8); 162 163 vsrc = vshrn_n_u16(vsrc_wide, 8); 164 vdst = vshrn_n_u16(vdst_wide, 8); 165 166 vsrc += vdst; 167 vsrc = vset_lane_u8(sa, vsrc, 3); 168 169 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 170} 171 172// kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc] 173SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) { 174 unsigned sa = SkGetPackedA32(src); 175 unsigned da = SkGetPackedA32(dst); 176 unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1); 177 unsigned isa = 255 - sa; 178 unsigned ida = 255 - da; 179 180 uint8x8_t vsrc, vdst, visa, vida; 181 uint16x8_t vsrc_wide, vdst_wide; 182 183 visa = vdup_n_u8(isa); 184 vida = vdup_n_u8(ida); 185 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 186 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 187 188 vsrc_wide = vmull_u8(vsrc, vida); 189 vdst_wide = vmull_u8(vdst, visa); 190 191 vsrc_wide += vdupq_n_u16(128); 192 vsrc_wide += vshrq_n_u16(vsrc_wide, 8); 193 194 vdst_wide += vdupq_n_u16(128); 195 vdst_wide += vshrq_n_u16(vdst_wide, 8); 196 197 vsrc = vshrn_n_u16(vsrc_wide, 8); 198 vdst = vshrn_n_u16(vdst_wide, 8); 199 200 vsrc += vdst; 201 202 vsrc = vset_lane_u8(ret_alpha, vsrc, 3); 203 204 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 205} 206 207// kPlus_Mode 208SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) { 209 uint8x8_t vsrc, vdst; 210 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 211 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 212 vsrc = vqadd_u8(vsrc, vdst); 213 214 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0); 215} 216 217// kModulate_Mode 218SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) { 219 uint8x8_t vsrc, vdst, vres; 220 uint16x8_t vres_wide; 221 222 vsrc = vreinterpret_u8_u32(vdup_n_u32(src)); 223 vdst = vreinterpret_u8_u32(vdup_n_u32(dst)); 224 225 vres_wide = vmull_u8(vsrc, vdst); 226 227 vres_wide += vdupq_n_u16(128); 228 vres_wide += vshrq_n_u16(vres_wide, 8); 229 230 vres = vshrn_n_u16(vres_wide, 8); 231 232 return vget_lane_u32(vreinterpret_u32_u8(vres), 0); 233} 234 235//////////////////////////////////////////////////////////////////////////////// 236// 8 pixels modeprocs 237//////////////////////////////////////////////////////////////////////////////// 238 239uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 240 uint8x8x4_t ret; 241 uint16x8_t src_scale; 242 243 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); 244 245 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale); 246 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale); 247 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale); 248 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale); 249 250 return ret; 251} 252 253uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 254 uint8x8x4_t ret; 255 uint16x8_t scale; 256 257 scale = SkAlpha255To256_neon8(dst.val[NEON_A]); 258 259 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale); 260 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale); 261 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale); 262 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale); 263 264 return ret; 265} 266 267uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 268 uint8x8x4_t ret; 269 uint16x8_t scale; 270 271 scale = SkAlpha255To256_neon8(src.val[NEON_A]); 272 273 ret = SkAlphaMulQ_neon8(dst, scale); 274 275 return ret; 276} 277 278uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 279 uint8x8x4_t ret; 280 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]); 281 282 ret = SkAlphaMulQ_neon8(src, scale); 283 284 return ret; 285} 286 287uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 288 uint8x8x4_t ret; 289 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]); 290 291 ret = SkAlphaMulQ_neon8(dst, scale); 292 293 return ret; 294} 295 296uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 297 uint8x8x4_t ret; 298 uint8x8_t isa; 299 300 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); 301 302 ret.val[NEON_A] = dst.val[NEON_A]; 303 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A]) 304 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); 305 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A]) 306 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); 307 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A]) 308 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); 309 310 return ret; 311} 312 313uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 314 uint8x8x4_t ret; 315 uint8x8_t ida; 316 317 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); 318 319 ret.val[NEON_A] = src.val[NEON_A]; 320 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) 321 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]); 322 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) 323 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]); 324 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) 325 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]); 326 327 return ret; 328} 329 330uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 331 uint8x8x4_t ret; 332 uint8x8_t isa, ida; 333 uint16x8_t tmp_wide, tmp_wide2; 334 335 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]); 336 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]); 337 338 // First calc alpha 339 tmp_wide = vmovl_u8(src.val[NEON_A]); 340 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]); 341 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1); 342 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2); 343 ret.val[NEON_A] = vmovn_u16(tmp_wide); 344 345 // Then colors 346 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida) 347 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa); 348 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida) 349 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa); 350 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida) 351 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa); 352 353 return ret; 354} 355 356uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 357 uint8x8x4_t ret; 358 359 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]); 360 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]); 361 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]); 362 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]); 363 364 return ret; 365} 366 367uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 368 uint8x8x4_t ret; 369 370 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]); 371 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]); 372 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]); 373 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]); 374 375 return ret; 376} 377 378static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) { 379 uint16x8_t tmp; 380 381 tmp = vaddl_u8(a, b); 382 tmp -= SkAlphaMulAlpha_neon8_16(a, b); 383 384 return vmovn_u16(tmp); 385} 386 387uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 388 uint8x8x4_t ret; 389 390 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 391 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]); 392 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]); 393 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]); 394 395 return ret; 396} 397 398template <bool overlay> 399static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc, 400 uint8x8_t sa, uint8x8_t da) { 401 /* 402 * In the end we're gonna use (rc + tmp) with a different rc 403 * coming from an alternative. 404 * The whole value (rc + tmp) can always be expressed as 405 * VAL = COM - SUB in the if case 406 * VAL = COM + SUB - sa*da in the else case 407 * 408 * with COM = 255 * (sc + dc) 409 * and SUB = sc*da + dc*sa - 2*dc*sc 410 */ 411 412 // Prepare common subexpressions 413 uint16x8_t const255 = vdupq_n_u16(255); 414 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc); 415 uint16x8_t scda = vmull_u8(sc, da); 416 uint16x8_t dcsa = vmull_u8(dc, sa); 417 uint16x8_t sada = vmull_u8(sa, da); 418 419 // Prepare non common subexpressions 420 uint16x8_t dc2, sc2; 421 uint32x4_t scdc2_1, scdc2_2; 422 if (overlay) { 423 dc2 = vshll_n_u8(dc, 1); 424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc))); 425#ifdef SK_CPU_ARM64 426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc)); 427#else 428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc))); 429#endif 430 } else { 431 sc2 = vshll_n_u8(sc, 1); 432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc))); 433#ifdef SK_CPU_ARM64 434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc)); 435#else 436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc))); 437#endif 438 } 439 440 // Calc COM 441 int32x4_t com1, com2; 442 com1 = vreinterpretq_s32_u32( 443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); 444 com2 = vreinterpretq_s32_u32( 445#ifdef SK_CPU_ARM64 446 vmull_high_u16(const255, sc_plus_dc)); 447#else 448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); 449#endif 450 451 // Calc SUB 452 int32x4_t sub1, sub2; 453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa))); 454#ifdef SK_CPU_ARM64 455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa)); 456#else 457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa))); 458#endif 459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1)); 460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2)); 461 462 // Compare 2*dc <= da 463 uint16x8_t cmp; 464 465 if (overlay) { 466 cmp = vcleq_u16(dc2, vmovl_u8(da)); 467 } else { 468 cmp = vcleq_u16(sc2, vmovl_u8(sa)); 469 } 470 471 // Prepare variables 472 int32x4_t val1_1, val1_2; 473 int32x4_t val2_1, val2_2; 474 uint32x4_t cmp1, cmp2; 475 476 // Doing a signed lengthening allows to save a few instructions 477 // thanks to sign extension. 478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp)))); 479#ifdef SK_CPU_ARM64 480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp))); 481#else 482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp)))); 483#endif 484 485 // Calc COM - SUB 486 val1_1 = com1 - sub1; 487 val1_2 = com2 - sub2; 488 489 // Calc COM + SUB - sa*da 490 val2_1 = com1 + sub1; 491 val2_2 = com2 + sub2; 492 493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada)))); 494#ifdef SK_CPU_ARM64 495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada))); 496#else 497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada)))); 498#endif 499 500 // Insert where needed 501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1); 502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2); 503 504 // Call the clamp_div255round function 505 return clamp_div255round_simd8_32(val1_1, val1_2); 506} 507 508static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc, 509 uint8x8_t sa, uint8x8_t da) { 510 return overlay_hardlight_color<true>(sc, dc, sa, da); 511} 512 513uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 514 uint8x8x4_t ret; 515 516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 517 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R], 518 src.val[NEON_A], dst.val[NEON_A]); 519 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G], 520 src.val[NEON_A], dst.val[NEON_A]); 521 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B], 522 src.val[NEON_A], dst.val[NEON_A]); 523 524 return ret; 525} 526 527template <bool lighten> 528static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc, 529 uint8x8_t sa, uint8x8_t da) { 530 uint16x8_t sd, ds, cmp, tmp, tmp2; 531 532 // Prepare 533 sd = vmull_u8(sc, da); 534 ds = vmull_u8(dc, sa); 535 536 // Do test 537 if (lighten) { 538 cmp = vcgtq_u16(sd, ds); 539 } else { 540 cmp = vcltq_u16(sd, ds); 541 } 542 543 // Assign if 544 tmp = vaddl_u8(sc, dc); 545 tmp2 = tmp; 546 tmp -= SkDiv255Round_neon8_16_16(ds); 547 548 // Calc else 549 tmp2 -= SkDiv255Round_neon8_16_16(sd); 550 551 // Insert where needed 552 tmp = vbslq_u16(cmp, tmp, tmp2); 553 554 return vmovn_u16(tmp); 555} 556 557static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc, 558 uint8x8_t sa, uint8x8_t da) { 559 return lighten_darken_color<false>(sc, dc, sa, da); 560} 561 562uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 563 uint8x8x4_t ret; 564 565 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 566 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R], 567 src.val[NEON_A], dst.val[NEON_A]); 568 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G], 569 src.val[NEON_A], dst.val[NEON_A]); 570 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B], 571 src.val[NEON_A], dst.val[NEON_A]); 572 573 return ret; 574} 575 576static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc, 577 uint8x8_t sa, uint8x8_t da) { 578 return lighten_darken_color<true>(sc, dc, sa, da); 579} 580 581uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 582 uint8x8x4_t ret; 583 584 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 585 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R], 586 src.val[NEON_A], dst.val[NEON_A]); 587 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G], 588 src.val[NEON_A], dst.val[NEON_A]); 589 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B], 590 src.val[NEON_A], dst.val[NEON_A]); 591 592 return ret; 593} 594 595static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc, 596 uint8x8_t sa, uint8x8_t da) { 597 return overlay_hardlight_color<false>(sc, dc, sa, da); 598} 599 600uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 601 uint8x8x4_t ret; 602 603 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 604 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R], 605 src.val[NEON_A], dst.val[NEON_A]); 606 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G], 607 src.val[NEON_A], dst.val[NEON_A]); 608 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B], 609 src.val[NEON_A], dst.val[NEON_A]); 610 611 return ret; 612} 613 614static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc, 615 uint8x8_t sa, uint8x8_t da) { 616 uint16x8_t sd, ds, tmp; 617 int16x8_t val; 618 619 sd = vmull_u8(sc, da); 620 ds = vmull_u8(dc, sa); 621 622 tmp = vminq_u16(sd, ds); 623 tmp = SkDiv255Round_neon8_16_16(tmp); 624 tmp = vshlq_n_u16(tmp, 1); 625 626 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc)); 627 628 val -= vreinterpretq_s16_u16(tmp); 629 630 val = vmaxq_s16(val, vdupq_n_s16(0)); 631 val = vminq_s16(val, vdupq_n_s16(255)); 632 633 return vmovn_u16(vreinterpretq_u16_s16(val)); 634} 635 636uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 637 uint8x8x4_t ret; 638 639 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 640 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R], 641 src.val[NEON_A], dst.val[NEON_A]); 642 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G], 643 src.val[NEON_A], dst.val[NEON_A]); 644 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B], 645 src.val[NEON_A], dst.val[NEON_A]); 646 647 return ret; 648} 649 650static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc, 651 uint8x8_t sa, uint8x8_t da) { 652 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */ 653 654 uint16x8_t sc_plus_dc, scdc, const255; 655 int32x4_t term1_1, term1_2, term2_1, term2_2; 656 657 /* Calc (sc + dc) and (sc * dc) */ 658 sc_plus_dc = vaddl_u8(sc, dc); 659 scdc = vmull_u8(sc, dc); 660 661 /* Prepare constants */ 662 const255 = vdupq_n_u16(255); 663 664 /* Calc the first term */ 665 term1_1 = vreinterpretq_s32_u32( 666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc))); 667 term1_2 = vreinterpretq_s32_u32( 668#ifdef SK_CPU_ARM64 669 vmull_high_u16(const255, sc_plus_dc)); 670#else 671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc))); 672#endif 673 674 /* Calc the second term */ 675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1)); 676#ifdef SK_CPU_ARM64 677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1)); 678#else 679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1)); 680#endif 681 682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2); 683} 684 685uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 686 uint8x8x4_t ret; 687 688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R], 690 src.val[NEON_A], dst.val[NEON_A]); 691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G], 692 src.val[NEON_A], dst.val[NEON_A]); 693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B], 694 src.val[NEON_A], dst.val[NEON_A]); 695 696 return ret; 697} 698 699static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc, 700 uint8x8_t sa, uint8x8_t da) { 701 uint32x4_t val1, val2; 702 uint16x8_t scdc, t1, t2; 703 704 t1 = vmull_u8(sc, vdup_n_u8(255) - da); 705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa); 706 scdc = vmull_u8(sc, dc); 707 708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2)); 709#ifdef SK_CPU_ARM64 710 val2 = vaddl_high_u16(t1, t2); 711#else 712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2)); 713#endif 714 715 val1 = vaddw_u16(val1, vget_low_u16(scdc)); 716#ifdef SK_CPU_ARM64 717 val2 = vaddw_high_u16(val2, scdc); 718#else 719 val2 = vaddw_u16(val2, vget_high_u16(scdc)); 720#endif 721 722 return clamp_div255round_simd8_32( 723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2)); 724} 725 726uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) { 727 uint8x8x4_t ret; 728 729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]); 730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R], 731 src.val[NEON_A], dst.val[NEON_A]); 732 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G], 733 src.val[NEON_A], dst.val[NEON_A]); 734 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B], 735 src.val[NEON_A], dst.val[NEON_A]); 736 737 return ret; 738} 739 740//////////////////////////////////////////////////////////////////////////////// 741 742typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst); 743 744extern SkXfermodeProcSIMD gNEONXfermodeProcs[]; 745 746SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer) 747 : INHERITED(buffer) { 748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]); 749} 750 751void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[], 752 int count, const SkAlpha aa[]) const { 753 SkASSERT(dst && src && count >= 0); 754 755 SkXfermodeProc proc = this->getProc(); 756 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); 757 SkASSERT(procSIMD != NULL); 758 759 if (NULL == aa) { 760 // Unrolled NEON code 761 while (count >= 8) { 762 uint8x8x4_t vsrc, vdst, vres; 763 764#ifdef SK_CPU_ARM64 765 vsrc = vld4_u8((uint8_t*)src); 766 vdst = vld4_u8((uint8_t*)dst); 767#else 768#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 769 asm volatile ( 770 "vld4.u8 %h[vsrc], [%[src]]! \t\n" 771 "vld4.u8 %h[vdst], [%[dst]] \t\n" 772 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src) 773 : [dst] "r" (dst) 774 : 775 ); 776#else 777 register uint8x8_t d0 asm("d0"); 778 register uint8x8_t d1 asm("d1"); 779 register uint8x8_t d2 asm("d2"); 780 register uint8x8_t d3 asm("d3"); 781 register uint8x8_t d4 asm("d4"); 782 register uint8x8_t d5 asm("d5"); 783 register uint8x8_t d6 asm("d6"); 784 register uint8x8_t d7 asm("d7"); 785 786 asm volatile ( 787 "vld4.u8 {d0-d3},[%[src]]!;" 788 "vld4.u8 {d4-d7},[%[dst]];" 789 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 790 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7), 791 [src] "+&r" (src) 792 : [dst] "r" (dst) 793 : 794 ); 795 vsrc.val[0] = d0; vdst.val[0] = d4; 796 vsrc.val[1] = d1; vdst.val[1] = d5; 797 vsrc.val[2] = d2; vdst.val[2] = d6; 798 vsrc.val[3] = d3; vdst.val[3] = d7; 799#endif 800#endif // #ifdef SK_CPU_ARM64 801 802 vres = procSIMD(vsrc, vdst); 803 804 vst4_u8((uint8_t*)dst, vres); 805 806 count -= 8; 807 dst += 8; 808#ifdef SK_CPU_ARM64 809 src += 8; 810#endif 811 } 812 // Leftovers 813 for (int i = 0; i < count; i++) { 814 dst[i] = proc(src[i], dst[i]); 815 } 816 } else { 817 for (int i = count - 1; i >= 0; --i) { 818 unsigned a = aa[i]; 819 if (0 != a) { 820 SkPMColor dstC = dst[i]; 821 SkPMColor C = proc(src[i], dstC); 822 if (a != 0xFF) { 823 C = SkFourByteInterp_neon(C, dstC, a); 824 } 825 dst[i] = C; 826 } 827 } 828 } 829} 830 831void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst, 832 const SkPMColor* SK_RESTRICT src, int count, 833 const SkAlpha* SK_RESTRICT aa) const { 834 SkASSERT(dst && src && count >= 0); 835 836 SkXfermodeProc proc = this->getProc(); 837 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD); 838 SkASSERT(procSIMD != NULL); 839 840 if (NULL == aa) { 841 while(count >= 8) { 842 uint16x8_t vdst, vres16; 843 uint8x8x4_t vdst32, vsrc, vres; 844 845 vdst = vld1q_u16(dst); 846 847#ifdef SK_CPU_ARM64 848 vsrc = vld4_u8((uint8_t*)src); 849#else 850#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 851 asm volatile ( 852 "vld4.u8 %h[vsrc], [%[src]]! \t\n" 853 : [vsrc] "=w" (vsrc), [src] "+&r" (src) 854 : : 855 ); 856#else 857 register uint8x8_t d0 asm("d0"); 858 register uint8x8_t d1 asm("d1"); 859 register uint8x8_t d2 asm("d2"); 860 register uint8x8_t d3 asm("d3"); 861 862 asm volatile ( 863 "vld4.u8 {d0-d3},[%[src]]!;" 864 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 865 [src] "+&r" (src) 866 : : 867 ); 868 vsrc.val[0] = d0; 869 vsrc.val[1] = d1; 870 vsrc.val[2] = d2; 871 vsrc.val[3] = d3; 872#endif 873#endif // #ifdef SK_CPU_ARM64 874 875 vdst32 = SkPixel16ToPixel32_neon8(vdst); 876 vres = procSIMD(vsrc, vdst32); 877 vres16 = SkPixel32ToPixel16_neon8(vres); 878 879 vst1q_u16(dst, vres16); 880 881 count -= 8; 882 dst += 8; 883#ifdef SK_CPU_ARM64 884 src += 8; 885#endif 886 } 887 for (int i = 0; i < count; i++) { 888 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); 889 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); 890 } 891 } else { 892 for (int i = count - 1; i >= 0; --i) { 893 unsigned a = aa[i]; 894 if (0 != a) { 895 SkPMColor dstC = SkPixel16ToPixel32(dst[i]); 896 SkPMColor C = proc(src[i], dstC); 897 if (0xFF != a) { 898 C = SkFourByteInterp_neon(C, dstC, a); 899 } 900 dst[i] = SkPixel32ToPixel16_ToU16(C); 901 } 902 } 903 } 904} 905 906#ifndef SK_IGNORE_TO_STRING 907void SkNEONProcCoeffXfermode::toString(SkString* str) const { 908 this->INHERITED::toString(str); 909} 910#endif 911 912//////////////////////////////////////////////////////////////////////////////// 913 914SkXfermodeProcSIMD gNEONXfermodeProcs[] = { 915 NULL, // kClear_Mode 916 NULL, // kSrc_Mode 917 NULL, // kDst_Mode 918 NULL, // kSrcOver_Mode 919 dstover_modeproc_neon8, 920 srcin_modeproc_neon8, 921 dstin_modeproc_neon8, 922 srcout_modeproc_neon8, 923 dstout_modeproc_neon8, 924 srcatop_modeproc_neon8, 925 dstatop_modeproc_neon8, 926 xor_modeproc_neon8, 927 plus_modeproc_neon8, 928 modulate_modeproc_neon8, 929 screen_modeproc_neon8, 930 931 overlay_modeproc_neon8, 932 darken_modeproc_neon8, 933 lighten_modeproc_neon8, 934 NULL, // kColorDodge_Mode 935 NULL, // kColorBurn_Mode 936 hardlight_modeproc_neon8, 937 NULL, // kSoftLight_Mode 938 difference_modeproc_neon8, 939 exclusion_modeproc_neon8, 940 multiply_modeproc_neon8, 941 942 NULL, // kHue_Mode 943 NULL, // kSaturation_Mode 944 NULL, // kColor_Mode 945 NULL, // kLuminosity_Mode 946}; 947 948SK_COMPILE_ASSERT( 949 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1, 950 mode_count_arm 951); 952 953SkXfermodeProc gNEONXfermodeProcs1[] = { 954 NULL, // kClear_Mode 955 NULL, // kSrc_Mode 956 NULL, // kDst_Mode 957 NULL, // kSrcOver_Mode 958 NULL, // kDstOver_Mode 959 NULL, // kSrcIn_Mode 960 NULL, // kDstIn_Mode 961 NULL, // kSrcOut_Mode 962 NULL, // kDstOut_Mode 963 srcatop_modeproc_neon, 964 dstatop_modeproc_neon, 965 xor_modeproc_neon, 966 plus_modeproc_neon, 967 modulate_modeproc_neon, 968 NULL, // kScreen_Mode 969 970 NULL, // kOverlay_Mode 971 NULL, // kDarken_Mode 972 NULL, // kLighten_Mode 973 NULL, // kColorDodge_Mode 974 NULL, // kColorBurn_Mode 975 NULL, // kHardLight_Mode 976 NULL, // kSoftLight_Mode 977 NULL, // kDifference_Mode 978 NULL, // kExclusion_Mode 979 NULL, // kMultiply_Mode 980 981 NULL, // kHue_Mode 982 NULL, // kSaturation_Mode 983 NULL, // kColor_Mode 984 NULL, // kLuminosity_Mode 985}; 986 987SK_COMPILE_ASSERT( 988 SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1, 989 mode1_count_arm 990); 991 992SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, 993 SkXfermode::Mode mode) { 994 995 void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]); 996 997 if (procSIMD != NULL) { 998 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD)); 999 } 1000 return NULL; 1001} 1002 1003SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) { 1004 return gNEONXfermodeProcs1[mode]; 1005} 1006