1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9#include "SkBlitRow_opts_SSE2.h" 10#include "SkBitmapProcState_opts_SSE2.h" 11#include "SkColorPriv.h" 12#include "SkUtils.h" 13 14#include <emmintrin.h> 15 16/* SSE2 version of S32_Blend_BlitRow32() 17 * portable version is in core/SkBlitRow_D32.cpp 18 */ 19void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 20 const SkPMColor* SK_RESTRICT src, 21 int count, U8CPU alpha) { 22 SkASSERT(alpha <= 255); 23 if (count <= 0) { 24 return; 25 } 26 27 uint32_t src_scale = SkAlpha255To256(alpha); 28 uint32_t dst_scale = 256 - src_scale; 29 30 if (count >= 4) { 31 SkASSERT(((size_t)dst & 0x03) == 0); 32 while (((size_t)dst & 0x0F) != 0) { 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 34 src++; 35 dst++; 36 count--; 37 } 38 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 40 __m128i *d = reinterpret_cast<__m128i*>(dst); 41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 43 44 // Move scale factors to upper byte of word 45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 47 while (count >= 4) { 48 // Load 4 pixels each of src and dest. 49 __m128i src_pixel = _mm_loadu_si128(s); 50 __m128i dst_pixel = _mm_load_si128(d); 51 52 // Interleave Atom port 0/1 operations based on the execution port 53 // constraints that multiply can only be executed on port 0 (while 54 // boolean operations can be executed on either port 0 or port 1) 55 // because GCC currently doesn't do a good job scheduling 56 // instructions based on these constraints. 57 58 // Get red and blue pixels into lower byte of each word. 59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 61 62 // Multiply by scale. 63 // (4 x (0, rs.h, 0, bs.h)) 64 // where rs.h stands for the higher byte of r * scale, and 65 // bs.h the higher byte of b * scale. 66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 67 68 // Get alpha and green pixels into higher byte of each word. 69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 71 72 // Multiply by scale. 73 // (4 x (as.h, as.l, gs.h, gs.l)) 74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 75 76 // Clear the lower byte of the a*scale and g*scale results 77 // (4 x (as.h, 0, gs.h, 0)) 78 src_ag = _mm_and_si128(src_ag, ag_mask); 79 80 // Operations the destination pixels are the same as on the 81 // source pixels. See the comments above. 82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 86 dst_ag = _mm_and_si128(dst_ag, ag_mask); 87 88 // Combine back into RGBA. 89 // (4 x (as.h, rs.h, gs.h, bs.h)) 90 src_pixel = _mm_or_si128(src_rb, src_ag); 91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 92 93 // Add result 94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 95 _mm_store_si128(d, result); 96 s++; 97 d++; 98 count -= 4; 99 } 100 src = reinterpret_cast<const SkPMColor*>(s); 101 dst = reinterpret_cast<SkPMColor*>(d); 102 } 103 104 while (count > 0) { 105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 106 src++; 107 dst++; 108 count--; 109 } 110} 111 112void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 113 const SkPMColor* SK_RESTRICT src, 114 int count, U8CPU alpha) { 115 SkASSERT(alpha == 255); 116 if (count <= 0) { 117 return; 118 } 119 120 if (count >= 4) { 121 SkASSERT(((size_t)dst & 0x03) == 0); 122 while (((size_t)dst & 0x0F) != 0) { 123 *dst = SkPMSrcOver(*src, *dst); 124 src++; 125 dst++; 126 count--; 127 } 128 129 const __m128i *s = reinterpret_cast<const __m128i*>(src); 130 __m128i *d = reinterpret_cast<__m128i*>(dst); 131#ifdef SK_USE_ACCURATE_BLENDING 132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 135 while (count >= 4) { 136 // Load 4 pixels 137 __m128i src_pixel = _mm_loadu_si128(s); 138 __m128i dst_pixel = _mm_load_si128(d); 139 140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 142 // Shift alphas down to lower 8 bits of each quad. 143 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 144 145 // Copy alpha to upper 3rd byte of each quad 146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 147 148 // Subtract alphas from 255, to get 0..255 149 alpha = _mm_sub_epi16(c_255, alpha); 150 151 // Multiply by red and blue by src alpha. 152 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 153 // Multiply by alpha and green by src alpha. 154 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 155 156 // dst_rb_low = (dst_rb >> 8) 157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 159 160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 162 dst_rb = _mm_add_epi16(dst_rb, c_128); 163 dst_rb = _mm_srli_epi16(dst_rb, 8); 164 165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 167 dst_ag = _mm_add_epi16(dst_ag, c_128); 168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 169 170 // Combine back into RGBA. 171 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 172 173 // Add result 174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 175 _mm_store_si128(d, result); 176 s++; 177 d++; 178 count -= 4; 179 } 180 #else 181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 183 while (count >= 4) { 184 // Load 4 pixels 185 __m128i src_pixel = _mm_loadu_si128(s); 186 __m128i dst_pixel = _mm_load_si128(d); 187 188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 190 191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 192 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 193 194 // (a0, a0, a1, a1, a2, g2, a3, g3) 195 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 196 197 // (a0, a0, a1, a1, a2, a2, a3, a3) 198 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 199 200 // Subtract alphas from 256, to get 1..256 201 alpha = _mm_sub_epi16(c_256, alpha); 202 203 // Multiply by red and blue by src alpha. 204 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 205 // Multiply by alpha and green by src alpha. 206 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 207 208 // Divide by 256. 209 dst_rb = _mm_srli_epi16(dst_rb, 8); 210 211 // Mask out high bits (already in the right place) 212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 213 214 // Combine back into RGBA. 215 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 216 217 // Add result 218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 219 _mm_store_si128(d, result); 220 s++; 221 d++; 222 count -= 4; 223 } 224#endif 225 src = reinterpret_cast<const SkPMColor*>(s); 226 dst = reinterpret_cast<SkPMColor*>(d); 227 } 228 229 while (count > 0) { 230 *dst = SkPMSrcOver(*src, *dst); 231 src++; 232 dst++; 233 count--; 234 } 235} 236 237void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 238 const SkPMColor* SK_RESTRICT src, 239 int count, U8CPU alpha) { 240 SkASSERT(alpha <= 255); 241 if (count <= 0) { 242 return; 243 } 244 245 if (count >= 4) { 246 while (((size_t)dst & 0x0F) != 0) { 247 *dst = SkBlendARGB32(*src, *dst, alpha); 248 src++; 249 dst++; 250 count--; 251 } 252 253 uint32_t src_scale = SkAlpha255To256(alpha); 254 255 const __m128i *s = reinterpret_cast<const __m128i*>(src); 256 __m128i *d = reinterpret_cast<__m128i*>(dst); 257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 260 while (count >= 4) { 261 // Load 4 pixels each of src and dest. 262 __m128i src_pixel = _mm_loadu_si128(s); 263 __m128i dst_pixel = _mm_load_si128(d); 264 265 // Get red and blue pixels into lower byte of each word. 266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 268 269 // Get alpha and green into lower byte of each word. 270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 272 273 // Put per-pixel alpha in low byte of each word. 274 // After the following two statements, the dst_alpha looks like 275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 278 279 // dst_alpha = dst_alpha * src_scale 280 // Because src_scales are in the higher byte of each word and 281 // we use mulhi here, the resulting alpha values are already 282 // in the right place and don't need to be divided by 256. 283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 285 286 // Subtract alphas from 256, to get 1..256 287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 288 289 // Multiply red and blue by dst pixel alpha. 290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 291 // Multiply alpha and green by dst pixel alpha. 292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 293 294 // Multiply red and blue by global alpha. 295 // (4 x (0, rs.h, 0, bs.h)) 296 // where rs.h stands for the higher byte of r * src_scale, 297 // and bs.h the higher byte of b * src_scale. 298 // Again, because we use mulhi, the resuling red and blue 299 // values are already in the right place and don't need to 300 // be divided by 256. 301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 302 // Multiply alpha and green by global alpha. 303 // (4 x (0, as.h, 0, gs.h)) 304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 305 306 // Divide by 256. 307 dst_rb = _mm_srli_epi16(dst_rb, 8); 308 309 // Mask out low bits (goodies already in the right place; no need to divide) 310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 311 // Shift alpha and green to higher byte of each word. 312 // (4 x (as.h, 0, gs.h, 0)) 313 src_ag = _mm_slli_epi16(src_ag, 8); 314 315 // Combine back into RGBA. 316 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 317 src_pixel = _mm_or_si128(src_rb, src_ag); 318 319 // Add two pixels into result. 320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 321 _mm_store_si128(d, result); 322 s++; 323 d++; 324 count -= 4; 325 } 326 src = reinterpret_cast<const SkPMColor*>(s); 327 dst = reinterpret_cast<SkPMColor*>(d); 328 } 329 330 while (count > 0) { 331 *dst = SkBlendARGB32(*src, *dst, alpha); 332 src++; 333 dst++; 334 count--; 335 } 336} 337 338/* SSE2 version of Color32() 339 * portable version is in core/SkBlitRow_D32.cpp 340 */ 341void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 342 SkPMColor color) { 343 344 if (count <= 0) { 345 return; 346 } 347 348 if (0 == color) { 349 if (src != dst) { 350 memcpy(dst, src, count * sizeof(SkPMColor)); 351 } 352 return; 353 } 354 355 unsigned colorA = SkGetPackedA32(color); 356 if (255 == colorA) { 357 sk_memset32(dst, color, count); 358 } else { 359 unsigned scale = 256 - SkAlpha255To256(colorA); 360 361 if (count >= 4) { 362 SkASSERT(((size_t)dst & 0x03) == 0); 363 while (((size_t)dst & 0x0F) != 0) { 364 *dst = color + SkAlphaMulQ(*src, scale); 365 src++; 366 dst++; 367 count--; 368 } 369 370 const __m128i *s = reinterpret_cast<const __m128i*>(src); 371 __m128i *d = reinterpret_cast<__m128i*>(dst); 372 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 373 __m128i src_scale_wide = _mm_set1_epi16(scale); 374 __m128i color_wide = _mm_set1_epi32(color); 375 while (count >= 4) { 376 // Load 4 pixels each of src and dest. 377 __m128i src_pixel = _mm_loadu_si128(s); 378 379 // Get red and blue pixels into lower byte of each word. 380 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 381 382 // Get alpha and green into lower byte of each word. 383 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 384 385 // Multiply by scale. 386 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 387 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 388 389 // Divide by 256. 390 src_rb = _mm_srli_epi16(src_rb, 8); 391 src_ag = _mm_andnot_si128(rb_mask, src_ag); 392 393 // Combine back into RGBA. 394 src_pixel = _mm_or_si128(src_rb, src_ag); 395 396 // Add color to result. 397 __m128i result = _mm_add_epi8(color_wide, src_pixel); 398 399 // Store result. 400 _mm_store_si128(d, result); 401 s++; 402 d++; 403 count -= 4; 404 } 405 src = reinterpret_cast<const SkPMColor*>(s); 406 dst = reinterpret_cast<SkPMColor*>(d); 407 } 408 409 while (count > 0) { 410 *dst = color + SkAlphaMulQ(*src, scale); 411 src += 1; 412 dst += 1; 413 count--; 414 } 415 } 416} 417 418void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 419 size_t maskRB, SkColor origColor, 420 int width, int height) { 421 SkPMColor color = SkPreMultiplyColor(origColor); 422 size_t dstOffset = dstRB - (width << 2); 423 size_t maskOffset = maskRB - width; 424 SkPMColor* dst = (SkPMColor *)device; 425 const uint8_t* mask = (const uint8_t*)maskPtr; 426 do { 427 int count = width; 428 if (count >= 4) { 429 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 430 *dst = SkBlendARGB32(color, *dst, *mask); 431 mask++; 432 dst++; 433 count--; 434 } 435 __m128i *d = reinterpret_cast<__m128i*>(dst); 436 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 437 __m128i c_256 = _mm_set1_epi16(256); 438 __m128i c_1 = _mm_set1_epi16(1); 439 __m128i src_pixel = _mm_set1_epi32(color); 440 while (count >= 4) { 441 // Load 4 pixels each of src and dest. 442 __m128i dst_pixel = _mm_load_si128(d); 443 444 //set the aphla value 445 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 446 0, *(mask+3),0, \ 447 *(mask+2),0, *(mask+2),\ 448 0,*(mask+1), 0,*(mask+1),\ 449 0, *mask,0,*mask); 450 451 //call SkAlpha255To256() 452 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 453 454 // Get red and blue pixels into lower byte of each word. 455 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 456 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 457 458 // Get alpha and green into lower byte of each word. 459 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 460 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 461 462 // Put per-pixel alpha in low byte of each word. 463 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 464 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 465 466 // dst_alpha = dst_alpha * src_scale 467 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 468 469 // Divide by 256. 470 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 471 472 // Subtract alphas from 256, to get 1..256 473 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 474 // Multiply red and blue by dst pixel alpha. 475 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 476 // Multiply alpha and green by dst pixel alpha. 477 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 478 479 // Multiply red and blue by global alpha. 480 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 481 // Multiply alpha and green by global alpha. 482 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 483 // Divide by 256. 484 dst_rb = _mm_srli_epi16(dst_rb, 8); 485 src_rb = _mm_srli_epi16(src_rb, 8); 486 487 // Mask out low bits (goodies already in the right place; no need to divide) 488 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 489 src_ag = _mm_andnot_si128(rb_mask, src_ag); 490 491 // Combine back into RGBA. 492 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 493 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 494 495 // Add two pixels into result. 496 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 497 _mm_store_si128(d, result); 498 // load the next 4 pixel 499 mask = mask + 4; 500 d++; 501 count -= 4; 502 } 503 dst = reinterpret_cast<SkPMColor *>(d); 504 } 505 while(count > 0) { 506 *dst= SkBlendARGB32(color, *dst, *mask); 507 dst += 1; 508 mask++; 509 count --; 510 } 511 dst = (SkPMColor *)((char*)dst + dstOffset); 512 mask += maskOffset; 513 } while (--height != 0); 514} 515 516// The following (left) shifts cause the top 5 bits of the mask components to 517// line up with the corresponding components in an SkPMColor. 518// Note that the mask's RGB16 order may differ from the SkPMColor order. 519#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 520#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 521#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 522 523#if SK_R16x5_R32x5_SHIFT == 0 524 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 525#elif SK_R16x5_R32x5_SHIFT > 0 526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 527#else 528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 529#endif 530 531#if SK_G16x5_G32x5_SHIFT == 0 532 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 533#elif SK_G16x5_G32x5_SHIFT > 0 534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 535#else 536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 537#endif 538 539#if SK_B16x5_B32x5_SHIFT == 0 540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 541#elif SK_B16x5_B32x5_SHIFT > 0 542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 543#else 544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 545#endif 546 547static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 548 __m128i &mask, __m128i &srcA) { 549 // In the following comments, the components of src, dst and mask are 550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 551 // by an R, G, B, or A suffix. Components of one of the four pixels that 552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 553 // example is the blue channel of the second destination pixel. Memory 554 // layout is shown for an ARGB byte order in a color value. 555 556 // src and srcA store 8-bit values interleaved with zeros. 557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 560 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 564 565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 568 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 569 570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 572 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 573 574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 576 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 577 578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 580 // 8-bit position 581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 583 mask = _mm_or_si128(_mm_or_si128(r, g), b); 584 585 // Interleave R,G,B into the lower byte of word. 586 // i.e. split the sixteen 8-bit values from mask into two sets of eight 587 // 16-bit values, padded by zero. 588 __m128i maskLo, maskHi; 589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 593 594 // Upscale from 0..31 to 0..32 595 // (allows to replace division by left-shift further down) 596 // Left-shift each component by 4 and add the result back to that component, 597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 600 601 // Multiply each component of maskLo and maskHi by srcA 602 maskLo = _mm_mullo_epi16(maskLo, srcA); 603 maskHi = _mm_mullo_epi16(maskHi, srcA); 604 605 // Left shift mask components by 8 (divide by 256) 606 maskLo = _mm_srli_epi16(maskLo, 8); 607 maskHi = _mm_srli_epi16(maskHi, 8); 608 609 // Interleave R,G,B into the lower byte of the word 610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 614 615 // mask = (src - dst) * mask 616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 618 619 // mask = (src - dst) * mask >> 5 620 maskLo = _mm_srai_epi16(maskLo, 5); 621 maskHi = _mm_srai_epi16(maskHi, 5); 622 623 // Add two pixels into result. 624 // result = dst + ((src - dst) * mask >> 5) 625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 627 628 // Pack into 4 32bit dst pixels. 629 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 631 // clamping to 255 if necessary. 632 return _mm_packus_epi16(resultLo, resultHi); 633} 634 635static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 636 __m128i &mask) { 637 // In the following comments, the components of src, dst and mask are 638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 639 // by an R, G, B, or A suffix. Components of one of the four pixels that 640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 641 // example is the blue channel of the second destination pixel. Memory 642 // layout is shown for an ARGB byte order in a color value. 643 644 // src and srcA store 8-bit values interleaved with zeros. 645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 646 // mask stores 16-bit values (shown as high and low bytes) interleaved with 647 // zeros 648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 650 651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 654 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 655 656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 658 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 659 660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 662 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 663 664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 666 // 8-bit position 667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 669 mask = _mm_or_si128(_mm_or_si128(r, g), b); 670 671 // Interleave R,G,B into the lower byte of word. 672 // i.e. split the sixteen 8-bit values from mask into two sets of eight 673 // 16-bit values, padded by zero. 674 __m128i maskLo, maskHi; 675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 679 680 // Upscale from 0..31 to 0..32 681 // (allows to replace division by left-shift further down) 682 // Left-shift each component by 4 and add the result back to that component, 683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 686 687 // Interleave R,G,B into the lower byte of the word 688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 692 693 // mask = (src - dst) * mask 694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 696 697 // mask = (src - dst) * mask >> 5 698 maskLo = _mm_srai_epi16(maskLo, 5); 699 maskHi = _mm_srai_epi16(maskHi, 5); 700 701 // Add two pixels into result. 702 // result = dst + ((src - dst) * mask >> 5) 703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 705 706 // Pack into 4 32bit dst pixels and force opaque. 707 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 709 // clamping to 255 if necessary. Set alpha components to 0xFF. 710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 712} 713 714void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 715 SkColor src, int width, SkPMColor) { 716 if (width <= 0) { 717 return; 718 } 719 720 int srcA = SkColorGetA(src); 721 int srcR = SkColorGetR(src); 722 int srcG = SkColorGetG(src); 723 int srcB = SkColorGetB(src); 724 725 srcA = SkAlpha255To256(srcA); 726 727 if (width >= 4) { 728 SkASSERT(((size_t)dst & 0x03) == 0); 729 while (((size_t)dst & 0x0F) != 0) { 730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 731 mask++; 732 dst++; 733 width--; 734 } 735 736 __m128i *d = reinterpret_cast<__m128i*>(dst); 737 // Set alpha to 0xFF and replicate source four times in SSE register. 738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 739 // Interleave with zeros to get two sets of four 16-bit values. 740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 741 // Set srcA_sse to contain eight copies of srcA, padded with zero. 742 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 743 __m128i srcA_sse = _mm_set1_epi16(srcA); 744 while (width >= 4) { 745 // Load four destination pixels into dst_sse. 746 __m128i dst_sse = _mm_load_si128(d); 747 // Load four 16-bit masks into lower half of mask_sse. 748 __m128i mask_sse = _mm_loadl_epi64( 749 reinterpret_cast<const __m128i*>(mask)); 750 751 // Check whether masks are equal to 0 and get the highest bit 752 // of each byte of result, if masks are all zero, we will get 753 // pack_cmp to 0xFFFF 754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 755 _mm_setzero_si128())); 756 757 // if mask pixels are not all zero, we will blend the dst pixels 758 if (pack_cmp != 0xFFFF) { 759 // Unpack 4 16bit mask pixels to 760 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 761 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 762 mask_sse = _mm_unpacklo_epi16(mask_sse, 763 _mm_setzero_si128()); 764 765 // Process 4 32bit dst pixels 766 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 767 mask_sse, srcA_sse); 768 _mm_store_si128(d, result); 769 } 770 771 d++; 772 mask += 4; 773 width -= 4; 774 } 775 776 dst = reinterpret_cast<SkPMColor*>(d); 777 } 778 779 while (width > 0) { 780 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 781 mask++; 782 dst++; 783 width--; 784 } 785} 786 787void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 788 SkColor src, int width, SkPMColor opaqueDst) { 789 if (width <= 0) { 790 return; 791 } 792 793 int srcR = SkColorGetR(src); 794 int srcG = SkColorGetG(src); 795 int srcB = SkColorGetB(src); 796 797 if (width >= 4) { 798 SkASSERT(((size_t)dst & 0x03) == 0); 799 while (((size_t)dst & 0x0F) != 0) { 800 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 801 mask++; 802 dst++; 803 width--; 804 } 805 806 __m128i *d = reinterpret_cast<__m128i*>(dst); 807 // Set alpha to 0xFF and replicate source four times in SSE register. 808 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 809 // Set srcA_sse to contain eight copies of srcA, padded with zero. 810 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 811 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 812 while (width >= 4) { 813 // Load four destination pixels into dst_sse. 814 __m128i dst_sse = _mm_load_si128(d); 815 // Load four 16-bit masks into lower half of mask_sse. 816 __m128i mask_sse = _mm_loadl_epi64( 817 reinterpret_cast<const __m128i*>(mask)); 818 819 // Check whether masks are equal to 0 and get the highest bit 820 // of each byte of result, if masks are all zero, we will get 821 // pack_cmp to 0xFFFF 822 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 823 _mm_setzero_si128())); 824 825 // if mask pixels are not all zero, we will blend the dst pixels 826 if (pack_cmp != 0xFFFF) { 827 // Unpack 4 16bit mask pixels to 828 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 829 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 830 mask_sse = _mm_unpacklo_epi16(mask_sse, 831 _mm_setzero_si128()); 832 833 // Process 4 32bit dst pixels 834 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 835 mask_sse); 836 _mm_store_si128(d, result); 837 } 838 839 d++; 840 mask += 4; 841 width -= 4; 842 } 843 844 dst = reinterpret_cast<SkPMColor*>(d); 845 } 846 847 while (width > 0) { 848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 849 mask++; 850 dst++; 851 width--; 852 } 853} 854