1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9#include "SkBlitRow_opts_SSE2.h" 10#include "SkColorPriv.h" 11#include "SkUtils.h" 12 13#include <emmintrin.h> 14 15/* SSE2 version of S32_Blend_BlitRow32() 16 * portable version is in core/SkBlitRow_D32.cpp 17 */ 18void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 19 const SkPMColor* SK_RESTRICT src, 20 int count, U8CPU alpha) { 21 SkASSERT(alpha <= 255); 22 if (count <= 0) { 23 return; 24 } 25 26 uint32_t src_scale = SkAlpha255To256(alpha); 27 uint32_t dst_scale = 256 - src_scale; 28 29 if (count >= 4) { 30 SkASSERT(((size_t)dst & 0x03) == 0); 31 while (((size_t)dst & 0x0F) != 0) { 32 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 33 src++; 34 dst++; 35 count--; 36 } 37 38 const __m128i *s = reinterpret_cast<const __m128i*>(src); 39 __m128i *d = reinterpret_cast<__m128i*>(dst); 40 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 41 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 42 43 // Move scale factors to upper byte of word 44 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 45 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 46 while (count >= 4) { 47 // Load 4 pixels each of src and dest. 48 __m128i src_pixel = _mm_loadu_si128(s); 49 __m128i dst_pixel = _mm_load_si128(d); 50 51 // Interleave Atom port 0/1 operations based on the execution port 52 // constraints that multiply can only be executed on port 0 (while 53 // boolean operations can be executed on either port 0 or port 1) 54 // because GCC currently doesn't do a good job scheduling 55 // instructions based on these constraints. 56 57 // Get red and blue pixels into lower byte of each word. 58 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 59 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 60 61 // Multiply by scale. 62 // (4 x (0, rs.h, 0, bs.h)) 63 // where rs.h stands for the higher byte of r * scale, and 64 // bs.h the higher byte of b * scale. 65 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 66 67 // Get alpha and green pixels into higher byte of each word. 68 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 69 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 70 71 // Multiply by scale. 72 // (4 x (as.h, as.l, gs.h, gs.l)) 73 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 74 75 // Clear the lower byte of the a*scale and g*scale results 76 // (4 x (as.h, 0, gs.h, 0)) 77 src_ag = _mm_and_si128(src_ag, ag_mask); 78 79 // Operations the destination pixels are the same as on the 80 // source pixels. See the comments above. 81 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 82 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 83 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 84 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 85 dst_ag = _mm_and_si128(dst_ag, ag_mask); 86 87 // Combine back into RGBA. 88 // (4 x (as.h, rs.h, gs.h, bs.h)) 89 src_pixel = _mm_or_si128(src_rb, src_ag); 90 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 91 92 // Add result 93 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 94 _mm_store_si128(d, result); 95 s++; 96 d++; 97 count -= 4; 98 } 99 src = reinterpret_cast<const SkPMColor*>(s); 100 dst = reinterpret_cast<SkPMColor*>(d); 101 } 102 103 while (count > 0) { 104 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 105 src++; 106 dst++; 107 count--; 108 } 109} 110 111void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 112 const SkPMColor* SK_RESTRICT src, 113 int count, U8CPU alpha) { 114 SkASSERT(alpha == 255); 115 if (count <= 0) { 116 return; 117 } 118 119 if (count >= 4) { 120 SkASSERT(((size_t)dst & 0x03) == 0); 121 while (((size_t)dst & 0x0F) != 0) { 122 *dst = SkPMSrcOver(*src, *dst); 123 src++; 124 dst++; 125 count--; 126 } 127 128 const __m128i *s = reinterpret_cast<const __m128i*>(src); 129 __m128i *d = reinterpret_cast<__m128i*>(dst); 130#ifdef SK_USE_ACCURATE_BLENDING 131 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 132 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 133 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 134 while (count >= 4) { 135 // Load 4 pixels 136 __m128i src_pixel = _mm_loadu_si128(s); 137 __m128i dst_pixel = _mm_load_si128(d); 138 139 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 140 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 141 // Shift alphas down to lower 8 bits of each quad. 142 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 143 144 // Copy alpha to upper 3rd byte of each quad 145 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 146 147 // Subtract alphas from 255, to get 0..255 148 alpha = _mm_sub_epi16(c_255, alpha); 149 150 // Multiply by red and blue by src alpha. 151 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 152 // Multiply by alpha and green by src alpha. 153 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 154 155 // dst_rb_low = (dst_rb >> 8) 156 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 157 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 158 159 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 160 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 161 dst_rb = _mm_add_epi16(dst_rb, c_128); 162 dst_rb = _mm_srli_epi16(dst_rb, 8); 163 164 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 165 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 166 dst_ag = _mm_add_epi16(dst_ag, c_128); 167 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 168 169 // Combine back into RGBA. 170 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 171 172 // Add result 173 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 174 _mm_store_si128(d, result); 175 s++; 176 d++; 177 count -= 4; 178 } 179 #else 180 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 181 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 182 while (count >= 4) { 183 // Load 4 pixels 184 __m128i src_pixel = _mm_loadu_si128(s); 185 __m128i dst_pixel = _mm_load_si128(d); 186 187 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 188 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 189 190 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 191 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 192 193 // (a0, a0, a1, a1, a2, g2, a3, g3) 194 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 195 196 // (a0, a0, a1, a1, a2, a2, a3, a3) 197 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 198 199 // Subtract alphas from 256, to get 1..256 200 alpha = _mm_sub_epi16(c_256, alpha); 201 202 // Multiply by red and blue by src alpha. 203 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 204 // Multiply by alpha and green by src alpha. 205 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 206 207 // Divide by 256. 208 dst_rb = _mm_srli_epi16(dst_rb, 8); 209 210 // Mask out high bits (already in the right place) 211 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 212 213 // Combine back into RGBA. 214 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 215 216 // Add result 217 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 218 _mm_store_si128(d, result); 219 s++; 220 d++; 221 count -= 4; 222 } 223#endif 224 src = reinterpret_cast<const SkPMColor*>(s); 225 dst = reinterpret_cast<SkPMColor*>(d); 226 } 227 228 while (count > 0) { 229 *dst = SkPMSrcOver(*src, *dst); 230 src++; 231 dst++; 232 count--; 233 } 234} 235 236void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 237 const SkPMColor* SK_RESTRICT src, 238 int count, U8CPU alpha) { 239 SkASSERT(alpha <= 255); 240 if (count <= 0) { 241 return; 242 } 243 244 if (count >= 4) { 245 while (((size_t)dst & 0x0F) != 0) { 246 *dst = SkBlendARGB32(*src, *dst, alpha); 247 src++; 248 dst++; 249 count--; 250 } 251 252 uint32_t src_scale = SkAlpha255To256(alpha); 253 254 const __m128i *s = reinterpret_cast<const __m128i*>(src); 255 __m128i *d = reinterpret_cast<__m128i*>(dst); 256 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 257 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 258 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 259 while (count >= 4) { 260 // Load 4 pixels each of src and dest. 261 __m128i src_pixel = _mm_loadu_si128(s); 262 __m128i dst_pixel = _mm_load_si128(d); 263 264 // Get red and blue pixels into lower byte of each word. 265 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 266 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 267 268 // Get alpha and green into lower byte of each word. 269 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 270 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 271 272 // Put per-pixel alpha in low byte of each word. 273 // After the following two statements, the dst_alpha looks like 274 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 275 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 276 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 277 278 // dst_alpha = dst_alpha * src_scale 279 // Because src_scales are in the higher byte of each word and 280 // we use mulhi here, the resulting alpha values are already 281 // in the right place and don't need to be divided by 256. 282 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 283 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 284 285 // Subtract alphas from 256, to get 1..256 286 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 287 288 // Multiply red and blue by dst pixel alpha. 289 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 290 // Multiply alpha and green by dst pixel alpha. 291 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 292 293 // Multiply red and blue by global alpha. 294 // (4 x (0, rs.h, 0, bs.h)) 295 // where rs.h stands for the higher byte of r * src_scale, 296 // and bs.h the higher byte of b * src_scale. 297 // Again, because we use mulhi, the resuling red and blue 298 // values are already in the right place and don't need to 299 // be divided by 256. 300 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 301 // Multiply alpha and green by global alpha. 302 // (4 x (0, as.h, 0, gs.h)) 303 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 304 305 // Divide by 256. 306 dst_rb = _mm_srli_epi16(dst_rb, 8); 307 308 // Mask out low bits (goodies already in the right place; no need to divide) 309 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 310 // Shift alpha and green to higher byte of each word. 311 // (4 x (as.h, 0, gs.h, 0)) 312 src_ag = _mm_slli_epi16(src_ag, 8); 313 314 // Combine back into RGBA. 315 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 316 src_pixel = _mm_or_si128(src_rb, src_ag); 317 318 // Add two pixels into result. 319 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 320 _mm_store_si128(d, result); 321 s++; 322 d++; 323 count -= 4; 324 } 325 src = reinterpret_cast<const SkPMColor*>(s); 326 dst = reinterpret_cast<SkPMColor*>(d); 327 } 328 329 while (count > 0) { 330 *dst = SkBlendARGB32(*src, *dst, alpha); 331 src++; 332 dst++; 333 count--; 334 } 335} 336 337/* SSE2 version of Color32() 338 * portable version is in core/SkBlitRow_D32.cpp 339 */ 340void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 341 SkPMColor color) { 342 343 if (count <= 0) { 344 return; 345 } 346 347 if (0 == color) { 348 if (src != dst) { 349 memcpy(dst, src, count * sizeof(SkPMColor)); 350 } 351 return; 352 } 353 354 unsigned colorA = SkGetPackedA32(color); 355 if (255 == colorA) { 356 sk_memset32(dst, color, count); 357 } else { 358 unsigned scale = 256 - SkAlpha255To256(colorA); 359 360 if (count >= 4) { 361 SkASSERT(((size_t)dst & 0x03) == 0); 362 while (((size_t)dst & 0x0F) != 0) { 363 *dst = color + SkAlphaMulQ(*src, scale); 364 src++; 365 dst++; 366 count--; 367 } 368 369 const __m128i *s = reinterpret_cast<const __m128i*>(src); 370 __m128i *d = reinterpret_cast<__m128i*>(dst); 371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 372 __m128i src_scale_wide = _mm_set1_epi16(scale); 373 __m128i color_wide = _mm_set1_epi32(color); 374 while (count >= 4) { 375 // Load 4 pixels each of src and dest. 376 __m128i src_pixel = _mm_loadu_si128(s); 377 378 // Get red and blue pixels into lower byte of each word. 379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 380 381 // Get alpha and green into lower byte of each word. 382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 383 384 // Multiply by scale. 385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 387 388 // Divide by 256. 389 src_rb = _mm_srli_epi16(src_rb, 8); 390 src_ag = _mm_andnot_si128(rb_mask, src_ag); 391 392 // Combine back into RGBA. 393 src_pixel = _mm_or_si128(src_rb, src_ag); 394 395 // Add color to result. 396 __m128i result = _mm_add_epi8(color_wide, src_pixel); 397 398 // Store result. 399 _mm_store_si128(d, result); 400 s++; 401 d++; 402 count -= 4; 403 } 404 src = reinterpret_cast<const SkPMColor*>(s); 405 dst = reinterpret_cast<SkPMColor*>(d); 406 } 407 408 while (count > 0) { 409 *dst = color + SkAlphaMulQ(*src, scale); 410 src += 1; 411 dst += 1; 412 count--; 413 } 414 } 415} 416 417void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 418 size_t maskRB, SkColor origColor, 419 int width, int height) { 420 SkPMColor color = SkPreMultiplyColor(origColor); 421 size_t dstOffset = dstRB - (width << 2); 422 size_t maskOffset = maskRB - width; 423 SkPMColor* dst = (SkPMColor *)device; 424 const uint8_t* mask = (const uint8_t*)maskPtr; 425 do { 426 int count = width; 427 if (count >= 4) { 428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 429 *dst = SkBlendARGB32(color, *dst, *mask); 430 mask++; 431 dst++; 432 count--; 433 } 434 __m128i *d = reinterpret_cast<__m128i*>(dst); 435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 436 __m128i c_256 = _mm_set1_epi16(256); 437 __m128i c_1 = _mm_set1_epi16(1); 438 __m128i src_pixel = _mm_set1_epi32(color); 439 while (count >= 4) { 440 // Load 4 pixels each of src and dest. 441 __m128i dst_pixel = _mm_load_si128(d); 442 443 //set the aphla value 444 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 445 0, *(mask+3),0, \ 446 *(mask+2),0, *(mask+2),\ 447 0,*(mask+1), 0,*(mask+1),\ 448 0, *mask,0,*mask); 449 450 //call SkAlpha255To256() 451 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 452 453 // Get red and blue pixels into lower byte of each word. 454 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 455 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 456 457 // Get alpha and green into lower byte of each word. 458 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 459 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 460 461 // Put per-pixel alpha in low byte of each word. 462 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 463 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 464 465 // dst_alpha = dst_alpha * src_scale 466 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 467 468 // Divide by 256. 469 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 470 471 // Subtract alphas from 256, to get 1..256 472 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 473 // Multiply red and blue by dst pixel alpha. 474 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 475 // Multiply alpha and green by dst pixel alpha. 476 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 477 478 // Multiply red and blue by global alpha. 479 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 480 // Multiply alpha and green by global alpha. 481 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 482 // Divide by 256. 483 dst_rb = _mm_srli_epi16(dst_rb, 8); 484 src_rb = _mm_srli_epi16(src_rb, 8); 485 486 // Mask out low bits (goodies already in the right place; no need to divide) 487 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 488 src_ag = _mm_andnot_si128(rb_mask, src_ag); 489 490 // Combine back into RGBA. 491 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 492 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 493 494 // Add two pixels into result. 495 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 496 _mm_store_si128(d, result); 497 // load the next 4 pixel 498 mask = mask + 4; 499 d++; 500 count -= 4; 501 } 502 dst = reinterpret_cast<SkPMColor *>(d); 503 } 504 while(count > 0) { 505 *dst= SkBlendARGB32(color, *dst, *mask); 506 dst += 1; 507 mask++; 508 count --; 509 } 510 dst = (SkPMColor *)((char*)dst + dstOffset); 511 mask += maskOffset; 512 } while (--height != 0); 513} 514 515static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst, 516 __m128i &mask, __m128i &scale) { 517 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 518 __m128i r = _mm_and_si128(_mm_slli_epi32(mask, 519 16-SK_R16_SHIFT-(SK_R16_BITS-5)), 520 _mm_set1_epi32(0x001F0000)); 521 522 __m128i g = _mm_and_si128(_mm_slli_epi32(mask, 523 8-SK_G16_SHIFT-(SK_G16_BITS-5)), 524 _mm_set1_epi32(0x00001F00)); 525 526 __m128i b = _mm_and_si128(_mm_slli_epi32(mask, 527 SK_B16_BITS-5), 528 _mm_set1_epi32(0x0000001F)); 529 530 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 531 mask = _mm_or_si128(_mm_or_si128(r, g), b); 532 533 // Interleave R,G,B into the lower byte of word. 534 __m128i maskLo, maskHi; 535 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 536 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 537 538 // Upscale to 0..32 539 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 540 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 541 542 maskLo = _mm_mullo_epi16(maskLo, scale); 543 maskHi = _mm_mullo_epi16(maskHi, scale); 544 545 maskLo = _mm_srli_epi16(maskLo, 8); 546 maskHi = _mm_srli_epi16(maskHi, 8); 547 548 // Interleave R,G,B into the lower byte of the word. 549 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 550 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 551 552 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); 553 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); 554 555 maskLo = _mm_srai_epi16(maskLo, 5); 556 maskHi = _mm_srai_epi16(maskHi, 5); 557 558 // Add two pixels into result. 559 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 560 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 561 562 // Pack into 4 32bit dst pixels 563 return _mm_packus_epi16(resultLo, resultHi); 564} 565 566static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, 567 __m128i &mask) { 568 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 569 __m128i r = _mm_and_si128(_mm_slli_epi32(mask, 570 16-SK_R16_SHIFT-(SK_R16_BITS-5)), 571 _mm_set1_epi32(0x001F0000)); 572 573 __m128i g = _mm_and_si128(_mm_slli_epi32(mask, 574 8-SK_G16_SHIFT-(SK_G16_BITS-5)), 575 _mm_set1_epi32(0x00001F00)); 576 577 __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5), 578 _mm_set1_epi32(0x0000001F)); 579 580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 581 mask = _mm_or_si128(_mm_or_si128(r, g), b); 582 583 // Interleave R,G,B into the lower byte of word. 584 __m128i maskLo, maskHi; 585 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 586 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 587 588 // Upscale to 0..32 589 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 590 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 591 592 // Interleave R,G,B into the lower byte of the word. 593 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 594 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 595 596 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo)); 597 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi)); 598 599 maskLo = _mm_srai_epi16(maskLo, 5); 600 maskHi = _mm_srai_epi16(maskHi, 5); 601 602 // Add two pixels into result. 603 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 604 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 605 606 // Pack into 4 32bit dst pixels 607 return _mm_packus_epi16(resultLo, resultHi); 608} 609 610void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], 611 SkColor color, int width, SkPMColor) { 612 if (width <= 0) { 613 return; 614 } 615 616 int srcA = SkColorGetA(color); 617 int srcR = SkColorGetR(color); 618 int srcG = SkColorGetG(color); 619 int srcB = SkColorGetB(color); 620 621 srcA = SkAlpha255To256(srcA); 622 623 if (width >= 4) { 624 SkASSERT(((size_t)dst & 0x03) == 0); 625 while (((size_t)dst & 0x0F) != 0) { 626 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); 627 src++; 628 dst++; 629 width--; 630 } 631 632 __m128i *d = reinterpret_cast<__m128i*>(dst); 633 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 634 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); 635 __m128i scale = _mm_set1_epi16(srcA); 636 while (width >= 4) { 637 __m128i dst_pixel = _mm_load_si128(d); 638 __m128i mask_pixel = _mm_loadl_epi64( 639 reinterpret_cast<const __m128i*>(src)); 640 641 // Check whether mask_pixels are equal to 0 and get the highest bit 642 // of each byte of result, if mask pixes are all zero, we will get 643 // pack_cmp to 0xFFFF 644 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, 645 _mm_setzero_si128())); 646 647 // if mask pixels are not all zero, we will blend the dst pixels 648 if (pack_cmp != 0xFFFF) { 649 // Unpack 4 16bit mask pixels to 650 // (p0, 0, p1, 0, p2, 0, p3, 0) 651 mask_pixel = _mm_unpacklo_epi16(mask_pixel, 652 _mm_setzero_si128()); 653 654 // Process 4 32bit dst pixels 655 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel, 656 mask_pixel, scale); 657 _mm_store_si128(d, result); 658 } 659 660 d++; 661 src += 4; 662 width -= 4; 663 } 664 665 dst = reinterpret_cast<SkPMColor*>(d); 666 } 667 668 while (width > 0) { 669 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src); 670 src++; 671 dst++; 672 width--; 673 } 674} 675 676void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], 677 SkColor color, int width, SkPMColor opaqueDst) { 678 if (width <= 0) { 679 return; 680 } 681 682 int srcR = SkColorGetR(color); 683 int srcG = SkColorGetG(color); 684 int srcB = SkColorGetB(color); 685 686 if (width >= 4) { 687 SkASSERT(((size_t)dst & 0x03) == 0); 688 while (((size_t)dst & 0x0F) != 0) { 689 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); 690 src++; 691 dst++; 692 width--; 693 } 694 695 __m128i *d = reinterpret_cast<__m128i*>(dst); 696 __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 697 srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128()); 698 while (width >= 4) { 699 __m128i dst_pixel = _mm_load_si128(d); 700 __m128i mask_pixel = _mm_loadl_epi64( 701 reinterpret_cast<const __m128i*>(src)); 702 703 // Check whether mask_pixels are equal to 0 and get the highest bit 704 // of each byte of result, if mask pixes are all zero, we will get 705 // pack_cmp to 0xFFFF 706 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel, 707 _mm_setzero_si128())); 708 709 // if mask pixels are not all zero, we will blend the dst pixels 710 if (pack_cmp != 0xFFFF) { 711 // Unpack 4 16bit mask pixels to 712 // (p0, 0, p1, 0, p2, 0, p3, 0) 713 mask_pixel = _mm_unpacklo_epi16(mask_pixel, 714 _mm_setzero_si128()); 715 716 // Process 4 32bit dst pixels 717 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel, 718 mask_pixel); 719 _mm_store_si128(d, result); 720 } 721 722 d++; 723 src += 4; 724 width -= 4; 725 } 726 727 dst = reinterpret_cast<SkPMColor*>(d); 728 } 729 730 while (width > 0) { 731 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst); 732 src++; 733 dst++; 734 width--; 735 } 736} 737