SkBlitRow_opts_SSE2.cpp revision 39ce33a1facae795eb2f02e35674702de7eb23b5
1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 9#include "SkBlitRow_opts_SSE2.h" 10#include "SkBitmapProcState_opts_SSE2.h" 11#include "SkColorPriv.h" 12#include "SkColor_opts_SSE2.h" 13#include "SkUtils.h" 14 15#include <emmintrin.h> 16 17/* SSE2 version of S32_Blend_BlitRow32() 18 * portable version is in core/SkBlitRow_D32.cpp 19 */ 20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 21 const SkPMColor* SK_RESTRICT src, 22 int count, U8CPU alpha) { 23 SkASSERT(alpha <= 255); 24 if (count <= 0) { 25 return; 26 } 27 28 uint32_t src_scale = SkAlpha255To256(alpha); 29 uint32_t dst_scale = 256 - src_scale; 30 31 if (count >= 4) { 32 SkASSERT(((size_t)dst & 0x03) == 0); 33 while (((size_t)dst & 0x0F) != 0) { 34 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 35 src++; 36 dst++; 37 count--; 38 } 39 40 const __m128i *s = reinterpret_cast<const __m128i*>(src); 41 __m128i *d = reinterpret_cast<__m128i*>(dst); 42 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 43 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 44 45 // Move scale factors to upper byte of word 46 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 47 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 48 while (count >= 4) { 49 // Load 4 pixels each of src and dest. 50 __m128i src_pixel = _mm_loadu_si128(s); 51 __m128i dst_pixel = _mm_load_si128(d); 52 53 // Interleave Atom port 0/1 operations based on the execution port 54 // constraints that multiply can only be executed on port 0 (while 55 // boolean operations can be executed on either port 0 or port 1) 56 // because GCC currently doesn't do a good job scheduling 57 // instructions based on these constraints. 58 59 // Get red and blue pixels into lower byte of each word. 60 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 61 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 62 63 // Multiply by scale. 64 // (4 x (0, rs.h, 0, bs.h)) 65 // where rs.h stands for the higher byte of r * scale, and 66 // bs.h the higher byte of b * scale. 67 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 68 69 // Get alpha and green pixels into higher byte of each word. 70 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 71 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 72 73 // Multiply by scale. 74 // (4 x (as.h, as.l, gs.h, gs.l)) 75 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 76 77 // Clear the lower byte of the a*scale and g*scale results 78 // (4 x (as.h, 0, gs.h, 0)) 79 src_ag = _mm_and_si128(src_ag, ag_mask); 80 81 // Operations the destination pixels are the same as on the 82 // source pixels. See the comments above. 83 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 84 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 85 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 86 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 87 dst_ag = _mm_and_si128(dst_ag, ag_mask); 88 89 // Combine back into RGBA. 90 // (4 x (as.h, rs.h, gs.h, bs.h)) 91 src_pixel = _mm_or_si128(src_rb, src_ag); 92 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 93 94 // Add result 95 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 96 _mm_store_si128(d, result); 97 s++; 98 d++; 99 count -= 4; 100 } 101 src = reinterpret_cast<const SkPMColor*>(s); 102 dst = reinterpret_cast<SkPMColor*>(d); 103 } 104 105 while (count > 0) { 106 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 107 src++; 108 dst++; 109 count--; 110 } 111} 112 113void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 114 const SkPMColor* SK_RESTRICT src, 115 int count, U8CPU alpha) { 116 SkASSERT(alpha == 255); 117 if (count <= 0) { 118 return; 119 } 120 121 if (count >= 4) { 122 SkASSERT(((size_t)dst & 0x03) == 0); 123 while (((size_t)dst & 0x0F) != 0) { 124 *dst = SkPMSrcOver(*src, *dst); 125 src++; 126 dst++; 127 count--; 128 } 129 130 const __m128i *s = reinterpret_cast<const __m128i*>(src); 131 __m128i *d = reinterpret_cast<__m128i*>(dst); 132#ifdef SK_USE_ACCURATE_BLENDING 133 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 134 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 135 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 136 while (count >= 4) { 137 // Load 4 pixels 138 __m128i src_pixel = _mm_loadu_si128(s); 139 __m128i dst_pixel = _mm_load_si128(d); 140 141 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 142 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 143 // Shift alphas down to lower 8 bits of each quad. 144 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 145 146 // Copy alpha to upper 3rd byte of each quad 147 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 148 149 // Subtract alphas from 255, to get 0..255 150 alpha = _mm_sub_epi16(c_255, alpha); 151 152 // Multiply by red and blue by src alpha. 153 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 154 // Multiply by alpha and green by src alpha. 155 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 156 157 // dst_rb_low = (dst_rb >> 8) 158 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 159 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 160 161 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 162 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 163 dst_rb = _mm_add_epi16(dst_rb, c_128); 164 dst_rb = _mm_srli_epi16(dst_rb, 8); 165 166 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 167 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 168 dst_ag = _mm_add_epi16(dst_ag, c_128); 169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 170 171 // Combine back into RGBA. 172 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 173 174 // Add result 175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 176 _mm_store_si128(d, result); 177 s++; 178 d++; 179 count -= 4; 180 } 181 #else 182 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 183 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 184 while (count >= 4) { 185 // Load 4 pixels 186 __m128i src_pixel = _mm_loadu_si128(s); 187 __m128i dst_pixel = _mm_load_si128(d); 188 189 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 190 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 191 192 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 193 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 194 195 // (a0, a0, a1, a1, a2, g2, a3, g3) 196 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 197 198 // (a0, a0, a1, a1, a2, a2, a3, a3) 199 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 200 201 // Subtract alphas from 256, to get 1..256 202 alpha = _mm_sub_epi16(c_256, alpha); 203 204 // Multiply by red and blue by src alpha. 205 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 206 // Multiply by alpha and green by src alpha. 207 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 208 209 // Divide by 256. 210 dst_rb = _mm_srli_epi16(dst_rb, 8); 211 212 // Mask out high bits (already in the right place) 213 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 214 215 // Combine back into RGBA. 216 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 217 218 // Add result 219 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 220 _mm_store_si128(d, result); 221 s++; 222 d++; 223 count -= 4; 224 } 225#endif 226 src = reinterpret_cast<const SkPMColor*>(s); 227 dst = reinterpret_cast<SkPMColor*>(d); 228 } 229 230 while (count > 0) { 231 *dst = SkPMSrcOver(*src, *dst); 232 src++; 233 dst++; 234 count--; 235 } 236} 237 238void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 239 const SkPMColor* SK_RESTRICT src, 240 int count, U8CPU alpha) { 241 SkASSERT(alpha <= 255); 242 if (count <= 0) { 243 return; 244 } 245 246 if (count >= 4) { 247 while (((size_t)dst & 0x0F) != 0) { 248 *dst = SkBlendARGB32(*src, *dst, alpha); 249 src++; 250 dst++; 251 count--; 252 } 253 254 uint32_t src_scale = SkAlpha255To256(alpha); 255 256 const __m128i *s = reinterpret_cast<const __m128i*>(src); 257 __m128i *d = reinterpret_cast<__m128i*>(dst); 258 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 259 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 260 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 261 while (count >= 4) { 262 // Load 4 pixels each of src and dest. 263 __m128i src_pixel = _mm_loadu_si128(s); 264 __m128i dst_pixel = _mm_load_si128(d); 265 266 // Get red and blue pixels into lower byte of each word. 267 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 268 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 269 270 // Get alpha and green into lower byte of each word. 271 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 272 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 273 274 // Put per-pixel alpha in low byte of each word. 275 // After the following two statements, the dst_alpha looks like 276 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 277 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 278 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 279 280 // dst_alpha = dst_alpha * src_scale 281 // Because src_scales are in the higher byte of each word and 282 // we use mulhi here, the resulting alpha values are already 283 // in the right place and don't need to be divided by 256. 284 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 285 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 286 287 // Subtract alphas from 256, to get 1..256 288 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 289 290 // Multiply red and blue by dst pixel alpha. 291 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 292 // Multiply alpha and green by dst pixel alpha. 293 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 294 295 // Multiply red and blue by global alpha. 296 // (4 x (0, rs.h, 0, bs.h)) 297 // where rs.h stands for the higher byte of r * src_scale, 298 // and bs.h the higher byte of b * src_scale. 299 // Again, because we use mulhi, the resuling red and blue 300 // values are already in the right place and don't need to 301 // be divided by 256. 302 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 303 // Multiply alpha and green by global alpha. 304 // (4 x (0, as.h, 0, gs.h)) 305 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 306 307 // Divide by 256. 308 dst_rb = _mm_srli_epi16(dst_rb, 8); 309 310 // Mask out low bits (goodies already in the right place; no need to divide) 311 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 312 // Shift alpha and green to higher byte of each word. 313 // (4 x (as.h, 0, gs.h, 0)) 314 src_ag = _mm_slli_epi16(src_ag, 8); 315 316 // Combine back into RGBA. 317 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 318 src_pixel = _mm_or_si128(src_rb, src_ag); 319 320 // Add two pixels into result. 321 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 322 _mm_store_si128(d, result); 323 s++; 324 d++; 325 count -= 4; 326 } 327 src = reinterpret_cast<const SkPMColor*>(s); 328 dst = reinterpret_cast<SkPMColor*>(d); 329 } 330 331 while (count > 0) { 332 *dst = SkBlendARGB32(*src, *dst, alpha); 333 src++; 334 dst++; 335 count--; 336 } 337} 338 339/* SSE2 version of Color32() 340 * portable version is in core/SkBlitRow_D32.cpp 341 */ 342void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 343 SkPMColor color) { 344 345 if (count <= 0) { 346 return; 347 } 348 349 if (0 == color) { 350 if (src != dst) { 351 memcpy(dst, src, count * sizeof(SkPMColor)); 352 } 353 return; 354 } 355 356 unsigned colorA = SkGetPackedA32(color); 357 if (255 == colorA) { 358 sk_memset32(dst, color, count); 359 } else { 360 unsigned scale = 256 - SkAlpha255To256(colorA); 361 362 if (count >= 4) { 363 SkASSERT(((size_t)dst & 0x03) == 0); 364 while (((size_t)dst & 0x0F) != 0) { 365 *dst = color + SkAlphaMulQ(*src, scale); 366 src++; 367 dst++; 368 count--; 369 } 370 371 const __m128i *s = reinterpret_cast<const __m128i*>(src); 372 __m128i *d = reinterpret_cast<__m128i*>(dst); 373 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 374 __m128i src_scale_wide = _mm_set1_epi16(scale); 375 __m128i color_wide = _mm_set1_epi32(color); 376 while (count >= 4) { 377 // Load 4 pixels each of src and dest. 378 __m128i src_pixel = _mm_loadu_si128(s); 379 380 // Get red and blue pixels into lower byte of each word. 381 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 382 383 // Get alpha and green into lower byte of each word. 384 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 385 386 // Multiply by scale. 387 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 388 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 389 390 // Divide by 256. 391 src_rb = _mm_srli_epi16(src_rb, 8); 392 src_ag = _mm_andnot_si128(rb_mask, src_ag); 393 394 // Combine back into RGBA. 395 src_pixel = _mm_or_si128(src_rb, src_ag); 396 397 // Add color to result. 398 __m128i result = _mm_add_epi8(color_wide, src_pixel); 399 400 // Store result. 401 _mm_store_si128(d, result); 402 s++; 403 d++; 404 count -= 4; 405 } 406 src = reinterpret_cast<const SkPMColor*>(s); 407 dst = reinterpret_cast<SkPMColor*>(d); 408 } 409 410 while (count > 0) { 411 *dst = color + SkAlphaMulQ(*src, scale); 412 src += 1; 413 dst += 1; 414 count--; 415 } 416 } 417} 418 419void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 420 size_t maskRB, SkColor origColor, 421 int width, int height) { 422 SkPMColor color = SkPreMultiplyColor(origColor); 423 size_t dstOffset = dstRB - (width << 2); 424 size_t maskOffset = maskRB - width; 425 SkPMColor* dst = (SkPMColor *)device; 426 const uint8_t* mask = (const uint8_t*)maskPtr; 427 do { 428 int count = width; 429 if (count >= 4) { 430 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 431 *dst = SkBlendARGB32(color, *dst, *mask); 432 mask++; 433 dst++; 434 count--; 435 } 436 __m128i *d = reinterpret_cast<__m128i*>(dst); 437 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 438 __m128i c_256 = _mm_set1_epi16(256); 439 __m128i c_1 = _mm_set1_epi16(1); 440 __m128i src_pixel = _mm_set1_epi32(color); 441 while (count >= 4) { 442 // Load 4 pixels each of src and dest. 443 __m128i dst_pixel = _mm_load_si128(d); 444 445 //set the aphla value 446 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 447 0, *(mask+3),0, \ 448 *(mask+2),0, *(mask+2),\ 449 0,*(mask+1), 0,*(mask+1),\ 450 0, *mask,0,*mask); 451 452 //call SkAlpha255To256() 453 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 454 455 // Get red and blue pixels into lower byte of each word. 456 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 457 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 458 459 // Get alpha and green into lower byte of each word. 460 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 461 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 462 463 // Put per-pixel alpha in low byte of each word. 464 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 465 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 466 467 // dst_alpha = dst_alpha * src_scale 468 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 469 470 // Divide by 256. 471 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 472 473 // Subtract alphas from 256, to get 1..256 474 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 475 // Multiply red and blue by dst pixel alpha. 476 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 477 // Multiply alpha and green by dst pixel alpha. 478 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 479 480 // Multiply red and blue by global alpha. 481 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 482 // Multiply alpha and green by global alpha. 483 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 484 // Divide by 256. 485 dst_rb = _mm_srli_epi16(dst_rb, 8); 486 src_rb = _mm_srli_epi16(src_rb, 8); 487 488 // Mask out low bits (goodies already in the right place; no need to divide) 489 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 490 src_ag = _mm_andnot_si128(rb_mask, src_ag); 491 492 // Combine back into RGBA. 493 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 494 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 495 496 // Add two pixels into result. 497 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 498 _mm_store_si128(d, result); 499 // load the next 4 pixel 500 mask = mask + 4; 501 d++; 502 count -= 4; 503 } 504 dst = reinterpret_cast<SkPMColor *>(d); 505 } 506 while(count > 0) { 507 *dst= SkBlendARGB32(color, *dst, *mask); 508 dst += 1; 509 mask++; 510 count --; 511 } 512 dst = (SkPMColor *)((char*)dst + dstOffset); 513 mask += maskOffset; 514 } while (--height != 0); 515} 516 517// The following (left) shifts cause the top 5 bits of the mask components to 518// line up with the corresponding components in an SkPMColor. 519// Note that the mask's RGB16 order may differ from the SkPMColor order. 520#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 521#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 522#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 523 524#if SK_R16x5_R32x5_SHIFT == 0 525 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 526#elif SK_R16x5_R32x5_SHIFT > 0 527 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 528#else 529 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 530#endif 531 532#if SK_G16x5_G32x5_SHIFT == 0 533 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 534#elif SK_G16x5_G32x5_SHIFT > 0 535 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 536#else 537 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 538#endif 539 540#if SK_B16x5_B32x5_SHIFT == 0 541 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 542#elif SK_B16x5_B32x5_SHIFT > 0 543 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 544#else 545 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 546#endif 547 548static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 549 __m128i &mask, __m128i &srcA) { 550 // In the following comments, the components of src, dst and mask are 551 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 552 // by an R, G, B, or A suffix. Components of one of the four pixels that 553 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 554 // example is the blue channel of the second destination pixel. Memory 555 // layout is shown for an ARGB byte order in a color value. 556 557 // src and srcA store 8-bit values interleaved with zeros. 558 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 559 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 560 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 561 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 562 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 563 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 564 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 565 566 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 567 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 568 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 569 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 570 571 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 572 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 573 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 574 575 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 576 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 577 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 578 579 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 580 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 581 // 8-bit position 582 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 583 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 584 mask = _mm_or_si128(_mm_or_si128(r, g), b); 585 586 // Interleave R,G,B into the lower byte of word. 587 // i.e. split the sixteen 8-bit values from mask into two sets of eight 588 // 16-bit values, padded by zero. 589 __m128i maskLo, maskHi; 590 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 591 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 592 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 593 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 594 595 // Upscale from 0..31 to 0..32 596 // (allows to replace division by left-shift further down) 597 // Left-shift each component by 4 and add the result back to that component, 598 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 599 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 600 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 601 602 // Multiply each component of maskLo and maskHi by srcA 603 maskLo = _mm_mullo_epi16(maskLo, srcA); 604 maskHi = _mm_mullo_epi16(maskHi, srcA); 605 606 // Left shift mask components by 8 (divide by 256) 607 maskLo = _mm_srli_epi16(maskLo, 8); 608 maskHi = _mm_srli_epi16(maskHi, 8); 609 610 // Interleave R,G,B into the lower byte of the word 611 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 612 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 613 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 614 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 615 616 // mask = (src - dst) * mask 617 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 618 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 619 620 // mask = (src - dst) * mask >> 5 621 maskLo = _mm_srai_epi16(maskLo, 5); 622 maskHi = _mm_srai_epi16(maskHi, 5); 623 624 // Add two pixels into result. 625 // result = dst + ((src - dst) * mask >> 5) 626 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 627 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 628 629 // Pack into 4 32bit dst pixels. 630 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 631 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 632 // clamping to 255 if necessary. 633 return _mm_packus_epi16(resultLo, resultHi); 634} 635 636static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 637 __m128i &mask) { 638 // In the following comments, the components of src, dst and mask are 639 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 640 // by an R, G, B, or A suffix. Components of one of the four pixels that 641 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 642 // example is the blue channel of the second destination pixel. Memory 643 // layout is shown for an ARGB byte order in a color value. 644 645 // src and srcA store 8-bit values interleaved with zeros. 646 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 647 // mask stores 16-bit values (shown as high and low bytes) interleaved with 648 // zeros 649 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 650 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 651 652 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 653 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 654 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 655 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 656 657 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 658 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 659 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 660 661 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 662 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 663 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 664 665 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 666 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 667 // 8-bit position 668 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 669 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 670 mask = _mm_or_si128(_mm_or_si128(r, g), b); 671 672 // Interleave R,G,B into the lower byte of word. 673 // i.e. split the sixteen 8-bit values from mask into two sets of eight 674 // 16-bit values, padded by zero. 675 __m128i maskLo, maskHi; 676 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 677 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 678 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 679 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 680 681 // Upscale from 0..31 to 0..32 682 // (allows to replace division by left-shift further down) 683 // Left-shift each component by 4 and add the result back to that component, 684 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 685 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 686 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 687 688 // Interleave R,G,B into the lower byte of the word 689 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 690 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 691 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 692 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 693 694 // mask = (src - dst) * mask 695 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 696 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 697 698 // mask = (src - dst) * mask >> 5 699 maskLo = _mm_srai_epi16(maskLo, 5); 700 maskHi = _mm_srai_epi16(maskHi, 5); 701 702 // Add two pixels into result. 703 // result = dst + ((src - dst) * mask >> 5) 704 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 705 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 706 707 // Pack into 4 32bit dst pixels and force opaque. 708 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 709 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 710 // clamping to 255 if necessary. Set alpha components to 0xFF. 711 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 712 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 713} 714 715void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 716 SkColor src, int width, SkPMColor) { 717 if (width <= 0) { 718 return; 719 } 720 721 int srcA = SkColorGetA(src); 722 int srcR = SkColorGetR(src); 723 int srcG = SkColorGetG(src); 724 int srcB = SkColorGetB(src); 725 726 srcA = SkAlpha255To256(srcA); 727 728 if (width >= 4) { 729 SkASSERT(((size_t)dst & 0x03) == 0); 730 while (((size_t)dst & 0x0F) != 0) { 731 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 732 mask++; 733 dst++; 734 width--; 735 } 736 737 __m128i *d = reinterpret_cast<__m128i*>(dst); 738 // Set alpha to 0xFF and replicate source four times in SSE register. 739 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 740 // Interleave with zeros to get two sets of four 16-bit values. 741 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 742 // Set srcA_sse to contain eight copies of srcA, padded with zero. 743 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 744 __m128i srcA_sse = _mm_set1_epi16(srcA); 745 while (width >= 4) { 746 // Load four destination pixels into dst_sse. 747 __m128i dst_sse = _mm_load_si128(d); 748 // Load four 16-bit masks into lower half of mask_sse. 749 __m128i mask_sse = _mm_loadl_epi64( 750 reinterpret_cast<const __m128i*>(mask)); 751 752 // Check whether masks are equal to 0 and get the highest bit 753 // of each byte of result, if masks are all zero, we will get 754 // pack_cmp to 0xFFFF 755 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 756 _mm_setzero_si128())); 757 758 // if mask pixels are not all zero, we will blend the dst pixels 759 if (pack_cmp != 0xFFFF) { 760 // Unpack 4 16bit mask pixels to 761 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 762 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 763 mask_sse = _mm_unpacklo_epi16(mask_sse, 764 _mm_setzero_si128()); 765 766 // Process 4 32bit dst pixels 767 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 768 mask_sse, srcA_sse); 769 _mm_store_si128(d, result); 770 } 771 772 d++; 773 mask += 4; 774 width -= 4; 775 } 776 777 dst = reinterpret_cast<SkPMColor*>(d); 778 } 779 780 while (width > 0) { 781 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 782 mask++; 783 dst++; 784 width--; 785 } 786} 787 788void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 789 SkColor src, int width, SkPMColor opaqueDst) { 790 if (width <= 0) { 791 return; 792 } 793 794 int srcR = SkColorGetR(src); 795 int srcG = SkColorGetG(src); 796 int srcB = SkColorGetB(src); 797 798 if (width >= 4) { 799 SkASSERT(((size_t)dst & 0x03) == 0); 800 while (((size_t)dst & 0x0F) != 0) { 801 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 802 mask++; 803 dst++; 804 width--; 805 } 806 807 __m128i *d = reinterpret_cast<__m128i*>(dst); 808 // Set alpha to 0xFF and replicate source four times in SSE register. 809 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 810 // Set srcA_sse to contain eight copies of srcA, padded with zero. 811 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 812 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 813 while (width >= 4) { 814 // Load four destination pixels into dst_sse. 815 __m128i dst_sse = _mm_load_si128(d); 816 // Load four 16-bit masks into lower half of mask_sse. 817 __m128i mask_sse = _mm_loadl_epi64( 818 reinterpret_cast<const __m128i*>(mask)); 819 820 // Check whether masks are equal to 0 and get the highest bit 821 // of each byte of result, if masks are all zero, we will get 822 // pack_cmp to 0xFFFF 823 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 824 _mm_setzero_si128())); 825 826 // if mask pixels are not all zero, we will blend the dst pixels 827 if (pack_cmp != 0xFFFF) { 828 // Unpack 4 16bit mask pixels to 829 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 830 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 831 mask_sse = _mm_unpacklo_epi16(mask_sse, 832 _mm_setzero_si128()); 833 834 // Process 4 32bit dst pixels 835 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 836 mask_sse); 837 _mm_store_si128(d, result); 838 } 839 840 d++; 841 mask += 4; 842 width -= 4; 843 } 844 845 dst = reinterpret_cast<SkPMColor*>(d); 846 } 847 848 while (width > 0) { 849 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 850 mask++; 851 dst++; 852 width--; 853 } 854} 855 856/* SSE2 version of S32_D565_Opaque() 857 * portable version is in core/SkBlitRow_D16.cpp 858 */ 859void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 860 const SkPMColor* SK_RESTRICT src, int count, 861 U8CPU alpha, int /*x*/, int /*y*/) { 862 SkASSERT(255 == alpha); 863 864 if (count <= 0) { 865 return; 866 } 867 868 if (count >= 8) { 869 while (((size_t)dst & 0x0F) != 0) { 870 SkPMColor c = *src++; 871 SkPMColorAssert(c); 872 873 *dst++ = SkPixel32ToPixel16_ToU16(c); 874 count--; 875 } 876 877 const __m128i* s = reinterpret_cast<const __m128i*>(src); 878 __m128i* d = reinterpret_cast<__m128i*>(dst); 879 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 880 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 881 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 882 883 while (count >= 8) { 884 // Load 8 pixels of src. 885 __m128i src_pixel1 = _mm_loadu_si128(s++); 886 __m128i src_pixel2 = _mm_loadu_si128(s++); 887 888 // Calculate result r. 889 __m128i r1 = _mm_srli_epi32(src_pixel1, 890 SK_R32_SHIFT + (8 - SK_R16_BITS)); 891 r1 = _mm_and_si128(r1, r16_mask); 892 __m128i r2 = _mm_srli_epi32(src_pixel2, 893 SK_R32_SHIFT + (8 - SK_R16_BITS)); 894 r2 = _mm_and_si128(r2, r16_mask); 895 __m128i r = _mm_packs_epi32(r1, r2); 896 897 // Calculate result g. 898 __m128i g1 = _mm_srli_epi32(src_pixel1, 899 SK_G32_SHIFT + (8 - SK_G16_BITS)); 900 g1 = _mm_and_si128(g1, g16_mask); 901 __m128i g2 = _mm_srli_epi32(src_pixel2, 902 SK_G32_SHIFT + (8 - SK_G16_BITS)); 903 g2 = _mm_and_si128(g2, g16_mask); 904 __m128i g = _mm_packs_epi32(g1, g2); 905 906 // Calculate result b. 907 __m128i b1 = _mm_srli_epi32(src_pixel1, 908 SK_B32_SHIFT + (8 - SK_B16_BITS)); 909 b1 = _mm_and_si128(b1, b16_mask); 910 __m128i b2 = _mm_srli_epi32(src_pixel2, 911 SK_B32_SHIFT + (8 - SK_B16_BITS)); 912 b2 = _mm_and_si128(b2, b16_mask); 913 __m128i b = _mm_packs_epi32(b1, b2); 914 915 // Store 8 16-bit colors in dst. 916 __m128i d_pixel = SkPackRGB16_SSE(r, g, b); 917 _mm_store_si128(d++, d_pixel); 918 count -= 8; 919 } 920 src = reinterpret_cast<const SkPMColor*>(s); 921 dst = reinterpret_cast<uint16_t*>(d); 922 } 923 924 if (count > 0) { 925 do { 926 SkPMColor c = *src++; 927 SkPMColorAssert(c); 928 *dst++ = SkPixel32ToPixel16_ToU16(c); 929 } while (--count != 0); 930 } 931} 932 933/* SSE2 version of S32A_D565_Opaque() 934 * portable version is in core/SkBlitRow_D16.cpp 935 */ 936void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 937 const SkPMColor* SK_RESTRICT src, 938 int count, U8CPU alpha, int /*x*/, int /*y*/) { 939 SkASSERT(255 == alpha); 940 941 if (count <= 0) { 942 return; 943 } 944 945 if (count >= 8) { 946 // Make dst 16 bytes alignment 947 while (((size_t)dst & 0x0F) != 0) { 948 SkPMColor c = *src++; 949 if (c) { 950 *dst = SkSrcOver32To16(c, *dst); 951 } 952 dst += 1; 953 count--; 954 } 955 956 const __m128i* s = reinterpret_cast<const __m128i*>(src); 957 __m128i* d = reinterpret_cast<__m128i*>(dst); 958 __m128i var255 = _mm_set1_epi16(255); 959 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 960 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 961 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 962 963 while (count >= 8) { 964 // Load 8 pixels of src. 965 __m128i src_pixel1 = _mm_loadu_si128(s++); 966 __m128i src_pixel2 = _mm_loadu_si128(s++); 967 968 // Check whether src pixels are equal to 0 and get the highest bit 969 // of each byte of result, if src pixels are all zero, src_cmp1 and 970 // src_cmp2 will be 0xFFFF. 971 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 972 _mm_setzero_si128())); 973 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 974 _mm_setzero_si128())); 975 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 976 d++; 977 count -= 8; 978 continue; 979 } 980 981 // Load 8 pixels of dst. 982 __m128i dst_pixel = _mm_load_si128(d); 983 984 // Extract A from src. 985 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 986 sa1 = _mm_srli_epi32(sa1, 24); 987 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 988 sa2 = _mm_srli_epi32(sa2, 24); 989 __m128i sa = _mm_packs_epi32(sa1, sa2); 990 991 // Extract R from src. 992 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); 993 sr1 = _mm_srli_epi32(sr1, 24); 994 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); 995 sr2 = _mm_srli_epi32(sr2, 24); 996 __m128i sr = _mm_packs_epi32(sr1, sr2); 997 998 // Extract G from src. 999 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); 1000 sg1 = _mm_srli_epi32(sg1, 24); 1001 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); 1002 sg2 = _mm_srli_epi32(sg2, 24); 1003 __m128i sg = _mm_packs_epi32(sg1, sg2); 1004 1005 // Extract B from src. 1006 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); 1007 sb1 = _mm_srli_epi32(sb1, 24); 1008 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); 1009 sb2 = _mm_srli_epi32(sb2, 24); 1010 __m128i sb = _mm_packs_epi32(sb1, sb2); 1011 1012 // Extract R G B from dst. 1013 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); 1014 dr = _mm_and_si128(dr, r16_mask); 1015 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); 1016 dg = _mm_and_si128(dg, g16_mask); 1017 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); 1018 db = _mm_and_si128(db, b16_mask); 1019 1020 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1021 1022 // Calculate R G B of result. 1023 // Original algorithm is in SkSrcOver32To16(). 1024 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); 1025 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1026 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); 1027 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1028 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); 1029 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1030 1031 // Pack R G B into 16-bit color. 1032 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1033 1034 // Store 8 16-bit colors in dst. 1035 _mm_store_si128(d++, d_pixel); 1036 count -= 8; 1037 } 1038 1039 src = reinterpret_cast<const SkPMColor*>(s); 1040 dst = reinterpret_cast<uint16_t*>(d); 1041 } 1042 1043 if (count > 0) { 1044 do { 1045 SkPMColor c = *src++; 1046 SkPMColorAssert(c); 1047 if (c) { 1048 *dst = SkSrcOver32To16(c, *dst); 1049 } 1050 dst += 1; 1051 } while (--count != 0); 1052 } 1053} 1054