1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include <emmintrin.h> 9#include "SkBitmapProcState_opts_SSE2.h" 10#include "SkBlitRow_opts_SSE2.h" 11#include "SkColorData.h" 12#include "SkColor_opts_SSE2.h" 13#include "SkDither.h" 14#include "SkMSAN.h" 15#include "SkUtils.h" 16 17/* SSE2 version of S32_Blend_BlitRow32() 18 * portable version is in core/SkBlitRow_D32.cpp 19 */ 20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 21 const SkPMColor* SK_RESTRICT src, 22 int count, U8CPU alpha) { 23 SkASSERT(alpha <= 255); 24 if (count <= 0) { 25 return; 26 } 27 28 uint32_t src_scale = SkAlpha255To256(alpha); 29 30 if (count >= 4) { 31 SkASSERT(((size_t)dst & 0x03) == 0); 32 while (((size_t)dst & 0x0F) != 0) { 33 *dst = SkPMLerp(*src, *dst, src_scale); 34 src++; 35 dst++; 36 count--; 37 } 38 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 40 __m128i *d = reinterpret_cast<__m128i*>(dst); 41 42 while (count >= 4) { 43 // Load 4 pixels each of src and dest. 44 __m128i src_pixel = _mm_loadu_si128(s); 45 __m128i dst_pixel = _mm_load_si128(d); 46 47 __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale); 48 _mm_store_si128(d, result); 49 s++; 50 d++; 51 count -= 4; 52 } 53 src = reinterpret_cast<const SkPMColor*>(s); 54 dst = reinterpret_cast<SkPMColor*>(d); 55 } 56 57 while (count > 0) { 58 *dst = SkPMLerp(*src, *dst, src_scale); 59 src++; 60 dst++; 61 count--; 62 } 63} 64 65void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 66 const SkPMColor* SK_RESTRICT src, 67 int count, U8CPU alpha) { 68 SkASSERT(alpha <= 255); 69 if (count <= 0) { 70 return; 71 } 72 73 if (count >= 4) { 74 while (((size_t)dst & 0x0F) != 0) { 75 *dst = SkBlendARGB32(*src, *dst, alpha); 76 src++; 77 dst++; 78 count--; 79 } 80 81 const __m128i *s = reinterpret_cast<const __m128i*>(src); 82 __m128i *d = reinterpret_cast<__m128i*>(dst); 83 while (count >= 4) { 84 // Load 4 pixels each of src and dest. 85 __m128i src_pixel = _mm_loadu_si128(s); 86 __m128i dst_pixel = _mm_load_si128(d); 87 88 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); 89 _mm_store_si128(d, result); 90 s++; 91 d++; 92 count -= 4; 93 } 94 src = reinterpret_cast<const SkPMColor*>(s); 95 dst = reinterpret_cast<SkPMColor*>(d); 96 } 97 98 while (count > 0) { 99 *dst = SkBlendARGB32(*src, *dst, alpha); 100 src++; 101 dst++; 102 count--; 103 } 104} 105 106// The following (left) shifts cause the top 5 bits of the mask components to 107// line up with the corresponding components in an SkPMColor. 108// Note that the mask's RGB16 order may differ from the SkPMColor order. 109#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 110#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 111#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 112 113#if SK_R16x5_R32x5_SHIFT == 0 114 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 115#elif SK_R16x5_R32x5_SHIFT > 0 116 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 117#else 118 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 119#endif 120 121#if SK_G16x5_G32x5_SHIFT == 0 122 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 123#elif SK_G16x5_G32x5_SHIFT > 0 124 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 125#else 126 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 127#endif 128 129#if SK_B16x5_B32x5_SHIFT == 0 130 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 131#elif SK_B16x5_B32x5_SHIFT > 0 132 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 133#else 134 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 135#endif 136 137static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 138 __m128i &mask, __m128i &srcA) { 139 // In the following comments, the components of src, dst and mask are 140 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 141 // by an R, G, B, or A suffix. Components of one of the four pixels that 142 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 143 // example is the blue channel of the second destination pixel. Memory 144 // layout is shown for an ARGB byte order in a color value. 145 146 // src and srcA store 8-bit values interleaved with zeros. 147 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 148 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 149 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 150 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 151 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 152 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 153 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 154 155 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 156 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 157 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 158 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 159 160 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 161 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 162 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 163 164 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 165 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 166 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 167 168 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 169 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 170 // 8-bit position 171 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 172 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 173 mask = _mm_or_si128(_mm_or_si128(r, g), b); 174 175 // Interleave R,G,B into the lower byte of word. 176 // i.e. split the sixteen 8-bit values from mask into two sets of eight 177 // 16-bit values, padded by zero. 178 __m128i maskLo, maskHi; 179 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 180 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 181 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 182 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 183 184 // Upscale from 0..31 to 0..32 185 // (allows to replace division by left-shift further down) 186 // Left-shift each component by 4 and add the result back to that component, 187 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 188 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 189 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 190 191 // Multiply each component of maskLo and maskHi by srcA 192 maskLo = _mm_mullo_epi16(maskLo, srcA); 193 maskHi = _mm_mullo_epi16(maskHi, srcA); 194 195 // Left shift mask components by 8 (divide by 256) 196 maskLo = _mm_srli_epi16(maskLo, 8); 197 maskHi = _mm_srli_epi16(maskHi, 8); 198 199 // Interleave R,G,B into the lower byte of the word 200 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 201 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 202 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 203 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 204 205 // mask = (src - dst) * mask 206 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 207 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 208 209 // mask = (src - dst) * mask >> 5 210 maskLo = _mm_srai_epi16(maskLo, 5); 211 maskHi = _mm_srai_epi16(maskHi, 5); 212 213 // Add two pixels into result. 214 // result = dst + ((src - dst) * mask >> 5) 215 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 216 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 217 218 // Pack into 4 32bit dst pixels. 219 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 220 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 221 // clamping to 255 if necessary. 222 return _mm_packus_epi16(resultLo, resultHi); 223} 224 225static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 226 __m128i &mask) { 227 // In the following comments, the components of src, dst and mask are 228 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 229 // by an R, G, B, or A suffix. Components of one of the four pixels that 230 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 231 // example is the blue channel of the second destination pixel. Memory 232 // layout is shown for an ARGB byte order in a color value. 233 234 // src and srcA store 8-bit values interleaved with zeros. 235 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 236 // mask stores 16-bit values (shown as high and low bytes) interleaved with 237 // zeros 238 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 239 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 240 241 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 242 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 243 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 244 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 245 246 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 247 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 248 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 249 250 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 251 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 252 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 253 254 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 255 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 256 // 8-bit position 257 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 258 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 259 mask = _mm_or_si128(_mm_or_si128(r, g), b); 260 261 // Interleave R,G,B into the lower byte of word. 262 // i.e. split the sixteen 8-bit values from mask into two sets of eight 263 // 16-bit values, padded by zero. 264 __m128i maskLo, maskHi; 265 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 266 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 267 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 268 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 269 270 // Upscale from 0..31 to 0..32 271 // (allows to replace division by left-shift further down) 272 // Left-shift each component by 4 and add the result back to that component, 273 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 274 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 275 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 276 277 // Interleave R,G,B into the lower byte of the word 278 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 279 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 280 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 281 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 282 283 // mask = (src - dst) * mask 284 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 285 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 286 287 // mask = (src - dst) * mask >> 5 288 maskLo = _mm_srai_epi16(maskLo, 5); 289 maskHi = _mm_srai_epi16(maskHi, 5); 290 291 // Add two pixels into result. 292 // result = dst + ((src - dst) * mask >> 5) 293 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 294 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 295 296 // Pack into 4 32bit dst pixels and force opaque. 297 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 298 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 299 // clamping to 255 if necessary. Set alpha components to 0xFF. 300 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 301 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 302} 303 304void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 305 SkColor src, int width, SkPMColor) { 306 if (width <= 0) { 307 return; 308 } 309 310 int srcA = SkColorGetA(src); 311 int srcR = SkColorGetR(src); 312 int srcG = SkColorGetG(src); 313 int srcB = SkColorGetB(src); 314 315 srcA = SkAlpha255To256(srcA); 316 317 if (width >= 4) { 318 SkASSERT(((size_t)dst & 0x03) == 0); 319 while (((size_t)dst & 0x0F) != 0) { 320 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 321 mask++; 322 dst++; 323 width--; 324 } 325 326 __m128i *d = reinterpret_cast<__m128i*>(dst); 327 // Set alpha to 0xFF and replicate source four times in SSE register. 328 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 329 // Interleave with zeros to get two sets of four 16-bit values. 330 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 331 // Set srcA_sse to contain eight copies of srcA, padded with zero. 332 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 333 __m128i srcA_sse = _mm_set1_epi16(srcA); 334 while (width >= 4) { 335 // Load four destination pixels into dst_sse. 336 __m128i dst_sse = _mm_load_si128(d); 337 // Load four 16-bit masks into lower half of mask_sse. 338 __m128i mask_sse = _mm_loadl_epi64( 339 reinterpret_cast<const __m128i*>(mask)); 340 341 // Check whether masks are equal to 0 and get the highest bit 342 // of each byte of result, if masks are all zero, we will get 343 // pack_cmp to 0xFFFF 344 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 345 _mm_setzero_si128())); 346 347 // if mask pixels are not all zero, we will blend the dst pixels 348 if (pack_cmp != 0xFFFF) { 349 // Unpack 4 16bit mask pixels to 350 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 351 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 352 mask_sse = _mm_unpacklo_epi16(mask_sse, 353 _mm_setzero_si128()); 354 355 // Process 4 32bit dst pixels 356 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 357 mask_sse, srcA_sse); 358 _mm_store_si128(d, result); 359 } 360 361 d++; 362 mask += 4; 363 width -= 4; 364 } 365 366 dst = reinterpret_cast<SkPMColor*>(d); 367 } 368 369 while (width > 0) { 370 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 371 mask++; 372 dst++; 373 width--; 374 } 375} 376 377void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 378 SkColor src, int width, SkPMColor opaqueDst) { 379 if (width <= 0) { 380 return; 381 } 382 383 int srcR = SkColorGetR(src); 384 int srcG = SkColorGetG(src); 385 int srcB = SkColorGetB(src); 386 387 if (width >= 4) { 388 SkASSERT(((size_t)dst & 0x03) == 0); 389 while (((size_t)dst & 0x0F) != 0) { 390 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 391 mask++; 392 dst++; 393 width--; 394 } 395 396 __m128i *d = reinterpret_cast<__m128i*>(dst); 397 // Set alpha to 0xFF and replicate source four times in SSE register. 398 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 399 // Set srcA_sse to contain eight copies of srcA, padded with zero. 400 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 401 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 402 while (width >= 4) { 403 // Load four destination pixels into dst_sse. 404 __m128i dst_sse = _mm_load_si128(d); 405 // Load four 16-bit masks into lower half of mask_sse. 406 __m128i mask_sse = _mm_loadl_epi64( 407 reinterpret_cast<const __m128i*>(mask)); 408 409 // Check whether masks are equal to 0 and get the highest bit 410 // of each byte of result, if masks are all zero, we will get 411 // pack_cmp to 0xFFFF 412 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 413 _mm_setzero_si128())); 414 415 // if mask pixels are not all zero, we will blend the dst pixels 416 if (pack_cmp != 0xFFFF) { 417 // Unpack 4 16bit mask pixels to 418 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 419 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 420 mask_sse = _mm_unpacklo_epi16(mask_sse, 421 _mm_setzero_si128()); 422 423 // Process 4 32bit dst pixels 424 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 425 mask_sse); 426 _mm_store_si128(d, result); 427 } 428 429 d++; 430 mask += 4; 431 width -= 4; 432 } 433 434 dst = reinterpret_cast<SkPMColor*>(d); 435 } 436 437 while (width > 0) { 438 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 439 mask++; 440 dst++; 441 width--; 442 } 443} 444