1/* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8#include <emmintrin.h> 9#include "SkBitmapProcState_opts_SSE2.h" 10#include "SkBlitRow_opts_SSE2.h" 11#include "SkColorPriv.h" 12#include "SkColor_opts_SSE2.h" 13#include "SkDither.h" 14#include "SkMSAN.h" 15#include "SkUtils.h" 16 17/* SSE2 version of S32_Blend_BlitRow32() 18 * portable version is in core/SkBlitRow_D32.cpp 19 */ 20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 21 const SkPMColor* SK_RESTRICT src, 22 int count, U8CPU alpha) { 23 SkASSERT(alpha <= 255); 24 if (count <= 0) { 25 return; 26 } 27 28 uint32_t src_scale = SkAlpha255To256(alpha); 29 30 if (count >= 4) { 31 SkASSERT(((size_t)dst & 0x03) == 0); 32 while (((size_t)dst & 0x0F) != 0) { 33 *dst = SkPMLerp(*src, *dst, src_scale); 34 src++; 35 dst++; 36 count--; 37 } 38 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 40 __m128i *d = reinterpret_cast<__m128i*>(dst); 41 42 while (count >= 4) { 43 // Load 4 pixels each of src and dest. 44 __m128i src_pixel = _mm_loadu_si128(s); 45 __m128i dst_pixel = _mm_load_si128(d); 46 47 __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale); 48 _mm_store_si128(d, result); 49 s++; 50 d++; 51 count -= 4; 52 } 53 src = reinterpret_cast<const SkPMColor*>(s); 54 dst = reinterpret_cast<SkPMColor*>(d); 55 } 56 57 while (count > 0) { 58 *dst = SkPMLerp(*src, *dst, src_scale); 59 src++; 60 dst++; 61 count--; 62 } 63} 64 65void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 66 const SkPMColor* SK_RESTRICT src, 67 int count, U8CPU alpha) { 68 SkASSERT(alpha <= 255); 69 if (count <= 0) { 70 return; 71 } 72 73 if (count >= 4) { 74 while (((size_t)dst & 0x0F) != 0) { 75 *dst = SkBlendARGB32(*src, *dst, alpha); 76 src++; 77 dst++; 78 count--; 79 } 80 81 const __m128i *s = reinterpret_cast<const __m128i*>(src); 82 __m128i *d = reinterpret_cast<__m128i*>(dst); 83 while (count >= 4) { 84 // Load 4 pixels each of src and dest. 85 __m128i src_pixel = _mm_loadu_si128(s); 86 __m128i dst_pixel = _mm_load_si128(d); 87 88 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); 89 _mm_store_si128(d, result); 90 s++; 91 d++; 92 count -= 4; 93 } 94 src = reinterpret_cast<const SkPMColor*>(s); 95 dst = reinterpret_cast<SkPMColor*>(d); 96 } 97 98 while (count > 0) { 99 *dst = SkBlendARGB32(*src, *dst, alpha); 100 src++; 101 dst++; 102 count--; 103 } 104} 105 106void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { 107 SkASSERT(count > 0); 108 109 uint32_t src_expand = (SkGetPackedG32(src) << 24) | 110 (SkGetPackedR32(src) << 13) | 111 (SkGetPackedB32(src) << 2); 112 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; 113 114 // Check if we have enough pixels to run SIMD 115 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { 116 __m128i* dst_wide; 117 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2); 118 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3); 119 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2); 120 const __m128i scale_wide = _mm_set1_epi16(scale); 121 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK); 122 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT); 123 124 // Align dst to an even 16 byte address (0-7 pixels) 125 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { 126 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 127 dst += 1; 128 count--; 129 } 130 131 dst_wide = reinterpret_cast<__m128i*>(dst); 132 do { 133 // Load eight RGB565 pixels 134 __m128i pixels = _mm_load_si128(dst_wide); 135 136 // Mask out sub-pixels 137 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT); 138 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS); 139 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS); 140 __m128i pixel_B = _mm_and_si128(pixels, mask_blue); 141 142 // Scale with alpha 143 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide); 144 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide); 145 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide); 146 147 // Add src_X_wide and shift down again 148 pixel_R = _mm_add_epi16(pixel_R, src_R_wide); 149 pixel_R = _mm_srli_epi16(pixel_R, 5); 150 pixel_G = _mm_add_epi16(pixel_G, src_G_wide); 151 pixel_B = _mm_add_epi16(pixel_B, src_B_wide); 152 pixel_B = _mm_srli_epi16(pixel_B, 5); 153 154 // Combine into RGB565 and store 155 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT); 156 pixel_G = _mm_and_si128(pixel_G, mask_green); 157 pixels = _mm_or_si128(pixel_R, pixel_G); 158 pixels = _mm_or_si128(pixels, pixel_B); 159 _mm_store_si128(dst_wide, pixels); 160 count -= 8; 161 dst_wide++; 162 } while (count >= 8); 163 164 dst = reinterpret_cast<uint16_t*>(dst_wide); 165 } 166 167 // Small loop to handle remaining pixels. 168 while (count > 0) { 169 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 170 dst += 1; 171 count--; 172 } 173} 174 175// The following (left) shifts cause the top 5 bits of the mask components to 176// line up with the corresponding components in an SkPMColor. 177// Note that the mask's RGB16 order may differ from the SkPMColor order. 178#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 179#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 180#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 181 182#if SK_R16x5_R32x5_SHIFT == 0 183 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 184#elif SK_R16x5_R32x5_SHIFT > 0 185 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 186#else 187 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 188#endif 189 190#if SK_G16x5_G32x5_SHIFT == 0 191 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 192#elif SK_G16x5_G32x5_SHIFT > 0 193 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 194#else 195 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 196#endif 197 198#if SK_B16x5_B32x5_SHIFT == 0 199 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 200#elif SK_B16x5_B32x5_SHIFT > 0 201 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 202#else 203 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 204#endif 205 206static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 207 __m128i &mask, __m128i &srcA) { 208 // In the following comments, the components of src, dst and mask are 209 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 210 // by an R, G, B, or A suffix. Components of one of the four pixels that 211 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 212 // example is the blue channel of the second destination pixel. Memory 213 // layout is shown for an ARGB byte order in a color value. 214 215 // src and srcA store 8-bit values interleaved with zeros. 216 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 217 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 218 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 219 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 220 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 221 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 222 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 223 224 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 225 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 226 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 227 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 228 229 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 230 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 231 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 232 233 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 234 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 235 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 236 237 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 238 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 239 // 8-bit position 240 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 241 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 242 mask = _mm_or_si128(_mm_or_si128(r, g), b); 243 244 // Interleave R,G,B into the lower byte of word. 245 // i.e. split the sixteen 8-bit values from mask into two sets of eight 246 // 16-bit values, padded by zero. 247 __m128i maskLo, maskHi; 248 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 249 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 250 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 251 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 252 253 // Upscale from 0..31 to 0..32 254 // (allows to replace division by left-shift further down) 255 // Left-shift each component by 4 and add the result back to that component, 256 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 257 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 258 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 259 260 // Multiply each component of maskLo and maskHi by srcA 261 maskLo = _mm_mullo_epi16(maskLo, srcA); 262 maskHi = _mm_mullo_epi16(maskHi, srcA); 263 264 // Left shift mask components by 8 (divide by 256) 265 maskLo = _mm_srli_epi16(maskLo, 8); 266 maskHi = _mm_srli_epi16(maskHi, 8); 267 268 // Interleave R,G,B into the lower byte of the word 269 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 270 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 271 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 272 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 273 274 // mask = (src - dst) * mask 275 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 276 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 277 278 // mask = (src - dst) * mask >> 5 279 maskLo = _mm_srai_epi16(maskLo, 5); 280 maskHi = _mm_srai_epi16(maskHi, 5); 281 282 // Add two pixels into result. 283 // result = dst + ((src - dst) * mask >> 5) 284 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 285 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 286 287 // Pack into 4 32bit dst pixels. 288 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 289 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 290 // clamping to 255 if necessary. 291 return _mm_packus_epi16(resultLo, resultHi); 292} 293 294static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 295 __m128i &mask) { 296 // In the following comments, the components of src, dst and mask are 297 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 298 // by an R, G, B, or A suffix. Components of one of the four pixels that 299 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 300 // example is the blue channel of the second destination pixel. Memory 301 // layout is shown for an ARGB byte order in a color value. 302 303 // src and srcA store 8-bit values interleaved with zeros. 304 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 305 // mask stores 16-bit values (shown as high and low bytes) interleaved with 306 // zeros 307 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 308 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 309 310 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 311 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 312 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 313 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 314 315 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 316 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 317 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 318 319 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 320 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 321 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 322 323 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 324 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 325 // 8-bit position 326 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 327 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 328 mask = _mm_or_si128(_mm_or_si128(r, g), b); 329 330 // Interleave R,G,B into the lower byte of word. 331 // i.e. split the sixteen 8-bit values from mask into two sets of eight 332 // 16-bit values, padded by zero. 333 __m128i maskLo, maskHi; 334 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 335 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 336 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 337 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 338 339 // Upscale from 0..31 to 0..32 340 // (allows to replace division by left-shift further down) 341 // Left-shift each component by 4 and add the result back to that component, 342 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 343 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 344 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 345 346 // Interleave R,G,B into the lower byte of the word 347 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 348 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 349 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 350 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 351 352 // mask = (src - dst) * mask 353 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 354 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 355 356 // mask = (src - dst) * mask >> 5 357 maskLo = _mm_srai_epi16(maskLo, 5); 358 maskHi = _mm_srai_epi16(maskHi, 5); 359 360 // Add two pixels into result. 361 // result = dst + ((src - dst) * mask >> 5) 362 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 363 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 364 365 // Pack into 4 32bit dst pixels and force opaque. 366 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 367 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 368 // clamping to 255 if necessary. Set alpha components to 0xFF. 369 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 370 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 371} 372 373void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 374 SkColor src, int width, SkPMColor) { 375 if (width <= 0) { 376 return; 377 } 378 379 int srcA = SkColorGetA(src); 380 int srcR = SkColorGetR(src); 381 int srcG = SkColorGetG(src); 382 int srcB = SkColorGetB(src); 383 384 srcA = SkAlpha255To256(srcA); 385 386 if (width >= 4) { 387 SkASSERT(((size_t)dst & 0x03) == 0); 388 while (((size_t)dst & 0x0F) != 0) { 389 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 390 mask++; 391 dst++; 392 width--; 393 } 394 395 __m128i *d = reinterpret_cast<__m128i*>(dst); 396 // Set alpha to 0xFF and replicate source four times in SSE register. 397 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 398 // Interleave with zeros to get two sets of four 16-bit values. 399 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 400 // Set srcA_sse to contain eight copies of srcA, padded with zero. 401 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 402 __m128i srcA_sse = _mm_set1_epi16(srcA); 403 while (width >= 4) { 404 // Load four destination pixels into dst_sse. 405 __m128i dst_sse = _mm_load_si128(d); 406 // Load four 16-bit masks into lower half of mask_sse. 407 __m128i mask_sse = _mm_loadl_epi64( 408 reinterpret_cast<const __m128i*>(mask)); 409 410 // Check whether masks are equal to 0 and get the highest bit 411 // of each byte of result, if masks are all zero, we will get 412 // pack_cmp to 0xFFFF 413 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 414 _mm_setzero_si128())); 415 416 // if mask pixels are not all zero, we will blend the dst pixels 417 if (pack_cmp != 0xFFFF) { 418 // Unpack 4 16bit mask pixels to 419 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 420 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 421 mask_sse = _mm_unpacklo_epi16(mask_sse, 422 _mm_setzero_si128()); 423 424 // Process 4 32bit dst pixels 425 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 426 mask_sse, srcA_sse); 427 _mm_store_si128(d, result); 428 } 429 430 d++; 431 mask += 4; 432 width -= 4; 433 } 434 435 dst = reinterpret_cast<SkPMColor*>(d); 436 } 437 438 while (width > 0) { 439 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 440 mask++; 441 dst++; 442 width--; 443 } 444} 445 446void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 447 SkColor src, int width, SkPMColor opaqueDst) { 448 if (width <= 0) { 449 return; 450 } 451 452 int srcR = SkColorGetR(src); 453 int srcG = SkColorGetG(src); 454 int srcB = SkColorGetB(src); 455 456 if (width >= 4) { 457 SkASSERT(((size_t)dst & 0x03) == 0); 458 while (((size_t)dst & 0x0F) != 0) { 459 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 460 mask++; 461 dst++; 462 width--; 463 } 464 465 __m128i *d = reinterpret_cast<__m128i*>(dst); 466 // Set alpha to 0xFF and replicate source four times in SSE register. 467 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 468 // Set srcA_sse to contain eight copies of srcA, padded with zero. 469 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 470 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 471 while (width >= 4) { 472 // Load four destination pixels into dst_sse. 473 __m128i dst_sse = _mm_load_si128(d); 474 // Load four 16-bit masks into lower half of mask_sse. 475 __m128i mask_sse = _mm_loadl_epi64( 476 reinterpret_cast<const __m128i*>(mask)); 477 478 // Check whether masks are equal to 0 and get the highest bit 479 // of each byte of result, if masks are all zero, we will get 480 // pack_cmp to 0xFFFF 481 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 482 _mm_setzero_si128())); 483 484 // if mask pixels are not all zero, we will blend the dst pixels 485 if (pack_cmp != 0xFFFF) { 486 // Unpack 4 16bit mask pixels to 487 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 488 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 489 mask_sse = _mm_unpacklo_epi16(mask_sse, 490 _mm_setzero_si128()); 491 492 // Process 4 32bit dst pixels 493 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 494 mask_sse); 495 _mm_store_si128(d, result); 496 } 497 498 d++; 499 mask += 4; 500 width -= 4; 501 } 502 503 dst = reinterpret_cast<SkPMColor*>(d); 504 } 505 506 while (width > 0) { 507 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 508 mask++; 509 dst++; 510 width--; 511 } 512} 513 514/* SSE2 version of S32_D565_Opaque() 515 * portable version is in core/SkBlitRow_D16.cpp 516 */ 517void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 518 const SkPMColor* SK_RESTRICT src, int count, 519 U8CPU alpha, int /*x*/, int /*y*/) { 520 SkASSERT(255 == alpha); 521 522 if (count <= 0) { 523 return; 524 } 525 526 if (count >= 8) { 527 while (((size_t)dst & 0x0F) != 0) { 528 SkPMColor c = *src++; 529 SkPMColorAssert(c); 530 531 *dst++ = SkPixel32ToPixel16_ToU16(c); 532 count--; 533 } 534 535 const __m128i* s = reinterpret_cast<const __m128i*>(src); 536 __m128i* d = reinterpret_cast<__m128i*>(dst); 537 538 while (count >= 8) { 539 // Load 8 pixels of src. 540 __m128i src_pixel1 = _mm_loadu_si128(s++); 541 __m128i src_pixel2 = _mm_loadu_si128(s++); 542 543 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2); 544 _mm_store_si128(d++, d_pixel); 545 count -= 8; 546 } 547 src = reinterpret_cast<const SkPMColor*>(s); 548 dst = reinterpret_cast<uint16_t*>(d); 549 } 550 551 if (count > 0) { 552 do { 553 SkPMColor c = *src++; 554 SkPMColorAssert(c); 555 *dst++ = SkPixel32ToPixel16_ToU16(c); 556 } while (--count != 0); 557 } 558} 559 560/* SSE2 version of S32A_D565_Opaque() 561 * portable version is in core/SkBlitRow_D16.cpp 562 */ 563void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 564 const SkPMColor* SK_RESTRICT src, 565 int count, U8CPU alpha, int /*x*/, int /*y*/) { 566 SkASSERT(255 == alpha); 567 568 if (count <= 0) { 569 return; 570 } 571 572 if (count >= 8) { 573 // Make dst 16 bytes alignment 574 while (((size_t)dst & 0x0F) != 0) { 575 SkPMColor c = *src++; 576 if (c) { 577 *dst = SkSrcOver32To16(c, *dst); 578 } 579 dst += 1; 580 count--; 581 } 582 583 const __m128i* s = reinterpret_cast<const __m128i*>(src); 584 __m128i* d = reinterpret_cast<__m128i*>(dst); 585 __m128i var255 = _mm_set1_epi16(255); 586 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 587 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 588 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 589 590 while (count >= 8) { 591 // Load 8 pixels of src. 592 __m128i src_pixel1 = _mm_loadu_si128(s++); 593 __m128i src_pixel2 = _mm_loadu_si128(s++); 594 595 // Check whether src pixels are equal to 0 and get the highest bit 596 // of each byte of result, if src pixels are all zero, src_cmp1 and 597 // src_cmp2 will be 0xFFFF. 598 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 599 _mm_setzero_si128())); 600 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 601 _mm_setzero_si128())); 602 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 603 d++; 604 count -= 8; 605 continue; 606 } 607 608 // Load 8 pixels of dst. 609 __m128i dst_pixel = _mm_load_si128(d); 610 611 // Extract A from src. 612 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 613 sa1 = _mm_srli_epi32(sa1, 24); 614 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 615 sa2 = _mm_srli_epi32(sa2, 24); 616 __m128i sa = _mm_packs_epi32(sa1, sa2); 617 618 // Extract R from src. 619 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 620 sr1 = _mm_srli_epi32(sr1, 24); 621 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 622 sr2 = _mm_srli_epi32(sr2, 24); 623 __m128i sr = _mm_packs_epi32(sr1, sr2); 624 625 // Extract G from src. 626 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 627 sg1 = _mm_srli_epi32(sg1, 24); 628 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 629 sg2 = _mm_srli_epi32(sg2, 24); 630 __m128i sg = _mm_packs_epi32(sg1, sg2); 631 632 // Extract B from src. 633 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 634 sb1 = _mm_srli_epi32(sb1, 24); 635 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 636 sb2 = _mm_srli_epi32(sb2, 24); 637 __m128i sb = _mm_packs_epi32(sb1, sb2); 638 639 // Extract R G B from dst. 640 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 641 dr = _mm_and_si128(dr, r16_mask); 642 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 643 dg = _mm_and_si128(dg, g16_mask); 644 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 645 db = _mm_and_si128(db, b16_mask); 646 647 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 648 649 // Calculate R G B of result. 650 // Original algorithm is in SkSrcOver32To16(). 651 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); 652 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 653 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); 654 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 655 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); 656 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 657 658 // Pack R G B into 16-bit color. 659 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 660 661 // Store 8 16-bit colors in dst. 662 _mm_store_si128(d++, d_pixel); 663 count -= 8; 664 } 665 666 src = reinterpret_cast<const SkPMColor*>(s); 667 dst = reinterpret_cast<uint16_t*>(d); 668 } 669 670 if (count > 0) { 671 do { 672 SkPMColor c = *src++; 673 SkPMColorAssert(c); 674 if (c) { 675 *dst = SkSrcOver32To16(c, *dst); 676 } 677 dst += 1; 678 } while (--count != 0); 679 } 680} 681 682void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 683 const SkPMColor* SK_RESTRICT src, 684 int count, U8CPU alpha, int x, int y) { 685 SkASSERT(255 == alpha); 686 687 if (count <= 0) { 688 return; 689 } 690 691 if (count >= 8) { 692 while (((size_t)dst & 0x0F) != 0) { 693 DITHER_565_SCAN(y); 694 SkPMColor c = *src++; 695 SkPMColorAssert(c); 696 697 unsigned dither = DITHER_VALUE(x); 698 *dst++ = SkDitherRGB32To565(c, dither); 699 DITHER_INC_X(x); 700 count--; 701 } 702 703 unsigned short dither_value[8]; 704 __m128i dither; 705#ifdef ENABLE_DITHER_MATRIX_4X4 706 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 707 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 708 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 709 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 710 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 711#else 712 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 713 dither_value[0] = dither_value[4] = (dither_scan 714 >> (((x) & 3) << 2)) & 0xF; 715 dither_value[1] = dither_value[5] = (dither_scan 716 >> (((x + 1) & 3) << 2)) & 0xF; 717 dither_value[2] = dither_value[6] = (dither_scan 718 >> (((x + 2) & 3) << 2)) & 0xF; 719 dither_value[3] = dither_value[7] = (dither_scan 720 >> (((x + 3) & 3) << 2)) & 0xF; 721#endif 722 dither = _mm_loadu_si128((__m128i*) dither_value); 723 724 const __m128i* s = reinterpret_cast<const __m128i*>(src); 725 __m128i* d = reinterpret_cast<__m128i*>(dst); 726 727 while (count >= 8) { 728 // Load 8 pixels of src. 729 __m128i src_pixel1 = _mm_loadu_si128(s++); 730 __m128i src_pixel2 = _mm_loadu_si128(s++); 731 732 // Extract R from src. 733 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 734 sr1 = _mm_srli_epi32(sr1, 24); 735 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 736 sr2 = _mm_srli_epi32(sr2, 24); 737 __m128i sr = _mm_packs_epi32(sr1, sr2); 738 739 // SkDITHER_R32To565(sr, dither) 740 __m128i sr_offset = _mm_srli_epi16(sr, 5); 741 sr = _mm_add_epi16(sr, dither); 742 sr = _mm_sub_epi16(sr, sr_offset); 743 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 744 745 // Extract G from src. 746 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 747 sg1 = _mm_srli_epi32(sg1, 24); 748 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 749 sg2 = _mm_srli_epi32(sg2, 24); 750 __m128i sg = _mm_packs_epi32(sg1, sg2); 751 752 // SkDITHER_R32To565(sg, dither) 753 __m128i sg_offset = _mm_srli_epi16(sg, 6); 754 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 755 sg = _mm_sub_epi16(sg, sg_offset); 756 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 757 758 // Extract B from src. 759 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 760 sb1 = _mm_srli_epi32(sb1, 24); 761 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 762 sb2 = _mm_srli_epi32(sb2, 24); 763 __m128i sb = _mm_packs_epi32(sb1, sb2); 764 765 // SkDITHER_R32To565(sb, dither) 766 __m128i sb_offset = _mm_srli_epi16(sb, 5); 767 sb = _mm_add_epi16(sb, dither); 768 sb = _mm_sub_epi16(sb, sb_offset); 769 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 770 771 // Pack and store 16-bit dst pixel. 772 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); 773 _mm_store_si128(d++, d_pixel); 774 775 count -= 8; 776 x += 8; 777 } 778 779 src = reinterpret_cast<const SkPMColor*>(s); 780 dst = reinterpret_cast<uint16_t*>(d); 781 } 782 783 if (count > 0) { 784 DITHER_565_SCAN(y); 785 do { 786 SkPMColor c = *src++; 787 SkPMColorAssert(c); 788 789 unsigned dither = DITHER_VALUE(x); 790 *dst++ = SkDitherRGB32To565(c, dither); 791 DITHER_INC_X(x); 792 } while (--count != 0); 793 } 794} 795 796/* SSE2 version of S32A_D565_Opaque_Dither() 797 * portable version is in core/SkBlitRow_D16.cpp 798 */ 799void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 800 const SkPMColor* SK_RESTRICT src, 801 int count, U8CPU alpha, int x, int y) { 802 SkASSERT(255 == alpha); 803 804 if (count <= 0) { 805 return; 806 } 807 808 if (count >= 8) { 809 while (((size_t)dst & 0x0F) != 0) { 810 DITHER_565_SCAN(y); 811 SkPMColor c = *src++; 812 SkPMColorAssert(c); 813 if (c) { 814 unsigned a = SkGetPackedA32(c); 815 816 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 817 818 unsigned sr = SkGetPackedR32(c); 819 unsigned sg = SkGetPackedG32(c); 820 unsigned sb = SkGetPackedB32(c); 821 sr = SkDITHER_R32_FOR_565(sr, d); 822 sg = SkDITHER_G32_FOR_565(sg, d); 823 sb = SkDITHER_B32_FOR_565(sb, d); 824 825 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 826 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 827 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 828 // now src and dst expanded are in g:11 r:10 x:1 b:10 829 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 830 } 831 dst += 1; 832 DITHER_INC_X(x); 833 count--; 834 } 835 836 unsigned short dither_value[8]; 837 __m128i dither, dither_cur; 838#ifdef ENABLE_DITHER_MATRIX_4X4 839 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 840 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 841 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 842 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 843 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 844#else 845 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 846 dither_value[0] = dither_value[4] = (dither_scan 847 >> (((x) & 3) << 2)) & 0xF; 848 dither_value[1] = dither_value[5] = (dither_scan 849 >> (((x + 1) & 3) << 2)) & 0xF; 850 dither_value[2] = dither_value[6] = (dither_scan 851 >> (((x + 2) & 3) << 2)) & 0xF; 852 dither_value[3] = dither_value[7] = (dither_scan 853 >> (((x + 3) & 3) << 2)) & 0xF; 854#endif 855 dither = _mm_loadu_si128((__m128i*) dither_value); 856 857 const __m128i* s = reinterpret_cast<const __m128i*>(src); 858 __m128i* d = reinterpret_cast<__m128i*>(dst); 859 __m128i var256 = _mm_set1_epi16(256); 860 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 861 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 862 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 863 864 while (count >= 8) { 865 // Load 8 pixels of src and dst. 866 __m128i src_pixel1 = _mm_loadu_si128(s++); 867 __m128i src_pixel2 = _mm_loadu_si128(s++); 868 __m128i dst_pixel = _mm_load_si128(d); 869 870 // Extract A from src. 871 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 872 sa1 = _mm_srli_epi32(sa1, 24); 873 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 874 sa2 = _mm_srli_epi32(sa2, 24); 875 __m128i sa = _mm_packs_epi32(sa1, sa2); 876 877 // Calculate current dither value. 878 dither_cur = _mm_mullo_epi16(dither, 879 _mm_add_epi16(sa, _mm_set1_epi16(1))); 880 dither_cur = _mm_srli_epi16(dither_cur, 8); 881 882 // Extract R from src. 883 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 884 sr1 = _mm_srli_epi32(sr1, 24); 885 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 886 sr2 = _mm_srli_epi32(sr2, 24); 887 __m128i sr = _mm_packs_epi32(sr1, sr2); 888 889 // SkDITHER_R32_FOR_565(sr, d) 890 __m128i sr_offset = _mm_srli_epi16(sr, 5); 891 sr = _mm_add_epi16(sr, dither_cur); 892 sr = _mm_sub_epi16(sr, sr_offset); 893 894 // Expand sr. 895 sr = _mm_slli_epi16(sr, 2); 896 897 // Extract G from src. 898 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 899 sg1 = _mm_srli_epi32(sg1, 24); 900 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 901 sg2 = _mm_srli_epi32(sg2, 24); 902 __m128i sg = _mm_packs_epi32(sg1, sg2); 903 904 // sg = SkDITHER_G32_FOR_565(sg, d). 905 __m128i sg_offset = _mm_srli_epi16(sg, 6); 906 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 907 sg = _mm_sub_epi16(sg, sg_offset); 908 909 // Expand sg. 910 sg = _mm_slli_epi16(sg, 3); 911 912 // Extract B from src. 913 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 914 sb1 = _mm_srli_epi32(sb1, 24); 915 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 916 sb2 = _mm_srli_epi32(sb2, 24); 917 __m128i sb = _mm_packs_epi32(sb1, sb2); 918 919 // sb = SkDITHER_B32_FOR_565(sb, d). 920 __m128i sb_offset = _mm_srli_epi16(sb, 5); 921 sb = _mm_add_epi16(sb, dither_cur); 922 sb = _mm_sub_epi16(sb, sb_offset); 923 924 // Expand sb. 925 sb = _mm_slli_epi16(sb, 2); 926 927 // Extract R G B from dst. 928 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 929 dr = _mm_and_si128(dr, r16_mask); 930 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 931 dg = _mm_and_si128(dg, g16_mask); 932 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 933 db = _mm_and_si128(db, b16_mask); 934 935 // SkAlpha255To256(255 - a) >> 3 936 __m128i isa = _mm_sub_epi16(var256, sa); 937 isa = _mm_srli_epi16(isa, 3); 938 939 dr = _mm_mullo_epi16(dr, isa); 940 dr = _mm_add_epi16(dr, sr); 941 dr = _mm_srli_epi16(dr, 5); 942 943 dg = _mm_mullo_epi16(dg, isa); 944 dg = _mm_add_epi16(dg, sg); 945 dg = _mm_srli_epi16(dg, 5); 946 947 db = _mm_mullo_epi16(db, isa); 948 db = _mm_add_epi16(db, sb); 949 db = _mm_srli_epi16(db, 5); 950 951 // Package and store dst pixel. 952 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 953 _mm_store_si128(d++, d_pixel); 954 955 count -= 8; 956 x += 8; 957 } 958 959 src = reinterpret_cast<const SkPMColor*>(s); 960 dst = reinterpret_cast<uint16_t*>(d); 961 } 962 963 if (count > 0) { 964 DITHER_565_SCAN(y); 965 do { 966 SkPMColor c = *src++; 967 SkPMColorAssert(c); 968 if (c) { 969 unsigned a = SkGetPackedA32(c); 970 971 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 972 973 unsigned sr = SkGetPackedR32(c); 974 unsigned sg = SkGetPackedG32(c); 975 unsigned sb = SkGetPackedB32(c); 976 sr = SkDITHER_R32_FOR_565(sr, d); 977 sg = SkDITHER_G32_FOR_565(sg, d); 978 sb = SkDITHER_B32_FOR_565(sb, d); 979 980 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 981 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 982 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 983 // now src and dst expanded are in g:11 r:10 x:1 b:10 984 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 985 } 986 dst += 1; 987 DITHER_INC_X(x); 988 } while (--count != 0); 989 } 990} 991