1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkBlitRow_opts_SSE2.h"
11#include "SkColorPriv.h"
12#include "SkColor_opts_SSE2.h"
13#include "SkDither.h"
14#include "SkMSAN.h"
15#include "SkUtils.h"
16
17/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
20void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                              const SkPMColor* SK_RESTRICT src,
22                              int count, U8CPU alpha) {
23    SkASSERT(alpha <= 255);
24    if (count <= 0) {
25        return;
26    }
27
28    uint32_t src_scale = SkAlpha255To256(alpha);
29
30    if (count >= 4) {
31        SkASSERT(((size_t)dst & 0x03) == 0);
32        while (((size_t)dst & 0x0F) != 0) {
33            *dst = SkPMLerp(*src, *dst, src_scale);
34            src++;
35            dst++;
36            count--;
37        }
38
39        const __m128i *s = reinterpret_cast<const __m128i*>(src);
40        __m128i *d = reinterpret_cast<__m128i*>(dst);
41
42        while (count >= 4) {
43            // Load 4 pixels each of src and dest.
44            __m128i src_pixel = _mm_loadu_si128(s);
45            __m128i dst_pixel = _mm_load_si128(d);
46
47            __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
48            _mm_store_si128(d, result);
49            s++;
50            d++;
51            count -= 4;
52        }
53        src = reinterpret_cast<const SkPMColor*>(s);
54        dst = reinterpret_cast<SkPMColor*>(d);
55    }
56
57    while (count > 0) {
58        *dst = SkPMLerp(*src, *dst, src_scale);
59        src++;
60        dst++;
61        count--;
62    }
63}
64
65void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66                               const SkPMColor* SK_RESTRICT src,
67                               int count, U8CPU alpha) {
68    SkASSERT(alpha <= 255);
69    if (count <= 0) {
70        return;
71    }
72
73    if (count >= 4) {
74        while (((size_t)dst & 0x0F) != 0) {
75            *dst = SkBlendARGB32(*src, *dst, alpha);
76            src++;
77            dst++;
78            count--;
79        }
80
81        const __m128i *s = reinterpret_cast<const __m128i*>(src);
82        __m128i *d = reinterpret_cast<__m128i*>(dst);
83        while (count >= 4) {
84            // Load 4 pixels each of src and dest.
85            __m128i src_pixel = _mm_loadu_si128(s);
86            __m128i dst_pixel = _mm_load_si128(d);
87
88            __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
89            _mm_store_si128(d, result);
90            s++;
91            d++;
92            count -= 4;
93        }
94        src = reinterpret_cast<const SkPMColor*>(s);
95        dst = reinterpret_cast<SkPMColor*>(d);
96    }
97
98    while (count > 0) {
99        *dst = SkBlendARGB32(*src, *dst, alpha);
100        src++;
101        dst++;
102        count--;
103    }
104}
105
106void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
107    SkASSERT(count > 0);
108
109    uint32_t src_expand = (SkGetPackedG32(src) << 24) |
110                          (SkGetPackedR32(src) << 13) |
111                          (SkGetPackedB32(src) << 2);
112    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
113
114    // Check if we have enough pixels to run SIMD
115    if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
116        __m128i* dst_wide;
117        const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
118        const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
119        const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
120        const __m128i scale_wide = _mm_set1_epi16(scale);
121        const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
122        const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
123
124        // Align dst to an even 16 byte address (0-7 pixels)
125        while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
126            *dst = SkBlend32_RGB16(src_expand, *dst, scale);
127            dst += 1;
128            count--;
129        }
130
131        dst_wide = reinterpret_cast<__m128i*>(dst);
132        do {
133            // Load eight RGB565 pixels
134            __m128i pixels = _mm_load_si128(dst_wide);
135
136            // Mask out sub-pixels
137            __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
138            __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
139            pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
140            __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
141
142            // Scale with alpha
143            pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
144            pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
145            pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
146
147            // Add src_X_wide and shift down again
148            pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
149            pixel_R = _mm_srli_epi16(pixel_R, 5);
150            pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
151            pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
152            pixel_B = _mm_srli_epi16(pixel_B, 5);
153
154            // Combine into RGB565 and store
155            pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
156            pixel_G = _mm_and_si128(pixel_G, mask_green);
157            pixels = _mm_or_si128(pixel_R, pixel_G);
158            pixels = _mm_or_si128(pixels, pixel_B);
159            _mm_store_si128(dst_wide, pixels);
160            count -= 8;
161            dst_wide++;
162        } while (count >= 8);
163
164        dst = reinterpret_cast<uint16_t*>(dst_wide);
165    }
166
167    // Small loop to handle remaining pixels.
168    while (count > 0) {
169        *dst = SkBlend32_RGB16(src_expand, *dst, scale);
170        dst += 1;
171        count--;
172    }
173}
174
175// The following (left) shifts cause the top 5 bits of the mask components to
176// line up with the corresponding components in an SkPMColor.
177// Note that the mask's RGB16 order may differ from the SkPMColor order.
178#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
179#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
180#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
181
182#if SK_R16x5_R32x5_SHIFT == 0
183    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
184#elif SK_R16x5_R32x5_SHIFT > 0
185    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
186#else
187    #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
188#endif
189
190#if SK_G16x5_G32x5_SHIFT == 0
191    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
192#elif SK_G16x5_G32x5_SHIFT > 0
193    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
194#else
195    #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
196#endif
197
198#if SK_B16x5_B32x5_SHIFT == 0
199    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
200#elif SK_B16x5_B32x5_SHIFT > 0
201    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
202#else
203    #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
204#endif
205
206static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
207                                 __m128i &mask, __m128i &srcA) {
208    // In the following comments, the components of src, dst and mask are
209    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
210    // by an R, G, B, or A suffix. Components of one of the four pixels that
211    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
212    // example is the blue channel of the second destination pixel. Memory
213    // layout is shown for an ARGB byte order in a color value.
214
215    // src and srcA store 8-bit values interleaved with zeros.
216    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
217    // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
218    //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
219    // mask stores 16-bit values (compressed three channels) interleaved with zeros.
220    // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
221    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
222    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
223
224    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
225    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
226    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
227                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
228
229    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
230    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
231                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
232
233    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
234    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
235                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
236
237    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
238    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
239    // 8-bit position
240    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
241    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
242    mask = _mm_or_si128(_mm_or_si128(r, g), b);
243
244    // Interleave R,G,B into the lower byte of word.
245    // i.e. split the sixteen 8-bit values from mask into two sets of eight
246    // 16-bit values, padded by zero.
247    __m128i maskLo, maskHi;
248    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
249    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
250    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
251    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
252
253    // Upscale from 0..31 to 0..32
254    // (allows to replace division by left-shift further down)
255    // Left-shift each component by 4 and add the result back to that component,
256    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
257    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
258    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
259
260    // Multiply each component of maskLo and maskHi by srcA
261    maskLo = _mm_mullo_epi16(maskLo, srcA);
262    maskHi = _mm_mullo_epi16(maskHi, srcA);
263
264    // Left shift mask components by 8 (divide by 256)
265    maskLo = _mm_srli_epi16(maskLo, 8);
266    maskHi = _mm_srli_epi16(maskHi, 8);
267
268    // Interleave R,G,B into the lower byte of the word
269    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
270    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
271    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
272    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
273
274    // mask = (src - dst) * mask
275    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
276    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
277
278    // mask = (src - dst) * mask >> 5
279    maskLo = _mm_srai_epi16(maskLo, 5);
280    maskHi = _mm_srai_epi16(maskHi, 5);
281
282    // Add two pixels into result.
283    // result = dst + ((src - dst) * mask >> 5)
284    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
285    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
286
287    // Pack into 4 32bit dst pixels.
288    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
289    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
290    // clamping to 255 if necessary.
291    return _mm_packus_epi16(resultLo, resultHi);
292}
293
294static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
295                                       __m128i &mask) {
296    // In the following comments, the components of src, dst and mask are
297    // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
298    // by an R, G, B, or A suffix. Components of one of the four pixels that
299    // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
300    // example is the blue channel of the second destination pixel. Memory
301    // layout is shown for an ARGB byte order in a color value.
302
303    // src and srcA store 8-bit values interleaved with zeros.
304    // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
305    // mask stores 16-bit values (shown as high and low bytes) interleaved with
306    // zeros
307    // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
308    //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
309
310    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
311    // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
312    __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
313                              _mm_set1_epi32(0x1F << SK_R32_SHIFT));
314
315    // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
316    __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
317                              _mm_set1_epi32(0x1F << SK_G32_SHIFT));
318
319    // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
320    __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
321                              _mm_set1_epi32(0x1F << SK_B32_SHIFT));
322
323    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
324    // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
325    // 8-bit position
326    // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
327    //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
328    mask = _mm_or_si128(_mm_or_si128(r, g), b);
329
330    // Interleave R,G,B into the lower byte of word.
331    // i.e. split the sixteen 8-bit values from mask into two sets of eight
332    // 16-bit values, padded by zero.
333    __m128i maskLo, maskHi;
334    // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
335    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
336    // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
337    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
338
339    // Upscale from 0..31 to 0..32
340    // (allows to replace division by left-shift further down)
341    // Left-shift each component by 4 and add the result back to that component,
342    // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
343    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
344    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
345
346    // Interleave R,G,B into the lower byte of the word
347    // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
348    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
349    // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
350    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
351
352    // mask = (src - dst) * mask
353    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
354    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
355
356    // mask = (src - dst) * mask >> 5
357    maskLo = _mm_srai_epi16(maskLo, 5);
358    maskHi = _mm_srai_epi16(maskHi, 5);
359
360    // Add two pixels into result.
361    // result = dst + ((src - dst) * mask >> 5)
362    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
363    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
364
365    // Pack into 4 32bit dst pixels and force opaque.
366    // resultLo and resultHi contain eight 16-bit components (two pixels) each.
367    // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
368    // clamping to 255 if necessary. Set alpha components to 0xFF.
369    return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
370                        _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
371}
372
373void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
374                         SkColor src, int width, SkPMColor) {
375    if (width <= 0) {
376        return;
377    }
378
379    int srcA = SkColorGetA(src);
380    int srcR = SkColorGetR(src);
381    int srcG = SkColorGetG(src);
382    int srcB = SkColorGetB(src);
383
384    srcA = SkAlpha255To256(srcA);
385
386    if (width >= 4) {
387        SkASSERT(((size_t)dst & 0x03) == 0);
388        while (((size_t)dst & 0x0F) != 0) {
389            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
390            mask++;
391            dst++;
392            width--;
393        }
394
395        __m128i *d = reinterpret_cast<__m128i*>(dst);
396        // Set alpha to 0xFF and replicate source four times in SSE register.
397        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
398        // Interleave with zeros to get two sets of four 16-bit values.
399        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
400        // Set srcA_sse to contain eight copies of srcA, padded with zero.
401        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
402        __m128i srcA_sse = _mm_set1_epi16(srcA);
403        while (width >= 4) {
404            // Load four destination pixels into dst_sse.
405            __m128i dst_sse = _mm_load_si128(d);
406            // Load four 16-bit masks into lower half of mask_sse.
407            __m128i mask_sse = _mm_loadl_epi64(
408                                   reinterpret_cast<const __m128i*>(mask));
409
410            // Check whether masks are equal to 0 and get the highest bit
411            // of each byte of result, if masks are all zero, we will get
412            // pack_cmp to 0xFFFF
413            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
414                                             _mm_setzero_si128()));
415
416            // if mask pixels are not all zero, we will blend the dst pixels
417            if (pack_cmp != 0xFFFF) {
418                // Unpack 4 16bit mask pixels to
419                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
420                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
421                mask_sse = _mm_unpacklo_epi16(mask_sse,
422                                              _mm_setzero_si128());
423
424                // Process 4 32bit dst pixels
425                __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
426                                                   mask_sse, srcA_sse);
427                _mm_store_si128(d, result);
428            }
429
430            d++;
431            mask += 4;
432            width -= 4;
433        }
434
435        dst = reinterpret_cast<SkPMColor*>(d);
436    }
437
438    while (width > 0) {
439        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
440        mask++;
441        dst++;
442        width--;
443    }
444}
445
446void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
447                               SkColor src, int width, SkPMColor opaqueDst) {
448    if (width <= 0) {
449        return;
450    }
451
452    int srcR = SkColorGetR(src);
453    int srcG = SkColorGetG(src);
454    int srcB = SkColorGetB(src);
455
456    if (width >= 4) {
457        SkASSERT(((size_t)dst & 0x03) == 0);
458        while (((size_t)dst & 0x0F) != 0) {
459            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
460            mask++;
461            dst++;
462            width--;
463        }
464
465        __m128i *d = reinterpret_cast<__m128i*>(dst);
466        // Set alpha to 0xFF and replicate source four times in SSE register.
467        __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
468        // Set srcA_sse to contain eight copies of srcA, padded with zero.
469        // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
470        src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
471        while (width >= 4) {
472            // Load four destination pixels into dst_sse.
473            __m128i dst_sse = _mm_load_si128(d);
474            // Load four 16-bit masks into lower half of mask_sse.
475            __m128i mask_sse = _mm_loadl_epi64(
476                                   reinterpret_cast<const __m128i*>(mask));
477
478            // Check whether masks are equal to 0 and get the highest bit
479            // of each byte of result, if masks are all zero, we will get
480            // pack_cmp to 0xFFFF
481            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
482                                             _mm_setzero_si128()));
483
484            // if mask pixels are not all zero, we will blend the dst pixels
485            if (pack_cmp != 0xFFFF) {
486                // Unpack 4 16bit mask pixels to
487                // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
488                //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
489                mask_sse = _mm_unpacklo_epi16(mask_sse,
490                                              _mm_setzero_si128());
491
492                // Process 4 32bit dst pixels
493                __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
494                                                         mask_sse);
495                _mm_store_si128(d, result);
496            }
497
498            d++;
499            mask += 4;
500            width -= 4;
501        }
502
503        dst = reinterpret_cast<SkPMColor*>(d);
504    }
505
506    while (width > 0) {
507        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
508        mask++;
509        dst++;
510        width--;
511    }
512}
513
514/* SSE2 version of S32_D565_Opaque()
515 * portable version is in core/SkBlitRow_D16.cpp
516 */
517void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
518                          const SkPMColor* SK_RESTRICT src, int count,
519                          U8CPU alpha, int /*x*/, int /*y*/) {
520    SkASSERT(255 == alpha);
521
522    if (count <= 0) {
523        return;
524    }
525
526    if (count >= 8) {
527        while (((size_t)dst & 0x0F) != 0) {
528            SkPMColor c = *src++;
529            SkPMColorAssert(c);
530
531            *dst++ = SkPixel32ToPixel16_ToU16(c);
532            count--;
533        }
534
535        const __m128i* s = reinterpret_cast<const __m128i*>(src);
536        __m128i* d = reinterpret_cast<__m128i*>(dst);
537
538        while (count >= 8) {
539            // Load 8 pixels of src.
540            __m128i src_pixel1 = _mm_loadu_si128(s++);
541            __m128i src_pixel2 = _mm_loadu_si128(s++);
542
543            __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
544            _mm_store_si128(d++, d_pixel);
545            count -= 8;
546        }
547        src = reinterpret_cast<const SkPMColor*>(s);
548        dst = reinterpret_cast<uint16_t*>(d);
549    }
550
551    if (count > 0) {
552        do {
553            SkPMColor c = *src++;
554            SkPMColorAssert(c);
555            *dst++ = SkPixel32ToPixel16_ToU16(c);
556        } while (--count != 0);
557    }
558}
559
560/* SSE2 version of S32A_D565_Opaque()
561 * portable version is in core/SkBlitRow_D16.cpp
562 */
563void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
564                           const SkPMColor* SK_RESTRICT src,
565                           int count, U8CPU alpha, int /*x*/, int /*y*/) {
566    SkASSERT(255 == alpha);
567
568    if (count <= 0) {
569        return;
570    }
571
572    if (count >= 8) {
573        // Make dst 16 bytes alignment
574        while (((size_t)dst & 0x0F) != 0) {
575            SkPMColor c = *src++;
576            if (c) {
577              *dst = SkSrcOver32To16(c, *dst);
578            }
579            dst += 1;
580            count--;
581        }
582
583        const __m128i* s = reinterpret_cast<const __m128i*>(src);
584        __m128i* d = reinterpret_cast<__m128i*>(dst);
585        __m128i var255 = _mm_set1_epi16(255);
586        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
587        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
588        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
589
590        while (count >= 8) {
591            // Load 8 pixels of src.
592            __m128i src_pixel1 = _mm_loadu_si128(s++);
593            __m128i src_pixel2 = _mm_loadu_si128(s++);
594
595            // Check whether src pixels are equal to 0 and get the highest bit
596            // of each byte of result, if src pixels are all zero, src_cmp1 and
597            // src_cmp2 will be 0xFFFF.
598            int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
599                                             _mm_setzero_si128()));
600            int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
601                                             _mm_setzero_si128()));
602            if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
603                d++;
604                count -= 8;
605                continue;
606            }
607
608            // Load 8 pixels of dst.
609            __m128i dst_pixel = _mm_load_si128(d);
610
611            // Extract A from src.
612            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
613            sa1 = _mm_srli_epi32(sa1, 24);
614            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
615            sa2 = _mm_srli_epi32(sa2, 24);
616            __m128i sa = _mm_packs_epi32(sa1, sa2);
617
618            // Extract R from src.
619            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
620            sr1 = _mm_srli_epi32(sr1, 24);
621            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
622            sr2 = _mm_srli_epi32(sr2, 24);
623            __m128i sr = _mm_packs_epi32(sr1, sr2);
624
625            // Extract G from src.
626            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
627            sg1 = _mm_srli_epi32(sg1, 24);
628            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
629            sg2 = _mm_srli_epi32(sg2, 24);
630            __m128i sg = _mm_packs_epi32(sg1, sg2);
631
632            // Extract B from src.
633            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
634            sb1 = _mm_srli_epi32(sb1, 24);
635            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
636            sb2 = _mm_srli_epi32(sb2, 24);
637            __m128i sb = _mm_packs_epi32(sb1, sb2);
638
639            // Extract R G B from dst.
640            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
641            dr = _mm_and_si128(dr, r16_mask);
642            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
643            dg = _mm_and_si128(dg, g16_mask);
644            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
645            db = _mm_and_si128(db, b16_mask);
646
647            __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
648
649            // Calculate R G B of result.
650            // Original algorithm is in SkSrcOver32To16().
651            dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
652            dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
653            dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
654            dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
655            db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
656            db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
657
658            // Pack R G B into 16-bit color.
659            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
660
661            // Store 8 16-bit colors in dst.
662            _mm_store_si128(d++, d_pixel);
663            count -= 8;
664        }
665
666        src = reinterpret_cast<const SkPMColor*>(s);
667        dst = reinterpret_cast<uint16_t*>(d);
668    }
669
670    if (count > 0) {
671        do {
672            SkPMColor c = *src++;
673            SkPMColorAssert(c);
674            if (c) {
675                *dst = SkSrcOver32To16(c, *dst);
676            }
677            dst += 1;
678        } while (--count != 0);
679    }
680}
681
682void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
683                                 const SkPMColor* SK_RESTRICT src,
684                                 int count, U8CPU alpha, int x, int y) {
685    SkASSERT(255 == alpha);
686
687    if (count <= 0) {
688        return;
689    }
690
691    if (count >= 8) {
692        while (((size_t)dst & 0x0F) != 0) {
693            DITHER_565_SCAN(y);
694            SkPMColor c = *src++;
695            SkPMColorAssert(c);
696
697            unsigned dither = DITHER_VALUE(x);
698            *dst++ = SkDitherRGB32To565(c, dither);
699            DITHER_INC_X(x);
700            count--;
701        }
702
703        unsigned short dither_value[8];
704        __m128i dither;
705#ifdef ENABLE_DITHER_MATRIX_4X4
706        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
707        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
708        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
709        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
710        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
711#else
712        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
713        dither_value[0] = dither_value[4] = (dither_scan
714                                             >> (((x) & 3) << 2)) & 0xF;
715        dither_value[1] = dither_value[5] = (dither_scan
716                                             >> (((x + 1) & 3) << 2)) & 0xF;
717        dither_value[2] = dither_value[6] = (dither_scan
718                                             >> (((x + 2) & 3) << 2)) & 0xF;
719        dither_value[3] = dither_value[7] = (dither_scan
720                                             >> (((x + 3) & 3) << 2)) & 0xF;
721#endif
722        dither = _mm_loadu_si128((__m128i*) dither_value);
723
724        const __m128i* s = reinterpret_cast<const __m128i*>(src);
725        __m128i* d = reinterpret_cast<__m128i*>(dst);
726
727        while (count >= 8) {
728            // Load 8 pixels of src.
729            __m128i src_pixel1 = _mm_loadu_si128(s++);
730            __m128i src_pixel2 = _mm_loadu_si128(s++);
731
732            // Extract R from src.
733            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
734            sr1 = _mm_srli_epi32(sr1, 24);
735            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
736            sr2 = _mm_srli_epi32(sr2, 24);
737            __m128i sr = _mm_packs_epi32(sr1, sr2);
738
739            // SkDITHER_R32To565(sr, dither)
740            __m128i sr_offset = _mm_srli_epi16(sr, 5);
741            sr = _mm_add_epi16(sr, dither);
742            sr = _mm_sub_epi16(sr, sr_offset);
743            sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
744
745            // Extract G from src.
746            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
747            sg1 = _mm_srli_epi32(sg1, 24);
748            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
749            sg2 = _mm_srli_epi32(sg2, 24);
750            __m128i sg = _mm_packs_epi32(sg1, sg2);
751
752            // SkDITHER_R32To565(sg, dither)
753            __m128i sg_offset = _mm_srli_epi16(sg, 6);
754            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
755            sg = _mm_sub_epi16(sg, sg_offset);
756            sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
757
758            // Extract B from src.
759            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
760            sb1 = _mm_srli_epi32(sb1, 24);
761            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
762            sb2 = _mm_srli_epi32(sb2, 24);
763            __m128i sb = _mm_packs_epi32(sb1, sb2);
764
765            // SkDITHER_R32To565(sb, dither)
766            __m128i sb_offset = _mm_srli_epi16(sb, 5);
767            sb = _mm_add_epi16(sb, dither);
768            sb = _mm_sub_epi16(sb, sb_offset);
769            sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
770
771            // Pack and store 16-bit dst pixel.
772            __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
773            _mm_store_si128(d++, d_pixel);
774
775            count -= 8;
776            x += 8;
777        }
778
779        src = reinterpret_cast<const SkPMColor*>(s);
780        dst = reinterpret_cast<uint16_t*>(d);
781    }
782
783    if (count > 0) {
784        DITHER_565_SCAN(y);
785        do {
786            SkPMColor c = *src++;
787            SkPMColorAssert(c);
788
789            unsigned dither = DITHER_VALUE(x);
790            *dst++ = SkDitherRGB32To565(c, dither);
791            DITHER_INC_X(x);
792        } while (--count != 0);
793    }
794}
795
796/* SSE2 version of S32A_D565_Opaque_Dither()
797 * portable version is in core/SkBlitRow_D16.cpp
798 */
799void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
800                                  const SkPMColor* SK_RESTRICT src,
801                                  int count, U8CPU alpha, int x, int y) {
802    SkASSERT(255 == alpha);
803
804    if (count <= 0) {
805        return;
806    }
807
808    if (count >= 8) {
809        while (((size_t)dst & 0x0F) != 0) {
810            DITHER_565_SCAN(y);
811            SkPMColor c = *src++;
812            SkPMColorAssert(c);
813            if (c) {
814                unsigned a = SkGetPackedA32(c);
815
816                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
817
818                unsigned sr = SkGetPackedR32(c);
819                unsigned sg = SkGetPackedG32(c);
820                unsigned sb = SkGetPackedB32(c);
821                sr = SkDITHER_R32_FOR_565(sr, d);
822                sg = SkDITHER_G32_FOR_565(sg, d);
823                sb = SkDITHER_B32_FOR_565(sb, d);
824
825                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
826                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
827                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
828                // now src and dst expanded are in g:11 r:10 x:1 b:10
829                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
830            }
831            dst += 1;
832            DITHER_INC_X(x);
833            count--;
834        }
835
836        unsigned short dither_value[8];
837        __m128i dither, dither_cur;
838#ifdef ENABLE_DITHER_MATRIX_4X4
839        const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
840        dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
841        dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
842        dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
843        dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
844#else
845        const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
846        dither_value[0] = dither_value[4] = (dither_scan
847                                             >> (((x) & 3) << 2)) & 0xF;
848        dither_value[1] = dither_value[5] = (dither_scan
849                                             >> (((x + 1) & 3) << 2)) & 0xF;
850        dither_value[2] = dither_value[6] = (dither_scan
851                                             >> (((x + 2) & 3) << 2)) & 0xF;
852        dither_value[3] = dither_value[7] = (dither_scan
853                                             >> (((x + 3) & 3) << 2)) & 0xF;
854#endif
855        dither = _mm_loadu_si128((__m128i*) dither_value);
856
857        const __m128i* s = reinterpret_cast<const __m128i*>(src);
858        __m128i* d = reinterpret_cast<__m128i*>(dst);
859        __m128i var256 = _mm_set1_epi16(256);
860        __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
861        __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
862        __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
863
864        while (count >= 8) {
865            // Load 8 pixels of src and dst.
866            __m128i src_pixel1 = _mm_loadu_si128(s++);
867            __m128i src_pixel2 = _mm_loadu_si128(s++);
868            __m128i dst_pixel = _mm_load_si128(d);
869
870            // Extract A from src.
871            __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
872            sa1 = _mm_srli_epi32(sa1, 24);
873            __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
874            sa2 = _mm_srli_epi32(sa2, 24);
875            __m128i sa = _mm_packs_epi32(sa1, sa2);
876
877            // Calculate current dither value.
878            dither_cur = _mm_mullo_epi16(dither,
879                                         _mm_add_epi16(sa, _mm_set1_epi16(1)));
880            dither_cur = _mm_srli_epi16(dither_cur, 8);
881
882            // Extract R from src.
883            __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
884            sr1 = _mm_srli_epi32(sr1, 24);
885            __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
886            sr2 = _mm_srli_epi32(sr2, 24);
887            __m128i sr = _mm_packs_epi32(sr1, sr2);
888
889            // SkDITHER_R32_FOR_565(sr, d)
890            __m128i sr_offset = _mm_srli_epi16(sr, 5);
891            sr = _mm_add_epi16(sr, dither_cur);
892            sr = _mm_sub_epi16(sr, sr_offset);
893
894            // Expand sr.
895            sr = _mm_slli_epi16(sr, 2);
896
897            // Extract G from src.
898            __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
899            sg1 = _mm_srli_epi32(sg1, 24);
900            __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
901            sg2 = _mm_srli_epi32(sg2, 24);
902            __m128i sg = _mm_packs_epi32(sg1, sg2);
903
904            // sg = SkDITHER_G32_FOR_565(sg, d).
905            __m128i sg_offset = _mm_srli_epi16(sg, 6);
906            sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
907            sg = _mm_sub_epi16(sg, sg_offset);
908
909            // Expand sg.
910            sg = _mm_slli_epi16(sg, 3);
911
912            // Extract B from src.
913            __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
914            sb1 = _mm_srli_epi32(sb1, 24);
915            __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
916            sb2 = _mm_srli_epi32(sb2, 24);
917            __m128i sb = _mm_packs_epi32(sb1, sb2);
918
919            // sb = SkDITHER_B32_FOR_565(sb, d).
920            __m128i sb_offset = _mm_srli_epi16(sb, 5);
921            sb = _mm_add_epi16(sb, dither_cur);
922            sb = _mm_sub_epi16(sb, sb_offset);
923
924            // Expand sb.
925            sb = _mm_slli_epi16(sb, 2);
926
927            // Extract R G B from dst.
928            __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
929            dr = _mm_and_si128(dr, r16_mask);
930            __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
931            dg = _mm_and_si128(dg, g16_mask);
932            __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
933            db = _mm_and_si128(db, b16_mask);
934
935            // SkAlpha255To256(255 - a) >> 3
936            __m128i isa = _mm_sub_epi16(var256, sa);
937            isa = _mm_srli_epi16(isa, 3);
938
939            dr = _mm_mullo_epi16(dr, isa);
940            dr = _mm_add_epi16(dr, sr);
941            dr = _mm_srli_epi16(dr, 5);
942
943            dg = _mm_mullo_epi16(dg, isa);
944            dg = _mm_add_epi16(dg, sg);
945            dg = _mm_srli_epi16(dg, 5);
946
947            db = _mm_mullo_epi16(db, isa);
948            db = _mm_add_epi16(db, sb);
949            db = _mm_srli_epi16(db, 5);
950
951            // Package and store dst pixel.
952            __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
953            _mm_store_si128(d++, d_pixel);
954
955            count -= 8;
956            x += 8;
957        }
958
959        src = reinterpret_cast<const SkPMColor*>(s);
960        dst = reinterpret_cast<uint16_t*>(d);
961    }
962
963    if (count > 0) {
964        DITHER_565_SCAN(y);
965        do {
966            SkPMColor c = *src++;
967            SkPMColorAssert(c);
968            if (c) {
969                unsigned a = SkGetPackedA32(c);
970
971                int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
972
973                unsigned sr = SkGetPackedR32(c);
974                unsigned sg = SkGetPackedG32(c);
975                unsigned sb = SkGetPackedB32(c);
976                sr = SkDITHER_R32_FOR_565(sr, d);
977                sg = SkDITHER_G32_FOR_565(sg, d);
978                sb = SkDITHER_B32_FOR_565(sb, d);
979
980                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
981                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
982                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
983                // now src and dst expanded are in g:11 r:10 x:1 b:10
984                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
985            }
986            dst += 1;
987            DITHER_INC_X(x);
988        } while (--count != 0);
989    }
990}
991