1/*
2 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmapProcState_opts_SSE2.h"
10#include "SkColorPriv.h"
11#include "SkPaint.h"
12#include "SkUtils.h"
13
14void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
15                                   const uint32_t* xy,
16                                   int count, uint32_t* colors) {
17    SkASSERT(count > 0 && colors != NULL);
18    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
19    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
20    SkASSERT(s.fAlphaScale == 256);
21
22    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
23    size_t rb = s.fBitmap->rowBytes();
24    uint32_t XY = *xy++;
25    unsigned y0 = XY >> 14;
26    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
27    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
28    unsigned subY = y0 & 0xF;
29
30    // ( 0,  0,  0,  0,  0,  0,  0, 16)
31    __m128i sixteen = _mm_cvtsi32_si128(16);
32
33    // ( 0,  0,  0,  0, 16, 16, 16, 16)
34    sixteen = _mm_shufflelo_epi16(sixteen, 0);
35
36    // ( 0,  0,  0,  0,  0,  0,  0,  y)
37    __m128i allY = _mm_cvtsi32_si128(subY);
38
39    // ( 0,  0,  0,  0,  y,  y,  y,  y)
40    allY = _mm_shufflelo_epi16(allY, 0);
41
42    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
43    __m128i negY = _mm_sub_epi16(sixteen, allY);
44
45    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
46    allY = _mm_unpacklo_epi64(allY, negY);
47
48    // (16, 16, 16, 16, 16, 16, 16, 16 )
49    sixteen = _mm_shuffle_epi32(sixteen, 0);
50
51    // ( 0,  0,  0,  0,  0,  0,  0,  0)
52    __m128i zero = _mm_setzero_si128();
53    do {
54        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
55        unsigned x0 = XX >> 18;
56        unsigned x1 = XX & 0x3FFF;
57
58        // (0, 0, 0, 0, 0, 0, 0, x)
59        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
60
61        // (0, 0, 0, 0, x, x, x, x)
62        allX = _mm_shufflelo_epi16(allX, 0);
63
64        // (x, x, x, x, x, x, x, x)
65        allX = _mm_shuffle_epi32(allX, 0);
66
67        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
68        __m128i negX = _mm_sub_epi16(sixteen, allX);
69
70        // Load 4 samples (pixels).
71        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
72        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
73        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
74        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
75
76        // (0, 0, a00, a10)
77        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
78
79        // Expand to 16 bits per component.
80        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
81
82        // ((a00 * (16-y)), (a10 * y)).
83        a00a10 = _mm_mullo_epi16(a00a10, allY);
84
85        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
86        a00a10 = _mm_mullo_epi16(a00a10, negX);
87
88        // (0, 0, a01, a10)
89        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
90
91        // Expand to 16 bits per component.
92        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
93
94        // (a01 * (16-y)), (a11 * y)
95        a01a11 = _mm_mullo_epi16(a01a11, allY);
96
97        // (a01 * (16-y) * x), (a11 * y * x)
98        a01a11 = _mm_mullo_epi16(a01a11, allX);
99
100        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
101        __m128i sum = _mm_add_epi16(a00a10, a01a11);
102
103        // (DC, a00*w00 + a01*w01)
104        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
105
106        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
107        sum = _mm_add_epi16(sum, shifted);
108
109        // Divide each 16 bit component by 256.
110        sum = _mm_srli_epi16(sum, 8);
111
112        // Pack lower 4 16 bit values of sum into lower 4 bytes.
113        sum = _mm_packus_epi16(sum, zero);
114
115        // Extract low int and store.
116        *colors++ = _mm_cvtsi128_si32(sum);
117    } while (--count > 0);
118}
119
120void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
121                                  const uint32_t* xy,
122                                  int count, uint32_t* colors) {
123    SkASSERT(count > 0 && colors != NULL);
124    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
125    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
126    SkASSERT(s.fAlphaScale < 256);
127
128    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
129    size_t rb = s.fBitmap->rowBytes();
130    uint32_t XY = *xy++;
131    unsigned y0 = XY >> 14;
132    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
133    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
134    unsigned subY = y0 & 0xF;
135
136    // ( 0,  0,  0,  0,  0,  0,  0, 16)
137    __m128i sixteen = _mm_cvtsi32_si128(16);
138
139    // ( 0,  0,  0,  0, 16, 16, 16, 16)
140    sixteen = _mm_shufflelo_epi16(sixteen, 0);
141
142    // ( 0,  0,  0,  0,  0,  0,  0,  y)
143    __m128i allY = _mm_cvtsi32_si128(subY);
144
145    // ( 0,  0,  0,  0,  y,  y,  y,  y)
146    allY = _mm_shufflelo_epi16(allY, 0);
147
148    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
149    __m128i negY = _mm_sub_epi16(sixteen, allY);
150
151    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
152    allY = _mm_unpacklo_epi64(allY, negY);
153
154    // (16, 16, 16, 16, 16, 16, 16, 16 )
155    sixteen = _mm_shuffle_epi32(sixteen, 0);
156
157    // ( 0,  0,  0,  0,  0,  0,  0,  0)
158    __m128i zero = _mm_setzero_si128();
159
160    // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
161    __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
162
163    do {
164        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
165        unsigned x0 = XX >> 18;
166        unsigned x1 = XX & 0x3FFF;
167
168        // (0, 0, 0, 0, 0, 0, 0, x)
169        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
170
171        // (0, 0, 0, 0, x, x, x, x)
172        allX = _mm_shufflelo_epi16(allX, 0);
173
174        // (x, x, x, x, x, x, x, x)
175        allX = _mm_shuffle_epi32(allX, 0);
176
177        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
178        __m128i negX = _mm_sub_epi16(sixteen, allX);
179
180        // Load 4 samples (pixels).
181        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
182        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
183        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
184        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
185
186        // (0, 0, a00, a10)
187        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
188
189        // Expand to 16 bits per component.
190        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
191
192        // ((a00 * (16-y)), (a10 * y)).
193        a00a10 = _mm_mullo_epi16(a00a10, allY);
194
195        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
196        a00a10 = _mm_mullo_epi16(a00a10, negX);
197
198        // (0, 0, a01, a10)
199        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
200
201        // Expand to 16 bits per component.
202        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
203
204        // (a01 * (16-y)), (a11 * y)
205        a01a11 = _mm_mullo_epi16(a01a11, allY);
206
207        // (a01 * (16-y) * x), (a11 * y * x)
208        a01a11 = _mm_mullo_epi16(a01a11, allX);
209
210        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
211        __m128i sum = _mm_add_epi16(a00a10, a01a11);
212
213        // (DC, a00*w00 + a01*w01)
214        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
215
216        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
217        sum = _mm_add_epi16(sum, shifted);
218
219        // Divide each 16 bit component by 256.
220        sum = _mm_srli_epi16(sum, 8);
221
222        // Multiply by alpha.
223        sum = _mm_mullo_epi16(sum, alpha);
224
225        // Divide each 16 bit component by 256.
226        sum = _mm_srli_epi16(sum, 8);
227
228        // Pack lower 4 16 bit values of sum into lower 4 bytes.
229        sum = _mm_packus_epi16(sum, zero);
230
231        // Extract low int and store.
232        *colors++ = _mm_cvtsi128_si32(sum);
233    } while (--count > 0);
234}
235
236static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
237                                                 SkFixed one) {
238    unsigned i = SkClampMax(f >> 16, max);
239    i = (i << 4) | ((f >> 12) & 0xF);
240    return (i << 14) | SkClampMax((f + one) >> 16, max);
241}
242
243/*  SSE version of ClampX_ClampY_filter_scale()
244 *  portable version is in core/SkBitmapProcState_matrix.h
245 */
246void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
247                                     int count, int x, int y) {
248    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
249                             SkMatrix::kScale_Mask)) == 0);
250    SkASSERT(s.fInvKy == 0);
251
252    const unsigned maxX = s.fBitmap->width() - 1;
253    const SkFixed one = s.fFilterOneX;
254    const SkFixed dx = s.fInvSx;
255    SkFixed fx;
256
257    SkPoint pt;
258    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
259                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
260    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
261    const unsigned maxY = s.fBitmap->height() - 1;
262    // compute our two Y values up front
263    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
264    // now initialize fx
265    fx = SkScalarToFixed(pt.fX) - (one >> 1);
266
267    // test if we don't need to apply the tile proc
268    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
269        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
270        if (count >= 4) {
271            // SSE version of decal_filter_scale
272            while ((size_t(xy) & 0x0F) != 0) {
273                SkASSERT((fx >> (16 + 14)) == 0);
274                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
275                fx += dx;
276                count--;
277            }
278
279            __m128i wide_1    = _mm_set1_epi32(1);
280            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
281            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
282                                              fx + dx, fx);
283
284            while (count >= 4) {
285                __m128i wide_out;
286
287                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
288                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
289                                        _mm_srai_epi32(wide_fx, 16), wide_1));
290
291                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
292
293                xy += 4;
294                fx += dx * 4;
295                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
296                count -= 4;
297            } // while count >= 4
298        } // if count >= 4
299
300        while (count-- > 0) {
301            SkASSERT((fx >> (16 + 14)) == 0);
302            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
303            fx += dx;
304        }
305    } else {
306        // SSE2 only support 16bit interger max & min, so only process the case
307        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
308        // height, there should be rare bitmap whose height will be greater
309        // than max 16bit interger in the real world.
310        if ((count >= 4) && (maxX <= 0xFFFF)) {
311            while (((size_t)xy & 0x0F) != 0) {
312                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
313                fx += dx;
314                count--;
315            }
316
317            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
318                                              fx + dx, fx);
319            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
320            __m128i wide_one  = _mm_set1_epi32(one);
321            __m128i wide_maxX = _mm_set1_epi32(maxX);
322            __m128i wide_mask = _mm_set1_epi32(0xF);
323
324             while (count >= 4) {
325                __m128i wide_i;
326                __m128i wide_lo;
327                __m128i wide_fx1;
328
329                // i = SkClampMax(f>>16,maxX)
330                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
331                                       _mm_setzero_si128());
332                wide_i = _mm_min_epi16(wide_i, wide_maxX);
333
334                // i<<4 | TILEX_LOW_BITS(fx)
335                wide_lo = _mm_srli_epi32(wide_fx, 12);
336                wide_lo = _mm_and_si128(wide_lo, wide_mask);
337                wide_i  = _mm_slli_epi32(wide_i, 4);
338                wide_i  = _mm_or_si128(wide_i, wide_lo);
339
340                // i<<14
341                wide_i = _mm_slli_epi32(wide_i, 14);
342
343                // SkClampMax(((f+one))>>16,max)
344                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
345                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
346                                                        _mm_setzero_si128());
347                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
348
349                // final combination
350                wide_i = _mm_or_si128(wide_i, wide_fx1);
351                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
352
353                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
354                fx += dx * 4;
355                xy += 4;
356                count -= 4;
357            } // while count >= 4
358        } // if count >= 4
359
360        while (count-- > 0) {
361            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
362            fx += dx;
363        }
364    }
365}
366
367/*  SSE version of ClampX_ClampY_nofilter_scale()
368 *  portable version is in core/SkBitmapProcState_matrix.h
369 */
370void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
371                                    uint32_t xy[], int count, int x, int y) {
372    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
373                             SkMatrix::kScale_Mask)) == 0);
374
375    // we store y, x, x, x, x, x
376    const unsigned maxX = s.fBitmap->width() - 1;
377    SkFixed fx;
378    SkPoint pt;
379    s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
380                             SkIntToScalar(y) + SK_ScalarHalf, &pt);
381    fx = SkScalarToFixed(pt.fY);
382    const unsigned maxY = s.fBitmap->height() - 1;
383    *xy++ = SkClampMax(fx >> 16, maxY);
384    fx = SkScalarToFixed(pt.fX);
385
386    if (0 == maxX) {
387        // all of the following X values must be 0
388        memset(xy, 0, count * sizeof(uint16_t));
389        return;
390    }
391
392    const SkFixed dx = s.fInvSx;
393
394    // test if we don't need to apply the tile proc
395    if ((unsigned)(fx >> 16) <= maxX &&
396        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
397        // SSE version of decal_nofilter_scale
398        if (count >= 8) {
399            while (((size_t)xy & 0x0F) != 0) {
400                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
401                fx += 2 * dx;
402                count -= 2;
403            }
404
405            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
406            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
407
408            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
409                                             fx + dx, fx);
410            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
411
412            while (count >= 8) {
413                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
414                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
415
416                __m128i wide_result = _mm_packs_epi32(wide_out_low,
417                                                      wide_out_high);
418                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
419
420                wide_low = _mm_add_epi32(wide_low, wide_dx8);
421                wide_high = _mm_add_epi32(wide_high, wide_dx8);
422
423                xy += 4;
424                fx += dx * 8;
425                count -= 8;
426            }
427        } // if count >= 8
428
429        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
430        while (count-- > 0) {
431            *xx++ = SkToU16(fx >> 16);
432            fx += dx;
433        }
434    } else {
435        // SSE2 only support 16bit interger max & min, so only process the case
436        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
437        // height, there should be rare bitmap whose height will be greater
438        // than max 16bit interger in the real world.
439        if ((count >= 8) && (maxX <= 0xFFFF)) {
440            while (((size_t)xy & 0x0F) != 0) {
441                *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
442                                        SkClampMax(fx >> 16, maxX));
443                fx += 2 * dx;
444                count -= 2;
445            }
446
447            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
448            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
449
450            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
451                                             fx + dx, fx);
452            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
453            __m128i wide_maxX = _mm_set1_epi32(maxX);
454
455            while (count >= 8) {
456                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
457                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
458
459                wide_out_low  = _mm_max_epi16(wide_out_low,
460                                              _mm_setzero_si128());
461                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
462                wide_out_high = _mm_max_epi16(wide_out_high,
463                                              _mm_setzero_si128());
464                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
465
466                __m128i wide_result = _mm_packs_epi32(wide_out_low,
467                                                      wide_out_high);
468                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
469
470                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
471                wide_high = _mm_add_epi32(wide_high, wide_dx8);
472
473                xy += 4;
474                fx += dx * 8;
475                count -= 8;
476            }
477        } // if count >= 8
478
479        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
480        while (count-- > 0) {
481            *xx++ = SkClampMax(fx >> 16, maxX);
482            fx += dx;
483        }
484    }
485}
486
487/*  SSE version of ClampX_ClampY_filter_affine()
488 *  portable version is in core/SkBitmapProcState_matrix.h
489 */
490void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
491                                      uint32_t xy[], int count, int x, int y) {
492    SkPoint srcPt;
493    s.fInvProc(s.fInvMatrix,
494               SkIntToScalar(x) + SK_ScalarHalf,
495               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
496
497    SkFixed oneX = s.fFilterOneX;
498    SkFixed oneY = s.fFilterOneY;
499    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
500    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
501    SkFixed dx = s.fInvSx;
502    SkFixed dy = s.fInvKy;
503    unsigned maxX = s.fBitmap->width() - 1;
504    unsigned maxY = s.fBitmap->height() - 1;
505
506    if (count >= 2 && (maxX <= 0xFFFF)) {
507        SkFixed dx2 = dx + dx;
508        SkFixed dy2 = dy + dy;
509
510        __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
511        __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
512        __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
513        __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);
514        __m128i wide_mask = _mm_set1_epi32(0xF);
515
516        while (count >= 2) {
517            // i = SkClampMax(f>>16,maxX)
518            __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),
519                                           _mm_setzero_si128());
520            wide_i = _mm_min_epi16(wide_i, wide_max);
521
522            // i<<4 | TILEX_LOW_BITS(f)
523            __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
524            wide_lo = _mm_and_si128(wide_lo, wide_mask);
525            wide_i  = _mm_slli_epi32(wide_i, 4);
526            wide_i  = _mm_or_si128(wide_i, wide_lo);
527
528            // i<<14
529            wide_i = _mm_slli_epi32(wide_i, 14);
530
531            // SkClampMax(((f+one))>>16,max)
532            __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
533            wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),
534                                                   _mm_setzero_si128());
535            wide_f1 = _mm_min_epi16(wide_f1, wide_max);
536
537            // final combination
538            wide_i = _mm_or_si128(wide_i, wide_f1);
539            _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);
540
541            wide_f = _mm_add_epi32(wide_f, wide_d2);
542
543            fx += dx2;
544            fy += dy2;
545            xy += 4;
546            count -= 2;
547        } // while count >= 2
548    } // if count >= 2
549
550    while (count-- > 0) {
551        *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
552        fy += dy;
553        *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
554        fx += dx;
555    }
556}
557
558/*  SSE version of ClampX_ClampY_nofilter_affine()
559 *  portable version is in core/SkBitmapProcState_matrix.h
560 */
561void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
562                                      uint32_t xy[], int count, int x, int y) {
563    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
564    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
565                             SkMatrix::kScale_Mask |
566                             SkMatrix::kAffine_Mask)) == 0);
567
568    SkPoint srcPt;
569    s.fInvProc(s.fInvMatrix,
570               SkIntToScalar(x) + SK_ScalarHalf,
571               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
572
573    SkFixed fx = SkScalarToFixed(srcPt.fX);
574    SkFixed fy = SkScalarToFixed(srcPt.fY);
575    SkFixed dx = s.fInvSx;
576    SkFixed dy = s.fInvKy;
577    int maxX = s.fBitmap->width() - 1;
578    int maxY = s.fBitmap->height() - 1;
579
580    if (count >= 4 && (maxX <= 0xFFFF)) {
581        while (((size_t)xy & 0x0F) != 0) {
582            *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
583                                  SkClampMax(fx >> 16, maxX);
584            fx += dx;
585            fy += dy;
586            count--;
587        }
588
589        SkFixed dx4 = dx * 4;
590        SkFixed dy4 = dy * 4;
591
592        __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
593                                          fx + dx, fx);
594        __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
595                                          fy + dy, fy);
596        __m128i wide_dx4  = _mm_set1_epi32(dx4);
597        __m128i wide_dy4  = _mm_set1_epi32(dy4);
598
599        __m128i wide_maxX = _mm_set1_epi32(maxX);
600        __m128i wide_maxY = _mm_set1_epi32(maxY);
601
602        while (count >= 4) {
603            // SkClampMax(fx>>16,maxX)
604            __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
605                                            _mm_setzero_si128());
606            wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
607
608            // SkClampMax(fy>>16,maxY)
609            __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),
610                                            _mm_setzero_si128());
611            wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
612
613            // final combination
614            __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
615                                          wide_lo);
616            _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
617
618            wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
619            wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
620
621            fx += dx4;
622            fy += dy4;
623            xy += 4;
624            count -= 4;
625        } // while count >= 4
626    } // if count >= 4
627
628    while (count-- > 0) {
629        *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
630                              SkClampMax(fx >> 16, maxX);
631        fx += dx;
632        fy += dy;
633    }
634}
635
636/*  SSE version of S32_D16_filter_DX_SSE2
637 *  Definition is in section of "D16 functions for SRC == 8888" in SkBitmapProcState.cpp
638 *  It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
639 */
640void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
641                            const uint32_t* xy,
642                            int count, uint16_t* colors) {
643    SkASSERT(count > 0 && colors != NULL);
644    SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
645    SkASSERT(kN32_SkColorType == s.fBitmap->colorType());
646    SkASSERT(s.fBitmap->isOpaque());
647
648    SkPMColor dstColor;
649    const char* srcAddr = static_cast<const char*>(s.fBitmap->getPixels());
650    size_t rb = s.fBitmap->rowBytes();
651    uint32_t XY = *xy++;
652    unsigned y0 = XY >> 14;
653    const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
654    const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
655    unsigned subY = y0 & 0xF;
656
657    // ( 0,  0,  0,  0,  0,  0,  0, 16)
658    __m128i sixteen = _mm_cvtsi32_si128(16);
659
660    // ( 0,  0,  0,  0, 16, 16, 16, 16)
661    sixteen = _mm_shufflelo_epi16(sixteen, 0);
662
663    // ( 0,  0,  0,  0,  0,  0,  0,  y)
664    __m128i allY = _mm_cvtsi32_si128(subY);
665
666    // ( 0,  0,  0,  0,  y,  y,  y,  y)
667    allY = _mm_shufflelo_epi16(allY, 0);
668
669    // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
670    __m128i negY = _mm_sub_epi16(sixteen, allY);
671
672    // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
673    allY = _mm_unpacklo_epi64(allY, negY);
674
675    // (16, 16, 16, 16, 16, 16, 16, 16 )
676    sixteen = _mm_shuffle_epi32(sixteen, 0);
677
678    // ( 0,  0,  0,  0,  0,  0,  0,  0)
679    __m128i zero = _mm_setzero_si128();
680
681    do {
682        uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
683        unsigned x0 = XX >> 18;
684        unsigned x1 = XX & 0x3FFF;
685
686        // (0, 0, 0, 0, 0, 0, 0, x)
687        __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
688
689        // (0, 0, 0, 0, x, x, x, x)
690        allX = _mm_shufflelo_epi16(allX, 0);
691
692        // (x, x, x, x, x, x, x, x)
693        allX = _mm_shuffle_epi32(allX, 0);
694
695        // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
696        __m128i negX = _mm_sub_epi16(sixteen, allX);
697
698        // Load 4 samples (pixels).
699        __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
700        __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
701        __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
702        __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
703
704        // (0, 0, a00, a10)
705        __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
706
707        // Expand to 16 bits per component.
708        a00a10 = _mm_unpacklo_epi8(a00a10, zero);
709
710        // ((a00 * (16-y)), (a10 * y)).
711        a00a10 = _mm_mullo_epi16(a00a10, allY);
712
713        // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
714        a00a10 = _mm_mullo_epi16(a00a10, negX);
715
716        // (0, 0, a01, a10)
717        __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
718
719        // Expand to 16 bits per component.
720        a01a11 = _mm_unpacklo_epi8(a01a11, zero);
721
722        // (a01 * (16-y)), (a11 * y)
723        a01a11 = _mm_mullo_epi16(a01a11, allY);
724
725        // (a01 * (16-y) * x), (a11 * y * x)
726        a01a11 = _mm_mullo_epi16(a01a11, allX);
727
728        // (a00*w00 + a01*w01, a10*w10 + a11*w11)
729        __m128i sum = _mm_add_epi16(a00a10, a01a11);
730
731        // (DC, a00*w00 + a01*w01)
732        __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
733
734        // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
735        sum = _mm_add_epi16(sum, shifted);
736
737        // Divide each 16 bit component by 256.
738        sum = _mm_srli_epi16(sum, 8);
739
740        // Pack lower 4 16 bit values of sum into lower 4 bytes.
741        sum = _mm_packus_epi16(sum, zero);
742
743        // Extract low int and store.
744        dstColor = _mm_cvtsi128_si32(sum);
745
746        *colors++ = SkPixel32ToPixel16(dstColor);
747    } while (--count > 0);
748}
749