1/*
2 * Copyright 2013 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmap.h"
10#include "SkBitmapFilter_opts_SSE2.h"
11#include "SkBitmapProcState.h"
12#include "SkColor.h"
13#include "SkColorPriv.h"
14#include "SkConvolver.h"
15#include "SkShader.h"
16#include "SkUnPreMultiply.h"
17
18#if 0
19static inline void print128i(__m128i value) {
20    int *v = (int*) &value;
21    printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
22}
23
24static inline void print128i_16(__m128i value) {
25    short *v = (short*) &value;
26    printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
27}
28
29static inline void print128i_8(__m128i value) {
30    unsigned char *v = (unsigned char*) &value;
31    printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
32           v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
33           v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
34           );
35}
36
37static inline void print128f(__m128 value) {
38    float *f = (float*) &value;
39    printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
40}
41#endif
42
43// because the border is handled specially, this is guaranteed to have all 16 pixels
44// available to it without running off the bitmap's edge.
45
46void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
47                            SkPMColor* SK_RESTRICT colors, int count) {
48
49    const int maxX = s.fBitmap->width() - 1;
50    const int maxY = s.fBitmap->height() - 1;
51
52    while (count-- > 0) {
53        SkPoint srcPt;
54        s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
55                    SkIntToScalar(y), &srcPt);
56        srcPt.fX -= SK_ScalarHalf;
57        srcPt.fY -= SK_ScalarHalf;
58
59        int sx = SkScalarFloorToInt(srcPt.fX);
60        int sy = SkScalarFloorToInt(srcPt.fY);
61
62        __m128 weight = _mm_setzero_ps();
63        __m128 accum = _mm_setzero_ps();
64
65        int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f)));
66        int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f)));
67        int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f)));
68        int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f)));
69
70        for (int src_y = y0; src_y <= y1; src_y++) {
71            float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fY - src_y));
72
73            for (int src_x = x0; src_x <= x1 ; src_x++) {
74                float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fX - src_x));
75
76                float combined_weight = xweight * yweight;
77
78                SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y);
79
80                __m128i c = _mm_cvtsi32_si128( color );
81                c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
82                c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
83
84                __m128 cfloat = _mm_cvtepi32_ps( c );
85
86                __m128 weightVector = _mm_set1_ps(combined_weight);
87
88                accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
89                weight = _mm_add_ps( weight, weightVector );
90            }
91        }
92
93        accum = _mm_div_ps(accum, weight);
94        accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));
95
96        __m128i accumInt = _mm_cvtps_epi32( accum );
97
98        int localResult[4];
99        _mm_storeu_si128((__m128i *) (localResult), accumInt);
100        int a = SkClampMax(localResult[0], 255);
101        int r = SkClampMax(localResult[1], a);
102        int g = SkClampMax(localResult[2], a);
103        int b = SkClampMax(localResult[3], a);
104
105        *colors++ = SkPackARGB32(a, r, g, b);
106
107        x++;
108    }
109}
110
111void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
112                             SkPMColor *SK_RESTRICT colors, int count) {
113    const int maxX = s.fBitmap->width() - 1;
114    const int maxY = s.fBitmap->height() - 1;
115
116    SkPoint srcPt;
117    s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
118                SkIntToScalar(y), &srcPt);
119    srcPt.fY -= SK_ScalarHalf;
120    int sy = SkScalarFloorToInt(srcPt.fY);
121
122    int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f)));
123    int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f)));
124
125    while (count-- > 0) {
126        srcPt.fX -= SK_ScalarHalf;
127        srcPt.fY -= SK_ScalarHalf;
128
129        int sx = SkScalarFloorToInt(srcPt.fX);
130
131        float weight = 0;
132        __m128 accum = _mm_setzero_ps();
133
134        int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f)));
135        int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f)));
136
137        for (int src_y = y0; src_y <= y1; src_y++) {
138            float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fY - src_y));
139
140            for (int src_x = x0; src_x <= x1 ; src_x++) {
141                float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fX - src_x));
142
143                float combined_weight = xweight * yweight;
144
145                SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y);
146
147                __m128 c = _mm_set_ps((float)SkGetPackedB32(color),
148                                      (float)SkGetPackedG32(color),
149                                      (float)SkGetPackedR32(color),
150                                      (float)SkGetPackedA32(color));
151
152                __m128 weightVector = _mm_set1_ps(combined_weight);
153
154                accum = _mm_add_ps(accum, _mm_mul_ps(c, weightVector));
155                weight += combined_weight;
156            }
157        }
158
159        __m128 totalWeightVector = _mm_set1_ps(weight);
160        accum = _mm_div_ps(accum, totalWeightVector);
161        accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));
162
163        float localResult[4];
164        _mm_storeu_ps(localResult, accum);
165        int a = SkClampMax(int(localResult[0]), 255);
166        int r = SkClampMax(int(localResult[1]), a);
167        int g = SkClampMax(int(localResult[2]), a);
168        int b = SkClampMax(int(localResult[3]), a);
169
170        *colors++ = SkPackARGB32(a, r, g, b);
171
172        x++;
173
174        s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
175                    SkIntToScalar(y), &srcPt);
176    }
177}
178
179// Convolves horizontally along a single row. The row data is given in
180// |src_data| and continues for the num_values() of the filter.
181void convolveHorizontally_SSE2(const unsigned char* src_data,
182                               const SkConvolutionFilter1D& filter,
183                               unsigned char* out_row,
184                               bool /*has_alpha*/) {
185    int num_values = filter.numValues();
186
187    int filter_offset, filter_length;
188    __m128i zero = _mm_setzero_si128();
189    __m128i mask[4];
190    // |mask| will be used to decimate all extra filter coefficients that are
191    // loaded by SIMD when |filter_length| is not divisible by 4.
192    // mask[0] is not used in following algorithm.
193    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
194    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
195    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
196
197    // Output one pixel each iteration, calculating all channels (RGBA) together.
198    for (int out_x = 0; out_x < num_values; out_x++) {
199        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
200            filter.FilterForValue(out_x, &filter_offset, &filter_length);
201
202        __m128i accum = _mm_setzero_si128();
203
204        // Compute the first pixel in this row that the filter affects. It will
205        // touch |filter_length| pixels (4 bytes each) after this.
206        const __m128i* row_to_filter =
207            reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
208
209        // We will load and accumulate with four coefficients per iteration.
210        for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
211
212            // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
213            __m128i coeff, coeff16;
214            // [16] xx xx xx xx c3 c2 c1 c0
215            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
216            // [16] xx xx xx xx c1 c1 c0 c0
217            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
218            // [16] c1 c1 c1 c1 c0 c0 c0 c0
219            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
220
221            // Load four pixels => unpack the first two pixels to 16 bits =>
222            // multiply with coefficients => accumulate the convolution result.
223            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
224            __m128i src8 = _mm_loadu_si128(row_to_filter);
225            // [16] a1 b1 g1 r1 a0 b0 g0 r0
226            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
227            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
228            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
229            // [32]  a0*c0 b0*c0 g0*c0 r0*c0
230            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
231            accum = _mm_add_epi32(accum, t);
232            // [32]  a1*c1 b1*c1 g1*c1 r1*c1
233            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
234            accum = _mm_add_epi32(accum, t);
235
236            // Duplicate 3rd and 4th coefficients for all channels =>
237            // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
238            // => accumulate the convolution results.
239            // [16] xx xx xx xx c3 c3 c2 c2
240            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
241            // [16] c3 c3 c3 c3 c2 c2 c2 c2
242            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
243            // [16] a3 g3 b3 r3 a2 g2 b2 r2
244            src16 = _mm_unpackhi_epi8(src8, zero);
245            mul_hi = _mm_mulhi_epi16(src16, coeff16);
246            mul_lo = _mm_mullo_epi16(src16, coeff16);
247            // [32]  a2*c2 b2*c2 g2*c2 r2*c2
248            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
249            accum = _mm_add_epi32(accum, t);
250            // [32]  a3*c3 b3*c3 g3*c3 r3*c3
251            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
252            accum = _mm_add_epi32(accum, t);
253
254            // Advance the pixel and coefficients pointers.
255            row_to_filter += 1;
256            filter_values += 4;
257        }
258
259        // When |filter_length| is not divisible by 4, we need to decimate some of
260        // the filter coefficient that was loaded incorrectly to zero; Other than
261        // that the algorithm is same with above, exceot that the 4th pixel will be
262        // always absent.
263        int r = filter_length&3;
264        if (r) {
265            // Note: filter_values must be padded to align_up(filter_offset, 8).
266            __m128i coeff, coeff16;
267            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
268            // Mask out extra filter taps.
269            coeff = _mm_and_si128(coeff, mask[r]);
270            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
271            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
272
273            // Note: line buffer must be padded to align_up(filter_offset, 16).
274            // We resolve this by use C-version for the last horizontal line.
275            __m128i src8 = _mm_loadu_si128(row_to_filter);
276            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
277            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
278            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
279            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
280            accum = _mm_add_epi32(accum, t);
281            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
282            accum = _mm_add_epi32(accum, t);
283
284            src16 = _mm_unpackhi_epi8(src8, zero);
285            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
286            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
287            mul_hi = _mm_mulhi_epi16(src16, coeff16);
288            mul_lo = _mm_mullo_epi16(src16, coeff16);
289            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
290            accum = _mm_add_epi32(accum, t);
291        }
292
293        // Shift right for fixed point implementation.
294        accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
295
296        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
297        accum = _mm_packs_epi32(accum, zero);
298        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
299        accum = _mm_packus_epi16(accum, zero);
300
301        // Store the pixel value of 32 bits.
302        *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
303        out_row += 4;
304    }
305}
306
307// Convolves horizontally along four rows. The row data is given in
308// |src_data| and continues for the num_values() of the filter.
309// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
310// refer to that function for detailed comments.
311void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
312                                    const SkConvolutionFilter1D& filter,
313                                    unsigned char* out_row[4]) {
314    int num_values = filter.numValues();
315
316    int filter_offset, filter_length;
317    __m128i zero = _mm_setzero_si128();
318    __m128i mask[4];
319    // |mask| will be used to decimate all extra filter coefficients that are
320    // loaded by SIMD when |filter_length| is not divisible by 4.
321    // mask[0] is not used in following algorithm.
322    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
323    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
324    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
325
326    // Output one pixel each iteration, calculating all channels (RGBA) together.
327    for (int out_x = 0; out_x < num_values; out_x++) {
328        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
329            filter.FilterForValue(out_x, &filter_offset, &filter_length);
330
331        // four pixels in a column per iteration.
332        __m128i accum0 = _mm_setzero_si128();
333        __m128i accum1 = _mm_setzero_si128();
334        __m128i accum2 = _mm_setzero_si128();
335        __m128i accum3 = _mm_setzero_si128();
336        int start = (filter_offset<<2);
337        // We will load and accumulate with four coefficients per iteration.
338        for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
339            __m128i coeff, coeff16lo, coeff16hi;
340            // [16] xx xx xx xx c3 c2 c1 c0
341            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
342            // [16] xx xx xx xx c1 c1 c0 c0
343            coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
344            // [16] c1 c1 c1 c1 c0 c0 c0 c0
345            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
346            // [16] xx xx xx xx c3 c3 c2 c2
347            coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
348            // [16] c3 c3 c3 c3 c2 c2 c2 c2
349            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
350
351            __m128i src8, src16, mul_hi, mul_lo, t;
352
353#define ITERATION(src, accum)                                                \
354            src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
355            src16 = _mm_unpacklo_epi8(src8, zero);                           \
356            mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
357            mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
358            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
359            accum = _mm_add_epi32(accum, t);                                 \
360            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
361            accum = _mm_add_epi32(accum, t);                                 \
362            src16 = _mm_unpackhi_epi8(src8, zero);                           \
363            mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
364            mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
365            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
366            accum = _mm_add_epi32(accum, t);                                 \
367            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
368            accum = _mm_add_epi32(accum, t)
369
370            ITERATION(src_data[0] + start, accum0);
371            ITERATION(src_data[1] + start, accum1);
372            ITERATION(src_data[2] + start, accum2);
373            ITERATION(src_data[3] + start, accum3);
374
375            start += 16;
376            filter_values += 4;
377        }
378
379        int r = filter_length & 3;
380        if (r) {
381            // Note: filter_values must be padded to align_up(filter_offset, 8);
382            __m128i coeff;
383            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
384            // Mask out extra filter taps.
385            coeff = _mm_and_si128(coeff, mask[r]);
386
387            __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
388            /* c1 c1 c1 c1 c0 c0 c0 c0 */
389            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
390            __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
391            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
392
393            __m128i src8, src16, mul_hi, mul_lo, t;
394
395            ITERATION(src_data[0] + start, accum0);
396            ITERATION(src_data[1] + start, accum1);
397            ITERATION(src_data[2] + start, accum2);
398            ITERATION(src_data[3] + start, accum3);
399        }
400
401        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
402        accum0 = _mm_packs_epi32(accum0, zero);
403        accum0 = _mm_packus_epi16(accum0, zero);
404        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
405        accum1 = _mm_packs_epi32(accum1, zero);
406        accum1 = _mm_packus_epi16(accum1, zero);
407        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
408        accum2 = _mm_packs_epi32(accum2, zero);
409        accum2 = _mm_packus_epi16(accum2, zero);
410        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
411        accum3 = _mm_packs_epi32(accum3, zero);
412        accum3 = _mm_packus_epi16(accum3, zero);
413
414        *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
415        *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
416        *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
417        *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
418
419        out_row[0] += 4;
420        out_row[1] += 4;
421        out_row[2] += 4;
422        out_row[3] += 4;
423    }
424}
425
426// Does vertical convolution to produce one output row. The filter values and
427// length are given in the first two parameters. These are applied to each
428// of the rows pointed to in the |source_data_rows| array, with each row
429// being |pixel_width| wide.
430//
431// The output must have room for |pixel_width * 4| bytes.
432template<bool has_alpha>
433void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
434                             int filter_length,
435                             unsigned char* const* source_data_rows,
436                             int pixel_width,
437                             unsigned char* out_row) {
438    int width = pixel_width & ~3;
439
440    __m128i zero = _mm_setzero_si128();
441    __m128i accum0, accum1, accum2, accum3, coeff16;
442    const __m128i* src;
443    // Output four pixels per iteration (16 bytes).
444    for (int out_x = 0; out_x < width; out_x += 4) {
445
446        // Accumulated result for each pixel. 32 bits per RGBA channel.
447        accum0 = _mm_setzero_si128();
448        accum1 = _mm_setzero_si128();
449        accum2 = _mm_setzero_si128();
450        accum3 = _mm_setzero_si128();
451
452        // Convolve with one filter coefficient per iteration.
453        for (int filter_y = 0; filter_y < filter_length; filter_y++) {
454
455            // Duplicate the filter coefficient 8 times.
456            // [16] cj cj cj cj cj cj cj cj
457            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
458
459            // Load four pixels (16 bytes) together.
460            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
461            src = reinterpret_cast<const __m128i*>(
462                &source_data_rows[filter_y][out_x << 2]);
463            __m128i src8 = _mm_loadu_si128(src);
464
465            // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
466            // multiply with current coefficient => accumulate the result.
467            // [16] a1 b1 g1 r1 a0 b0 g0 r0
468            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
469            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
470            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
471            // [32] a0 b0 g0 r0
472            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
473            accum0 = _mm_add_epi32(accum0, t);
474            // [32] a1 b1 g1 r1
475            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
476            accum1 = _mm_add_epi32(accum1, t);
477
478            // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
479            // multiply with current coefficient => accumulate the result.
480            // [16] a3 b3 g3 r3 a2 b2 g2 r2
481            src16 = _mm_unpackhi_epi8(src8, zero);
482            mul_hi = _mm_mulhi_epi16(src16, coeff16);
483            mul_lo = _mm_mullo_epi16(src16, coeff16);
484            // [32] a2 b2 g2 r2
485            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
486            accum2 = _mm_add_epi32(accum2, t);
487            // [32] a3 b3 g3 r3
488            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
489            accum3 = _mm_add_epi32(accum3, t);
490        }
491
492        // Shift right for fixed point implementation.
493        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
494        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
495        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
496        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
497
498        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
499        // [16] a1 b1 g1 r1 a0 b0 g0 r0
500        accum0 = _mm_packs_epi32(accum0, accum1);
501        // [16] a3 b3 g3 r3 a2 b2 g2 r2
502        accum2 = _mm_packs_epi32(accum2, accum3);
503
504        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
505        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
506        accum0 = _mm_packus_epi16(accum0, accum2);
507
508        if (has_alpha) {
509            // Compute the max(ri, gi, bi) for each pixel.
510            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
511            __m128i a = _mm_srli_epi32(accum0, 8);
512            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
513            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
514            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
515            a = _mm_srli_epi32(accum0, 16);
516            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
517            b = _mm_max_epu8(a, b);  // Max of r and g and b.
518            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
519            b = _mm_slli_epi32(b, 24);
520
521            // Make sure the value of alpha channel is always larger than maximum
522            // value of color channels.
523            accum0 = _mm_max_epu8(b, accum0);
524        } else {
525            // Set value of alpha channels to 0xFF.
526            __m128i mask = _mm_set1_epi32(0xff000000);
527            accum0 = _mm_or_si128(accum0, mask);
528        }
529
530        // Store the convolution result (16 bytes) and advance the pixel pointers.
531        _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
532        out_row += 16;
533    }
534
535    // When the width of the output is not divisible by 4, We need to save one
536    // pixel (4 bytes) each time. And also the fourth pixel is always absent.
537    if (pixel_width & 3) {
538        accum0 = _mm_setzero_si128();
539        accum1 = _mm_setzero_si128();
540        accum2 = _mm_setzero_si128();
541        for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
542            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
543            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
544            src = reinterpret_cast<const __m128i*>(
545                &source_data_rows[filter_y][width<<2]);
546            __m128i src8 = _mm_loadu_si128(src);
547            // [16] a1 b1 g1 r1 a0 b0 g0 r0
548            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
549            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
550            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
551            // [32] a0 b0 g0 r0
552            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
553            accum0 = _mm_add_epi32(accum0, t);
554            // [32] a1 b1 g1 r1
555            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
556            accum1 = _mm_add_epi32(accum1, t);
557            // [16] a3 b3 g3 r3 a2 b2 g2 r2
558            src16 = _mm_unpackhi_epi8(src8, zero);
559            mul_hi = _mm_mulhi_epi16(src16, coeff16);
560            mul_lo = _mm_mullo_epi16(src16, coeff16);
561            // [32] a2 b2 g2 r2
562            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
563            accum2 = _mm_add_epi32(accum2, t);
564        }
565
566        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
567        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
568        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
569        // [16] a1 b1 g1 r1 a0 b0 g0 r0
570        accum0 = _mm_packs_epi32(accum0, accum1);
571        // [16] a3 b3 g3 r3 a2 b2 g2 r2
572        accum2 = _mm_packs_epi32(accum2, zero);
573        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
574        accum0 = _mm_packus_epi16(accum0, accum2);
575        if (has_alpha) {
576            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
577            __m128i a = _mm_srli_epi32(accum0, 8);
578            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
579            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
580            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
581            a = _mm_srli_epi32(accum0, 16);
582            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
583            b = _mm_max_epu8(a, b);  // Max of r and g and b.
584            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
585            b = _mm_slli_epi32(b, 24);
586            accum0 = _mm_max_epu8(b, accum0);
587        } else {
588            __m128i mask = _mm_set1_epi32(0xff000000);
589            accum0 = _mm_or_si128(accum0, mask);
590        }
591
592        for (int out_x = width; out_x < pixel_width; out_x++) {
593            *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
594            accum0 = _mm_srli_si128(accum0, 4);
595            out_row += 4;
596        }
597    }
598}
599
600void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
601                             int filter_length,
602                             unsigned char* const* source_data_rows,
603                             int pixel_width,
604                             unsigned char* out_row,
605                             bool has_alpha) {
606    if (has_alpha) {
607        convolveVertically_SSE2<true>(filter_values,
608                                      filter_length,
609                                      source_data_rows,
610                                      pixel_width,
611                                      out_row);
612    } else {
613        convolveVertically_SSE2<false>(filter_values,
614                                       filter_length,
615                                       source_data_rows,
616                                       pixel_width,
617                                       out_row);
618    }
619}
620
621void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
622    // Padding |paddingCount| of more dummy coefficients after the coefficients
623    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
624    // together to access invalid memory areas. We are not trying to align the
625    // coefficients right now due to the opaqueness of <vector> implementation.
626    // This has to be done after all |AddFilter| calls.
627    for (int i = 0; i < 8; ++i) {
628        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
629    }
630}
631