1/*
2 * Copyright 2013 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <emmintrin.h>
9#include "SkBitmap.h"
10#include "SkBitmapFilter_opts_SSE2.h"
11#include "SkBitmapProcState.h"
12#include "SkColor.h"
13#include "SkColorPriv.h"
14#include "SkConvolver.h"
15#include "SkShader.h"
16#include "SkUnPreMultiply.h"
17
18#if 0
19static inline void print128i(__m128i value) {
20    int *v = (int*) &value;
21    printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
22}
23
24static inline void print128i_16(__m128i value) {
25    short *v = (short*) &value;
26    printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
27}
28
29static inline void print128i_8(__m128i value) {
30    unsigned char *v = (unsigned char*) &value;
31    printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
32           v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
33           v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
34           );
35}
36
37static inline void print128f(__m128 value) {
38    float *f = (float*) &value;
39    printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
40}
41#endif
42
43// because the border is handled specially, this is guaranteed to have all 16 pixels
44// available to it without running off the bitmap's edge.
45
46void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
47                            SkPMColor* SK_RESTRICT colors, int count) {
48
49    const int maxX = s.fBitmap->width();
50    const int maxY = s.fBitmap->height();
51    SkAutoTMalloc<SkScalar> xWeights(maxX);
52
53    while (count-- > 0) {
54        SkPoint srcPt;
55        s.fInvProc(s.fInvMatrix, x + 0.5f, y + 0.5f, &srcPt);
56        srcPt.fX -= SK_ScalarHalf;
57        srcPt.fY -= SK_ScalarHalf;
58
59        __m128 weight = _mm_setzero_ps();
60        __m128 accum = _mm_setzero_ps();
61
62        int y0 = SkClampMax(SkScalarCeilToInt(srcPt.fY-s.getBitmapFilter()->width()), maxY);
63        int y1 = SkClampMax(SkScalarFloorToInt(srcPt.fY+s.getBitmapFilter()->width()+1), maxY);
64        int x0 = SkClampMax(SkScalarCeilToInt(srcPt.fX-s.getBitmapFilter()->width()), maxX);
65        int x1 = SkClampMax(SkScalarFloorToInt(srcPt.fX+s.getBitmapFilter()->width())+1, maxX);
66
67        for (int srcX = x0; srcX < x1 ; srcX++) {
68            // Looking these up once instead of each loop is a ~15% speedup.
69            xWeights[srcX - x0] = s.getBitmapFilter()->lookupScalar((srcPt.fX - srcX));
70        }
71
72        for (int srcY = y0; srcY < y1; srcY++) {
73            SkScalar yWeight = s.getBitmapFilter()->lookupScalar((srcPt.fY - srcY));
74
75            for (int srcX = x0; srcX < x1 ; srcX++) {
76                SkScalar xWeight = xWeights[srcX - x0];
77
78                SkScalar combined_weight = SkScalarMul(xWeight, yWeight);
79
80                SkPMColor color = *s.fBitmap->getAddr32(srcX, srcY);
81
82                __m128i c = _mm_cvtsi32_si128(color);
83                c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
84                c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
85                __m128 cfloat = _mm_cvtepi32_ps(c);
86
87                __m128 weightVector = _mm_set1_ps(combined_weight);
88                accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
89                weight = _mm_add_ps( weight, weightVector );
90            }
91        }
92
93        accum = _mm_div_ps(accum, weight);
94        accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));
95        __m128i accumInt = _mm_cvttps_epi32(accum);
96        accumInt = _mm_packs_epi32(accumInt, _mm_setzero_si128());
97        accumInt = _mm_packus_epi16(accumInt, _mm_setzero_si128());
98        SkPMColor c = _mm_cvtsi128_si32(accumInt);
99
100        int a = SkClampMax(SkGetPackedA32(c), 255);
101        int r = SkClampMax(SkGetPackedR32(c), a);
102        int g = SkClampMax(SkGetPackedG32(c), a);
103        int b = SkClampMax(SkGetPackedB32(c), a);
104
105        *colors++ = SkPackARGB32(a, r, g, b);
106
107        x++;
108    }
109}
110
111// Convolves horizontally along a single row. The row data is given in
112// |src_data| and continues for the num_values() of the filter.
113void convolveHorizontally_SSE2(const unsigned char* src_data,
114                               const SkConvolutionFilter1D& filter,
115                               unsigned char* out_row,
116                               bool /*has_alpha*/) {
117    int num_values = filter.numValues();
118
119    int filter_offset, filter_length;
120    __m128i zero = _mm_setzero_si128();
121    __m128i mask[4];
122    // |mask| will be used to decimate all extra filter coefficients that are
123    // loaded by SIMD when |filter_length| is not divisible by 4.
124    // mask[0] is not used in following algorithm.
125    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
126    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
127    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
128
129    // Output one pixel each iteration, calculating all channels (RGBA) together.
130    for (int out_x = 0; out_x < num_values; out_x++) {
131        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
132            filter.FilterForValue(out_x, &filter_offset, &filter_length);
133
134        __m128i accum = _mm_setzero_si128();
135
136        // Compute the first pixel in this row that the filter affects. It will
137        // touch |filter_length| pixels (4 bytes each) after this.
138        const __m128i* row_to_filter =
139            reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
140
141        // We will load and accumulate with four coefficients per iteration.
142        for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
143
144            // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
145            __m128i coeff, coeff16;
146            // [16] xx xx xx xx c3 c2 c1 c0
147            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
148            // [16] xx xx xx xx c1 c1 c0 c0
149            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
150            // [16] c1 c1 c1 c1 c0 c0 c0 c0
151            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
152
153            // Load four pixels => unpack the first two pixels to 16 bits =>
154            // multiply with coefficients => accumulate the convolution result.
155            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
156            __m128i src8 = _mm_loadu_si128(row_to_filter);
157            // [16] a1 b1 g1 r1 a0 b0 g0 r0
158            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
159            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
160            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
161            // [32]  a0*c0 b0*c0 g0*c0 r0*c0
162            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
163            accum = _mm_add_epi32(accum, t);
164            // [32]  a1*c1 b1*c1 g1*c1 r1*c1
165            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
166            accum = _mm_add_epi32(accum, t);
167
168            // Duplicate 3rd and 4th coefficients for all channels =>
169            // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
170            // => accumulate the convolution results.
171            // [16] xx xx xx xx c3 c3 c2 c2
172            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
173            // [16] c3 c3 c3 c3 c2 c2 c2 c2
174            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
175            // [16] a3 g3 b3 r3 a2 g2 b2 r2
176            src16 = _mm_unpackhi_epi8(src8, zero);
177            mul_hi = _mm_mulhi_epi16(src16, coeff16);
178            mul_lo = _mm_mullo_epi16(src16, coeff16);
179            // [32]  a2*c2 b2*c2 g2*c2 r2*c2
180            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
181            accum = _mm_add_epi32(accum, t);
182            // [32]  a3*c3 b3*c3 g3*c3 r3*c3
183            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
184            accum = _mm_add_epi32(accum, t);
185
186            // Advance the pixel and coefficients pointers.
187            row_to_filter += 1;
188            filter_values += 4;
189        }
190
191        // When |filter_length| is not divisible by 4, we need to decimate some of
192        // the filter coefficient that was loaded incorrectly to zero; Other than
193        // that the algorithm is same with above, exceot that the 4th pixel will be
194        // always absent.
195        int r = filter_length&3;
196        if (r) {
197            // Note: filter_values must be padded to align_up(filter_offset, 8).
198            __m128i coeff, coeff16;
199            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
200            // Mask out extra filter taps.
201            coeff = _mm_and_si128(coeff, mask[r]);
202            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
203            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
204
205            // Note: line buffer must be padded to align_up(filter_offset, 16).
206            // We resolve this by use C-version for the last horizontal line.
207            __m128i src8 = _mm_loadu_si128(row_to_filter);
208            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
209            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
210            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
211            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
212            accum = _mm_add_epi32(accum, t);
213            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
214            accum = _mm_add_epi32(accum, t);
215
216            src16 = _mm_unpackhi_epi8(src8, zero);
217            coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
218            coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
219            mul_hi = _mm_mulhi_epi16(src16, coeff16);
220            mul_lo = _mm_mullo_epi16(src16, coeff16);
221            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
222            accum = _mm_add_epi32(accum, t);
223        }
224
225        // Shift right for fixed point implementation.
226        accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
227
228        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
229        accum = _mm_packs_epi32(accum, zero);
230        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
231        accum = _mm_packus_epi16(accum, zero);
232
233        // Store the pixel value of 32 bits.
234        *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
235        out_row += 4;
236    }
237}
238
239// Convolves horizontally along four rows. The row data is given in
240// |src_data| and continues for the num_values() of the filter.
241// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
242// refer to that function for detailed comments.
243void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
244                                    const SkConvolutionFilter1D& filter,
245                                    unsigned char* out_row[4]) {
246    int num_values = filter.numValues();
247
248    int filter_offset, filter_length;
249    __m128i zero = _mm_setzero_si128();
250    __m128i mask[4];
251    // |mask| will be used to decimate all extra filter coefficients that are
252    // loaded by SIMD when |filter_length| is not divisible by 4.
253    // mask[0] is not used in following algorithm.
254    mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
255    mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
256    mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
257
258    // Output one pixel each iteration, calculating all channels (RGBA) together.
259    for (int out_x = 0; out_x < num_values; out_x++) {
260        const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
261            filter.FilterForValue(out_x, &filter_offset, &filter_length);
262
263        // four pixels in a column per iteration.
264        __m128i accum0 = _mm_setzero_si128();
265        __m128i accum1 = _mm_setzero_si128();
266        __m128i accum2 = _mm_setzero_si128();
267        __m128i accum3 = _mm_setzero_si128();
268        int start = (filter_offset<<2);
269        // We will load and accumulate with four coefficients per iteration.
270        for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
271            __m128i coeff, coeff16lo, coeff16hi;
272            // [16] xx xx xx xx c3 c2 c1 c0
273            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
274            // [16] xx xx xx xx c1 c1 c0 c0
275            coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
276            // [16] c1 c1 c1 c1 c0 c0 c0 c0
277            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
278            // [16] xx xx xx xx c3 c3 c2 c2
279            coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
280            // [16] c3 c3 c3 c3 c2 c2 c2 c2
281            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
282
283            __m128i src8, src16, mul_hi, mul_lo, t;
284
285#define ITERATION(src, accum)                                                \
286            src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
287            src16 = _mm_unpacklo_epi8(src8, zero);                           \
288            mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
289            mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
290            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
291            accum = _mm_add_epi32(accum, t);                                 \
292            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
293            accum = _mm_add_epi32(accum, t);                                 \
294            src16 = _mm_unpackhi_epi8(src8, zero);                           \
295            mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
296            mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
297            t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
298            accum = _mm_add_epi32(accum, t);                                 \
299            t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
300            accum = _mm_add_epi32(accum, t)
301
302            ITERATION(src_data[0] + start, accum0);
303            ITERATION(src_data[1] + start, accum1);
304            ITERATION(src_data[2] + start, accum2);
305            ITERATION(src_data[3] + start, accum3);
306
307            start += 16;
308            filter_values += 4;
309        }
310
311        int r = filter_length & 3;
312        if (r) {
313            // Note: filter_values must be padded to align_up(filter_offset, 8);
314            __m128i coeff;
315            coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
316            // Mask out extra filter taps.
317            coeff = _mm_and_si128(coeff, mask[r]);
318
319            __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
320            /* c1 c1 c1 c1 c0 c0 c0 c0 */
321            coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
322            __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
323            coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
324
325            __m128i src8, src16, mul_hi, mul_lo, t;
326
327            ITERATION(src_data[0] + start, accum0);
328            ITERATION(src_data[1] + start, accum1);
329            ITERATION(src_data[2] + start, accum2);
330            ITERATION(src_data[3] + start, accum3);
331        }
332
333        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
334        accum0 = _mm_packs_epi32(accum0, zero);
335        accum0 = _mm_packus_epi16(accum0, zero);
336        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
337        accum1 = _mm_packs_epi32(accum1, zero);
338        accum1 = _mm_packus_epi16(accum1, zero);
339        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
340        accum2 = _mm_packs_epi32(accum2, zero);
341        accum2 = _mm_packus_epi16(accum2, zero);
342        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
343        accum3 = _mm_packs_epi32(accum3, zero);
344        accum3 = _mm_packus_epi16(accum3, zero);
345
346        *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
347        *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
348        *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
349        *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
350
351        out_row[0] += 4;
352        out_row[1] += 4;
353        out_row[2] += 4;
354        out_row[3] += 4;
355    }
356}
357
358// Does vertical convolution to produce one output row. The filter values and
359// length are given in the first two parameters. These are applied to each
360// of the rows pointed to in the |source_data_rows| array, with each row
361// being |pixel_width| wide.
362//
363// The output must have room for |pixel_width * 4| bytes.
364template<bool has_alpha>
365void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
366                             int filter_length,
367                             unsigned char* const* source_data_rows,
368                             int pixel_width,
369                             unsigned char* out_row) {
370    int width = pixel_width & ~3;
371
372    __m128i zero = _mm_setzero_si128();
373    __m128i accum0, accum1, accum2, accum3, coeff16;
374    const __m128i* src;
375    // Output four pixels per iteration (16 bytes).
376    for (int out_x = 0; out_x < width; out_x += 4) {
377
378        // Accumulated result for each pixel. 32 bits per RGBA channel.
379        accum0 = _mm_setzero_si128();
380        accum1 = _mm_setzero_si128();
381        accum2 = _mm_setzero_si128();
382        accum3 = _mm_setzero_si128();
383
384        // Convolve with one filter coefficient per iteration.
385        for (int filter_y = 0; filter_y < filter_length; filter_y++) {
386
387            // Duplicate the filter coefficient 8 times.
388            // [16] cj cj cj cj cj cj cj cj
389            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
390
391            // Load four pixels (16 bytes) together.
392            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
393            src = reinterpret_cast<const __m128i*>(
394                &source_data_rows[filter_y][out_x << 2]);
395            __m128i src8 = _mm_loadu_si128(src);
396
397            // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
398            // multiply with current coefficient => accumulate the result.
399            // [16] a1 b1 g1 r1 a0 b0 g0 r0
400            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
401            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
402            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
403            // [32] a0 b0 g0 r0
404            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
405            accum0 = _mm_add_epi32(accum0, t);
406            // [32] a1 b1 g1 r1
407            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
408            accum1 = _mm_add_epi32(accum1, t);
409
410            // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
411            // multiply with current coefficient => accumulate the result.
412            // [16] a3 b3 g3 r3 a2 b2 g2 r2
413            src16 = _mm_unpackhi_epi8(src8, zero);
414            mul_hi = _mm_mulhi_epi16(src16, coeff16);
415            mul_lo = _mm_mullo_epi16(src16, coeff16);
416            // [32] a2 b2 g2 r2
417            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
418            accum2 = _mm_add_epi32(accum2, t);
419            // [32] a3 b3 g3 r3
420            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
421            accum3 = _mm_add_epi32(accum3, t);
422        }
423
424        // Shift right for fixed point implementation.
425        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
426        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
427        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
428        accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
429
430        // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
431        // [16] a1 b1 g1 r1 a0 b0 g0 r0
432        accum0 = _mm_packs_epi32(accum0, accum1);
433        // [16] a3 b3 g3 r3 a2 b2 g2 r2
434        accum2 = _mm_packs_epi32(accum2, accum3);
435
436        // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
437        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
438        accum0 = _mm_packus_epi16(accum0, accum2);
439
440        if (has_alpha) {
441            // Compute the max(ri, gi, bi) for each pixel.
442            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
443            __m128i a = _mm_srli_epi32(accum0, 8);
444            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
445            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
446            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
447            a = _mm_srli_epi32(accum0, 16);
448            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
449            b = _mm_max_epu8(a, b);  // Max of r and g and b.
450            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
451            b = _mm_slli_epi32(b, 24);
452
453            // Make sure the value of alpha channel is always larger than maximum
454            // value of color channels.
455            accum0 = _mm_max_epu8(b, accum0);
456        } else {
457            // Set value of alpha channels to 0xFF.
458            __m128i mask = _mm_set1_epi32(0xff000000);
459            accum0 = _mm_or_si128(accum0, mask);
460        }
461
462        // Store the convolution result (16 bytes) and advance the pixel pointers.
463        _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
464        out_row += 16;
465    }
466
467    // When the width of the output is not divisible by 4, We need to save one
468    // pixel (4 bytes) each time. And also the fourth pixel is always absent.
469    if (pixel_width & 3) {
470        accum0 = _mm_setzero_si128();
471        accum1 = _mm_setzero_si128();
472        accum2 = _mm_setzero_si128();
473        for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
474            coeff16 = _mm_set1_epi16(filter_values[filter_y]);
475            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
476            src = reinterpret_cast<const __m128i*>(
477                &source_data_rows[filter_y][width<<2]);
478            __m128i src8 = _mm_loadu_si128(src);
479            // [16] a1 b1 g1 r1 a0 b0 g0 r0
480            __m128i src16 = _mm_unpacklo_epi8(src8, zero);
481            __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
482            __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
483            // [32] a0 b0 g0 r0
484            __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
485            accum0 = _mm_add_epi32(accum0, t);
486            // [32] a1 b1 g1 r1
487            t = _mm_unpackhi_epi16(mul_lo, mul_hi);
488            accum1 = _mm_add_epi32(accum1, t);
489            // [16] a3 b3 g3 r3 a2 b2 g2 r2
490            src16 = _mm_unpackhi_epi8(src8, zero);
491            mul_hi = _mm_mulhi_epi16(src16, coeff16);
492            mul_lo = _mm_mullo_epi16(src16, coeff16);
493            // [32] a2 b2 g2 r2
494            t = _mm_unpacklo_epi16(mul_lo, mul_hi);
495            accum2 = _mm_add_epi32(accum2, t);
496        }
497
498        accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
499        accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
500        accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
501        // [16] a1 b1 g1 r1 a0 b0 g0 r0
502        accum0 = _mm_packs_epi32(accum0, accum1);
503        // [16] a3 b3 g3 r3 a2 b2 g2 r2
504        accum2 = _mm_packs_epi32(accum2, zero);
505        // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
506        accum0 = _mm_packus_epi16(accum0, accum2);
507        if (has_alpha) {
508            // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
509            __m128i a = _mm_srli_epi32(accum0, 8);
510            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
511            __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
512            // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
513            a = _mm_srli_epi32(accum0, 16);
514            // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
515            b = _mm_max_epu8(a, b);  // Max of r and g and b.
516            // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
517            b = _mm_slli_epi32(b, 24);
518            accum0 = _mm_max_epu8(b, accum0);
519        } else {
520            __m128i mask = _mm_set1_epi32(0xff000000);
521            accum0 = _mm_or_si128(accum0, mask);
522        }
523
524        for (int out_x = width; out_x < pixel_width; out_x++) {
525            *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
526            accum0 = _mm_srli_si128(accum0, 4);
527            out_row += 4;
528        }
529    }
530}
531
532void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
533                             int filter_length,
534                             unsigned char* const* source_data_rows,
535                             int pixel_width,
536                             unsigned char* out_row,
537                             bool has_alpha) {
538    if (has_alpha) {
539        convolveVertically_SSE2<true>(filter_values,
540                                      filter_length,
541                                      source_data_rows,
542                                      pixel_width,
543                                      out_row);
544    } else {
545        convolveVertically_SSE2<false>(filter_values,
546                                       filter_length,
547                                       source_data_rows,
548                                       pixel_width,
549                                       out_row);
550    }
551}
552
553void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
554    // Padding |paddingCount| of more dummy coefficients after the coefficients
555    // of last filter to prevent SIMD instructions which load 8 or 16 bytes
556    // together to access invalid memory areas. We are not trying to align the
557    // coefficients right now due to the opaqueness of <vector> implementation.
558    // This has to be done after all |AddFilter| calls.
559    for (int i = 0; i < 8; ++i) {
560        filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
561    }
562}
563