1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6
7#include "skia/ext/convolver.h"
8#include "skia/ext/convolver_SSE2.h"
9#include "third_party/skia/include/core/SkTypes.h"
10
11#include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
12
13namespace skia {
14
15// Convolves horizontally along a single row. The row data is given in
16// |src_data| and continues for the num_values() of the filter.
17void ConvolveHorizontally_SSE2(const unsigned char* src_data,
18                               const ConvolutionFilter1D& filter,
19                               unsigned char* out_row,
20                               bool /*has_alpha*/) {
21  int num_values = filter.num_values();
22
23  int filter_offset, filter_length;
24  __m128i zero = _mm_setzero_si128();
25  __m128i mask[4];
26  // |mask| will be used to decimate all extra filter coefficients that are
27  // loaded by SIMD when |filter_length| is not divisible by 4.
28  // mask[0] is not used in following algorithm.
29  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
30  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
31  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
32
33  // Output one pixel each iteration, calculating all channels (RGBA) together.
34  for (int out_x = 0; out_x < num_values; out_x++) {
35    const ConvolutionFilter1D::Fixed* filter_values =
36        filter.FilterForValue(out_x, &filter_offset, &filter_length);
37
38    __m128i accum = _mm_setzero_si128();
39
40    // Compute the first pixel in this row that the filter affects. It will
41    // touch |filter_length| pixels (4 bytes each) after this.
42    const __m128i* row_to_filter =
43        reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
44
45    // We will load and accumulate with four coefficients per iteration.
46    for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
47
48      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
49      __m128i coeff, coeff16;
50      // [16] xx xx xx xx c3 c2 c1 c0
51      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
52      // [16] xx xx xx xx c1 c1 c0 c0
53      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
54      // [16] c1 c1 c1 c1 c0 c0 c0 c0
55      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
56
57      // Load four pixels => unpack the first two pixels to 16 bits =>
58      // multiply with coefficients => accumulate the convolution result.
59      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
60      __m128i src8 = _mm_loadu_si128(row_to_filter);
61      // [16] a1 b1 g1 r1 a0 b0 g0 r0
62      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
63      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
64      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
65      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
66      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
67      accum = _mm_add_epi32(accum, t);
68      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
69      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
70      accum = _mm_add_epi32(accum, t);
71
72      // Duplicate 3rd and 4th coefficients for all channels =>
73      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
74      // => accumulate the convolution results.
75      // [16] xx xx xx xx c3 c3 c2 c2
76      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
77      // [16] c3 c3 c3 c3 c2 c2 c2 c2
78      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
79      // [16] a3 g3 b3 r3 a2 g2 b2 r2
80      src16 = _mm_unpackhi_epi8(src8, zero);
81      mul_hi = _mm_mulhi_epi16(src16, coeff16);
82      mul_lo = _mm_mullo_epi16(src16, coeff16);
83      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
84      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
85      accum = _mm_add_epi32(accum, t);
86      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
87      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
88      accum = _mm_add_epi32(accum, t);
89
90      // Advance the pixel and coefficients pointers.
91      row_to_filter += 1;
92      filter_values += 4;
93    }
94
95    // When |filter_length| is not divisible by 4, we need to decimate some of
96    // the filter coefficient that was loaded incorrectly to zero; Other than
97    // that the algorithm is same with above, exceot that the 4th pixel will be
98    // always absent.
99    int r = filter_length&3;
100    if (r) {
101      // Note: filter_values must be padded to align_up(filter_offset, 8).
102      __m128i coeff, coeff16;
103      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
104      // Mask out extra filter taps.
105      coeff = _mm_and_si128(coeff, mask[r]);
106      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
107      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
108
109      // Note: line buffer must be padded to align_up(filter_offset, 16).
110      // We resolve this by use C-version for the last horizontal line.
111      __m128i src8 = _mm_loadu_si128(row_to_filter);
112      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
113      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
114      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
115      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
116      accum = _mm_add_epi32(accum, t);
117      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
118      accum = _mm_add_epi32(accum, t);
119
120      src16 = _mm_unpackhi_epi8(src8, zero);
121      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
122      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
123      mul_hi = _mm_mulhi_epi16(src16, coeff16);
124      mul_lo = _mm_mullo_epi16(src16, coeff16);
125      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
126      accum = _mm_add_epi32(accum, t);
127    }
128
129    // Shift right for fixed point implementation.
130    accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
131
132    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
133    accum = _mm_packs_epi32(accum, zero);
134    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
135    accum = _mm_packus_epi16(accum, zero);
136
137    // Store the pixel value of 32 bits.
138    *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
139    out_row += 4;
140  }
141}
142
143// Convolves horizontally along four rows. The row data is given in
144// |src_data| and continues for the num_values() of the filter.
145// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
146// refer to that function for detailed comments.
147void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
148                                    const ConvolutionFilter1D& filter,
149                                    unsigned char* out_row[4]) {
150  int num_values = filter.num_values();
151
152  int filter_offset, filter_length;
153  __m128i zero = _mm_setzero_si128();
154  __m128i mask[4];
155  // |mask| will be used to decimate all extra filter coefficients that are
156  // loaded by SIMD when |filter_length| is not divisible by 4.
157  // mask[0] is not used in following algorithm.
158  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
159  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
160  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
161
162  // Output one pixel each iteration, calculating all channels (RGBA) together.
163  for (int out_x = 0; out_x < num_values; out_x++) {
164    const ConvolutionFilter1D::Fixed* filter_values =
165        filter.FilterForValue(out_x, &filter_offset, &filter_length);
166
167    // four pixels in a column per iteration.
168    __m128i accum0 = _mm_setzero_si128();
169    __m128i accum1 = _mm_setzero_si128();
170    __m128i accum2 = _mm_setzero_si128();
171    __m128i accum3 = _mm_setzero_si128();
172    int start = (filter_offset<<2);
173    // We will load and accumulate with four coefficients per iteration.
174    for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
175      __m128i coeff, coeff16lo, coeff16hi;
176      // [16] xx xx xx xx c3 c2 c1 c0
177      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
178      // [16] xx xx xx xx c1 c1 c0 c0
179      coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
180      // [16] c1 c1 c1 c1 c0 c0 c0 c0
181      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
182      // [16] xx xx xx xx c3 c3 c2 c2
183      coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
184      // [16] c3 c3 c3 c3 c2 c2 c2 c2
185      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
186
187      __m128i src8, src16, mul_hi, mul_lo, t;
188
189#define ITERATION(src, accum)                                          \
190      src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
191      src16 = _mm_unpacklo_epi8(src8, zero);                           \
192      mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
193      mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
194      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
195      accum = _mm_add_epi32(accum, t);                                 \
196      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
197      accum = _mm_add_epi32(accum, t);                                 \
198      src16 = _mm_unpackhi_epi8(src8, zero);                           \
199      mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
200      mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
201      t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
202      accum = _mm_add_epi32(accum, t);                                 \
203      t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
204      accum = _mm_add_epi32(accum, t)
205
206      ITERATION(src_data[0] + start, accum0);
207      ITERATION(src_data[1] + start, accum1);
208      ITERATION(src_data[2] + start, accum2);
209      ITERATION(src_data[3] + start, accum3);
210
211      start += 16;
212      filter_values += 4;
213    }
214
215    int r = filter_length & 3;
216    if (r) {
217      // Note: filter_values must be padded to align_up(filter_offset, 8);
218      __m128i coeff;
219      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
220      // Mask out extra filter taps.
221      coeff = _mm_and_si128(coeff, mask[r]);
222
223      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
224      /* c1 c1 c1 c1 c0 c0 c0 c0 */
225      coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
226      __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
227      coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
228
229      __m128i src8, src16, mul_hi, mul_lo, t;
230
231      ITERATION(src_data[0] + start, accum0);
232      ITERATION(src_data[1] + start, accum1);
233      ITERATION(src_data[2] + start, accum2);
234      ITERATION(src_data[3] + start, accum3);
235    }
236
237    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
238    accum0 = _mm_packs_epi32(accum0, zero);
239    accum0 = _mm_packus_epi16(accum0, zero);
240    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
241    accum1 = _mm_packs_epi32(accum1, zero);
242    accum1 = _mm_packus_epi16(accum1, zero);
243    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
244    accum2 = _mm_packs_epi32(accum2, zero);
245    accum2 = _mm_packus_epi16(accum2, zero);
246    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
247    accum3 = _mm_packs_epi32(accum3, zero);
248    accum3 = _mm_packus_epi16(accum3, zero);
249
250    *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
251    *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
252    *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
253    *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
254
255    out_row[0] += 4;
256    out_row[1] += 4;
257    out_row[2] += 4;
258    out_row[3] += 4;
259  }
260}
261
262// Does vertical convolution to produce one output row. The filter values and
263// length are given in the first two parameters. These are applied to each
264// of the rows pointed to in the |source_data_rows| array, with each row
265// being |pixel_width| wide.
266//
267// The output must have room for |pixel_width * 4| bytes.
268template<bool has_alpha>
269void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
270                             int filter_length,
271                             unsigned char* const* source_data_rows,
272                             int pixel_width,
273                             unsigned char* out_row) {
274  int width = pixel_width & ~3;
275
276  __m128i zero = _mm_setzero_si128();
277  __m128i accum0, accum1, accum2, accum3, coeff16;
278  const __m128i* src;
279  // Output four pixels per iteration (16 bytes).
280  for (int out_x = 0; out_x < width; out_x += 4) {
281
282    // Accumulated result for each pixel. 32 bits per RGBA channel.
283    accum0 = _mm_setzero_si128();
284    accum1 = _mm_setzero_si128();
285    accum2 = _mm_setzero_si128();
286    accum3 = _mm_setzero_si128();
287
288    // Convolve with one filter coefficient per iteration.
289    for (int filter_y = 0; filter_y < filter_length; filter_y++) {
290
291      // Duplicate the filter coefficient 8 times.
292      // [16] cj cj cj cj cj cj cj cj
293      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
294
295      // Load four pixels (16 bytes) together.
296      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
297      src = reinterpret_cast<const __m128i*>(
298          &source_data_rows[filter_y][out_x << 2]);
299      __m128i src8 = _mm_loadu_si128(src);
300
301      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
302      // multiply with current coefficient => accumulate the result.
303      // [16] a1 b1 g1 r1 a0 b0 g0 r0
304      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
305      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
306      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
307      // [32] a0 b0 g0 r0
308      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
309      accum0 = _mm_add_epi32(accum0, t);
310      // [32] a1 b1 g1 r1
311      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
312      accum1 = _mm_add_epi32(accum1, t);
313
314      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
315      // multiply with current coefficient => accumulate the result.
316      // [16] a3 b3 g3 r3 a2 b2 g2 r2
317      src16 = _mm_unpackhi_epi8(src8, zero);
318      mul_hi = _mm_mulhi_epi16(src16, coeff16);
319      mul_lo = _mm_mullo_epi16(src16, coeff16);
320      // [32] a2 b2 g2 r2
321      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
322      accum2 = _mm_add_epi32(accum2, t);
323      // [32] a3 b3 g3 r3
324      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
325      accum3 = _mm_add_epi32(accum3, t);
326    }
327
328    // Shift right for fixed point implementation.
329    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
330    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
331    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
332    accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
333
334    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
335    // [16] a1 b1 g1 r1 a0 b0 g0 r0
336    accum0 = _mm_packs_epi32(accum0, accum1);
337    // [16] a3 b3 g3 r3 a2 b2 g2 r2
338    accum2 = _mm_packs_epi32(accum2, accum3);
339
340    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
341    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
342    accum0 = _mm_packus_epi16(accum0, accum2);
343
344    if (has_alpha) {
345      // Compute the max(ri, gi, bi) for each pixel.
346      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
347      __m128i a = _mm_srli_epi32(accum0, 8);
348      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
349      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
350      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
351      a = _mm_srli_epi32(accum0, 16);
352      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
353      b = _mm_max_epu8(a, b);  // Max of r and g and b.
354      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
355      b = _mm_slli_epi32(b, 24);
356
357      // Make sure the value of alpha channel is always larger than maximum
358      // value of color channels.
359      accum0 = _mm_max_epu8(b, accum0);
360    } else {
361      // Set value of alpha channels to 0xFF.
362      __m128i mask = _mm_set1_epi32(0xff000000);
363      accum0 = _mm_or_si128(accum0, mask);
364    }
365
366    // Store the convolution result (16 bytes) and advance the pixel pointers.
367    _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
368    out_row += 16;
369  }
370
371  // When the width of the output is not divisible by 4, We need to save one
372  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
373  if (pixel_width & 3) {
374    accum0 = _mm_setzero_si128();
375    accum1 = _mm_setzero_si128();
376    accum2 = _mm_setzero_si128();
377    for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
378      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
379      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
380      src = reinterpret_cast<const __m128i*>(
381          &source_data_rows[filter_y][width<<2]);
382      __m128i src8 = _mm_loadu_si128(src);
383      // [16] a1 b1 g1 r1 a0 b0 g0 r0
384      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
385      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
386      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
387      // [32] a0 b0 g0 r0
388      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
389      accum0 = _mm_add_epi32(accum0, t);
390      // [32] a1 b1 g1 r1
391      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
392      accum1 = _mm_add_epi32(accum1, t);
393      // [16] a3 b3 g3 r3 a2 b2 g2 r2
394      src16 = _mm_unpackhi_epi8(src8, zero);
395      mul_hi = _mm_mulhi_epi16(src16, coeff16);
396      mul_lo = _mm_mullo_epi16(src16, coeff16);
397      // [32] a2 b2 g2 r2
398      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
399      accum2 = _mm_add_epi32(accum2, t);
400    }
401
402    accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
403    accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
404    accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
405    // [16] a1 b1 g1 r1 a0 b0 g0 r0
406    accum0 = _mm_packs_epi32(accum0, accum1);
407    // [16] a3 b3 g3 r3 a2 b2 g2 r2
408    accum2 = _mm_packs_epi32(accum2, zero);
409    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
410    accum0 = _mm_packus_epi16(accum0, accum2);
411    if (has_alpha) {
412      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
413      __m128i a = _mm_srli_epi32(accum0, 8);
414      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
415      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
416      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
417      a = _mm_srli_epi32(accum0, 16);
418      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
419      b = _mm_max_epu8(a, b);  // Max of r and g and b.
420      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
421      b = _mm_slli_epi32(b, 24);
422      accum0 = _mm_max_epu8(b, accum0);
423    } else {
424      __m128i mask = _mm_set1_epi32(0xff000000);
425      accum0 = _mm_or_si128(accum0, mask);
426    }
427
428    for (int out_x = width; out_x < pixel_width; out_x++) {
429      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
430      accum0 = _mm_srli_si128(accum0, 4);
431      out_row += 4;
432    }
433  }
434}
435
436void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
437                             int filter_length,
438                             unsigned char* const* source_data_rows,
439                             int pixel_width,
440                             unsigned char* out_row,
441                             bool has_alpha) {
442  if (has_alpha) {
443    ConvolveVertically_SSE2<true>(filter_values,
444                                  filter_length,
445                                  source_data_rows,
446                                  pixel_width,
447                                  out_row);
448  } else {
449    ConvolveVertically_SSE2<false>(filter_values,
450                                   filter_length,
451                                   source_data_rows,
452                                   pixel_width,
453                                   out_row);
454  }
455}
456
457}  // namespace skia
458