1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include <tmmintrin.h>  // SSSE3
9#include "SkBitmapProcState_opts_SSSE3.h"
10#include "SkUtils.h"
11
12// adding anonymous namespace seemed to force gcc to inline directly the
13// instantiation, instead of creating the functions
14// S32_generic_D32_filter_DX_SSSE3<true> and
15// S32_generic_D32_filter_DX_SSSE3<false> which were then called by the
16// external functions.
17namespace {
18// In this file, variations for alpha and non alpha versions are implemented
19// with a template, as it makes the code more compact and a bit easier to
20// maintain, while making the compiler generate the same exact code as with
21// two functions that only differ by a few lines.
22
23
24// Prepare all necessary constants for a round of processing for two pixel
25// pairs.
26// @param xy is the location where the xy parameters for four pixels should be
27//           read from. It is identical in concept with argument two of
28//           S32_{opaque}_D32_filter_DX methods.
29// @param mask_3FFF vector of 32 bit constants containing 3FFF,
30//                  suitable to mask the bottom 14 bits of a XY value.
31// @param mask_000F vector of 32 bit constants containing 000F,
32//                  suitable to mask the bottom 4 bits of a XY value.
33// @param sixteen_8bit vector of 8 bit components containing the value 16.
34// @param mask_dist_select vector of 8 bit components containing the shuffling
35//                         parameters to reorder x[0-3] parameters.
36// @param all_x_result vector of 8 bit components that will contain the
37//              (4x(x3), 4x(x2), 4x(x1), 4x(x0)) upon return.
38// @param sixteen_minus_x vector of 8 bit components, containing
39//              (4x(16 - x3), 4x(16 - x2), 4x(16 - x1), 4x(16 - x0))
40inline void PrepareConstantsTwoPixelPairs(const uint32_t* xy,
41                                          const __m128i& mask_3FFF,
42                                          const __m128i& mask_000F,
43                                          const __m128i& sixteen_8bit,
44                                          const __m128i& mask_dist_select,
45                                          __m128i* all_x_result,
46                                          __m128i* sixteen_minus_x,
47                                          int* x0,
48                                          int* x1) {
49    const __m128i xx = _mm_loadu_si128(reinterpret_cast<const __m128i *>(xy));
50
51    // 4 delta X
52    // (x03, x02, x01, x00)
53    const __m128i x0_wide = _mm_srli_epi32(xx, 18);
54    // (x13, x12, x11, x10)
55    const __m128i x1_wide = _mm_and_si128(xx, mask_3FFF);
56
57    _mm_storeu_si128(reinterpret_cast<__m128i *>(x0), x0_wide);
58    _mm_storeu_si128(reinterpret_cast<__m128i *>(x1), x1_wide);
59
60    __m128i all_x = _mm_and_si128(_mm_srli_epi32(xx, 14), mask_000F);
61
62    // (4x(x3), 4x(x2), 4x(x1), 4x(x0))
63    all_x = _mm_shuffle_epi8(all_x, mask_dist_select);
64
65    *all_x_result = all_x;
66    // (4x(16-x3), 4x(16-x2), 4x(16-x1), 4x(16-x0))
67    *sixteen_minus_x = _mm_sub_epi8(sixteen_8bit, all_x);
68}
69
70// Helper function used when processing one pixel pair.
71// @param pixel0..3 are the four input pixels
72// @param scale_x vector of 8 bit components to multiply the pixel[0:3]. This
73//                will contain (4x(x1, 16-x1), 4x(x0, 16-x0))
74//                or (4x(x3, 16-x3), 4x(x2, 16-x2))
75// @return a vector of 16 bit components containing:
76// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
77inline __m128i ProcessPixelPairHelper(uint32_t pixel0,
78                                      uint32_t pixel1,
79                                      uint32_t pixel2,
80                                      uint32_t pixel3,
81                                      const __m128i& scale_x) {
82    __m128i a0, a1, a2, a3;
83    // Load 2 pairs of pixels
84    a0 = _mm_cvtsi32_si128(pixel0);
85    a1 = _mm_cvtsi32_si128(pixel1);
86
87    // Interleave pixels.
88    // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
89    a0 = _mm_unpacklo_epi8(a0, a1);
90
91    a2 = _mm_cvtsi32_si128(pixel2);
92    a3 = _mm_cvtsi32_si128(pixel3);
93    // (0, 0, 0, 0, 0, 0, 0, 0, Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2)
94    a2 = _mm_unpacklo_epi8(a2, a3);
95
96    // two pairs of pixel pairs, interleaved.
97    // (Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2,
98    //  Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
99    a0 = _mm_unpacklo_epi64(a0, a2);
100
101    // multiply and sum to 16 bit components.
102    // (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
103    // At that point, we use up a bit less than 12 bits for each 16 bit
104    // component:
105    // All components are less than 255. So,
106    // C0 * (16 - x) + C1 * x <= 255 * (16 - x) + 255 * x = 255 * 16.
107    return _mm_maddubs_epi16(a0, scale_x);
108}
109
110// Scale back the results after multiplications to the [0:255] range, and scale
111// by alpha when has_alpha is true.
112// Depending on whether one set or two sets of multiplications had been applied,
113// the results have to be shifted by four places (dividing by 16), or shifted
114// by eight places (dividing by 256), since each multiplication is by a quantity
115// in the range [0:16].
116template<bool has_alpha, int scale>
117inline __m128i ScaleFourPixels(__m128i* pixels,
118                               const __m128i& alpha) {
119    // Divide each 16 bit component by 16 (or 256 depending on scale).
120    *pixels = _mm_srli_epi16(*pixels, scale);
121
122    if (has_alpha) {
123        // Multiply by alpha.
124        *pixels = _mm_mullo_epi16(*pixels, alpha);
125
126        // Divide each 16 bit component by 256.
127        *pixels = _mm_srli_epi16(*pixels, 8);
128    }
129    return *pixels;
130}
131
132// Wrapper to calculate two output pixels from four input pixels. The
133// arguments are the same as ProcessPixelPairHelper. Technically, there are
134// eight input pixels, but since sub_y == 0, the factors applied to half of the
135// pixels is zero (sub_y), and are therefore omitted here to save on some
136// processing.
137// @param alpha when has_alpha is true, scale all resulting components by this
138//              value.
139// @return a vector of 16 bit components containing:
140// ((Aa2 * (16 - x1) + Aa3 * x1) * alpha, ...,
141// (Ra0 * (16 - x0) + Ra1 * x0) * alpha) (when has_alpha is true)
142// otherwise
143// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
144// In both cases, the results are renormalized (divided by 16) to match the
145// expected formats when storing back the results into memory.
146template<bool has_alpha>
147inline __m128i ProcessPixelPairZeroSubY(uint32_t pixel0,
148                                        uint32_t pixel1,
149                                        uint32_t pixel2,
150                                        uint32_t pixel3,
151                                        const __m128i& scale_x,
152                                        const __m128i& alpha) {
153    __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
154                                         scale_x);
155    return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
156}
157
158// Same as ProcessPixelPairZeroSubY, expect processing one output pixel at a
159// time instead of two. As in the above function, only two pixels are needed
160// to generate a single pixel since sub_y == 0.
161// @return same as ProcessPixelPairZeroSubY, except that only the bottom 4
162// 16 bit components are set.
163template<bool has_alpha>
164inline __m128i ProcessOnePixelZeroSubY(uint32_t pixel0,
165                                       uint32_t pixel1,
166                                       __m128i scale_x,
167                                       __m128i alpha) {
168    __m128i a0 = _mm_cvtsi32_si128(pixel0);
169    __m128i a1 = _mm_cvtsi32_si128(pixel1);
170
171    // Interleave
172    a0 = _mm_unpacklo_epi8(a0, a1);
173
174    // (a0 * (16-x) + a1 * x)
175    __m128i sum = _mm_maddubs_epi16(a0, scale_x);
176
177    return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
178}
179
180// Methods when sub_y != 0
181
182
183// Same as ProcessPixelPairHelper, except that the values are scaled by y.
184// @param y vector of 16 bit components containing 'y' values. There are two
185//        cases in practice, where y will contain the sub_y constant, or will
186//        contain the 16 - sub_y constant.
187// @return vector of 16 bit components containing:
188// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , y * (Ra0 * (16 - x0) + Ra1 * x0))
189inline __m128i ProcessPixelPair(uint32_t pixel0,
190                                uint32_t pixel1,
191                                uint32_t pixel2,
192                                uint32_t pixel3,
193                                const __m128i& scale_x,
194                                const __m128i& y) {
195    __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
196                                         scale_x);
197
198    // first row times 16-y or y depending on whether 'y' represents one or
199    // the other.
200    // Values will be up to 255 * 16 * 16 = 65280.
201    // (y * (Aa2 * (16 - x1) + Aa3 * x1), ... ,
202    //  y * (Ra0 * (16 - x0) + Ra1 * x0))
203    sum = _mm_mullo_epi16(sum, y);
204
205    return sum;
206}
207
208// Process two pixel pairs out of eight input pixels.
209// In other methods, the distinct pixels are passed one by one, but in this
210// case, the rows, and index offsets to the pixels into the row are passed
211// to generate the 8 pixels.
212// @param row0..1 top and bottom row where to find input pixels.
213// @param x0..1 offsets into the row for all eight input pixels.
214// @param all_y vector of 16 bit components containing the constant sub_y
215// @param neg_y vector of 16 bit components containing the constant 16 - sub_y
216// @param alpha vector of 16 bit components containing the alpha value to scale
217//        the results by, when has_alpha is true.
218// @return
219// (alpha * ((16-y) * (Aa2  * (16-x1) + Aa3  * x1) +
220//             y    * (Aa2' * (16-x1) + Aa3' * x1)),
221// ...
222//  alpha * ((16-y) * (Ra0  * (16-x0) + Ra1 * x0) +
223//             y    * (Ra0' * (16-x0) + Ra1' * x0))
224// With the factor alpha removed when has_alpha is false.
225// The values are scaled back to 16 bit components, but with only the bottom
226// 8 bits being set.
227template<bool has_alpha>
228inline __m128i ProcessTwoPixelPairs(const uint32_t* row0,
229                                    const uint32_t* row1,
230                                    const int* x0,
231                                    const int* x1,
232                                    const __m128i& scale_x,
233                                    const __m128i& all_y,
234                                    const __m128i& neg_y,
235                                    const __m128i& alpha) {
236    __m128i sum0 = ProcessPixelPair(
237        row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
238        scale_x, neg_y);
239    __m128i sum1 = ProcessPixelPair(
240        row1[x0[0]], row1[x1[0]], row1[x0[1]], row1[x1[1]],
241        scale_x, all_y);
242
243    // 2 samples fully summed.
244    // ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
245    //  y * (Aa2' * (16-x1) + Aa3' * x1),
246    // ...
247    //  (16-y) * (Ra0 * (16 - x0) + Ra1 * x0)) +
248    //  y * (Ra0' * (16-x0) + Ra1' * x0))
249    // Each component, again can be at most 256 * 255 = 65280, so no overflow.
250    sum0 = _mm_add_epi16(sum0, sum1);
251
252    return ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
253}
254
255
256// Same as ProcessPixelPair, except that performing the math one output pixel
257// at a time. This means that only the bottom four 16 bit components are set.
258inline __m128i ProcessOnePixel(uint32_t pixel0, uint32_t pixel1,
259                               const __m128i& scale_x, const __m128i& y) {
260    __m128i a0 = _mm_cvtsi32_si128(pixel0);
261    __m128i a1 = _mm_cvtsi32_si128(pixel1);
262
263    // Interleave
264    // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
265    a0 = _mm_unpacklo_epi8(a0, a1);
266
267    // (a0 * (16-x) + a1 * x)
268    a0 = _mm_maddubs_epi16(a0, scale_x);
269
270    // scale row by y
271    return _mm_mullo_epi16(a0, y);
272}
273
274// Notes about the various tricks that are used in this implementation:
275// - specialization for sub_y == 0.
276// Statistically, 1/16th of the samples will have sub_y == 0. When this
277// happens, the math goes from:
278// (16 - x)*(16 - y)*a00 + x*(16 - y)*a01 + (16 - x)*y*a10 + x*y*a11
279// to:
280// (16 - x)*a00 + 16*x*a01
281// much simpler. The simplification makes for an easy boost in performance.
282// - calculating 4 output pixels at a time.
283//  This allows loading the coefficients x0 and x1 and shuffling them to the
284// optimum location only once per loop, instead of twice per loop.
285// This also allows us to store the four pixels with a single store.
286// - Use of 2 special SSSE3 instructions (comparatively to the SSE2 instruction
287// version):
288// _mm_shuffle_epi8 : this allows us to spread the coefficients x[0-3] loaded
289// in 32 bit values to 8 bit values repeated four times.
290// _mm_maddubs_epi16 : this allows us to perform multiplications and additions
291// in one swoop of 8bit values storing the results in 16 bit values. This
292// instruction is actually crucial for the speed of the implementation since
293// as one can see in the SSE2 implementation, all inputs have to be used as
294// 16 bits because the results are 16 bits. This basically allows us to process
295// twice as many pixel components per iteration.
296//
297// As a result, this method behaves faster than the traditional SSE2. The actual
298// boost varies greatly on the underlying architecture.
299template<bool has_alpha>
300void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
301                                     const uint32_t* xy,
302                                     int count, uint32_t* colors) {
303    SkASSERT(count > 0 && colors != NULL);
304    SkASSERT(s.fDoFilter);
305    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
306    if (has_alpha) {
307        SkASSERT(s.fAlphaScale < 256);
308    } else {
309        SkASSERT(s.fAlphaScale == 256);
310    }
311
312    const uint8_t* src_addr =
313            static_cast<const uint8_t*>(s.fBitmap->getPixels());
314    const unsigned rb = s.fBitmap->rowBytes();
315    const uint32_t XY = *xy++;
316    const unsigned y0 = XY >> 14;
317    const uint32_t* row0 =
318            reinterpret_cast<const uint32_t*>(src_addr + (y0 >> 4) * rb);
319    const uint32_t* row1 =
320            reinterpret_cast<const uint32_t*>(src_addr + (XY & 0x3FFF) * rb);
321    const unsigned sub_y = y0 & 0xF;
322
323    // vector constants
324    const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12,
325                                                  8,  8,  8,  8,
326                                                  4,  4,  4,  4,
327                                                  0,  0,  0,  0);
328    const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF);
329    const __m128i mask_000F = _mm_set1_epi32(0x000F);
330    const __m128i sixteen_8bit = _mm_set1_epi8(16);
331    // (0, 0, 0, 0, 0, 0, 0, 0)
332    const __m128i zero = _mm_setzero_si128();
333
334    __m128i alpha;
335    if (has_alpha)
336        // 8x(alpha)
337        alpha = _mm_set1_epi16(s.fAlphaScale);
338
339    if (sub_y == 0) {
340        // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
341        while (count > 3) {
342            count -= 4;
343
344            int x0[4];
345            int x1[4];
346            __m128i all_x, sixteen_minus_x;
347            PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
348                                          sixteen_8bit, mask_dist_select,
349                                          &all_x, &sixteen_minus_x, x0, x1);
350            xy += 4;
351
352            // First pair of pixel pairs.
353            // (4x(x1, 16-x1), 4x(x0, 16-x0))
354            __m128i scale_x;
355            scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
356
357            __m128i sum0 = ProcessPixelPairZeroSubY<has_alpha>(
358                row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
359                scale_x, alpha);
360
361            // second pair of pixel pairs
362            // (4x (x3, 16-x3), 4x (16-x2, x2))
363            scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
364
365            __m128i sum1 = ProcessPixelPairZeroSubY<has_alpha>(
366                row0[x0[2]], row0[x1[2]], row0[x0[3]], row0[x1[3]],
367                scale_x, alpha);
368
369            // Pack lower 4 16 bit values of sum into lower 4 bytes.
370            sum0 = _mm_packus_epi16(sum0, sum1);
371
372            // Extract low int and store.
373            _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
374
375            colors += 4;
376        }
377
378        // handle remainder
379        while (count-- > 0) {
380            uint32_t xx = *xy++;  // x0:14 | 4 | x1:14
381            unsigned x0 = xx >> 18;
382            unsigned x1 = xx & 0x3FFF;
383
384            // 16x(x)
385            const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
386
387            // (16x(16-x))
388            __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
389
390            scale_x = _mm_unpacklo_epi8(scale_x, all_x);
391
392            __m128i sum = ProcessOnePixelZeroSubY<has_alpha>(
393                row0[x0], row0[x1],
394                scale_x, alpha);
395
396            // Pack lower 4 16 bit values of sum into lower 4 bytes.
397            sum = _mm_packus_epi16(sum, zero);
398
399            // Extract low int and store.
400            *colors++ = _mm_cvtsi128_si32(sum);
401        }
402    } else {  // more general case, y != 0
403        // 8x(16)
404        const __m128i sixteen_16bit = _mm_set1_epi16(16);
405
406        // 8x (y)
407        const __m128i all_y = _mm_set1_epi16(sub_y);
408
409        // 8x (16-y)
410        const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y);
411
412        // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
413        while (count > 3) {
414            count -= 4;
415
416            int x0[4];
417            int x1[4];
418            __m128i all_x, sixteen_minus_x;
419            PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
420                                          sixteen_8bit, mask_dist_select,
421                                          &all_x, &sixteen_minus_x, x0, x1);
422            xy += 4;
423
424            // First pair of pixel pairs
425            // (4x(x1, 16-x1), 4x(x0, 16-x0))
426            __m128i scale_x;
427            scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
428
429            __m128i sum0 = ProcessTwoPixelPairs<has_alpha>(
430                row0, row1, x0, x1,
431                scale_x, all_y, neg_y, alpha);
432
433            // second pair of pixel pairs
434            // (4x (x3, 16-x3), 4x (16-x2, x2))
435            scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
436
437            __m128i sum1 = ProcessTwoPixelPairs<has_alpha>(
438                row0, row1, x0 + 2, x1 + 2,
439                scale_x, all_y, neg_y, alpha);
440
441            // Do the final packing of the two results
442
443            // Pack lower 4 16 bit values of sum into lower 4 bytes.
444            sum0 = _mm_packus_epi16(sum0, sum1);
445
446            // Extract low int and store.
447            _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
448
449            colors += 4;
450        }
451
452        // Left over.
453        while (count-- > 0) {
454            const uint32_t xx = *xy++;  // x0:14 | 4 | x1:14
455            const unsigned x0 = xx >> 18;
456            const unsigned x1 = xx & 0x3FFF;
457
458            // 16x(x)
459            const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
460
461            // 16x (16-x)
462            __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
463
464            // (8x (x, 16-x))
465            scale_x = _mm_unpacklo_epi8(scale_x, all_x);
466
467            // first row.
468            __m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y);
469            // second row.
470            __m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y);
471
472            // Add both rows for full sample
473            sum0 = _mm_add_epi16(sum0, sum1);
474
475            sum0 = ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
476
477            // Pack lower 4 16 bit values of sum into lower 4 bytes.
478            sum0 = _mm_packus_epi16(sum0, zero);
479
480            // Extract low int and store.
481            *colors++ = _mm_cvtsi128_si32(sum0);
482        }
483    }
484}
485}  // namepace
486
487void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
488                                    const uint32_t* xy,
489                                    int count, uint32_t* colors) {
490    S32_generic_D32_filter_DX_SSSE3<false>(s, xy, count, colors);
491}
492
493void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
494                                   const uint32_t* xy,
495                                   int count, uint32_t* colors) {
496    S32_generic_D32_filter_DX_SSSE3<true>(s, xy, count, colors);
497}
498