1/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBlurImageFilter_opts_DEFINED
9#define SkBlurImageFilter_opts_DEFINED
10
11#include "SkColorPriv.h"
12#include "SkRect.h"
13
14#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
15    #include <immintrin.h>
16#endif
17
18namespace SK_OPTS_NS {
19
20enum class BlurDirection { kX, kY };
21
22#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
23#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
24// ARGB -> 000A 000R 000G 000B
25static inline __m128i expand(SkPMColor p) {
26    return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
27};
28// Axxx Rxxx Gxxx Bxxx -> ARGB
29static inline SkPMColor repack(__m128i p) {
30    const char _ = ~0;  // Don't care what ends up in these bytes.  This zeros them.
31    p = _mm_shuffle_epi8(p, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 15,11,7,3));
32    return _mm_cvtsi128_si32(p);
33};
34#define mullo_epi32 _mm_mullo_epi32
35
36#else
37// ARGB -> 000A 000R 000G 000B
38static inline __m128i expand(int p) {
39    auto result = _mm_cvtsi32_si128(p);
40    result = _mm_unpacklo_epi8(result, _mm_setzero_si128());
41    result = _mm_unpacklo_epi16(result, _mm_setzero_si128());
42    return result;
43};
44// Axxx Rxxx Gxxx Bxxx -> ARGB
45static inline SkPMColor repack(__m128i p) {
46    p = _mm_srli_epi32(p, 24);  // 000A 000R 000G 000B
47    p = _mm_packs_epi32(p, p);  // xxxx xxxx 0A0R 0G0B
48    p = _mm_packus_epi16(p, p); // xxxx xxxx xxxx ARGB
49    return _mm_cvtsi128_si32(p);
50};
51
52// _mm_mullo_epi32 is not available, so use the standard trick to emulate it.
53static inline __m128i mullo_epi32(__m128i a, __m128i b) {
54    __m128i p02 = _mm_mul_epu32(a, b),
55            p13 = _mm_mul_epu32(_mm_srli_si128(a, 4),
56                                _mm_srli_si128(b, 4));
57    return _mm_unpacklo_epi32(_mm_shuffle_epi32(p02, _MM_SHUFFLE(0,0,2,0)),
58                              _mm_shuffle_epi32(p13, _MM_SHUFFLE(0,0,2,0)));
59};
60#endif
61#define INIT_SCALE const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);
62#define INIT_HALF const __m128i half = _mm_set1_epi32(1 << 23);
63#define INIT_SUMS __m128i sum = _mm_setzero_si128();
64#define INCREMENT_SUMS(c) sum = _mm_add_epi32(sum, expand(c))
65#define DECREMENT_SUMS(c) sum = _mm_sub_epi32(sum, expand(c))
66#define STORE_SUMS \
67    auto result = mullo_epi32(sum, scale); \
68    result = _mm_add_epi32(result, half); \
69    *dptr = repack(result);
70#define DOUBLE_ROW_OPTIMIZATION
71
72#elif defined(SK_ARM_HAS_NEON)
73
74// val = (sum * scale * 2 + 0x8000) >> 16
75#define STORE_SUMS_DOUBLE \
76    uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16( \
77        vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale))); \
78    if (dstDirection == BlurDirection::kX) { \
79        uint32x2_t px2 = vreinterpret_u32_u8(vmovn_u16(resultPixels)); \
80        vst1_lane_u32(dptr +     0, px2, 0); \
81        vst1_lane_u32(dptr + width, px2, 1); \
82    } else { \
83        vst1_u8((uint8_t*)dptr, vmovn_u16(resultPixels)); \
84    }
85
86#define INCREMENT_SUMS_DOUBLE(p) sum = vaddw_u8(sum, load_2_pixels(p))
87#define DECREMENT_SUMS_DOUBLE(p) sum = vsubw_u8(sum, load_2_pixels(p))
88
89// Fast path for kernel sizes between 2 and 127, working on two rows at a time.
90template<BlurDirection srcDirection, BlurDirection dstDirection>
91static int box_blur_double(const SkPMColor** src, int srcStride, const SkIRect& srcBounds,
92                           SkPMColor** dst, int kernelSize,
93                           int leftOffset, int rightOffset, int width, int height) {
94    // Load 2 pixels from adjacent rows.
95    auto load_2_pixels = [&](const SkPMColor* s) {
96        if (srcDirection == BlurDirection::kX) {
97            // 10% faster by adding these 2 prefetches
98            SK_PREFETCH(s + 16);
99            SK_PREFETCH(s + 16 + srcStride);
100            auto one = vld1_lane_u32(s +         0, vdup_n_u32(0), 0),
101                 two = vld1_lane_u32(s + srcStride,           one, 1);
102            return vreinterpret_u8_u32(two);
103        } else {
104            return vld1_u8((uint8_t*)s);
105        }
106    };
107    int left = srcBounds.left();
108    int right = srcBounds.right();
109    int top = srcBounds.top();
110    int bottom = srcBounds.bottom();
111    int incrementStart = SkMax32(left - rightOffset - 1, left - right);
112    int incrementEnd = SkMax32(right - rightOffset - 1, 0);
113    int decrementStart = SkMin32(left + leftOffset, width);
114    int decrementEnd = SkMin32(right + leftOffset, width);
115    const int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;
116    const int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;
117    const int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;
118    const int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;
119    const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
120
121    for (; bottom - top >= 2; top += 2) {
122        uint16x8_t sum = vdupq_n_u16(0);
123        const SkPMColor* lptr = *src;
124        const SkPMColor* rptr = *src;
125        SkPMColor* dptr = *dst;
126        int x;
127        for (x = incrementStart; x < 0; ++x) {
128            INCREMENT_SUMS_DOUBLE(rptr);
129            rptr += srcStrideX;
130        }
131        // Clear to zero when sampling to the left our domain. "sum" is zero here because we
132        // initialized it above, and the preceeding loop has no effect in this case.
133        for (x = 0; x < incrementStart; ++x) {
134            STORE_SUMS_DOUBLE
135            dptr += dstStrideX;
136        }
137        for (; x < decrementStart && x < incrementEnd; ++x) {
138            STORE_SUMS_DOUBLE
139            dptr += dstStrideX;
140            INCREMENT_SUMS_DOUBLE(rptr);
141            rptr += srcStrideX;
142        }
143        for (x = decrementStart; x < incrementEnd; ++x) {
144            STORE_SUMS_DOUBLE
145            dptr += dstStrideX;
146            INCREMENT_SUMS_DOUBLE(rptr);
147            rptr += srcStrideX;
148            DECREMENT_SUMS_DOUBLE(lptr);
149            lptr += srcStrideX;
150        }
151        for (x = incrementEnd; x < decrementStart; ++x) {
152            STORE_SUMS_DOUBLE
153            dptr += dstStrideX;
154        }
155        for (; x < decrementEnd; ++x) {
156            STORE_SUMS_DOUBLE
157            dptr += dstStrideX;
158            DECREMENT_SUMS_DOUBLE(lptr);
159            lptr += srcStrideX;
160        }
161        // Clear to zero when sampling to the right of our domain. "sum" is zero here because we
162        // added on then subtracted off all of the pixels, leaving zero.
163        for (; x < width; ++x) {
164            STORE_SUMS_DOUBLE
165            dptr += dstStrideX;
166        }
167        *src += srcStrideY * 2;
168        *dst += dstStrideY * 2;
169    }
170    return top;
171}
172
173// ARGB -> 0A0R 0G0B
174static inline uint16x4_t expand(SkPMColor p) {
175    return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(p))));
176};
177
178#define INIT_SCALE const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
179#define INIT_HALF const uint32x4_t half = vdupq_n_u32(1 << 23);
180#define INIT_SUMS uint32x4_t sum = vdupq_n_u32(0);
181#define INCREMENT_SUMS(c) sum = vaddw_u16(sum, expand(c));
182#define DECREMENT_SUMS(c) sum = vsubw_u16(sum, expand(c));
183
184#define STORE_SUMS \
185    uint32x4_t result = vmlaq_u32(half, sum, scale); \
186    uint16x4_t result16 = vqshrn_n_u32(result, 16); \
187    uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8); \
188    vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);
189
190#define DOUBLE_ROW_OPTIMIZATION \
191    if (1 < kernelSize && kernelSize < 128) { \
192        top = box_blur_double<srcDirection, dstDirection>(&src, srcStride, srcBounds, &dst, \
193                                                          kernelSize, leftOffset, rightOffset, \
194                                                          width, height); \
195    }
196
197#else  // Neither NEON nor >=SSE2.
198
199#define INIT_SCALE uint32_t scale = (1 << 24) / kernelSize;
200#define INIT_HALF  uint32_t half = 1 << 23;
201#define INIT_SUMS int sumA = 0, sumR = 0, sumG = 0, sumB = 0;
202#define INCREMENT_SUMS(c) \
203    sumA += SkGetPackedA32(c); \
204    sumR += SkGetPackedR32(c); \
205    sumG += SkGetPackedG32(c); \
206    sumB += SkGetPackedB32(c)
207#define DECREMENT_SUMS(c) \
208    sumA -= SkGetPackedA32(c); \
209    sumR -= SkGetPackedR32(c); \
210    sumG -= SkGetPackedG32(c); \
211    sumB -= SkGetPackedB32(c)
212#define STORE_SUMS \
213    *dptr = SkPackARGB32((sumA * scale + half) >> 24, \
214                         (sumR * scale + half) >> 24, \
215                         (sumG * scale + half) >> 24, \
216                         (sumB * scale + half) >> 24);
217#define DOUBLE_ROW_OPTIMIZATION
218
219#endif
220
221#define PREFETCH_RPTR \
222    if (srcDirection == BlurDirection::kY) { \
223        SK_PREFETCH(rptr); \
224    }
225
226template<BlurDirection srcDirection, BlurDirection dstDirection>
227static void box_blur(const SkPMColor* src, int srcStride, const SkIRect& srcBounds, SkPMColor* dst,
228                     int kernelSize, int leftOffset, int rightOffset, int width, int height) {
229    int left = srcBounds.left();
230    int right = srcBounds.right();
231    int top = srcBounds.top();
232    int bottom = srcBounds.bottom();
233    int incrementStart = SkMax32(left - rightOffset - 1, left - right);
234    int incrementEnd = SkMax32(right - rightOffset - 1, 0);
235    int decrementStart = SkMin32(left + leftOffset, width);
236    int decrementEnd = SkMin32(right + leftOffset, width);
237    int srcStrideX = srcDirection == BlurDirection::kX ? 1 : srcStride;
238    int dstStrideX = dstDirection == BlurDirection::kX ? 1 : height;
239    int srcStrideY = srcDirection == BlurDirection::kX ? srcStride : 1;
240    int dstStrideY = dstDirection == BlurDirection::kX ? width : 1;
241    INIT_SCALE
242    INIT_HALF
243
244    // Clear to zero when sampling above our domain.
245    for (int y = 0; y < top; y++) {
246        SkColor* dptr = dst;
247        for (int x = 0; x < width; ++x) {
248            *dptr = 0;
249            dptr += dstStrideX;
250        }
251        dst += dstStrideY;
252    }
253
254    DOUBLE_ROW_OPTIMIZATION
255
256    for (int y = top; y < bottom; ++y) {
257        INIT_SUMS
258        const SkPMColor* lptr = src;
259        const SkPMColor* rptr = src;
260        SkColor* dptr = dst;
261        int x;
262        for (x = incrementStart; x < 0; ++x) {
263            INCREMENT_SUMS(*rptr);
264            rptr += srcStrideX;
265            PREFETCH_RPTR
266        }
267        // Clear to zero when sampling to the left of our domain.
268        for (x = 0; x < incrementStart; ++x) {
269            *dptr = 0;
270            dptr += dstStrideX;
271        }
272        for (; x < decrementStart && x < incrementEnd; ++x) {
273            STORE_SUMS
274            dptr += dstStrideX;
275            INCREMENT_SUMS(*rptr);
276            rptr += srcStrideX;
277            PREFETCH_RPTR
278        }
279        for (x = decrementStart; x < incrementEnd; ++x) {
280            STORE_SUMS
281            dptr += dstStrideX;
282            INCREMENT_SUMS(*rptr);
283            rptr += srcStrideX;
284            PREFETCH_RPTR
285            DECREMENT_SUMS(*lptr);
286            lptr += srcStrideX;
287        }
288        for (x = incrementEnd; x < decrementStart; ++x) {
289            STORE_SUMS
290            dptr += dstStrideX;
291        }
292        for (; x < decrementEnd; ++x) {
293            STORE_SUMS
294            dptr += dstStrideX;
295            DECREMENT_SUMS(*lptr);
296            lptr += srcStrideX;
297        }
298        // Clear to zero when sampling to the right of our domain.
299        for (; x < width; ++x) {
300            *dptr = 0;
301            dptr += dstStrideX;
302        }
303        src += srcStrideY;
304        dst += dstStrideY;
305    }
306    // Clear to zero when sampling below our domain.
307    for (int y = bottom; y < height; ++y) {
308        SkColor* dptr = dst;
309        for (int x = 0; x < width; ++x) {
310            *dptr = 0;
311            dptr += dstStrideX;
312        }
313        dst += dstStrideY;
314    }
315}
316
317static auto box_blur_xx = &box_blur<BlurDirection::kX, BlurDirection::kX>,
318            box_blur_xy = &box_blur<BlurDirection::kX, BlurDirection::kY>,
319            box_blur_yx = &box_blur<BlurDirection::kY, BlurDirection::kX>;
320
321}  // namespace SK_OPTS_NS
322
323#endif
324