SkBlitRow_D32.cpp revision 376e9bc206b69d9190f38dfebb132a8769bbd72b
1/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow.h"
9#include "SkBlitMask.h"
10#include "SkColorPriv.h"
11#include "SkUtils.h"
12
13#define UNROLL
14
15static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
16                                 const SkPMColor* SK_RESTRICT src,
17                                 int count, U8CPU alpha) {
18    SkASSERT(255 == alpha);
19    sk_memcpy32(dst, src, count);
20}
21
22static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
23                                const SkPMColor* SK_RESTRICT src,
24                                int count, U8CPU alpha) {
25    SkASSERT(alpha <= 255);
26    if (count > 0) {
27        unsigned src_scale = SkAlpha255To256(alpha);
28        unsigned dst_scale = 256 - src_scale;
29
30#ifdef UNROLL
31        if (count & 1) {
32            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
33            dst += 1;
34            count -= 1;
35        }
36
37        const SkPMColor* SK_RESTRICT srcEnd = src + count;
38        while (src != srcEnd) {
39            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
40            dst += 1;
41            *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
42            dst += 1;
43        }
44#else
45        do {
46            *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
47            src += 1;
48            dst += 1;
49        } while (--count > 0);
50#endif
51    }
52}
53
54static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
55                                  const SkPMColor* SK_RESTRICT src,
56                                  int count, U8CPU alpha) {
57    SkASSERT(255 == alpha);
58    if (count > 0) {
59#ifdef UNROLL
60        if (count & 1) {
61            *dst = SkPMSrcOver(*(src++), *dst);
62            dst += 1;
63            count -= 1;
64        }
65
66        const SkPMColor* SK_RESTRICT srcEnd = src + count;
67        while (src != srcEnd) {
68            *dst = SkPMSrcOver(*(src++), *dst);
69            dst += 1;
70            *dst = SkPMSrcOver(*(src++), *dst);
71            dst += 1;
72        }
73#else
74        do {
75            *dst = SkPMSrcOver(*src, *dst);
76            src += 1;
77            dst += 1;
78        } while (--count > 0);
79#endif
80    }
81}
82
83static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
84                                 const SkPMColor* SK_RESTRICT src,
85                                 int count, U8CPU alpha) {
86    SkASSERT(alpha <= 255);
87    if (count > 0) {
88#ifdef UNROLL
89        if (count & 1) {
90            *dst = SkBlendARGB32(*(src++), *dst, alpha);
91            dst += 1;
92            count -= 1;
93        }
94
95        const SkPMColor* SK_RESTRICT srcEnd = src + count;
96        while (src != srcEnd) {
97            *dst = SkBlendARGB32(*(src++), *dst, alpha);
98            dst += 1;
99            *dst = SkBlendARGB32(*(src++), *dst, alpha);
100            dst += 1;
101        }
102#else
103        do {
104            *dst = SkBlendARGB32(*src, *dst, alpha);
105            src += 1;
106            dst += 1;
107        } while (--count > 0);
108#endif
109    }
110}
111
112///////////////////////////////////////////////////////////////////////////////
113
114static const SkBlitRow::Proc32 gDefault_Procs32[] = {
115    S32_Opaque_BlitRow32,
116    S32_Blend_BlitRow32,
117    S32A_Opaque_BlitRow32,
118    S32A_Blend_BlitRow32
119};
120
121SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
122    SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32));
123    // just so we don't crash
124    flags &= kFlags32_Mask;
125
126    SkBlitRow::Proc32 proc = PlatformProcs32(flags);
127    if (NULL == proc) {
128        proc = gDefault_Procs32[flags];
129    }
130    SkASSERT(proc);
131    return proc;
132}
133
134// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
135// It's not quite perfect, but it's never wrong in the interesting edge cases,
136// and it's quite a bit faster than blend_perfect.
137//
138// blend_256_round_alt is our currently blessed algorithm.  Please use it or an analogous one.
139void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
140    switch (SkGetPackedA32(color)) {
141        case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
142        case 255: sk_memset32(dst, color, count);               return;
143    }
144
145    unsigned invA = 255 - SkGetPackedA32(color);
146    invA += invA >> 7;
147    SkASSERT(invA < 256);  // We've already handled alpha == 0 above.
148
149#if defined(SK_ARM_HAS_NEON)
150    uint16x8_t colorHigh = vshll_n_u8((uint8x8_t)vdup_n_u32(color), 8);
151    uint16x8_t colorAndRound = vaddq_u16(colorHigh, vdupq_n_u16(128));
152    uint8x8_t invA8 = vdup_n_u8(invA);
153
154    // Does the core work of blending color onto 4 pixels, returning the resulting 4 pixels.
155    auto kernel = [&](const uint32x4_t& src4) -> uint32x4_t {
156        uint16x8_t lo = vmull_u8(vget_low_u8( (uint8x16_t)src4), invA8),
157                   hi = vmull_u8(vget_high_u8((uint8x16_t)src4), invA8);
158        return (uint32x4_t)
159            vcombine_u8(vaddhn_u16(colorAndRound, lo), vaddhn_u16(colorAndRound, hi));
160    };
161
162    while (count >= 8) {
163        uint32x4_t dst0 = kernel(vld1q_u32(src+0)),
164                   dst4 = kernel(vld1q_u32(src+4));
165        vst1q_u32(dst+0, dst0);
166        vst1q_u32(dst+4, dst4);
167        src   += 8;
168        dst   += 8;
169        count -= 8;
170    }
171    if (count >= 4) {
172        vst1q_u32(dst, kernel(vld1q_u32(src)));
173        src   += 4;
174        dst   += 4;
175        count -= 4;
176    }
177    if (count >= 2) {
178        uint32x2_t src2 = vld1_u32(src);
179        vst1_u32(dst, vget_low_u32(kernel(vcombine_u32(src2, src2))));
180        src   += 2;
181        dst   += 2;
182        count -= 2;
183    }
184    if (count >= 1) {
185        vst1q_lane_u32(dst, kernel(vdupq_n_u32(*src)), 0);
186    }
187
188#elif SK_CPU_SSE_LEVEL >= SK_CPU_LEVEL_SSE2
189    __m128i colorHigh = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_set1_epi32(color));
190    __m128i colorAndRound = _mm_add_epi16(colorHigh, _mm_set1_epi16(128));
191    __m128i invA16 = _mm_set1_epi16(invA);
192
193    // Does the core work of blending color onto 4 pixels, returning the resulting 4 pixels.
194    auto kernel = [&](const __m128i& src4) -> __m128i {
195        __m128i lo = _mm_mullo_epi16(invA16, _mm_unpacklo_epi8(src4, _mm_setzero_si128())),
196                hi = _mm_mullo_epi16(invA16, _mm_unpackhi_epi8(src4, _mm_setzero_si128()));
197        return _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(colorAndRound, lo), 8),
198                                _mm_srli_epi16(_mm_add_epi16(colorAndRound, hi), 8));
199    };
200
201    while (count >= 8) {
202        __m128i dst0 = kernel(_mm_loadu_si128((const __m128i*)(src+0))),
203                dst4 = kernel(_mm_loadu_si128((const __m128i*)(src+4)));
204        _mm_storeu_si128((__m128i*)(dst+0), dst0);
205        _mm_storeu_si128((__m128i*)(dst+4), dst4);
206        src   += 8;
207        dst   += 8;
208        count -= 8;
209    }
210    if (count >= 4) {
211        _mm_storeu_si128((__m128i*)dst, kernel(_mm_loadu_si128((const __m128i*)src)));
212        src   += 4;
213        dst   += 4;
214        count -= 4;
215    }
216    if (count >= 2) {
217        _mm_storel_epi64((__m128i*)dst, kernel(_mm_loadl_epi64((const __m128i*)src)));
218        src   += 2;
219        dst   += 2;
220        count -= 2;
221    }
222    if (count >= 1) {
223        *dst = _mm_cvtsi128_si32(kernel(_mm_cvtsi32_si128(*src)));
224    }
225#else  // Neither NEON nor SSE2.
226    unsigned round = (128 << 16) + (128 << 0);
227
228    while (count --> 0) {
229        // Our math is 16-bit, so we can do a little bit of SIMD in 32-bit registers.
230        const uint32_t mask = 0x00FF00FF;
231        uint32_t rb = (((*src >> 0) & mask) * invA + round) >> 8,  // _r_b
232                 ag = (((*src >> 8) & mask) * invA + round) >> 0;  // a_g_
233        *dst = color + ((rb & mask) | (ag & ~mask));
234        src++;
235        dst++;
236    }
237#endif
238}
239
240