1/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkBlitRow_opts_arm_neon.h"
9
10#include "SkBlitMask.h"
11#include "SkBlitRow.h"
12#include "SkColorData.h"
13#include "SkDither.h"
14#include "SkMathPriv.h"
15#include "SkUtils.h"
16
17#include "SkColor_opts_neon.h"
18#include <arm_neon.h>
19
20/* Neon version of S32_Blend_BlitRow32()
21 * portable version is in src/core/SkBlitRow_D32.cpp
22 */
23void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
24                              const SkPMColor* SK_RESTRICT src,
25                              int count, U8CPU alpha) {
26    SkASSERT(alpha <= 255);
27
28    if (count <= 0) {
29        return;
30    }
31
32    uint16_t src_scale = SkAlpha255To256(alpha);
33    uint16_t dst_scale = 256 - src_scale;
34
35    while (count >= 2) {
36        uint8x8_t vsrc, vdst, vres;
37        uint16x8_t vsrc_wide, vdst_wide;
38
39        /* These commented prefetches are a big win for count
40         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
41         * They also hurt a little (<5%) on an A15
42         */
43        //__builtin_prefetch(src+32);
44        //__builtin_prefetch(dst+32);
45
46        // Load
47        vsrc = vreinterpret_u8_u32(vld1_u32(src));
48        vdst = vreinterpret_u8_u32(vld1_u32(dst));
49
50        // Process src
51        vsrc_wide = vmovl_u8(vsrc);
52        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
53
54        // Process dst
55        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
56
57        // Combine
58        vdst_wide += vsrc_wide;
59        vres = vshrn_n_u16(vdst_wide, 8);
60
61        // Store
62        vst1_u32(dst, vreinterpret_u32_u8(vres));
63
64        src += 2;
65        dst += 2;
66        count -= 2;
67    }
68
69    if (count == 1) {
70        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
71        uint16x8_t vsrc_wide, vdst_wide;
72
73        // Load
74        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
75        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
76
77        // Process
78        vsrc_wide = vmovl_u8(vsrc);
79        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
80        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
81        vdst_wide += vsrc_wide;
82        vres = vshrn_n_u16(vdst_wide, 8);
83
84        // Store
85        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
86    }
87}
88
89#ifdef SK_CPU_ARM32
90void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
91                         const SkPMColor* SK_RESTRICT src,
92                         int count, U8CPU alpha) {
93
94    SkASSERT(255 > alpha);
95
96    if (count <= 0) {
97        return;
98    }
99
100    unsigned alpha256 = SkAlpha255To256(alpha);
101
102    // First deal with odd counts
103    if (count & 1) {
104        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
105        uint16x8_t vdst_wide, vsrc_wide;
106        unsigned dst_scale;
107
108        // Load
109        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
110        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
111
112        // Calc dst_scale
113        dst_scale = vget_lane_u8(vsrc, 3);
114        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
115
116        // Process src
117        vsrc_wide = vmovl_u8(vsrc);
118        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
119
120        // Process dst
121        vdst_wide = vmovl_u8(vdst);
122        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
123
124        // Combine
125        vdst_wide += vsrc_wide;
126        vres = vshrn_n_u16(vdst_wide, 8);
127
128        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
129        dst++;
130        src++;
131        count--;
132    }
133
134    if (count) {
135        uint8x8_t alpha_mask;
136        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
137        alpha_mask = vld1_u8(alpha_mask_setup);
138
139        do {
140
141            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
142            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
143
144            __builtin_prefetch(src+32);
145            __builtin_prefetch(dst+32);
146
147            // Load
148            vsrc = vreinterpret_u8_u32(vld1_u32(src));
149            vdst = vreinterpret_u8_u32(vld1_u32(dst));
150
151            // Prepare src_scale
152            vsrc_scale = vdupq_n_u16(alpha256);
153
154            // Calc dst_scale
155            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
156            vdst_scale = vmovl_u8(vsrc_alphas);
157            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
158            // A 16-bit lane would overflow if we used 0xFFFF here,
159            // so use an approximation with 0xFF00 that is off by 1,
160            // and add back 1 after to get the correct value.
161            // This is valid if alpha256 <= 255.
162            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
163            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
164            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
165
166            // Process src
167            vsrc_wide = vmovl_u8(vsrc);
168            vsrc_wide *= vsrc_scale;
169
170            // Process dst
171            vdst_wide = vmovl_u8(vdst);
172            vdst_wide *= vdst_scale;
173
174            // Combine
175            vdst_wide += vsrc_wide;
176            vres = vshrn_n_u16(vdst_wide, 8);
177
178            vst1_u32(dst, vreinterpret_u32_u8(vres));
179
180            src += 2;
181            dst += 2;
182            count -= 2;
183        } while(count);
184    }
185}
186
187///////////////////////////////////////////////////////////////////////////////
188
189#endif // #ifdef SK_CPU_ARM32
190
191///////////////////////////////////////////////////////////////////////////////
192
193const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
194    nullptr,   // S32_Opaque,
195    S32_Blend_BlitRow32_neon,        // S32_Blend,
196    nullptr,  // Ported to SkOpts
197#ifdef SK_CPU_ARM32
198    S32A_Blend_BlitRow32_neon        // S32A_Blend
199#else
200    nullptr
201#endif
202};
203