1
2/*
3 * Copyright 2012 The Android Open Source Project
4 *
5 * Use of this source code is governed by a BSD-style license that can be
6 * found in the LICENSE file.
7 */
8
9
10#include <arm_neon.h>
11#include "SkColorPriv.h"
12
13/*
14 * Filter_32_opaque
15 *
16 * There is no hard-n-fast rule that the filtering must produce
17 * exact results for the color components, but if the 4 incoming colors are
18 * all opaque, then the output color must also be opaque. Subsequent parts of
19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use).
20 *
21 */
22// Chrome on Android uses -Os so we need to force these inline. Otherwise
23// calling the function in the inner loops will cause significant overhead on
24// some platforms.
25static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
26                                                   SkPMColor a00, SkPMColor a01,
27                                                   SkPMColor a10, SkPMColor a11,
28                                                   SkPMColor *dst) {
29    uint8x8_t vy, vconst16_8, v16_y, vres;
30    uint16x4_t vx, vconst16_16, v16_x, tmp;
31    uint32x2_t va0, va1;
32    uint16x8_t tmp1, tmp2;
33
34    vy = vdup_n_u8(y);                // duplicate y into vy
35    vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
36    v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
37
38    va0 = vdup_n_u32(a00);            // duplicate a00
39    va1 = vdup_n_u32(a10);            // duplicate a10
40    va0 = vset_lane_u32(a01, va0, 1); // set top to a01
41    va1 = vset_lane_u32(a11, va1, 1); // set top to a11
42
43    tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
44    tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
45
46    vx = vdup_n_u16(x);                // duplicate x into vx
47    vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
48    v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
49
50    tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
51    tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
52    tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
53    tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
54
55    vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
56    vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
57}
58
59static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
60                                                  SkPMColor a00, SkPMColor a01,
61                                                  SkPMColor a10, SkPMColor a11,
62                                                  SkPMColor *dst,
63                                                  uint16_t scale) {
64    uint8x8_t vy, vconst16_8, v16_y, vres;
65    uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
66    uint32x2_t va0, va1;
67    uint16x8_t tmp1, tmp2;
68
69    vy = vdup_n_u8(y);                // duplicate y into vy
70    vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
71    v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
72
73    va0 = vdup_n_u32(a00);            // duplicate a00
74    va1 = vdup_n_u32(a10);            // duplicate a10
75    va0 = vset_lane_u32(a01, va0, 1); // set top to a01
76    va1 = vset_lane_u32(a11, va1, 1); // set top to a11
77
78    tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
79    tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
80
81    vx = vdup_n_u16(x);                // duplicate x into vx
82    vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
83    v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
84
85    tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
86    tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
87    tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
88    tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
89
90    vscale = vdup_n_u16(scale);        // duplicate scale
91    tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
92    tmp = vmul_u16(tmp, vscale);       // multiply result by scale
93
94    vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
95    vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
96}
97