1typedef short int16_t;
2typedef int int32_t;
3typedef unsigned char uint8_t;
4typedef unsigned int uintptr_t;
5
6typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8)));
7typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8)));
8typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16)));
9typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16)));
10typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16)));
11typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8)));
12typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8)));
13
14typedef struct uint8x8x2_t
15{
16  uint8x8_t val[2];
17} uint8x8x2_t;
18typedef struct uint8x8x4_t
19{
20  uint8x8_t val[4];
21} uint8x8x4_t;
22
23__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
24vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
25{
26  return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
27}
28__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
29vaddl_s16 (int16x4_t __a, int16x4_t __b)
30{
31  return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
32}
33__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
34vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
35{
36  return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
37}
38__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
39vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
40{
41  return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
42}
43__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
44vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
45{
46  return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
47}
48__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
49vsubl_s16 (int16x4_t __a, int16x4_t __b)
50{
51  return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
52}
53__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
54vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
55{
56  return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
57}
58__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
59vshrn_n_u16 (uint16x8_t __a, const int __b)
60{
61  return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
62}
63__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
64vrshrn_n_s32 (int32x4_t __a, const int __b)
65{
66  return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 5);
67}
68__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
69vshlq_n_s16 (int16x8_t __a, const int __b)
70{
71  return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
72}
73__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
74vshll_n_s16 (int16x4_t __a, const int __b)
75{
76  return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
77}
78__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
79vshll_n_u8 (uint8x8_t __a, const int __b)
80{
81  return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
82}
83__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
84vmov_n_s32 (int32_t __a)
85{
86  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
87}
88__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
89vmov_n_u8 (uint8_t __a)
90{
91  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
92}
93__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
94vcombine_s16 (int16x4_t __a, int16x4_t __b)
95{
96  return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
97}
98__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
99vget_high_s16 (int16x8_t __a)
100{
101  return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
102}
103__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
104vget_low_s16 (int16x8_t __a)
105{
106  return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
107}
108__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
109vqmovun_s16 (int16x8_t __a)
110{
111  return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
112}
113__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
114vmovl_s16 (int16x4_t __a)
115{
116  return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
117}
118__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
119vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
120{
121  return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
122}
123__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
124vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
125{
126  return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
127}
128__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
129vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
130{
131  return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
132}
133__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
134vld1_s16 (const int16_t * __a)
135{
136  return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
137}
138__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
139vld1_u8 (const uint8_t * __a)
140{
141  return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
142}
143__extension__ static __inline void __attribute__ ((__always_inline__))
144vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
145{
146  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
147  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
148}
149__extension__ static __inline void __attribute__ ((__always_inline__))
150vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
151{
152  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
153  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
154}
155__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
156vreinterpretq_s16_u16 (uint16x8_t __a)
157{
158  return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
159}
160
161static const int16_t coef[4] = { 89858 / 4, 22014, 45773 / 2, 113618 / 4 };
162
163void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len)
164{
165    int block;
166    uint8_t uv_buf[2 * 32 + 15];
167    uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);
168    const int uv_len = (len + 1) >> 1;
169    const int num_blocks = (uv_len - 1) >> 3;
170    const int leftover = uv_len - num_blocks * 8;
171    const int last_pos = 1 + 16 * num_blocks;
172    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;
173    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;
174    const int16x4_t cf16 = vld1_s16(coef);
175    const int32x2_t cf32 = vmov_n_s32(76283);
176    const uint8x8_t u16 = vmov_n_u8(16);
177    const uint8x8_t u128 = vmov_n_u8(128);
178    for (block = 0; block < num_blocks; ++block) {
179        {
180            uint8x8_t a = vld1_u8(top_u);
181            uint8x8_t b = vld1_u8(top_u + 1);
182            uint8x8_t c = vld1_u8(cur_u);
183            uint8x8_t d = vld1_u8(cur_u + 1);
184            uint16x8_t al = vshll_n_u8(a, 1);
185            uint16x8_t bl = vshll_n_u8(b, 1);
186            uint16x8_t cl = vshll_n_u8(c, 1);
187            uint16x8_t dl = vshll_n_u8(d, 1);
188            uint8x8_t diag1, diag2;
189            uint16x8_t sl;
190            sl = vaddl_u8(a, b);
191            sl = vaddw_u8(sl, c);
192            sl = vaddw_u8(sl, d);
193            al = vaddq_u16(sl, al);
194            bl = vaddq_u16(sl, bl);
195            al = vaddq_u16(al, dl);
196            bl = vaddq_u16(bl, cl);
197            diag2 = vshrn_n_u16(al, 3);
198            diag1 = vshrn_n_u16(bl, 3);
199            a = vrhadd_u8(a, diag1);
200            b = vrhadd_u8(b, diag2);
201            c = vrhadd_u8(c, diag2);
202            d = vrhadd_u8(d, diag1);
203            {
204                const uint8x8x2_t a_b = {{ a, b }};
205                const uint8x8x2_t c_d = {{ c, d }};
206                vst2_u8(r_uv, a_b);
207                vst2_u8(r_uv + 32, c_d);
208            }
209        }
210        {
211            uint8x8_t a = vld1_u8(top_v);
212            uint8x8_t b = vld1_u8(top_v + 1);
213            uint8x8_t c = vld1_u8(cur_v);
214            uint8x8_t d = vld1_u8(cur_v + 1);
215            uint16x8_t al = vshll_n_u8(a, 1);
216            uint16x8_t bl = vshll_n_u8(b, 1);
217            uint16x8_t cl = vshll_n_u8(c, 1);
218            uint16x8_t dl = vshll_n_u8(d, 1);
219            uint8x8_t diag1, diag2;
220            uint16x8_t sl;
221            sl = vaddl_u8(a, b);
222            sl = vaddw_u8(sl, c);
223            sl = vaddw_u8(sl, d);
224            al = vaddq_u16(sl, al);
225            bl = vaddq_u16(sl, bl);
226            al = vaddq_u16(al, dl);
227            bl = vaddq_u16(bl, cl);
228            diag2 = vshrn_n_u16(al, 3);
229            diag1 = vshrn_n_u16(bl, 3);
230            a = vrhadd_u8(a, diag1);
231            b = vrhadd_u8(b, diag2);
232            c = vrhadd_u8(c, diag2);
233            d = vrhadd_u8(d, diag1);
234            {
235                const uint8x8x2_t a_b = {{ a, b }};
236                const uint8x8x2_t c_d = {{ c, d }};
237                vst2_u8(r_uv + 16, a_b);
238                vst2_u8(r_uv + 16 + 32, c_d);
239            }
240        }
241        {
242            if (top_y) {
243                {
244                    int i;
245                    for (i = 0; i < 16; i += 8) {
246                        int off = ((16 * block + 1) + i) * 4;
247                        uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i);
248                        uint8x8_t u = vld1_u8((r_uv) + i);
249                        uint8x8_t v = vld1_u8((r_uv) + i + 16);
250                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
251                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
252                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
253                        int16x8_t ud = vshlq_n_s16(uu, 1);
254                        int16x8_t vd = vshlq_n_s16(vv, 1);
255                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
256                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
257                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
258                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
259                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
260                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
261                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
262                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
263                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
264                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
265                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
266                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
267                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
268                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
269                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
270                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
271                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
272                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
273                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
274                        rl = vmulq_lane_s32(rl, cf32, 0);
275                        rh = vmulq_lane_s32(rh, cf32, 0);
276                        gl = vmulq_lane_s32(gl, cf32, 0);
277                        gh = vmulq_lane_s32(gh, cf32, 0);
278                        bl = vmulq_lane_s32(bl, cf32, 0);
279                        bh = vmulq_lane_s32(bh, cf32, 0);
280                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
281                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
282                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
283                        do {
284                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
285                            vst4_u8(top_dst + off, r_g_b_v255);
286                        } while (0);
287                    }
288                }
289            }
290            if (bottom_y) {
291                {
292                    int i;
293                    for (i = 0; i < 16; i += 8) {
294                        int off = ((16 * block + 1) + i) * 4;
295                        uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i);
296                        uint8x8_t u = vld1_u8(((r_uv) + 32) + i);
297                        uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16);
298                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
299                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
300                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
301                        int16x8_t ud = vshlq_n_s16(uu, 1);
302                        int16x8_t vd = vshlq_n_s16(vv, 1);
303                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
304                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
305                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
306                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
307                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
308                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
309                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
310                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
311                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
312                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
313                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
314                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
315                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
316                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
317                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
318                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
319                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
320                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
321                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
322                        rl = vmulq_lane_s32(rl, cf32, 0);
323                        rh = vmulq_lane_s32(rh, cf32, 0);
324                        gl = vmulq_lane_s32(gl, cf32, 0);
325                        gh = vmulq_lane_s32(gh, cf32, 0);
326                        bl = vmulq_lane_s32(bl, cf32, 0);
327                        bh = vmulq_lane_s32(bh, cf32, 0);
328                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
329                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
330                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
331                        do {
332                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
333                            vst4_u8(bottom_dst + off, r_g_b_v255);
334                        } while (0);
335                    }
336                }
337            }
338        }
339    }
340}
341