1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#ifdef _MSC_VER
14#define __builtin_prefetch(x)
15#endif
16
17static const int8_t vp8_sub_pel_filters[8][8] = {
18    {0,  0,  128,   0,   0, 0, 0, 0},  /* note that 1/8 pel positionyys are */
19    {0, -6,  123,  12,  -1, 0, 0, 0},  /*    just as per alpha -0.5 bicubic */
20    {2, -11, 108,  36,  -8, 1, 0, 0},  /* New 1/4 pel 6 tap filter */
21    {0, -9,   93,  50,  -6, 0, 0, 0},
22    {3, -16,  77,  77, -16, 3, 0, 0},  /* New 1/2 pel 6 tap filter */
23    {0, -6,   50,  93,  -9, 0, 0, 0},
24    {1, -8,   36, 108, -11, 2, 0, 0},  /* New 1/4 pel 6 tap filter */
25    {0, -1,   12, 123,  -6, 0, 0, 0},
26};
27
28void vp8_sixtap_predict4x4_neon(
29        unsigned char *src_ptr,
30        int src_pixels_per_line,
31        int xoffset,
32        int yoffset,
33        unsigned char *dst_ptr,
34        int dst_pitch) {
35    unsigned char *src;
36    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
37    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
38    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
39    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
40    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
41    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
42    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
43    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
44    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
45    uint32x2x2_t d0u32x2, d1u32x2;
46
47    if (xoffset == 0) {  // secondpass_filter4x4_only
48        uint32x2_t d27u32 = vdup_n_u32(0);
49        uint32x2_t d28u32 = vdup_n_u32(0);
50        uint32x2_t d29u32 = vdup_n_u32(0);
51        uint32x2_t d30u32 = vdup_n_u32(0);
52        uint32x2_t d31u32 = vdup_n_u32(0);
53
54        // load second_pass filter
55        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
56        d0s8 = vdup_lane_s8(dtmps8, 0);
57        d1s8 = vdup_lane_s8(dtmps8, 1);
58        d2s8 = vdup_lane_s8(dtmps8, 2);
59        d3s8 = vdup_lane_s8(dtmps8, 3);
60        d4s8 = vdup_lane_s8(dtmps8, 4);
61        d5s8 = vdup_lane_s8(dtmps8, 5);
62        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
63        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
64        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
65        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
66        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
67        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
68
69        // load src data
70        src = src_ptr - src_pixels_per_line * 2;
71        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
72        src += src_pixels_per_line;
73        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
74        src += src_pixels_per_line;
75        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
76        src += src_pixels_per_line;
77        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
78        src += src_pixels_per_line;
79        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
80        src += src_pixels_per_line;
81        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
82        src += src_pixels_per_line;
83        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
84        src += src_pixels_per_line;
85        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
86        src += src_pixels_per_line;
87        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
88
89        d27u8 = vreinterpret_u8_u32(d27u32);
90        d28u8 = vreinterpret_u8_u32(d28u32);
91        d29u8 = vreinterpret_u8_u32(d29u32);
92        d30u8 = vreinterpret_u8_u32(d30u32);
93        d31u8 = vreinterpret_u8_u32(d31u32);
94
95        d23u8 = vext_u8(d27u8, d28u8, 4);
96        d24u8 = vext_u8(d28u8, d29u8, 4);
97        d25u8 = vext_u8(d29u8, d30u8, 4);
98        d26u8 = vext_u8(d30u8, d31u8, 4);
99
100        q3u16 = vmull_u8(d27u8, d0u8);
101        q4u16 = vmull_u8(d28u8, d0u8);
102        q5u16 = vmull_u8(d25u8, d5u8);
103        q6u16 = vmull_u8(d26u8, d5u8);
104
105        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
106        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
107        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
108        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
109
110        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
111        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
112        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
113        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
114
115        q3s16 = vreinterpretq_s16_u16(q3u16);
116        q4s16 = vreinterpretq_s16_u16(q4u16);
117        q5s16 = vreinterpretq_s16_u16(q5u16);
118        q6s16 = vreinterpretq_s16_u16(q6u16);
119
120        q5s16 = vqaddq_s16(q5s16, q3s16);
121        q6s16 = vqaddq_s16(q6s16, q4s16);
122
123        d3u8 = vqrshrun_n_s16(q5s16, 7);
124        d4u8 = vqrshrun_n_s16(q6s16, 7);
125
126        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
127        dst_ptr += dst_pitch;
128        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
129        dst_ptr += dst_pitch;
130        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
131        dst_ptr += dst_pitch;
132        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
133        return;
134    }
135
136    // load first_pass filter
137    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
138    d0s8 = vdup_lane_s8(dtmps8, 0);
139    d1s8 = vdup_lane_s8(dtmps8, 1);
140    d2s8 = vdup_lane_s8(dtmps8, 2);
141    d3s8 = vdup_lane_s8(dtmps8, 3);
142    d4s8 = vdup_lane_s8(dtmps8, 4);
143    d5s8 = vdup_lane_s8(dtmps8, 5);
144    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
145    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
146    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
147    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
148    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
149    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
150
151    // First pass: output_height lines x output_width columns (9x4)
152
153    if (yoffset == 0)  // firstpass_filter4x4_only
154        src = src_ptr - 2;
155    else
156        src = src_ptr - 2 - (src_pixels_per_line * 2);
157
158    q3u8 = vld1q_u8(src);
159    src += src_pixels_per_line;
160    q4u8 = vld1q_u8(src);
161    src += src_pixels_per_line;
162    q5u8 = vld1q_u8(src);
163    src += src_pixels_per_line;
164    q6u8 = vld1q_u8(src);
165    src += src_pixels_per_line;
166
167    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
168    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
169    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
170    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
171
172    // vswp here
173    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
174    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
175
176    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
177                       vreinterpret_u32_u8(d19u8));
178    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
179                       vreinterpret_u32_u8(d21u8));
180    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
181    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
182
183    // keep original src data in q4 q6
184    q4u64 = vreinterpretq_u64_u8(q3u8);
185    q6u64 = vreinterpretq_u64_u8(q5u8);
186
187    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
188                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
189    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
190                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
191    q9u64 = vshrq_n_u64(q4u64, 8);
192    q10u64 = vshrq_n_u64(q6u64, 8);
193    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
194    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
195
196    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
197                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
198    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
199                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
200    q3u64 = vshrq_n_u64(q4u64, 32);
201    q5u64 = vshrq_n_u64(q6u64, 32);
202    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
203    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
204
205    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
206                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
207    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
208                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
209    q9u64 = vshrq_n_u64(q4u64, 16);
210    q10u64 = vshrq_n_u64(q6u64, 16);
211    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
212    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
213
214    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
215                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
216    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
217                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
218    q3u64 = vshrq_n_u64(q4u64, 24);
219    q5u64 = vshrq_n_u64(q6u64, 24);
220    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
221    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
222
223    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
224                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
225    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
226                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
227    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
228    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
229
230    q7s16 = vreinterpretq_s16_u16(q7u16);
231    q8s16 = vreinterpretq_s16_u16(q8u16);
232    q9s16 = vreinterpretq_s16_u16(q9u16);
233    q10s16 = vreinterpretq_s16_u16(q10u16);
234    q7s16 = vqaddq_s16(q7s16, q9s16);
235    q8s16 = vqaddq_s16(q8s16, q10s16);
236
237    d27u8 = vqrshrun_n_s16(q7s16, 7);
238    d28u8 = vqrshrun_n_s16(q8s16, 7);
239
240    if (yoffset == 0) {  // firstpass_filter4x4_only
241        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
242        dst_ptr += dst_pitch;
243        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
244        dst_ptr += dst_pitch;
245        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
246        dst_ptr += dst_pitch;
247        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
248        return;
249    }
250
251    // First Pass on rest 5-line data
252    q3u8 = vld1q_u8(src);
253    src += src_pixels_per_line;
254    q4u8 = vld1q_u8(src);
255    src += src_pixels_per_line;
256    q5u8 = vld1q_u8(src);
257    src += src_pixels_per_line;
258    q6u8 = vld1q_u8(src);
259    src += src_pixels_per_line;
260    q11u8 = vld1q_u8(src);
261
262    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
263    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
264    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
265    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
266
267    // vswp here
268    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
269    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
270
271    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
272                       vreinterpret_u32_u8(d19u8));
273    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
274                       vreinterpret_u32_u8(d21u8));
275    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
276    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
277    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
278    q12u16 = vmull_u8(d31u8, d5u8);
279
280    q4u64 = vreinterpretq_u64_u8(q3u8);
281    q6u64 = vreinterpretq_u64_u8(q5u8);
282
283    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
284                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
285    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
286                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
287    q9u64 = vshrq_n_u64(q4u64, 8);
288    q10u64 = vshrq_n_u64(q6u64, 8);
289    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
290    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
291    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
292
293    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
294                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
295    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
296                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
297    q3u64 = vshrq_n_u64(q4u64, 32);
298    q5u64 = vshrq_n_u64(q6u64, 32);
299    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
300    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
301    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
302    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
303
304    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
305                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
306    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
307                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
308    q9u64 = vshrq_n_u64(q4u64, 16);
309    q10u64 = vshrq_n_u64(q6u64, 16);
310    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
311    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
312    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
313    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
314
315    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
316                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
317    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
318                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
319    q3u64 = vshrq_n_u64(q4u64, 24);
320    q5u64 = vshrq_n_u64(q6u64, 24);
321    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
322    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
323    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
324    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
325
326    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
327                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
328    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
329                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
330    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
331    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
332    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
333    q11u16 = vmull_u8(d31u8, d3u8);
334
335    q7s16 = vreinterpretq_s16_u16(q7u16);
336    q8s16 = vreinterpretq_s16_u16(q8u16);
337    q9s16 = vreinterpretq_s16_u16(q9u16);
338    q10s16 = vreinterpretq_s16_u16(q10u16);
339    q11s16 = vreinterpretq_s16_u16(q11u16);
340    q12s16 = vreinterpretq_s16_u16(q12u16);
341    q7s16 = vqaddq_s16(q7s16, q9s16);
342    q8s16 = vqaddq_s16(q8s16, q10s16);
343    q12s16 = vqaddq_s16(q12s16, q11s16);
344
345    d29u8 = vqrshrun_n_s16(q7s16, 7);
346    d30u8 = vqrshrun_n_s16(q8s16, 7);
347    d31u8 = vqrshrun_n_s16(q12s16, 7);
348
349    // Second pass: 4x4
350    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
351    d0s8 = vdup_lane_s8(dtmps8, 0);
352    d1s8 = vdup_lane_s8(dtmps8, 1);
353    d2s8 = vdup_lane_s8(dtmps8, 2);
354    d3s8 = vdup_lane_s8(dtmps8, 3);
355    d4s8 = vdup_lane_s8(dtmps8, 4);
356    d5s8 = vdup_lane_s8(dtmps8, 5);
357    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
358    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
359    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
360    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
361    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
362    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
363
364    d23u8 = vext_u8(d27u8, d28u8, 4);
365    d24u8 = vext_u8(d28u8, d29u8, 4);
366    d25u8 = vext_u8(d29u8, d30u8, 4);
367    d26u8 = vext_u8(d30u8, d31u8, 4);
368
369    q3u16 = vmull_u8(d27u8, d0u8);
370    q4u16 = vmull_u8(d28u8, d0u8);
371    q5u16 = vmull_u8(d25u8, d5u8);
372    q6u16 = vmull_u8(d26u8, d5u8);
373
374    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
375    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
376    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
377    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
378
379    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
380    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
381    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
382    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
383
384    q3s16 = vreinterpretq_s16_u16(q3u16);
385    q4s16 = vreinterpretq_s16_u16(q4u16);
386    q5s16 = vreinterpretq_s16_u16(q5u16);
387    q6s16 = vreinterpretq_s16_u16(q6u16);
388
389    q5s16 = vqaddq_s16(q5s16, q3s16);
390    q6s16 = vqaddq_s16(q6s16, q4s16);
391
392    d3u8 = vqrshrun_n_s16(q5s16, 7);
393    d4u8 = vqrshrun_n_s16(q6s16, 7);
394
395    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
396    dst_ptr += dst_pitch;
397    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
398    dst_ptr += dst_pitch;
399    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
400    dst_ptr += dst_pitch;
401    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
402    return;
403}
404
405void vp8_sixtap_predict8x4_neon(
406        unsigned char *src_ptr,
407        int src_pixels_per_line,
408        int xoffset,
409        int yoffset,
410        unsigned char *dst_ptr,
411        int dst_pitch) {
412    unsigned char *src;
413    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
414    uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
415    uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
416    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
417    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
418    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
419    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
420    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
421    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
422
423    if (xoffset == 0) {  // secondpass_filter8x4_only
424        // load second_pass filter
425        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
426        d0s8 = vdup_lane_s8(dtmps8, 0);
427        d1s8 = vdup_lane_s8(dtmps8, 1);
428        d2s8 = vdup_lane_s8(dtmps8, 2);
429        d3s8 = vdup_lane_s8(dtmps8, 3);
430        d4s8 = vdup_lane_s8(dtmps8, 4);
431        d5s8 = vdup_lane_s8(dtmps8, 5);
432        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
433        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
434        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
435        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
436        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
437        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
438
439        // load src data
440        src = src_ptr - src_pixels_per_line * 2;
441        d22u8 = vld1_u8(src);
442        src += src_pixels_per_line;
443        d23u8 = vld1_u8(src);
444        src += src_pixels_per_line;
445        d24u8 = vld1_u8(src);
446        src += src_pixels_per_line;
447        d25u8 = vld1_u8(src);
448        src += src_pixels_per_line;
449        d26u8 = vld1_u8(src);
450        src += src_pixels_per_line;
451        d27u8 = vld1_u8(src);
452        src += src_pixels_per_line;
453        d28u8 = vld1_u8(src);
454        src += src_pixels_per_line;
455        d29u8 = vld1_u8(src);
456        src += src_pixels_per_line;
457        d30u8 = vld1_u8(src);
458
459        q3u16 = vmull_u8(d22u8, d0u8);
460        q4u16 = vmull_u8(d23u8, d0u8);
461        q5u16 = vmull_u8(d24u8, d0u8);
462        q6u16 = vmull_u8(d25u8, d0u8);
463
464        q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
465        q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
466        q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
467        q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
468
469        q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
470        q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
471        q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
472        q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
473
474        q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
475        q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
476        q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
477        q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
478
479        q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
480        q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
481        q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
482        q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
483
484        q7u16 = vmull_u8(d25u8, d3u8);
485        q8u16 = vmull_u8(d26u8, d3u8);
486        q9u16 = vmull_u8(d27u8, d3u8);
487        q10u16 = vmull_u8(d28u8, d3u8);
488
489        q3s16 = vreinterpretq_s16_u16(q3u16);
490        q4s16 = vreinterpretq_s16_u16(q4u16);
491        q5s16 = vreinterpretq_s16_u16(q5u16);
492        q6s16 = vreinterpretq_s16_u16(q6u16);
493        q7s16 = vreinterpretq_s16_u16(q7u16);
494        q8s16 = vreinterpretq_s16_u16(q8u16);
495        q9s16 = vreinterpretq_s16_u16(q9u16);
496        q10s16 = vreinterpretq_s16_u16(q10u16);
497
498        q7s16 = vqaddq_s16(q7s16, q3s16);
499        q8s16 = vqaddq_s16(q8s16, q4s16);
500        q9s16 = vqaddq_s16(q9s16, q5s16);
501        q10s16 = vqaddq_s16(q10s16, q6s16);
502
503        d6u8 = vqrshrun_n_s16(q7s16, 7);
504        d7u8 = vqrshrun_n_s16(q8s16, 7);
505        d8u8 = vqrshrun_n_s16(q9s16, 7);
506        d9u8 = vqrshrun_n_s16(q10s16, 7);
507
508        vst1_u8(dst_ptr, d6u8);
509        dst_ptr += dst_pitch;
510        vst1_u8(dst_ptr, d7u8);
511        dst_ptr += dst_pitch;
512        vst1_u8(dst_ptr, d8u8);
513        dst_ptr += dst_pitch;
514        vst1_u8(dst_ptr, d9u8);
515        return;
516    }
517
518    // load first_pass filter
519    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
520    d0s8 = vdup_lane_s8(dtmps8, 0);
521    d1s8 = vdup_lane_s8(dtmps8, 1);
522    d2s8 = vdup_lane_s8(dtmps8, 2);
523    d3s8 = vdup_lane_s8(dtmps8, 3);
524    d4s8 = vdup_lane_s8(dtmps8, 4);
525    d5s8 = vdup_lane_s8(dtmps8, 5);
526    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
527    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
528    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
529    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
530    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
531    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
532
533    // First pass: output_height lines x output_width columns (9x4)
534    if (yoffset == 0)  // firstpass_filter4x4_only
535        src = src_ptr - 2;
536    else
537        src = src_ptr - 2 - (src_pixels_per_line * 2);
538    q3u8 = vld1q_u8(src);
539    src += src_pixels_per_line;
540    q4u8 = vld1q_u8(src);
541    src += src_pixels_per_line;
542    q5u8 = vld1q_u8(src);
543    src += src_pixels_per_line;
544    q6u8 = vld1q_u8(src);
545
546    q7u16  = vmull_u8(vget_low_u8(q3u8), d0u8);
547    q8u16  = vmull_u8(vget_low_u8(q4u8), d0u8);
548    q9u16  = vmull_u8(vget_low_u8(q5u8), d0u8);
549    q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
550
551    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
552    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
553    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
554    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
555
556    q7u16  = vmlsl_u8(q7u16, d28u8, d1u8);
557    q8u16  = vmlsl_u8(q8u16, d29u8, d1u8);
558    q9u16  = vmlsl_u8(q9u16, d30u8, d1u8);
559    q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
560
561    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
562    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
563    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
564    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
565
566    q7u16  = vmlsl_u8(q7u16, d28u8, d4u8);
567    q8u16  = vmlsl_u8(q8u16, d29u8, d4u8);
568    q9u16  = vmlsl_u8(q9u16, d30u8, d4u8);
569    q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
570
571    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
572    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
573    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
574    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
575
576    q7u16  = vmlal_u8(q7u16, d28u8, d2u8);
577    q8u16  = vmlal_u8(q8u16, d29u8, d2u8);
578    q9u16  = vmlal_u8(q9u16, d30u8, d2u8);
579    q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
580
581    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
582    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
583    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
584    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
585
586    q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
587    q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
588    q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
589    q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
590
591    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
592    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
593    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
594    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
595
596    q3u16 = vmull_u8(d28u8, d3u8);
597    q4u16 = vmull_u8(d29u8, d3u8);
598    q5u16 = vmull_u8(d30u8, d3u8);
599    q6u16 = vmull_u8(d31u8, d3u8);
600
601    q3s16 = vreinterpretq_s16_u16(q3u16);
602    q4s16 = vreinterpretq_s16_u16(q4u16);
603    q5s16 = vreinterpretq_s16_u16(q5u16);
604    q6s16 = vreinterpretq_s16_u16(q6u16);
605    q7s16 = vreinterpretq_s16_u16(q7u16);
606    q8s16 = vreinterpretq_s16_u16(q8u16);
607    q9s16 = vreinterpretq_s16_u16(q9u16);
608    q10s16 = vreinterpretq_s16_u16(q10u16);
609
610    q7s16 = vqaddq_s16(q7s16, q3s16);
611    q8s16 = vqaddq_s16(q8s16, q4s16);
612    q9s16 = vqaddq_s16(q9s16, q5s16);
613    q10s16 = vqaddq_s16(q10s16, q6s16);
614
615    d22u8 = vqrshrun_n_s16(q7s16, 7);
616    d23u8 = vqrshrun_n_s16(q8s16, 7);
617    d24u8 = vqrshrun_n_s16(q9s16, 7);
618    d25u8 = vqrshrun_n_s16(q10s16, 7);
619
620    if (yoffset == 0) {  // firstpass_filter8x4_only
621        vst1_u8(dst_ptr, d22u8);
622        dst_ptr += dst_pitch;
623        vst1_u8(dst_ptr, d23u8);
624        dst_ptr += dst_pitch;
625        vst1_u8(dst_ptr, d24u8);
626        dst_ptr += dst_pitch;
627        vst1_u8(dst_ptr, d25u8);
628        return;
629    }
630
631    // First Pass on rest 5-line data
632    src += src_pixels_per_line;
633    q3u8 = vld1q_u8(src);
634    src += src_pixels_per_line;
635    q4u8 = vld1q_u8(src);
636    src += src_pixels_per_line;
637    q5u8 = vld1q_u8(src);
638    src += src_pixels_per_line;
639    q6u8 = vld1q_u8(src);
640    src += src_pixels_per_line;
641    q7u8 = vld1q_u8(src);
642
643    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
644    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
645    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
646    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
647    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
648
649    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
650    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
651    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
652    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
653    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
654
655    q8u16  = vmlsl_u8(q8u16, d27u8, d1u8);
656    q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
657    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
658    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
659    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
660
661    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
662    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
663    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
664    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
665    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
666
667    q8u16  = vmlsl_u8(q8u16, d27u8, d4u8);
668    q9u16  = vmlsl_u8(q9u16, d28u8, d4u8);
669    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
670    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
671    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
672
673    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
674    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
675    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
676    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
677    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
678
679    q8u16  = vmlal_u8(q8u16, d27u8, d2u8);
680    q9u16  = vmlal_u8(q9u16, d28u8, d2u8);
681    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
682    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
683    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
684
685    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
686    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
687    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
688    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
689    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
690
691    q8u16  = vmlal_u8(q8u16, d27u8, d5u8);
692    q9u16  = vmlal_u8(q9u16, d28u8, d5u8);
693    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
694    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
695    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
696
697    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
698    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
699    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
700    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
701    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
702
703    q3u16 = vmull_u8(d27u8, d3u8);
704    q4u16 = vmull_u8(d28u8, d3u8);
705    q5u16 = vmull_u8(d29u8, d3u8);
706    q6u16 = vmull_u8(d30u8, d3u8);
707    q7u16 = vmull_u8(d31u8, d3u8);
708
709    q3s16 = vreinterpretq_s16_u16(q3u16);
710    q4s16 = vreinterpretq_s16_u16(q4u16);
711    q5s16 = vreinterpretq_s16_u16(q5u16);
712    q6s16 = vreinterpretq_s16_u16(q6u16);
713    q7s16 = vreinterpretq_s16_u16(q7u16);
714    q8s16 = vreinterpretq_s16_u16(q8u16);
715    q9s16 = vreinterpretq_s16_u16(q9u16);
716    q10s16 = vreinterpretq_s16_u16(q10u16);
717    q11s16 = vreinterpretq_s16_u16(q11u16);
718    q12s16 = vreinterpretq_s16_u16(q12u16);
719
720    q8s16 = vqaddq_s16(q8s16, q3s16);
721    q9s16 = vqaddq_s16(q9s16, q4s16);
722    q10s16 = vqaddq_s16(q10s16, q5s16);
723    q11s16 = vqaddq_s16(q11s16, q6s16);
724    q12s16 = vqaddq_s16(q12s16, q7s16);
725
726    d26u8 = vqrshrun_n_s16(q8s16, 7);
727    d27u8 = vqrshrun_n_s16(q9s16, 7);
728    d28u8 = vqrshrun_n_s16(q10s16, 7);
729    d29u8 = vqrshrun_n_s16(q11s16, 7);
730    d30u8 = vqrshrun_n_s16(q12s16, 7);
731
732    // Second pass: 8x4
733    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
734    d0s8 = vdup_lane_s8(dtmps8, 0);
735    d1s8 = vdup_lane_s8(dtmps8, 1);
736    d2s8 = vdup_lane_s8(dtmps8, 2);
737    d3s8 = vdup_lane_s8(dtmps8, 3);
738    d4s8 = vdup_lane_s8(dtmps8, 4);
739    d5s8 = vdup_lane_s8(dtmps8, 5);
740    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
741    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
742    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
743    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
744    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
745    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
746
747    q3u16 = vmull_u8(d22u8, d0u8);
748    q4u16 = vmull_u8(d23u8, d0u8);
749    q5u16 = vmull_u8(d24u8, d0u8);
750    q6u16 = vmull_u8(d25u8, d0u8);
751
752    q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
753    q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
754    q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
755    q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
756
757    q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
758    q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
759    q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
760    q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
761
762    q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
763    q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
764    q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
765    q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
766
767    q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
768    q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
769    q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
770    q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
771
772    q7u16 = vmull_u8(d25u8, d3u8);
773    q8u16 = vmull_u8(d26u8, d3u8);
774    q9u16 = vmull_u8(d27u8, d3u8);
775    q10u16 = vmull_u8(d28u8, d3u8);
776
777    q3s16 = vreinterpretq_s16_u16(q3u16);
778    q4s16 = vreinterpretq_s16_u16(q4u16);
779    q5s16 = vreinterpretq_s16_u16(q5u16);
780    q6s16 = vreinterpretq_s16_u16(q6u16);
781    q7s16 = vreinterpretq_s16_u16(q7u16);
782    q8s16 = vreinterpretq_s16_u16(q8u16);
783    q9s16 = vreinterpretq_s16_u16(q9u16);
784    q10s16 = vreinterpretq_s16_u16(q10u16);
785
786    q7s16 = vqaddq_s16(q7s16, q3s16);
787    q8s16 = vqaddq_s16(q8s16, q4s16);
788    q9s16 = vqaddq_s16(q9s16, q5s16);
789    q10s16 = vqaddq_s16(q10s16, q6s16);
790
791    d6u8 = vqrshrun_n_s16(q7s16, 7);
792    d7u8 = vqrshrun_n_s16(q8s16, 7);
793    d8u8 = vqrshrun_n_s16(q9s16, 7);
794    d9u8 = vqrshrun_n_s16(q10s16, 7);
795
796    vst1_u8(dst_ptr, d6u8);
797    dst_ptr += dst_pitch;
798    vst1_u8(dst_ptr, d7u8);
799    dst_ptr += dst_pitch;
800    vst1_u8(dst_ptr, d8u8);
801    dst_ptr += dst_pitch;
802    vst1_u8(dst_ptr, d9u8);
803    return;
804}
805
806void vp8_sixtap_predict8x8_neon(
807        unsigned char *src_ptr,
808        int src_pixels_per_line,
809        int xoffset,
810        int yoffset,
811        unsigned char *dst_ptr,
812        int dst_pitch) {
813    unsigned char *src, *tmpp;
814    unsigned char tmp[64];
815    int i;
816    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
817    uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
818    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
819    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
820    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
821    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
822    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
823    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
824    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
825
826    if (xoffset == 0) {  // secondpass_filter8x8_only
827        // load second_pass filter
828        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
829        d0s8 = vdup_lane_s8(dtmps8, 0);
830        d1s8 = vdup_lane_s8(dtmps8, 1);
831        d2s8 = vdup_lane_s8(dtmps8, 2);
832        d3s8 = vdup_lane_s8(dtmps8, 3);
833        d4s8 = vdup_lane_s8(dtmps8, 4);
834        d5s8 = vdup_lane_s8(dtmps8, 5);
835        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
836        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
837        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
838        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
839        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
840        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
841
842        // load src data
843        src = src_ptr - src_pixels_per_line * 2;
844        d18u8 = vld1_u8(src);
845        src += src_pixels_per_line;
846        d19u8 = vld1_u8(src);
847        src += src_pixels_per_line;
848        d20u8 = vld1_u8(src);
849        src += src_pixels_per_line;
850        d21u8 = vld1_u8(src);
851        src += src_pixels_per_line;
852        d22u8 = vld1_u8(src);
853        src += src_pixels_per_line;
854        d23u8 = vld1_u8(src);
855        src += src_pixels_per_line;
856        d24u8 = vld1_u8(src);
857        src += src_pixels_per_line;
858        d25u8 = vld1_u8(src);
859        src += src_pixels_per_line;
860        d26u8 = vld1_u8(src);
861        src += src_pixels_per_line;
862        d27u8 = vld1_u8(src);
863        src += src_pixels_per_line;
864        d28u8 = vld1_u8(src);
865        src += src_pixels_per_line;
866        d29u8 = vld1_u8(src);
867        src += src_pixels_per_line;
868        d30u8 = vld1_u8(src);
869
870        for (i = 2; i > 0; i--) {
871            q3u16 = vmull_u8(d18u8, d0u8);
872            q4u16 = vmull_u8(d19u8, d0u8);
873            q5u16 = vmull_u8(d20u8, d0u8);
874            q6u16 = vmull_u8(d21u8, d0u8);
875
876            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
877            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
878            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
879            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
880
881            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
882            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
883            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
884            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
885
886            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
887            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
888            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
889            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
890
891            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
892            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
893            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
894            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
895
896            q7u16 = vmull_u8(d21u8, d3u8);
897            q8u16 = vmull_u8(d22u8, d3u8);
898            q9u16 = vmull_u8(d23u8, d3u8);
899            q10u16 = vmull_u8(d24u8, d3u8);
900
901            q3s16 = vreinterpretq_s16_u16(q3u16);
902            q4s16 = vreinterpretq_s16_u16(q4u16);
903            q5s16 = vreinterpretq_s16_u16(q5u16);
904            q6s16 = vreinterpretq_s16_u16(q6u16);
905            q7s16 = vreinterpretq_s16_u16(q7u16);
906            q8s16 = vreinterpretq_s16_u16(q8u16);
907            q9s16 = vreinterpretq_s16_u16(q9u16);
908            q10s16 = vreinterpretq_s16_u16(q10u16);
909
910            q7s16 = vqaddq_s16(q7s16, q3s16);
911            q8s16 = vqaddq_s16(q8s16, q4s16);
912            q9s16 = vqaddq_s16(q9s16, q5s16);
913            q10s16 = vqaddq_s16(q10s16, q6s16);
914
915            d6u8 = vqrshrun_n_s16(q7s16, 7);
916            d7u8 = vqrshrun_n_s16(q8s16, 7);
917            d8u8 = vqrshrun_n_s16(q9s16, 7);
918            d9u8 = vqrshrun_n_s16(q10s16, 7);
919
920            d18u8 = d22u8;
921            d19u8 = d23u8;
922            d20u8 = d24u8;
923            d21u8 = d25u8;
924            d22u8 = d26u8;
925            d23u8 = d27u8;
926            d24u8 = d28u8;
927            d25u8 = d29u8;
928            d26u8 = d30u8;
929
930            vst1_u8(dst_ptr, d6u8);
931            dst_ptr += dst_pitch;
932            vst1_u8(dst_ptr, d7u8);
933            dst_ptr += dst_pitch;
934            vst1_u8(dst_ptr, d8u8);
935            dst_ptr += dst_pitch;
936            vst1_u8(dst_ptr, d9u8);
937            dst_ptr += dst_pitch;
938        }
939        return;
940    }
941
942    // load first_pass filter
943    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
944    d0s8 = vdup_lane_s8(dtmps8, 0);
945    d1s8 = vdup_lane_s8(dtmps8, 1);
946    d2s8 = vdup_lane_s8(dtmps8, 2);
947    d3s8 = vdup_lane_s8(dtmps8, 3);
948    d4s8 = vdup_lane_s8(dtmps8, 4);
949    d5s8 = vdup_lane_s8(dtmps8, 5);
950    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
951    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
952    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
953    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
954    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
955    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
956
957    // First pass: output_height lines x output_width columns (9x4)
958    if (yoffset == 0)  // firstpass_filter4x4_only
959        src = src_ptr - 2;
960    else
961        src = src_ptr - 2 - (src_pixels_per_line * 2);
962
963    tmpp = tmp;
964    for (i = 2; i > 0; i--) {
965        q3u8 = vld1q_u8(src);
966        src += src_pixels_per_line;
967        q4u8 = vld1q_u8(src);
968        src += src_pixels_per_line;
969        q5u8 = vld1q_u8(src);
970        src += src_pixels_per_line;
971        q6u8 = vld1q_u8(src);
972        src += src_pixels_per_line;
973
974        __builtin_prefetch(src);
975        __builtin_prefetch(src + src_pixels_per_line);
976        __builtin_prefetch(src + src_pixels_per_line * 2);
977
978        q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
979        q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
980        q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
981        q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
982
983        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
984        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
985        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
986        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
987
988        q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
989        q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
990        q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
991        q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
992
993        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
994        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
995        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
996        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
997
998        q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
999        q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
1000        q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
1001        q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
1002
1003        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1004        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1005        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1006        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1007
1008        q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
1009        q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
1010        q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
1011        q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
1012
1013        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1014        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1015        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1016        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1017
1018        q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
1019        q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1020        q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
1021        q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
1022
1023        d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1024        d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1025        d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1026        d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1027
1028        q3u16 = vmull_u8(d28u8, d3u8);
1029        q4u16 = vmull_u8(d29u8, d3u8);
1030        q5u16 = vmull_u8(d30u8, d3u8);
1031        q6u16 = vmull_u8(d31u8, d3u8);
1032
1033        q3s16 = vreinterpretq_s16_u16(q3u16);
1034        q4s16 = vreinterpretq_s16_u16(q4u16);
1035        q5s16 = vreinterpretq_s16_u16(q5u16);
1036        q6s16 = vreinterpretq_s16_u16(q6u16);
1037        q7s16 = vreinterpretq_s16_u16(q7u16);
1038        q8s16 = vreinterpretq_s16_u16(q8u16);
1039        q9s16 = vreinterpretq_s16_u16(q9u16);
1040        q10s16 = vreinterpretq_s16_u16(q10u16);
1041
1042        q7s16 = vqaddq_s16(q7s16, q3s16);
1043        q8s16 = vqaddq_s16(q8s16, q4s16);
1044        q9s16 = vqaddq_s16(q9s16, q5s16);
1045        q10s16 = vqaddq_s16(q10s16, q6s16);
1046
1047        d22u8 = vqrshrun_n_s16(q7s16, 7);
1048        d23u8 = vqrshrun_n_s16(q8s16, 7);
1049        d24u8 = vqrshrun_n_s16(q9s16, 7);
1050        d25u8 = vqrshrun_n_s16(q10s16, 7);
1051
1052        if (yoffset == 0) {  // firstpass_filter8x4_only
1053            vst1_u8(dst_ptr, d22u8);
1054            dst_ptr += dst_pitch;
1055            vst1_u8(dst_ptr, d23u8);
1056            dst_ptr += dst_pitch;
1057            vst1_u8(dst_ptr, d24u8);
1058            dst_ptr += dst_pitch;
1059            vst1_u8(dst_ptr, d25u8);
1060            dst_ptr += dst_pitch;
1061        } else {
1062            vst1_u8(tmpp, d22u8);
1063            tmpp += 8;
1064            vst1_u8(tmpp, d23u8);
1065            tmpp += 8;
1066            vst1_u8(tmpp, d24u8);
1067            tmpp += 8;
1068            vst1_u8(tmpp, d25u8);
1069            tmpp += 8;
1070        }
1071    }
1072    if (yoffset == 0)
1073        return;
1074
1075    // First Pass on rest 5-line data
1076    q3u8 = vld1q_u8(src);
1077    src += src_pixels_per_line;
1078    q4u8 = vld1q_u8(src);
1079    src += src_pixels_per_line;
1080    q5u8 = vld1q_u8(src);
1081    src += src_pixels_per_line;
1082    q6u8 = vld1q_u8(src);
1083    src += src_pixels_per_line;
1084    q7u8 = vld1q_u8(src);
1085
1086    q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1087    q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1088    q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1089    q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1090    q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1091
1092    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1093    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1094    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1095    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1096    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1097
1098    q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1099    q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1100    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1101    q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1102    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1103
1104    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1105    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1106    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1107    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1108    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1109
1110    q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1111    q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1112    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1113    q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1114    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1115
1116    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1117    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1118    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1119    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1120    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1121
1122    q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1123    q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1124    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1125    q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1126    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1127
1128    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1129    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1130    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1131    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1132    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1133
1134    q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1135    q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1136    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1137    q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1138    q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1139
1140    d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1141    d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1142    d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1143    d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1144    d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1145
1146    q3u16 = vmull_u8(d27u8, d3u8);
1147    q4u16 = vmull_u8(d28u8, d3u8);
1148    q5u16 = vmull_u8(d29u8, d3u8);
1149    q6u16 = vmull_u8(d30u8, d3u8);
1150    q7u16 = vmull_u8(d31u8, d3u8);
1151
1152    q3s16 = vreinterpretq_s16_u16(q3u16);
1153    q4s16 = vreinterpretq_s16_u16(q4u16);
1154    q5s16 = vreinterpretq_s16_u16(q5u16);
1155    q6s16 = vreinterpretq_s16_u16(q6u16);
1156    q7s16 = vreinterpretq_s16_u16(q7u16);
1157    q8s16 = vreinterpretq_s16_u16(q8u16);
1158    q9s16 = vreinterpretq_s16_u16(q9u16);
1159    q10s16 = vreinterpretq_s16_u16(q10u16);
1160    q11s16 = vreinterpretq_s16_u16(q11u16);
1161    q12s16 = vreinterpretq_s16_u16(q12u16);
1162
1163    q8s16 = vqaddq_s16(q8s16, q3s16);
1164    q9s16 = vqaddq_s16(q9s16, q4s16);
1165    q10s16 = vqaddq_s16(q10s16, q5s16);
1166    q11s16 = vqaddq_s16(q11s16, q6s16);
1167    q12s16 = vqaddq_s16(q12s16, q7s16);
1168
1169    d26u8 = vqrshrun_n_s16(q8s16, 7);
1170    d27u8 = vqrshrun_n_s16(q9s16, 7);
1171    d28u8 = vqrshrun_n_s16(q10s16, 7);
1172    d29u8 = vqrshrun_n_s16(q11s16, 7);
1173    d30u8 = vqrshrun_n_s16(q12s16, 7);
1174
1175    // Second pass: 8x8
1176    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1177    d0s8 = vdup_lane_s8(dtmps8, 0);
1178    d1s8 = vdup_lane_s8(dtmps8, 1);
1179    d2s8 = vdup_lane_s8(dtmps8, 2);
1180    d3s8 = vdup_lane_s8(dtmps8, 3);
1181    d4s8 = vdup_lane_s8(dtmps8, 4);
1182    d5s8 = vdup_lane_s8(dtmps8, 5);
1183    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1184    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1185    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1186    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1187    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1188    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1189
1190    tmpp = tmp;
1191    q9u8 = vld1q_u8(tmpp);
1192    tmpp += 16;
1193    q10u8 = vld1q_u8(tmpp);
1194    tmpp += 16;
1195    q11u8 = vld1q_u8(tmpp);
1196    tmpp += 16;
1197    q12u8 = vld1q_u8(tmpp);
1198
1199    d18u8 = vget_low_u8(q9u8);
1200    d19u8 = vget_high_u8(q9u8);
1201    d20u8 = vget_low_u8(q10u8);
1202    d21u8 = vget_high_u8(q10u8);
1203    d22u8 = vget_low_u8(q11u8);
1204    d23u8 = vget_high_u8(q11u8);
1205    d24u8 = vget_low_u8(q12u8);
1206    d25u8 = vget_high_u8(q12u8);
1207
1208    for (i = 2; i > 0; i--) {
1209        q3u16 = vmull_u8(d18u8, d0u8);
1210        q4u16 = vmull_u8(d19u8, d0u8);
1211        q5u16 = vmull_u8(d20u8, d0u8);
1212        q6u16 = vmull_u8(d21u8, d0u8);
1213
1214        q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1215        q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1216        q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1217        q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1218
1219        q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1220        q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1221        q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1222        q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1223
1224        q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1225        q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1226        q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1227        q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1228
1229        q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1230        q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1231        q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1232        q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1233
1234        q7u16 = vmull_u8(d21u8, d3u8);
1235        q8u16 = vmull_u8(d22u8, d3u8);
1236        q9u16 = vmull_u8(d23u8, d3u8);
1237        q10u16 = vmull_u8(d24u8, d3u8);
1238
1239        q3s16 = vreinterpretq_s16_u16(q3u16);
1240        q4s16 = vreinterpretq_s16_u16(q4u16);
1241        q5s16 = vreinterpretq_s16_u16(q5u16);
1242        q6s16 = vreinterpretq_s16_u16(q6u16);
1243        q7s16 = vreinterpretq_s16_u16(q7u16);
1244        q8s16 = vreinterpretq_s16_u16(q8u16);
1245        q9s16 = vreinterpretq_s16_u16(q9u16);
1246        q10s16 = vreinterpretq_s16_u16(q10u16);
1247
1248        q7s16 = vqaddq_s16(q7s16, q3s16);
1249        q8s16 = vqaddq_s16(q8s16, q4s16);
1250        q9s16 = vqaddq_s16(q9s16, q5s16);
1251        q10s16 = vqaddq_s16(q10s16, q6s16);
1252
1253        d6u8 = vqrshrun_n_s16(q7s16, 7);
1254        d7u8 = vqrshrun_n_s16(q8s16, 7);
1255        d8u8 = vqrshrun_n_s16(q9s16, 7);
1256        d9u8 = vqrshrun_n_s16(q10s16, 7);
1257
1258        d18u8 = d22u8;
1259        d19u8 = d23u8;
1260        d20u8 = d24u8;
1261        d21u8 = d25u8;
1262        d22u8 = d26u8;
1263        d23u8 = d27u8;
1264        d24u8 = d28u8;
1265        d25u8 = d29u8;
1266        d26u8 = d30u8;
1267
1268        vst1_u8(dst_ptr, d6u8);
1269        dst_ptr += dst_pitch;
1270        vst1_u8(dst_ptr, d7u8);
1271        dst_ptr += dst_pitch;
1272        vst1_u8(dst_ptr, d8u8);
1273        dst_ptr += dst_pitch;
1274        vst1_u8(dst_ptr, d9u8);
1275        dst_ptr += dst_pitch;
1276    }
1277    return;
1278}
1279
1280void vp8_sixtap_predict16x16_neon(
1281        unsigned char *src_ptr,
1282        int src_pixels_per_line,
1283        int xoffset,
1284        int yoffset,
1285        unsigned char *dst_ptr,
1286        int dst_pitch) {
1287    unsigned char *src, *src_tmp, *dst, *tmpp;
1288    unsigned char tmp[336];
1289    int i, j;
1290    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1291    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1292    uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1293    uint8x8_t d28u8, d29u8, d30u8, d31u8;
1294    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1295    uint8x16_t q3u8, q4u8;
1296    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1297    uint16x8_t q11u16, q12u16, q13u16, q15u16;
1298    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1299    int16x8_t q11s16, q12s16, q13s16, q15s16;
1300
1301    if (xoffset == 0) {  // secondpass_filter8x8_only
1302        // load second_pass filter
1303        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1304        d0s8 = vdup_lane_s8(dtmps8, 0);
1305        d1s8 = vdup_lane_s8(dtmps8, 1);
1306        d2s8 = vdup_lane_s8(dtmps8, 2);
1307        d3s8 = vdup_lane_s8(dtmps8, 3);
1308        d4s8 = vdup_lane_s8(dtmps8, 4);
1309        d5s8 = vdup_lane_s8(dtmps8, 5);
1310        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1311        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1312        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1313        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1314        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1315        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1316
1317        // load src data
1318        src_tmp = src_ptr - src_pixels_per_line * 2;
1319        for (i = 0; i < 2; i++) {
1320            src = src_tmp + i * 8;
1321            dst = dst_ptr + i * 8;
1322            d18u8 = vld1_u8(src);
1323            src += src_pixels_per_line;
1324            d19u8 = vld1_u8(src);
1325            src += src_pixels_per_line;
1326            d20u8 = vld1_u8(src);
1327            src += src_pixels_per_line;
1328            d21u8 = vld1_u8(src);
1329            src += src_pixels_per_line;
1330            d22u8 = vld1_u8(src);
1331            src += src_pixels_per_line;
1332            for (j = 0; j < 4; j++) {
1333                d23u8 = vld1_u8(src);
1334                src += src_pixels_per_line;
1335                d24u8 = vld1_u8(src);
1336                src += src_pixels_per_line;
1337                d25u8 = vld1_u8(src);
1338                src += src_pixels_per_line;
1339                d26u8 = vld1_u8(src);
1340                src += src_pixels_per_line;
1341
1342                q3u16 = vmull_u8(d18u8, d0u8);
1343                q4u16 = vmull_u8(d19u8, d0u8);
1344                q5u16 = vmull_u8(d20u8, d0u8);
1345                q6u16 = vmull_u8(d21u8, d0u8);
1346
1347                q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1348                q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1349                q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1350                q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1351
1352                q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1353                q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1354                q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1355                q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1356
1357                q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1358                q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1359                q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1360                q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1361
1362                q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1363                q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1364                q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1365                q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1366
1367                q7u16 = vmull_u8(d21u8, d3u8);
1368                q8u16 = vmull_u8(d22u8, d3u8);
1369                q9u16 = vmull_u8(d23u8, d3u8);
1370                q10u16 = vmull_u8(d24u8, d3u8);
1371
1372                q3s16 = vreinterpretq_s16_u16(q3u16);
1373                q4s16 = vreinterpretq_s16_u16(q4u16);
1374                q5s16 = vreinterpretq_s16_u16(q5u16);
1375                q6s16 = vreinterpretq_s16_u16(q6u16);
1376                q7s16 = vreinterpretq_s16_u16(q7u16);
1377                q8s16 = vreinterpretq_s16_u16(q8u16);
1378                q9s16 = vreinterpretq_s16_u16(q9u16);
1379                q10s16 = vreinterpretq_s16_u16(q10u16);
1380
1381                q7s16 = vqaddq_s16(q7s16, q3s16);
1382                q8s16 = vqaddq_s16(q8s16, q4s16);
1383                q9s16 = vqaddq_s16(q9s16, q5s16);
1384                q10s16 = vqaddq_s16(q10s16, q6s16);
1385
1386                d6u8 = vqrshrun_n_s16(q7s16, 7);
1387                d7u8 = vqrshrun_n_s16(q8s16, 7);
1388                d8u8 = vqrshrun_n_s16(q9s16, 7);
1389                d9u8 = vqrshrun_n_s16(q10s16, 7);
1390
1391                d18u8 = d22u8;
1392                d19u8 = d23u8;
1393                d20u8 = d24u8;
1394                d21u8 = d25u8;
1395                d22u8 = d26u8;
1396
1397                vst1_u8(dst, d6u8);
1398                dst += dst_pitch;
1399                vst1_u8(dst, d7u8);
1400                dst += dst_pitch;
1401                vst1_u8(dst, d8u8);
1402                dst += dst_pitch;
1403                vst1_u8(dst, d9u8);
1404                dst += dst_pitch;
1405            }
1406        }
1407        return;
1408    }
1409
1410    // load first_pass filter
1411    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1412    d0s8 = vdup_lane_s8(dtmps8, 0);
1413    d1s8 = vdup_lane_s8(dtmps8, 1);
1414    d2s8 = vdup_lane_s8(dtmps8, 2);
1415    d3s8 = vdup_lane_s8(dtmps8, 3);
1416    d4s8 = vdup_lane_s8(dtmps8, 4);
1417    d5s8 = vdup_lane_s8(dtmps8, 5);
1418    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1419    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1420    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1421    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1422    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1423    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1424
1425    // First pass: output_height lines x output_width columns (9x4)
1426    if (yoffset == 0) {  // firstpass_filter4x4_only
1427        src = src_ptr - 2;
1428        dst = dst_ptr;
1429        for (i = 0; i < 8; i++) {
1430            d6u8 = vld1_u8(src);
1431            d7u8 = vld1_u8(src + 8);
1432            d8u8 = vld1_u8(src + 16);
1433            src += src_pixels_per_line;
1434            d9u8 = vld1_u8(src);
1435            d10u8 = vld1_u8(src + 8);
1436            d11u8 = vld1_u8(src + 16);
1437            src += src_pixels_per_line;
1438
1439            __builtin_prefetch(src);
1440            __builtin_prefetch(src + src_pixels_per_line);
1441
1442            q6u16 = vmull_u8(d6u8, d0u8);
1443            q7u16 = vmull_u8(d7u8, d0u8);
1444            q8u16 = vmull_u8(d9u8, d0u8);
1445            q9u16 = vmull_u8(d10u8, d0u8);
1446
1447            d20u8 = vext_u8(d6u8, d7u8, 1);
1448            d21u8 = vext_u8(d9u8, d10u8, 1);
1449            d22u8 = vext_u8(d7u8, d8u8, 1);
1450            d23u8 = vext_u8(d10u8, d11u8, 1);
1451            d24u8 = vext_u8(d6u8, d7u8, 4);
1452            d25u8 = vext_u8(d9u8, d10u8, 4);
1453            d26u8 = vext_u8(d7u8, d8u8, 4);
1454            d27u8 = vext_u8(d10u8, d11u8, 4);
1455            d28u8 = vext_u8(d6u8, d7u8, 5);
1456            d29u8 = vext_u8(d9u8, d10u8, 5);
1457
1458            q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1459            q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1460            q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1461            q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1462            q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1463            q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1464            q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1465            q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1466            q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1467            q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1468
1469            d20u8 = vext_u8(d7u8, d8u8, 5);
1470            d21u8 = vext_u8(d10u8, d11u8, 5);
1471            d22u8 = vext_u8(d6u8, d7u8, 2);
1472            d23u8 = vext_u8(d9u8, d10u8, 2);
1473            d24u8 = vext_u8(d7u8, d8u8, 2);
1474            d25u8 = vext_u8(d10u8, d11u8, 2);
1475            d26u8 = vext_u8(d6u8, d7u8, 3);
1476            d27u8 = vext_u8(d9u8, d10u8, 3);
1477            d28u8 = vext_u8(d7u8, d8u8, 3);
1478            d29u8 = vext_u8(d10u8, d11u8, 3);
1479
1480            q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1481            q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1482            q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1483            q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1484            q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1485            q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1486
1487            q10u16 = vmull_u8(d26u8, d3u8);
1488            q11u16 = vmull_u8(d27u8, d3u8);
1489            q12u16 = vmull_u8(d28u8, d3u8);
1490            q15u16 = vmull_u8(d29u8, d3u8);
1491
1492            q6s16 = vreinterpretq_s16_u16(q6u16);
1493            q7s16 = vreinterpretq_s16_u16(q7u16);
1494            q8s16 = vreinterpretq_s16_u16(q8u16);
1495            q9s16 = vreinterpretq_s16_u16(q9u16);
1496            q10s16 = vreinterpretq_s16_u16(q10u16);
1497            q11s16 = vreinterpretq_s16_u16(q11u16);
1498            q12s16 = vreinterpretq_s16_u16(q12u16);
1499            q15s16 = vreinterpretq_s16_u16(q15u16);
1500
1501            q6s16 = vqaddq_s16(q6s16, q10s16);
1502            q8s16 = vqaddq_s16(q8s16, q11s16);
1503            q7s16 = vqaddq_s16(q7s16, q12s16);
1504            q9s16 = vqaddq_s16(q9s16, q15s16);
1505
1506            d6u8 = vqrshrun_n_s16(q6s16, 7);
1507            d7u8 = vqrshrun_n_s16(q7s16, 7);
1508            d8u8 = vqrshrun_n_s16(q8s16, 7);
1509            d9u8 = vqrshrun_n_s16(q9s16, 7);
1510
1511            q3u8 = vcombine_u8(d6u8, d7u8);
1512            q4u8 = vcombine_u8(d8u8, d9u8);
1513            vst1q_u8(dst, q3u8);
1514            dst += dst_pitch;
1515            vst1q_u8(dst, q4u8);
1516            dst += dst_pitch;
1517        }
1518        return;
1519    }
1520
1521    src = src_ptr - 2 - src_pixels_per_line * 2;
1522    tmpp = tmp;
1523    for (i = 0; i < 7; i++) {
1524        d6u8 = vld1_u8(src);
1525        d7u8 = vld1_u8(src + 8);
1526        d8u8 = vld1_u8(src + 16);
1527        src += src_pixels_per_line;
1528        d9u8 = vld1_u8(src);
1529        d10u8 = vld1_u8(src + 8);
1530        d11u8 = vld1_u8(src + 16);
1531        src += src_pixels_per_line;
1532        d12u8 = vld1_u8(src);
1533        d13u8 = vld1_u8(src + 8);
1534        d14u8 = vld1_u8(src + 16);
1535        src += src_pixels_per_line;
1536
1537        __builtin_prefetch(src);
1538        __builtin_prefetch(src + src_pixels_per_line);
1539        __builtin_prefetch(src + src_pixels_per_line * 2);
1540
1541        q8u16 = vmull_u8(d6u8, d0u8);
1542        q9u16 = vmull_u8(d7u8, d0u8);
1543        q10u16 = vmull_u8(d9u8, d0u8);
1544        q11u16 = vmull_u8(d10u8, d0u8);
1545        q12u16 = vmull_u8(d12u8, d0u8);
1546        q13u16 = vmull_u8(d13u8, d0u8);
1547
1548        d28u8 = vext_u8(d6u8, d7u8, 1);
1549        d29u8 = vext_u8(d9u8, d10u8, 1);
1550        d30u8 = vext_u8(d12u8, d13u8, 1);
1551        q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1552        q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1553        q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1554        d28u8 = vext_u8(d7u8, d8u8, 1);
1555        d29u8 = vext_u8(d10u8, d11u8, 1);
1556        d30u8 = vext_u8(d13u8, d14u8, 1);
1557        q9u16  = vmlsl_u8(q9u16, d28u8, d1u8);
1558        q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1559        q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1560
1561        d28u8 = vext_u8(d6u8, d7u8, 4);
1562        d29u8 = vext_u8(d9u8, d10u8, 4);
1563        d30u8 = vext_u8(d12u8, d13u8, 4);
1564        q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1565        q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1566        q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1567        d28u8 = vext_u8(d7u8, d8u8, 4);
1568        d29u8 = vext_u8(d10u8, d11u8, 4);
1569        d30u8 = vext_u8(d13u8, d14u8, 4);
1570        q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1571        q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1572        q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1573
1574        d28u8 = vext_u8(d6u8, d7u8, 5);
1575        d29u8 = vext_u8(d9u8, d10u8, 5);
1576        d30u8 = vext_u8(d12u8, d13u8, 5);
1577        q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1578        q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1579        q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1580        d28u8 = vext_u8(d7u8, d8u8, 5);
1581        d29u8 = vext_u8(d10u8, d11u8, 5);
1582        d30u8 = vext_u8(d13u8, d14u8, 5);
1583        q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1584        q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1585        q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1586
1587        d28u8 = vext_u8(d6u8, d7u8, 2);
1588        d29u8 = vext_u8(d9u8, d10u8, 2);
1589        d30u8 = vext_u8(d12u8, d13u8, 2);
1590        q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1591        q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1592        q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1593        d28u8 = vext_u8(d7u8, d8u8, 2);
1594        d29u8 = vext_u8(d10u8, d11u8, 2);
1595        d30u8 = vext_u8(d13u8, d14u8, 2);
1596        q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1597        q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1598        q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1599
1600        d28u8 = vext_u8(d6u8, d7u8, 3);
1601        d29u8 = vext_u8(d9u8, d10u8, 3);
1602        d30u8 = vext_u8(d12u8, d13u8, 3);
1603        d15u8 = vext_u8(d7u8, d8u8, 3);
1604        d31u8 = vext_u8(d10u8, d11u8, 3);
1605        d6u8  = vext_u8(d13u8, d14u8, 3);
1606        q4u16 = vmull_u8(d28u8, d3u8);
1607        q5u16 = vmull_u8(d29u8, d3u8);
1608        q6u16 = vmull_u8(d30u8, d3u8);
1609        q4s16 = vreinterpretq_s16_u16(q4u16);
1610        q5s16 = vreinterpretq_s16_u16(q5u16);
1611        q6s16 = vreinterpretq_s16_u16(q6u16);
1612        q8s16 = vreinterpretq_s16_u16(q8u16);
1613        q10s16 = vreinterpretq_s16_u16(q10u16);
1614        q12s16 = vreinterpretq_s16_u16(q12u16);
1615        q8s16 = vqaddq_s16(q8s16, q4s16);
1616        q10s16 = vqaddq_s16(q10s16, q5s16);
1617        q12s16 = vqaddq_s16(q12s16, q6s16);
1618
1619        q6u16 = vmull_u8(d15u8, d3u8);
1620        q7u16 = vmull_u8(d31u8, d3u8);
1621        q3u16 = vmull_u8(d6u8, d3u8);
1622        q3s16 = vreinterpretq_s16_u16(q3u16);
1623        q6s16 = vreinterpretq_s16_u16(q6u16);
1624        q7s16 = vreinterpretq_s16_u16(q7u16);
1625        q9s16 = vreinterpretq_s16_u16(q9u16);
1626        q11s16 = vreinterpretq_s16_u16(q11u16);
1627        q13s16 = vreinterpretq_s16_u16(q13u16);
1628        q9s16 = vqaddq_s16(q9s16, q6s16);
1629        q11s16 = vqaddq_s16(q11s16, q7s16);
1630        q13s16 = vqaddq_s16(q13s16, q3s16);
1631
1632        d6u8 = vqrshrun_n_s16(q8s16, 7);
1633        d7u8 = vqrshrun_n_s16(q9s16, 7);
1634        d8u8 = vqrshrun_n_s16(q10s16, 7);
1635        d9u8 = vqrshrun_n_s16(q11s16, 7);
1636        d10u8 = vqrshrun_n_s16(q12s16, 7);
1637        d11u8 = vqrshrun_n_s16(q13s16, 7);
1638
1639        vst1_u8(tmpp, d6u8);
1640        tmpp += 8;
1641        vst1_u8(tmpp, d7u8);
1642        tmpp += 8;
1643        vst1_u8(tmpp, d8u8);
1644        tmpp += 8;
1645        vst1_u8(tmpp, d9u8);
1646        tmpp += 8;
1647        vst1_u8(tmpp, d10u8);
1648        tmpp += 8;
1649        vst1_u8(tmpp, d11u8);
1650        tmpp += 8;
1651    }
1652
1653    // Second pass: 16x16
1654    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1655    d0s8 = vdup_lane_s8(dtmps8, 0);
1656    d1s8 = vdup_lane_s8(dtmps8, 1);
1657    d2s8 = vdup_lane_s8(dtmps8, 2);
1658    d3s8 = vdup_lane_s8(dtmps8, 3);
1659    d4s8 = vdup_lane_s8(dtmps8, 4);
1660    d5s8 = vdup_lane_s8(dtmps8, 5);
1661    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1662    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1663    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1664    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1665    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1666    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1667
1668    for (i = 0; i < 2; i++) {
1669        dst = dst_ptr + 8 * i;
1670        tmpp = tmp + 8 * i;
1671        d18u8 = vld1_u8(tmpp);
1672        tmpp += 16;
1673        d19u8 = vld1_u8(tmpp);
1674        tmpp += 16;
1675        d20u8 = vld1_u8(tmpp);
1676        tmpp += 16;
1677        d21u8 = vld1_u8(tmpp);
1678        tmpp += 16;
1679        d22u8 = vld1_u8(tmpp);
1680        tmpp += 16;
1681        for (j = 0; j < 4; j++) {
1682            d23u8 = vld1_u8(tmpp);
1683            tmpp += 16;
1684            d24u8 = vld1_u8(tmpp);
1685            tmpp += 16;
1686            d25u8 = vld1_u8(tmpp);
1687            tmpp += 16;
1688            d26u8 = vld1_u8(tmpp);
1689            tmpp += 16;
1690
1691            q3u16 = vmull_u8(d18u8, d0u8);
1692            q4u16 = vmull_u8(d19u8, d0u8);
1693            q5u16 = vmull_u8(d20u8, d0u8);
1694            q6u16 = vmull_u8(d21u8, d0u8);
1695
1696            q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1697            q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1698            q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1699            q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1700
1701            q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1702            q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1703            q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1704            q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1705
1706            q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1707            q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1708            q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1709            q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1710
1711            q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1712            q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1713            q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1714            q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1715
1716            q7u16 = vmull_u8(d21u8, d3u8);
1717            q8u16 = vmull_u8(d22u8, d3u8);
1718            q9u16 = vmull_u8(d23u8, d3u8);
1719            q10u16 = vmull_u8(d24u8, d3u8);
1720
1721            q3s16 = vreinterpretq_s16_u16(q3u16);
1722            q4s16 = vreinterpretq_s16_u16(q4u16);
1723            q5s16 = vreinterpretq_s16_u16(q5u16);
1724            q6s16 = vreinterpretq_s16_u16(q6u16);
1725            q7s16 = vreinterpretq_s16_u16(q7u16);
1726            q8s16 = vreinterpretq_s16_u16(q8u16);
1727            q9s16 = vreinterpretq_s16_u16(q9u16);
1728            q10s16 = vreinterpretq_s16_u16(q10u16);
1729
1730            q7s16 = vqaddq_s16(q7s16, q3s16);
1731            q8s16 = vqaddq_s16(q8s16, q4s16);
1732            q9s16 = vqaddq_s16(q9s16, q5s16);
1733            q10s16 = vqaddq_s16(q10s16, q6s16);
1734
1735            d6u8 = vqrshrun_n_s16(q7s16, 7);
1736            d7u8 = vqrshrun_n_s16(q8s16, 7);
1737            d8u8 = vqrshrun_n_s16(q9s16, 7);
1738            d9u8 = vqrshrun_n_s16(q10s16, 7);
1739
1740            d18u8 = d22u8;
1741            d19u8 = d23u8;
1742            d20u8 = d24u8;
1743            d21u8 = d25u8;
1744            d22u8 = d26u8;
1745
1746            vst1_u8(dst, d6u8);
1747            dst += dst_pitch;
1748            vst1_u8(dst, d7u8);
1749            dst += dst_pitch;
1750            vst1_u8(dst, d8u8);
1751            dst += dst_pitch;
1752            vst1_u8(dst, d9u8);
1753            dst += dst_pitch;
1754        }
1755    }
1756    return;
1757}
1758