1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include "vpx_ports/mem.h"
13#include "vpx/vpx_integer.h"
14
15static const uint16_t bilinear_taps_coeff[8][2] = {
16    {128,   0},
17    {112,  16},
18    { 96,  32},
19    { 80,  48},
20    { 64,  64},
21    { 48,  80},
22    { 32,  96},
23    { 16, 112}
24};
25
26unsigned int vp8_sub_pixel_variance16x16_neon_func(
27        const unsigned char *src_ptr,
28        int src_pixels_per_line,
29        int xoffset,
30        int yoffset,
31        const unsigned char *dst_ptr,
32        int dst_pixels_per_line,
33        unsigned int *sse) {
34    int i;
35    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
36    unsigned char *tmpp;
37    unsigned char *tmpp2;
38    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
39    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
40    uint8x8_t d19u8, d20u8, d21u8;
41    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
42    uint32x2_t d0u32, d10u32;
43    int64x1_t d0s64, d1s64, d2s64, d3s64;
44    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
45    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
46    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
47    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
48    int32x4_t q8s32, q9s32, q10s32;
49    int64x2_t q0s64, q1s64, q5s64;
50
51    tmpp2 = tmp + 272;
52    tmpp = tmp;
53    if (xoffset == 0) {  // secondpass_bfilter16x16_only
54        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
55        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
56
57        q11u8 = vld1q_u8(src_ptr);
58        src_ptr += src_pixels_per_line;
59        for (i = 4; i > 0; i--) {
60            q12u8 = vld1q_u8(src_ptr);
61            src_ptr += src_pixels_per_line;
62            q13u8 = vld1q_u8(src_ptr);
63            src_ptr += src_pixels_per_line;
64            q14u8 = vld1q_u8(src_ptr);
65            src_ptr += src_pixels_per_line;
66            q15u8 = vld1q_u8(src_ptr);
67            src_ptr += src_pixels_per_line;
68
69            __builtin_prefetch(src_ptr);
70            __builtin_prefetch(src_ptr + src_pixels_per_line);
71            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
72
73            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
74            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
75            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
76            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
77            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
78            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
79            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
80            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
81
82            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
83            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
84            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
85            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
86            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
87            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
88            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
89            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
90
91            d2u8 = vqrshrn_n_u16(q1u16, 7);
92            d3u8 = vqrshrn_n_u16(q2u16, 7);
93            d4u8 = vqrshrn_n_u16(q3u16, 7);
94            d5u8 = vqrshrn_n_u16(q4u16, 7);
95            d6u8 = vqrshrn_n_u16(q5u16, 7);
96            d7u8 = vqrshrn_n_u16(q6u16, 7);
97            d8u8 = vqrshrn_n_u16(q7u16, 7);
98            d9u8 = vqrshrn_n_u16(q8u16, 7);
99
100            q1u8 = vcombine_u8(d2u8, d3u8);
101            q2u8 = vcombine_u8(d4u8, d5u8);
102            q3u8 = vcombine_u8(d6u8, d7u8);
103            q4u8 = vcombine_u8(d8u8, d9u8);
104
105            q11u8 = q15u8;
106
107            vst1q_u8((uint8_t *)tmpp2, q1u8);
108            tmpp2 += 16;
109            vst1q_u8((uint8_t *)tmpp2, q2u8);
110            tmpp2 += 16;
111            vst1q_u8((uint8_t *)tmpp2, q3u8);
112            tmpp2 += 16;
113            vst1q_u8((uint8_t *)tmpp2, q4u8);
114            tmpp2 += 16;
115        }
116    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
117        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
118        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
119
120        for (i = 4; i > 0 ; i--) {
121            d2u8 = vld1_u8(src_ptr);
122            d3u8 = vld1_u8(src_ptr + 8);
123            d4u8 = vld1_u8(src_ptr + 16);
124            src_ptr += src_pixels_per_line;
125            d5u8 = vld1_u8(src_ptr);
126            d6u8 = vld1_u8(src_ptr + 8);
127            d7u8 = vld1_u8(src_ptr + 16);
128            src_ptr += src_pixels_per_line;
129            d8u8 = vld1_u8(src_ptr);
130            d9u8 = vld1_u8(src_ptr + 8);
131            d10u8 = vld1_u8(src_ptr + 16);
132            src_ptr += src_pixels_per_line;
133            d11u8 = vld1_u8(src_ptr);
134            d12u8 = vld1_u8(src_ptr + 8);
135            d13u8 = vld1_u8(src_ptr + 16);
136            src_ptr += src_pixels_per_line;
137
138            __builtin_prefetch(src_ptr);
139            __builtin_prefetch(src_ptr + src_pixels_per_line);
140            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
141
142            q7u16  = vmull_u8(d2u8, d0u8);
143            q8u16  = vmull_u8(d3u8, d0u8);
144            q9u16  = vmull_u8(d5u8, d0u8);
145            q10u16 = vmull_u8(d6u8, d0u8);
146            q11u16 = vmull_u8(d8u8, d0u8);
147            q12u16 = vmull_u8(d9u8, d0u8);
148            q13u16 = vmull_u8(d11u8, d0u8);
149            q14u16 = vmull_u8(d12u8, d0u8);
150
151            d2u8  = vext_u8(d2u8, d3u8, 1);
152            d5u8  = vext_u8(d5u8, d6u8, 1);
153            d8u8  = vext_u8(d8u8, d9u8, 1);
154            d11u8 = vext_u8(d11u8, d12u8, 1);
155
156            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
157            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
158            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
159            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
160
161            d3u8  = vext_u8(d3u8, d4u8, 1);
162            d6u8  = vext_u8(d6u8, d7u8, 1);
163            d9u8  = vext_u8(d9u8, d10u8, 1);
164            d12u8 = vext_u8(d12u8, d13u8, 1);
165
166            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
167            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
168            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
169            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
170
171            d14u8 = vqrshrn_n_u16(q7u16, 7);
172            d15u8 = vqrshrn_n_u16(q8u16, 7);
173            d16u8 = vqrshrn_n_u16(q9u16, 7);
174            d17u8 = vqrshrn_n_u16(q10u16, 7);
175            d18u8 = vqrshrn_n_u16(q11u16, 7);
176            d19u8 = vqrshrn_n_u16(q12u16, 7);
177            d20u8 = vqrshrn_n_u16(q13u16, 7);
178            d21u8 = vqrshrn_n_u16(q14u16, 7);
179
180            q7u8  = vcombine_u8(d14u8, d15u8);
181            q8u8  = vcombine_u8(d16u8, d17u8);
182            q9u8  = vcombine_u8(d18u8, d19u8);
183            q10u8 = vcombine_u8(d20u8, d21u8);
184
185            vst1q_u8((uint8_t *)tmpp2, q7u8);
186            tmpp2 += 16;
187            vst1q_u8((uint8_t *)tmpp2, q8u8);
188            tmpp2 += 16;
189            vst1q_u8((uint8_t *)tmpp2, q9u8);
190            tmpp2 += 16;
191            vst1q_u8((uint8_t *)tmpp2, q10u8);
192            tmpp2 += 16;
193        }
194    } else {
195        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
196        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
197
198        d2u8 = vld1_u8(src_ptr);
199        d3u8 = vld1_u8(src_ptr + 8);
200        d4u8 = vld1_u8(src_ptr + 16);
201        src_ptr += src_pixels_per_line;
202        d5u8 = vld1_u8(src_ptr);
203        d6u8 = vld1_u8(src_ptr + 8);
204        d7u8 = vld1_u8(src_ptr + 16);
205        src_ptr += src_pixels_per_line;
206        d8u8 = vld1_u8(src_ptr);
207        d9u8 = vld1_u8(src_ptr + 8);
208        d10u8 = vld1_u8(src_ptr + 16);
209        src_ptr += src_pixels_per_line;
210        d11u8 = vld1_u8(src_ptr);
211        d12u8 = vld1_u8(src_ptr + 8);
212        d13u8 = vld1_u8(src_ptr + 16);
213        src_ptr += src_pixels_per_line;
214
215        // First Pass: output_height lines x output_width columns (17x16)
216        for (i = 3; i > 0; i--) {
217            q7u16  = vmull_u8(d2u8, d0u8);
218            q8u16  = vmull_u8(d3u8, d0u8);
219            q9u16  = vmull_u8(d5u8, d0u8);
220            q10u16 = vmull_u8(d6u8, d0u8);
221            q11u16 = vmull_u8(d8u8, d0u8);
222            q12u16 = vmull_u8(d9u8, d0u8);
223            q13u16 = vmull_u8(d11u8, d0u8);
224            q14u16 = vmull_u8(d12u8, d0u8);
225
226            d2u8  = vext_u8(d2u8, d3u8, 1);
227            d5u8  = vext_u8(d5u8, d6u8, 1);
228            d8u8  = vext_u8(d8u8, d9u8, 1);
229            d11u8 = vext_u8(d11u8, d12u8, 1);
230
231            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
232            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
233            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
234            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
235
236            d3u8  = vext_u8(d3u8, d4u8, 1);
237            d6u8  = vext_u8(d6u8, d7u8, 1);
238            d9u8  = vext_u8(d9u8, d10u8, 1);
239            d12u8 = vext_u8(d12u8, d13u8, 1);
240
241            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
242            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
243            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
244            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
245
246            d14u8 = vqrshrn_n_u16(q7u16, 7);
247            d15u8 = vqrshrn_n_u16(q8u16, 7);
248            d16u8 = vqrshrn_n_u16(q9u16, 7);
249            d17u8 = vqrshrn_n_u16(q10u16, 7);
250            d18u8 = vqrshrn_n_u16(q11u16, 7);
251            d19u8 = vqrshrn_n_u16(q12u16, 7);
252            d20u8 = vqrshrn_n_u16(q13u16, 7);
253            d21u8 = vqrshrn_n_u16(q14u16, 7);
254
255            d2u8 = vld1_u8(src_ptr);
256            d3u8 = vld1_u8(src_ptr + 8);
257            d4u8 = vld1_u8(src_ptr + 16);
258            src_ptr += src_pixels_per_line;
259            d5u8 = vld1_u8(src_ptr);
260            d6u8 = vld1_u8(src_ptr + 8);
261            d7u8 = vld1_u8(src_ptr + 16);
262            src_ptr += src_pixels_per_line;
263            d8u8 = vld1_u8(src_ptr);
264            d9u8 = vld1_u8(src_ptr + 8);
265            d10u8 = vld1_u8(src_ptr + 16);
266            src_ptr += src_pixels_per_line;
267            d11u8 = vld1_u8(src_ptr);
268            d12u8 = vld1_u8(src_ptr + 8);
269            d13u8 = vld1_u8(src_ptr + 16);
270            src_ptr += src_pixels_per_line;
271
272            q7u8 = vcombine_u8(d14u8, d15u8);
273            q8u8 = vcombine_u8(d16u8, d17u8);
274            q9u8 = vcombine_u8(d18u8, d19u8);
275            q10u8 = vcombine_u8(d20u8, d21u8);
276
277            vst1q_u8((uint8_t *)tmpp, q7u8);
278            tmpp += 16;
279            vst1q_u8((uint8_t *)tmpp, q8u8);
280            tmpp += 16;
281            vst1q_u8((uint8_t *)tmpp, q9u8);
282            tmpp += 16;
283            vst1q_u8((uint8_t *)tmpp, q10u8);
284            tmpp += 16;
285        }
286
287        // First-pass filtering for rest 5 lines
288        d14u8 = vld1_u8(src_ptr);
289        d15u8 = vld1_u8(src_ptr + 8);
290        d16u8 = vld1_u8(src_ptr + 16);
291        src_ptr += src_pixels_per_line;
292
293        q9u16  = vmull_u8(d2u8, d0u8);
294        q10u16 = vmull_u8(d3u8, d0u8);
295        q11u16 = vmull_u8(d5u8, d0u8);
296        q12u16 = vmull_u8(d6u8, d0u8);
297        q13u16 = vmull_u8(d8u8, d0u8);
298        q14u16 = vmull_u8(d9u8, d0u8);
299
300        d2u8  = vext_u8(d2u8, d3u8, 1);
301        d5u8  = vext_u8(d5u8, d6u8, 1);
302        d8u8  = vext_u8(d8u8, d9u8, 1);
303
304        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
305        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
306        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
307
308        d3u8  = vext_u8(d3u8, d4u8, 1);
309        d6u8  = vext_u8(d6u8, d7u8, 1);
310        d9u8  = vext_u8(d9u8, d10u8, 1);
311
312        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
313        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
314        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
315
316        q1u16 = vmull_u8(d11u8, d0u8);
317        q2u16 = vmull_u8(d12u8, d0u8);
318        q3u16 = vmull_u8(d14u8, d0u8);
319        q4u16 = vmull_u8(d15u8, d0u8);
320
321        d11u8 = vext_u8(d11u8, d12u8, 1);
322        d14u8 = vext_u8(d14u8, d15u8, 1);
323
324        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
325        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
326
327        d12u8 = vext_u8(d12u8, d13u8, 1);
328        d15u8 = vext_u8(d15u8, d16u8, 1);
329
330        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
331        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
332
333        d10u8 = vqrshrn_n_u16(q9u16, 7);
334        d11u8 = vqrshrn_n_u16(q10u16, 7);
335        d12u8 = vqrshrn_n_u16(q11u16, 7);
336        d13u8 = vqrshrn_n_u16(q12u16, 7);
337        d14u8 = vqrshrn_n_u16(q13u16, 7);
338        d15u8 = vqrshrn_n_u16(q14u16, 7);
339        d16u8 = vqrshrn_n_u16(q1u16, 7);
340        d17u8 = vqrshrn_n_u16(q2u16, 7);
341        d18u8 = vqrshrn_n_u16(q3u16, 7);
342        d19u8 = vqrshrn_n_u16(q4u16, 7);
343
344        q5u8 = vcombine_u8(d10u8, d11u8);
345        q6u8 = vcombine_u8(d12u8, d13u8);
346        q7u8 = vcombine_u8(d14u8, d15u8);
347        q8u8 = vcombine_u8(d16u8, d17u8);
348        q9u8 = vcombine_u8(d18u8, d19u8);
349
350        vst1q_u8((uint8_t *)tmpp, q5u8);
351        tmpp += 16;
352        vst1q_u8((uint8_t *)tmpp, q6u8);
353        tmpp += 16;
354        vst1q_u8((uint8_t *)tmpp, q7u8);
355        tmpp += 16;
356        vst1q_u8((uint8_t *)tmpp, q8u8);
357        tmpp += 16;
358        vst1q_u8((uint8_t *)tmpp, q9u8);
359
360        // secondpass_filter
361        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
362        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
363
364        tmpp = tmp;
365        tmpp2 = tmpp + 272;
366        q11u8 = vld1q_u8(tmpp);
367        tmpp += 16;
368        for (i = 4; i > 0; i--) {
369            q12u8 = vld1q_u8(tmpp);
370            tmpp += 16;
371            q13u8 = vld1q_u8(tmpp);
372            tmpp += 16;
373            q14u8 = vld1q_u8(tmpp);
374            tmpp += 16;
375            q15u8 = vld1q_u8(tmpp);
376            tmpp += 16;
377
378            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
379            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
380            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
381            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
382            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
383            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
384            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
385            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
386
387            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
388            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
389            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
390            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
391            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
392            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
393            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
394            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
395
396            d2u8 = vqrshrn_n_u16(q1u16, 7);
397            d3u8 = vqrshrn_n_u16(q2u16, 7);
398            d4u8 = vqrshrn_n_u16(q3u16, 7);
399            d5u8 = vqrshrn_n_u16(q4u16, 7);
400            d6u8 = vqrshrn_n_u16(q5u16, 7);
401            d7u8 = vqrshrn_n_u16(q6u16, 7);
402            d8u8 = vqrshrn_n_u16(q7u16, 7);
403            d9u8 = vqrshrn_n_u16(q8u16, 7);
404
405            q1u8 = vcombine_u8(d2u8, d3u8);
406            q2u8 = vcombine_u8(d4u8, d5u8);
407            q3u8 = vcombine_u8(d6u8, d7u8);
408            q4u8 = vcombine_u8(d8u8, d9u8);
409
410            q11u8 = q15u8;
411
412            vst1q_u8((uint8_t *)tmpp2, q1u8);
413            tmpp2 += 16;
414            vst1q_u8((uint8_t *)tmpp2, q2u8);
415            tmpp2 += 16;
416            vst1q_u8((uint8_t *)tmpp2, q3u8);
417            tmpp2 += 16;
418            vst1q_u8((uint8_t *)tmpp2, q4u8);
419            tmpp2 += 16;
420        }
421    }
422
423    // sub_pixel_variance16x16_neon
424    q8s32 = vdupq_n_s32(0);
425    q9s32 = vdupq_n_s32(0);
426    q10s32 = vdupq_n_s32(0);
427
428    tmpp = tmp + 272;
429    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
430        q0u8 = vld1q_u8(tmpp);
431        tmpp += 16;
432        q1u8 = vld1q_u8(tmpp);
433        tmpp += 16;
434        q2u8 = vld1q_u8(dst_ptr);
435        dst_ptr += dst_pixels_per_line;
436        q3u8 = vld1q_u8(dst_ptr);
437        dst_ptr += dst_pixels_per_line;
438
439        d0u8 = vget_low_u8(q0u8);
440        d1u8 = vget_high_u8(q0u8);
441        d2u8 = vget_low_u8(q1u8);
442        d3u8 = vget_high_u8(q1u8);
443
444        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
445        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
446        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
447        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
448
449        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
450        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
451        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
452        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
453        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
454
455        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
456        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
457        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
458        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
459        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
460
461        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
462        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
463        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
464        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
465        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
466
467        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
468        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
469        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
470        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
471        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
472    }
473
474    q10s32 = vaddq_s32(q10s32, q9s32);
475    q0s64 = vpaddlq_s32(q8s32);
476    q1s64 = vpaddlq_s32(q10s32);
477
478    d0s64 = vget_low_s64(q0s64);
479    d1s64 = vget_high_s64(q0s64);
480    d2s64 = vget_low_s64(q1s64);
481    d3s64 = vget_high_s64(q1s64);
482    d0s64 = vadd_s64(d0s64, d1s64);
483    d1s64 = vadd_s64(d2s64, d3s64);
484
485    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
486                      vreinterpret_s32_s64(d0s64));
487    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
488
489    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
490    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
491
492    return vget_lane_u32(d0u32, 0);
493}
494
495unsigned int vp8_variance_halfpixvar16x16_h_neon(
496        const unsigned char *src_ptr,
497        int  source_stride,
498        const unsigned char *ref_ptr,
499        int  recon_stride,
500        unsigned int *sse) {
501    int i;
502    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
503    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
504    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
505    uint32x2_t d0u32, d10u32;
506    int64x1_t d0s64, d1s64, d2s64, d3s64;
507    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
508    uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
509    uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
510    int32x4_t q8s32, q9s32, q10s32;
511    int64x2_t q0s64, q1s64, q5s64;
512
513    q8s32 = vdupq_n_s32(0);
514    q9s32 = vdupq_n_s32(0);
515    q10s32 = vdupq_n_s32(0);
516
517    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
518        q0u8 = vld1q_u8(src_ptr);
519        q1u8 = vld1q_u8(src_ptr + 16);
520        src_ptr += source_stride;
521        q2u8 = vld1q_u8(src_ptr);
522        q3u8 = vld1q_u8(src_ptr + 16);
523        src_ptr += source_stride;
524        q4u8 = vld1q_u8(src_ptr);
525        q5u8 = vld1q_u8(src_ptr + 16);
526        src_ptr += source_stride;
527        q6u8 = vld1q_u8(src_ptr);
528        q7u8 = vld1q_u8(src_ptr + 16);
529        src_ptr += source_stride;
530
531        q11u8 = vld1q_u8(ref_ptr);
532        ref_ptr += recon_stride;
533        q12u8 = vld1q_u8(ref_ptr);
534        ref_ptr += recon_stride;
535        q13u8 = vld1q_u8(ref_ptr);
536        ref_ptr += recon_stride;
537        q14u8 = vld1q_u8(ref_ptr);
538        ref_ptr += recon_stride;
539
540        q1u8 = vextq_u8(q0u8, q1u8, 1);
541        q3u8 = vextq_u8(q2u8, q3u8, 1);
542        q5u8 = vextq_u8(q4u8, q5u8, 1);
543        q7u8 = vextq_u8(q6u8, q7u8, 1);
544
545        q0u8 = vrhaddq_u8(q0u8, q1u8);
546        q1u8 = vrhaddq_u8(q2u8, q3u8);
547        q2u8 = vrhaddq_u8(q4u8, q5u8);
548        q3u8 = vrhaddq_u8(q6u8, q7u8);
549
550        d0u8 = vget_low_u8(q0u8);
551        d1u8 = vget_high_u8(q0u8);
552        d2u8 = vget_low_u8(q1u8);
553        d3u8 = vget_high_u8(q1u8);
554        d4u8 = vget_low_u8(q2u8);
555        d5u8 = vget_high_u8(q2u8);
556        d6u8 = vget_low_u8(q3u8);
557        d7u8 = vget_high_u8(q3u8);
558
559        q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
560        q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
561        q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
562        q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
563        q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
564        q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
565        q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
566        q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
567
568        d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
569        d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
570        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
571        q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
572        q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
573        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
574        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
575        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
576        q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
577        q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
578        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
579        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
580        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
581        q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
582        q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
583        d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
584        d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
585        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
586        q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
587        q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
588        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
589        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
590        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
591        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
592        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
593        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
594        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
595        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
596        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
597        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
598        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
599        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
600        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
601        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
602        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
603        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
604        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
605        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
606        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
607        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
608    }
609
610    q10s32 = vaddq_s32(q10s32, q9s32);
611    q0s64 = vpaddlq_s32(q8s32);
612    q1s64 = vpaddlq_s32(q10s32);
613
614    d0s64 = vget_low_s64(q0s64);
615    d1s64 = vget_high_s64(q0s64);
616    d2s64 = vget_low_s64(q1s64);
617    d3s64 = vget_high_s64(q1s64);
618    d0s64 = vadd_s64(d0s64, d1s64);
619    d1s64 = vadd_s64(d2s64, d3s64);
620
621    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
622                      vreinterpret_s32_s64(d0s64));
623    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
624
625    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
626    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
627
628    return vget_lane_u32(d0u32, 0);
629}
630
631unsigned int vp8_variance_halfpixvar16x16_v_neon(
632        const unsigned char *src_ptr,
633        int  source_stride,
634        const unsigned char *ref_ptr,
635        int  recon_stride,
636        unsigned int *sse) {
637    int i;
638    uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
639    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
640    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
641    uint32x2_t d0u32, d10u32;
642    int64x1_t d0s64, d1s64, d2s64, d3s64;
643    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
644    uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
645    int32x4_t q8s32, q9s32, q10s32;
646    int64x2_t q0s64, q1s64, q5s64;
647
648    q8s32 = vdupq_n_s32(0);
649    q9s32 = vdupq_n_s32(0);
650    q10s32 = vdupq_n_s32(0);
651
652    q0u8 = vld1q_u8(src_ptr);
653    src_ptr += source_stride;
654    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
655        q2u8 = vld1q_u8(src_ptr);
656        src_ptr += source_stride;
657        q4u8 = vld1q_u8(src_ptr);
658        src_ptr += source_stride;
659        q6u8 = vld1q_u8(src_ptr);
660        src_ptr += source_stride;
661        q15u8 = vld1q_u8(src_ptr);
662        src_ptr += source_stride;
663
664        q1u8 = vld1q_u8(ref_ptr);
665        ref_ptr += recon_stride;
666        q3u8 = vld1q_u8(ref_ptr);
667        ref_ptr += recon_stride;
668        q5u8 = vld1q_u8(ref_ptr);
669        ref_ptr += recon_stride;
670        q7u8 = vld1q_u8(ref_ptr);
671        ref_ptr += recon_stride;
672
673        q0u8 = vrhaddq_u8(q0u8, q2u8);
674        q2u8 = vrhaddq_u8(q2u8, q4u8);
675        q4u8 = vrhaddq_u8(q4u8, q6u8);
676        q6u8 = vrhaddq_u8(q6u8, q15u8);
677
678        d0u8  = vget_low_u8(q0u8);
679        d1u8  = vget_high_u8(q0u8);
680        d4u8  = vget_low_u8(q2u8);
681        d5u8  = vget_high_u8(q2u8);
682        d8u8  = vget_low_u8(q4u8);
683        d9u8  = vget_high_u8(q4u8);
684        d12u8 = vget_low_u8(q6u8);
685        d13u8 = vget_high_u8(q6u8);
686
687        q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
688        q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
689        q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
690        q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
691        q0u16  = vsubl_u8(d8u8, vget_low_u8(q5u8));
692        q1u16  = vsubl_u8(d9u8, vget_high_u8(q5u8));
693        q2u16  = vsubl_u8(d12u8, vget_low_u8(q7u8));
694        q3u16  = vsubl_u8(d13u8, vget_high_u8(q7u8));
695
696        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
697        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
698        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
699        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
700        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
701        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
702        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
703        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
704        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
705        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
706        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
707        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
708        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
709        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
710        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
711        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
712        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
713        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
714        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
715        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
716        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
717        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
718        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
719        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
720        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
721        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
722        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
723        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
724        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
725        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
726        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
727        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
728        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
729        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
730        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
731        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
732        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
733        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
734        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
735        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
736
737        q0u8 = q15u8;
738    }
739
740    q10s32 = vaddq_s32(q10s32, q9s32);
741    q0s64 = vpaddlq_s32(q8s32);
742    q1s64 = vpaddlq_s32(q10s32);
743
744    d0s64 = vget_low_s64(q0s64);
745    d1s64 = vget_high_s64(q0s64);
746    d2s64 = vget_low_s64(q1s64);
747    d3s64 = vget_high_s64(q1s64);
748    d0s64 = vadd_s64(d0s64, d1s64);
749    d1s64 = vadd_s64(d2s64, d3s64);
750
751    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
752                      vreinterpret_s32_s64(d0s64));
753    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
754
755    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
756    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
757
758    return vget_lane_u32(d0u32, 0);
759}
760
761unsigned int vp8_variance_halfpixvar16x16_hv_neon(
762        const unsigned char *src_ptr,
763        int  source_stride,
764        const unsigned char *ref_ptr,
765        int  recon_stride,
766        unsigned int *sse) {
767    int i;
768    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
769    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
770    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
771    uint32x2_t d0u32, d10u32;
772    int64x1_t d0s64, d1s64, d2s64, d3s64;
773    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
774    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
775    int32x4_t q13s32, q14s32, q15s32;
776    int64x2_t q0s64, q1s64, q5s64;
777
778    q13s32 = vdupq_n_s32(0);
779    q14s32 = vdupq_n_s32(0);
780    q15s32 = vdupq_n_s32(0);
781
782    q0u8 = vld1q_u8(src_ptr);
783    q1u8 = vld1q_u8(src_ptr + 16);
784    src_ptr += source_stride;
785    q1u8 = vextq_u8(q0u8, q1u8, 1);
786    q0u8 = vrhaddq_u8(q0u8, q1u8);
787    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
788        q2u8 = vld1q_u8(src_ptr);
789        q3u8 = vld1q_u8(src_ptr + 16);
790        src_ptr += source_stride;
791        q4u8 = vld1q_u8(src_ptr);
792        q5u8 = vld1q_u8(src_ptr + 16);
793        src_ptr += source_stride;
794        q6u8 = vld1q_u8(src_ptr);
795        q7u8 = vld1q_u8(src_ptr + 16);
796        src_ptr += source_stride;
797        q8u8 = vld1q_u8(src_ptr);
798        q9u8 = vld1q_u8(src_ptr + 16);
799        src_ptr += source_stride;
800
801        q3u8 = vextq_u8(q2u8, q3u8, 1);
802        q5u8 = vextq_u8(q4u8, q5u8, 1);
803        q7u8 = vextq_u8(q6u8, q7u8, 1);
804        q9u8 = vextq_u8(q8u8, q9u8, 1);
805
806        q1u8 = vrhaddq_u8(q2u8, q3u8);
807        q2u8 = vrhaddq_u8(q4u8, q5u8);
808        q3u8 = vrhaddq_u8(q6u8, q7u8);
809        q4u8 = vrhaddq_u8(q8u8, q9u8);
810        q0u8 = vrhaddq_u8(q0u8, q1u8);
811        q1u8 = vrhaddq_u8(q1u8, q2u8);
812        q2u8 = vrhaddq_u8(q2u8, q3u8);
813        q3u8 = vrhaddq_u8(q3u8, q4u8);
814
815        q5u8 = vld1q_u8(ref_ptr);
816        ref_ptr += recon_stride;
817        q6u8 = vld1q_u8(ref_ptr);
818        ref_ptr += recon_stride;
819        q7u8 = vld1q_u8(ref_ptr);
820        ref_ptr += recon_stride;
821        q8u8 = vld1q_u8(ref_ptr);
822        ref_ptr += recon_stride;
823
824        d0u8 = vget_low_u8(q0u8);
825        d1u8 = vget_high_u8(q0u8);
826        d2u8 = vget_low_u8(q1u8);
827        d3u8 = vget_high_u8(q1u8);
828        d4u8 = vget_low_u8(q2u8);
829        d5u8 = vget_high_u8(q2u8);
830        d6u8 = vget_low_u8(q3u8);
831        d7u8 = vget_high_u8(q3u8);
832
833        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
834        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
835        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
836        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
837        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
838        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
839        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
840        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));
841
842        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
843        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
844        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
845        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
846        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
847
848        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
849        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
850        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
851        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
852        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
853
854        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
855        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
856        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
857        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
858        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
859
860        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
861        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
862        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
863        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
864        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
865
866        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
867        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
868        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
869        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
870        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
871
872        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
873        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
874        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
875        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
876        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
877
878        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
879        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
880        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
881        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
882        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
883
884        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
885        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
886        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
887        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
888        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
889
890        q0u8 = q4u8;
891    }
892
893    q15s32 = vaddq_s32(q14s32, q15s32);
894    q0s64 = vpaddlq_s32(q13s32);
895    q1s64 = vpaddlq_s32(q15s32);
896
897    d0s64 = vget_low_s64(q0s64);
898    d1s64 = vget_high_s64(q0s64);
899    d2s64 = vget_low_s64(q1s64);
900    d3s64 = vget_high_s64(q1s64);
901    d0s64 = vadd_s64(d0s64, d1s64);
902    d1s64 = vadd_s64(d2s64, d3s64);
903
904    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
905                      vreinterpret_s32_s64(d0s64));
906    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
907
908    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
909    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
910
911    return vget_lane_u32(d0u32, 0);
912}
913
914enum { kWidth8 = 8 };
915enum { kHeight8 = 8 };
916enum { kHeight8PlusOne = 9 };
917enum { kPixelStepOne = 1 };
918enum { kAlign16 = 16 };
919
920#define FILTER_BITS 7
921
922static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
923  const int32x4_t a = vpaddlq_s16(v_16x8);
924  const int64x2_t b = vpaddlq_s32(a);
925  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
926                               vreinterpret_s32_s64(vget_high_s64(b)));
927  return vget_lane_s32(c, 0);
928}
929
930static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
931  const int64x2_t b = vpaddlq_s32(v_32x4);
932  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
933                               vreinterpret_s32_s64(vget_high_s64(b)));
934  return vget_lane_s32(c, 0);
935}
936
937static void variance_neon_w8(const uint8_t *a, int a_stride,
938                             const uint8_t *b, int b_stride,
939                             int w, int h, unsigned int *sse, int *sum) {
940  int i, j;
941  int16x8_t v_sum = vdupq_n_s16(0);
942  int32x4_t v_sse_lo = vdupq_n_s32(0);
943  int32x4_t v_sse_hi = vdupq_n_s32(0);
944
945  for (i = 0; i < h; ++i) {
946    for (j = 0; j < w; j += 8) {
947      const uint8x8_t v_a = vld1_u8(&a[j]);
948      const uint8x8_t v_b = vld1_u8(&b[j]);
949      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
950      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
951      v_sum = vaddq_s16(v_sum, sv_diff);
952      v_sse_lo = vmlal_s16(v_sse_lo,
953                           vget_low_s16(sv_diff),
954                           vget_low_s16(sv_diff));
955      v_sse_hi = vmlal_s16(v_sse_hi,
956                           vget_high_s16(sv_diff),
957                           vget_high_s16(sv_diff));
958    }
959    a += a_stride;
960    b += b_stride;
961  }
962
963  *sum = horizontal_add_s16x8(v_sum);
964  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
965}
966
967static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
968                                     const uint8_t *b, int b_stride,
969                                     unsigned int *sse) {
970  int sum;
971  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
972  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
973}
974
975static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
976                                      uint8_t *output_ptr,
977                                      unsigned int src_pixels_per_line,
978                                      int pixel_step,
979                                      unsigned int output_height,
980                                      unsigned int output_width,
981                                      const uint16_t *vpx_filter) {
982  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
983  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
984  unsigned int i;
985  for (i = 0; i < output_height; ++i) {
986    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
987    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
988    const uint16x8_t a = vmull_u8(src_0, f0);
989    const uint16x8_t b = vmlal_u8(a, src_1, f1);
990    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
991    vst1_u8(&output_ptr[0], out);
992    // Next row...
993    src_ptr += src_pixels_per_line;
994    output_ptr += output_width;
995  }
996}
997
998unsigned int vp8_sub_pixel_variance8x8_neon(
999        const unsigned char *src,
1000        int src_stride,
1001        int xoffset,
1002        int yoffset,
1003        const unsigned char *dst,
1004        int dst_stride,
1005        unsigned int *sse) {
1006  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
1007  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
1008  if (xoffset == 0) {
1009    var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
1010                              kWidth8, bilinear_taps_coeff[yoffset]);
1011  } else if (yoffset == 0) {
1012    var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
1013                              kHeight8PlusOne, kWidth8,
1014                              bilinear_taps_coeff[xoffset]);
1015  } else {
1016    var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
1017                              kHeight8PlusOne, kWidth8,
1018                              bilinear_taps_coeff[xoffset]);
1019    var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
1020                              kWidth8, bilinear_taps_coeff[yoffset]);
1021  }
1022  return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
1023}
1024
1025