1d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org/*
2d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *
4d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
5d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
6d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
7d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
8d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
9d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org */
10d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
11d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include <arm_neon.h>
12d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include "vpx_ports/mem.h"
13d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include "vpx/vpx_integer.h"
14d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic const uint16_t bilinear_taps_coeff[8][2] = {
16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    {128,   0},
17d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    {112,  16},
18d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 96,  32},
19d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 80,  48},
20d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 64,  64},
21d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 48,  80},
22d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 32,  96},
23d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    { 16, 112}
24d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org};
25d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
26d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp8_sub_pixel_variance16x16_neon_func(
27d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *src_ptr,
28d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int src_pixels_per_line,
29d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int xoffset,
30d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int yoffset,
31d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *dst_ptr,
32d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int dst_pixels_per_line,
33d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        unsigned int *sse) {
34d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int i;
35d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
36d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    unsigned char *tmpp;
37d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    unsigned char *tmpp2;
38d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
39d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
40d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d19u8, d20u8, d21u8;
41d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
42d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint32x2_t d0u32, d10u32;
43d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x1_t d0s64, d1s64, d2s64, d3s64;
44d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
45d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
46d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
47d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
48d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int32x4_t q8s32, q9s32, q10s32;
49d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x2_t q0s64, q1s64, q5s64;
50d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
51d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    tmpp2 = tmp + 272;
52d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    tmpp = tmp;
53d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    if (xoffset == 0) {  // secondpass_bfilter16x16_only
54d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
55d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
56d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
57d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u8 = vld1q_u8(src_ptr);
58d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
59d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        for (i = 4; i > 0; i--) {
60d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u8 = vld1q_u8(src_ptr);
61d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
62d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u8 = vld1q_u8(src_ptr);
63d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
64d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u8 = vld1q_u8(src_ptr);
65d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
66d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q15u8 = vld1q_u8(src_ptr);
67d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
68d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
69d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr);
70d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr + src_pixels_per_line);
71d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
72d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
73d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
74d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
75d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
76d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
77d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
78d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
79d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
80d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
81d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
82d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
83d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
84d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
85d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
86d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
87d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
88d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
89d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
90d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
91d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8 = vqrshrn_n_u16(q1u16, 7);
92d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8 = vqrshrn_n_u16(q2u16, 7);
93d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d4u8 = vqrshrn_n_u16(q3u16, 7);
94d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8 = vqrshrn_n_u16(q4u16, 7);
95d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8 = vqrshrn_n_u16(q5u16, 7);
96d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d7u8 = vqrshrn_n_u16(q6u16, 7);
97d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8 = vqrshrn_n_u16(q7u16, 7);
98d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8 = vqrshrn_n_u16(q8u16, 7);
99d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
100d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u8 = vcombine_u8(d2u8, d3u8);
101d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u8 = vcombine_u8(d4u8, d5u8);
102d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u8 = vcombine_u8(d6u8, d7u8);
103d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u8 = vcombine_u8(d8u8, d9u8);
104d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
105d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u8 = q15u8;
106d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
107d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q1u8);
108d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
109d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q2u8);
110d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
111d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q3u8);
112d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
113d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q4u8);
114d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
115d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        }
116d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
117d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
118d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
119d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
120d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        for (i = 4; i > 0 ; i--) {
121d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8 = vld1_u8(src_ptr);
122d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8 = vld1_u8(src_ptr + 8);
123d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d4u8 = vld1_u8(src_ptr + 16);
124d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
125d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8 = vld1_u8(src_ptr);
126d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8 = vld1_u8(src_ptr + 8);
127d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d7u8 = vld1_u8(src_ptr + 16);
128d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
129d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8 = vld1_u8(src_ptr);
130d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8 = vld1_u8(src_ptr + 8);
131d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d10u8 = vld1_u8(src_ptr + 16);
132d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
133d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d11u8 = vld1_u8(src_ptr);
134d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d12u8 = vld1_u8(src_ptr + 8);
135d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d13u8 = vld1_u8(src_ptr + 16);
136d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
137d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
138d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr);
139d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr + src_pixels_per_line);
140d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
141d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
142d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16  = vmull_u8(d2u8, d0u8);
143d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16  = vmull_u8(d3u8, d0u8);
144d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u16  = vmull_u8(d5u8, d0u8);
145d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u16 = vmull_u8(d6u8, d0u8);
146d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u16 = vmull_u8(d8u8, d0u8);
147d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u16 = vmull_u8(d9u8, d0u8);
148d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u16 = vmull_u8(d11u8, d0u8);
149d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u16 = vmull_u8(d12u8, d0u8);
150d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
151d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8  = vext_u8(d2u8, d3u8, 1);
152d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8  = vext_u8(d5u8, d6u8, 1);
153d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8  = vext_u8(d8u8, d9u8, 1);
154d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d11u8 = vext_u8(d11u8, d12u8, 1);
155d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
156d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
157d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
158d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
159d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
160d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
161d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8  = vext_u8(d3u8, d4u8, 1);
162d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8  = vext_u8(d6u8, d7u8, 1);
163d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8  = vext_u8(d9u8, d10u8, 1);
164d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d12u8 = vext_u8(d12u8, d13u8, 1);
165d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
166d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
167d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
168d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
169d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
170d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
171d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d14u8 = vqrshrn_n_u16(q7u16, 7);
172d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d15u8 = vqrshrn_n_u16(q8u16, 7);
173d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d16u8 = vqrshrn_n_u16(q9u16, 7);
174d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d17u8 = vqrshrn_n_u16(q10u16, 7);
175d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d18u8 = vqrshrn_n_u16(q11u16, 7);
176d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d19u8 = vqrshrn_n_u16(q12u16, 7);
177d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d20u8 = vqrshrn_n_u16(q13u16, 7);
178d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d21u8 = vqrshrn_n_u16(q14u16, 7);
179d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
180d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u8  = vcombine_u8(d14u8, d15u8);
181d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u8  = vcombine_u8(d16u8, d17u8);
182d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u8  = vcombine_u8(d18u8, d19u8);
183d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u8 = vcombine_u8(d20u8, d21u8);
184d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
185d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q7u8);
186d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
187d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q8u8);
188d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
189d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q9u8);
190d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
191d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q10u8);
192d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
193d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        }
194d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    } else {
195d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
196d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
197d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
198d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2u8 = vld1_u8(src_ptr);
199d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3u8 = vld1_u8(src_ptr + 8);
200d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4u8 = vld1_u8(src_ptr + 16);
201d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
202d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5u8 = vld1_u8(src_ptr);
203d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6u8 = vld1_u8(src_ptr + 8);
204d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d7u8 = vld1_u8(src_ptr + 16);
205d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
206d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d8u8 = vld1_u8(src_ptr);
207d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d9u8 = vld1_u8(src_ptr + 8);
208d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d10u8 = vld1_u8(src_ptr + 16);
209d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
210d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d11u8 = vld1_u8(src_ptr);
211d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12u8 = vld1_u8(src_ptr + 8);
212d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d13u8 = vld1_u8(src_ptr + 16);
213d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
214d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
215d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        // First Pass: output_height lines x output_width columns (17x16)
216d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        for (i = 3; i > 0; i--) {
217d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16  = vmull_u8(d2u8, d0u8);
218d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16  = vmull_u8(d3u8, d0u8);
219d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u16  = vmull_u8(d5u8, d0u8);
220d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u16 = vmull_u8(d6u8, d0u8);
221d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u16 = vmull_u8(d8u8, d0u8);
222d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u16 = vmull_u8(d9u8, d0u8);
223d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u16 = vmull_u8(d11u8, d0u8);
224d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u16 = vmull_u8(d12u8, d0u8);
225d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
226d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8  = vext_u8(d2u8, d3u8, 1);
227d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8  = vext_u8(d5u8, d6u8, 1);
228d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8  = vext_u8(d8u8, d9u8, 1);
229d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d11u8 = vext_u8(d11u8, d12u8, 1);
230d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
231d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
232d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
233d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
234d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
235d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
236d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8  = vext_u8(d3u8, d4u8, 1);
237d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8  = vext_u8(d6u8, d7u8, 1);
238d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8  = vext_u8(d9u8, d10u8, 1);
239d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d12u8 = vext_u8(d12u8, d13u8, 1);
240d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
241d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
242d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
243d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
244d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
245d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
246d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d14u8 = vqrshrn_n_u16(q7u16, 7);
247d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d15u8 = vqrshrn_n_u16(q8u16, 7);
248d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d16u8 = vqrshrn_n_u16(q9u16, 7);
249d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d17u8 = vqrshrn_n_u16(q10u16, 7);
250d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d18u8 = vqrshrn_n_u16(q11u16, 7);
251d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d19u8 = vqrshrn_n_u16(q12u16, 7);
252d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d20u8 = vqrshrn_n_u16(q13u16, 7);
253d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d21u8 = vqrshrn_n_u16(q14u16, 7);
254d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
255d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8 = vld1_u8(src_ptr);
256d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8 = vld1_u8(src_ptr + 8);
257d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d4u8 = vld1_u8(src_ptr + 16);
258d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
259d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8 = vld1_u8(src_ptr);
260d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8 = vld1_u8(src_ptr + 8);
261d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d7u8 = vld1_u8(src_ptr + 16);
262d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
263d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8 = vld1_u8(src_ptr);
264d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8 = vld1_u8(src_ptr + 8);
265d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d10u8 = vld1_u8(src_ptr + 16);
266d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
267d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d11u8 = vld1_u8(src_ptr);
268d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d12u8 = vld1_u8(src_ptr + 8);
269d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d13u8 = vld1_u8(src_ptr + 16);
270d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            src_ptr += src_pixels_per_line;
271d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
272d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u8 = vcombine_u8(d14u8, d15u8);
273d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u8 = vcombine_u8(d16u8, d17u8);
274d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q9u8 = vcombine_u8(d18u8, d19u8);
275d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q10u8 = vcombine_u8(d20u8, d21u8);
276d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
277d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp, q7u8);
278d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
279d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp, q8u8);
280d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
281d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp, q9u8);
282d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
283d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp, q10u8);
284d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
285d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        }
286d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
287d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        // First-pass filtering for rest 5 lines
288d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d14u8 = vld1_u8(src_ptr);
289d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d15u8 = vld1_u8(src_ptr + 8);
290d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d16u8 = vld1_u8(src_ptr + 16);
291d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += src_pixels_per_line;
292d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
293d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u16  = vmull_u8(d2u8, d0u8);
294d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10u16 = vmull_u8(d3u8, d0u8);
295d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u16 = vmull_u8(d5u8, d0u8);
296d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u16 = vmull_u8(d6u8, d0u8);
297d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13u16 = vmull_u8(d8u8, d0u8);
298d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14u16 = vmull_u8(d9u8, d0u8);
299d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
300d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2u8  = vext_u8(d2u8, d3u8, 1);
301d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5u8  = vext_u8(d5u8, d6u8, 1);
302d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d8u8  = vext_u8(d8u8, d9u8, 1);
303d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
304d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
305d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
306d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
307d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
308d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3u8  = vext_u8(d3u8, d4u8, 1);
309d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6u8  = vext_u8(d6u8, d7u8, 1);
310d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d9u8  = vext_u8(d9u8, d10u8, 1);
311d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
312d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
313d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
314d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
315d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
316d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u16 = vmull_u8(d11u8, d0u8);
317d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u16 = vmull_u8(d12u8, d0u8);
318d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u16 = vmull_u8(d14u8, d0u8);
319d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u16 = vmull_u8(d15u8, d0u8);
320d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
321d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d11u8 = vext_u8(d11u8, d12u8, 1);
322d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d14u8 = vext_u8(d14u8, d15u8, 1);
323d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
324d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
325d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
326d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
327d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12u8 = vext_u8(d12u8, d13u8, 1);
328d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d15u8 = vext_u8(d15u8, d16u8, 1);
329d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
330d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
331d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
332d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
333d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d10u8 = vqrshrn_n_u16(q9u16, 7);
334d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d11u8 = vqrshrn_n_u16(q10u16, 7);
335d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12u8 = vqrshrn_n_u16(q11u16, 7);
336d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d13u8 = vqrshrn_n_u16(q12u16, 7);
337d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d14u8 = vqrshrn_n_u16(q13u16, 7);
338d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d15u8 = vqrshrn_n_u16(q14u16, 7);
339d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d16u8 = vqrshrn_n_u16(q1u16, 7);
340d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d17u8 = vqrshrn_n_u16(q2u16, 7);
341d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d18u8 = vqrshrn_n_u16(q3u16, 7);
342d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d19u8 = vqrshrn_n_u16(q4u16, 7);
343d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
344d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vcombine_u8(d10u8, d11u8);
345d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vcombine_u8(d12u8, d13u8);
346d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vcombine_u8(d14u8, d15u8);
347d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8u8 = vcombine_u8(d16u8, d17u8);
348d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u8 = vcombine_u8(d18u8, d19u8);
349d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
350d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        vst1q_u8((uint8_t *)tmpp, q5u8);
351d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
352d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        vst1q_u8((uint8_t *)tmpp, q6u8);
353d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
354d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        vst1q_u8((uint8_t *)tmpp, q7u8);
355d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
356d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        vst1q_u8((uint8_t *)tmpp, q8u8);
357d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
358d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        vst1q_u8((uint8_t *)tmpp, q9u8);
359d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
360d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        // secondpass_filter
361d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
362d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
363d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
364d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp = tmp;
365d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp2 = tmpp + 272;
366d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u8 = vld1q_u8(tmpp);
367d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
368d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        for (i = 4; i > 0; i--) {
369d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q12u8 = vld1q_u8(tmpp);
370d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
371d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q13u8 = vld1q_u8(tmpp);
372d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
373d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q14u8 = vld1q_u8(tmpp);
374d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
375d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q15u8 = vld1q_u8(tmpp);
376d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp += 16;
377d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
378d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
379d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
380d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
381d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
382d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
383d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
384d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
385d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
386d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
387d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
388d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
389d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
390d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
391d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
392d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
393d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
394d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
395d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
396d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d2u8 = vqrshrn_n_u16(q1u16, 7);
397d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d3u8 = vqrshrn_n_u16(q2u16, 7);
398d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d4u8 = vqrshrn_n_u16(q3u16, 7);
399d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d5u8 = vqrshrn_n_u16(q4u16, 7);
400d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d6u8 = vqrshrn_n_u16(q5u16, 7);
401d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d7u8 = vqrshrn_n_u16(q6u16, 7);
402d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d8u8 = vqrshrn_n_u16(q7u16, 7);
403d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            d9u8 = vqrshrn_n_u16(q8u16, 7);
404d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
405d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q1u8 = vcombine_u8(d2u8, d3u8);
406d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q2u8 = vcombine_u8(d4u8, d5u8);
407d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q3u8 = vcombine_u8(d6u8, d7u8);
408d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q4u8 = vcombine_u8(d8u8, d9u8);
409d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
410d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            q11u8 = q15u8;
411d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
412d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q1u8);
413d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
414d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q2u8);
415d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
416d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q3u8);
417d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
418d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            vst1q_u8((uint8_t *)tmpp2, q4u8);
419d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org            tmpp2 += 16;
420d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        }
421d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
422d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
423d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    // sub_pixel_variance16x16_neon
424d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q8s32 = vdupq_n_s32(0);
425d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q9s32 = vdupq_n_s32(0);
426d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vdupq_n_s32(0);
427d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
428d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    tmpp = tmp + 272;
429d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
430d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = vld1q_u8(tmpp);
431d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
432d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vld1q_u8(tmpp);
433d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        tmpp += 16;
434d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vld1q_u8(dst_ptr);
435d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        dst_ptr += dst_pixels_per_line;
436d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vld1q_u8(dst_ptr);
437d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        dst_ptr += dst_pixels_per_line;
438d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
439d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vget_low_u8(q0u8);
440d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vget_high_u8(q0u8);
441d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2u8 = vget_low_u8(q1u8);
442d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3u8 = vget_high_u8(q1u8);
443d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
444d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
445d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
446d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
447d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
448d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
449d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
450d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
451d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
452d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
453d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
454d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
455d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
456d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
457d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
458d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
459d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
460d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
461d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
462d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
463d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
464d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
465d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
466d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
467d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
468d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
469d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
470d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
471d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
472d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
473d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
474d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vaddq_s32(q10s32, q9s32);
475d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0s64 = vpaddlq_s32(q8s32);
476d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1s64 = vpaddlq_s32(q10s32);
477d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
478d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vget_low_s64(q0s64);
479d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vget_high_s64(q0s64);
480d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d2s64 = vget_low_s64(q1s64);
481d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d3s64 = vget_high_s64(q1s64);
482d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vadd_s64(d0s64, d1s64);
483d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vadd_s64(d2s64, d3s64);
484d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
485d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
486d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                      vreinterpret_s32_s64(d0s64));
487d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
488d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
489d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
490d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
491d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
492d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    return vget_lane_u32(d0u32, 0);
493d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
494d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
495d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp8_variance_halfpixvar16x16_h_neon(
496d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *src_ptr,
497d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  source_stride,
498d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *ref_ptr,
499d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  recon_stride,
500d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        unsigned int *sse) {
501d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int i;
502d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
503d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
504d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
505d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint32x2_t d0u32, d10u32;
506d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x1_t d0s64, d1s64, d2s64, d3s64;
507d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
508d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
509d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
510d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int32x4_t q8s32, q9s32, q10s32;
511d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x2_t q0s64, q1s64, q5s64;
512d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
513d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q8s32 = vdupq_n_s32(0);
514d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q9s32 = vdupq_n_s32(0);
515d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vdupq_n_s32(0);
516d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
517d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
518d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = vld1q_u8(src_ptr);
519d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vld1q_u8(src_ptr + 16);
520d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
521d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vld1q_u8(src_ptr);
522d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vld1q_u8(src_ptr + 16);
523d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
524d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u8 = vld1q_u8(src_ptr);
525d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vld1q_u8(src_ptr + 16);
526d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
527d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vld1q_u8(src_ptr);
528d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vld1q_u8(src_ptr + 16);
529d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
530d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
531d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u8 = vld1q_u8(ref_ptr);
532d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
533d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u8 = vld1q_u8(ref_ptr);
534d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
535d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13u8 = vld1q_u8(ref_ptr);
536d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
537d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14u8 = vld1q_u8(ref_ptr);
538d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
539d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
540d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vextq_u8(q0u8, q1u8, 1);
541d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vextq_u8(q2u8, q3u8, 1);
542d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vextq_u8(q4u8, q5u8, 1);
543d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vextq_u8(q6u8, q7u8, 1);
544d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
545d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = vrhaddq_u8(q0u8, q1u8);
546d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vrhaddq_u8(q2u8, q3u8);
547d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vrhaddq_u8(q4u8, q5u8);
548d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vrhaddq_u8(q6u8, q7u8);
549d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
550d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vget_low_u8(q0u8);
551d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vget_high_u8(q0u8);
552d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2u8 = vget_low_u8(q1u8);
553d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3u8 = vget_high_u8(q1u8);
554d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4u8 = vget_low_u8(q2u8);
555d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5u8 = vget_high_u8(q2u8);
556d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6u8 = vget_low_u8(q3u8);
557d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d7u8 = vget_high_u8(q3u8);
558d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
559d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
560d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
561d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
562d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
563d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
564d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
565d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
566d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
567d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
568d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
569d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
570d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
571d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
572d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
573d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
574d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
575d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
576d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
577d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
578d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
579d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
580d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
581d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
582d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
583d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
584d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
585d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
586d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
587d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
588d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
589d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
590d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
591d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
592d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
593d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
594d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
595d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
596d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
597d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
598d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
599d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
600d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
601d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
602d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
603d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
604d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
605d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
606d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
607d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
608d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
609d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
610d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vaddq_s32(q10s32, q9s32);
611d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0s64 = vpaddlq_s32(q8s32);
612d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1s64 = vpaddlq_s32(q10s32);
613d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
614d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vget_low_s64(q0s64);
615d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vget_high_s64(q0s64);
616d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d2s64 = vget_low_s64(q1s64);
617d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d3s64 = vget_high_s64(q1s64);
618d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vadd_s64(d0s64, d1s64);
619d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vadd_s64(d2s64, d3s64);
620d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
621d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
622d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                      vreinterpret_s32_s64(d0s64));
623d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
624d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
625d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
626d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
627d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
628d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    return vget_lane_u32(d0u32, 0);
629d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
630d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
631d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp8_variance_halfpixvar16x16_v_neon(
632d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *src_ptr,
633d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  source_stride,
634d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *ref_ptr,
635d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  recon_stride,
636d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        unsigned int *sse) {
637d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int i;
638d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
639d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
640d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
641d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint32x2_t d0u32, d10u32;
642d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x1_t d0s64, d1s64, d2s64, d3s64;
643d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
644d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
645d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int32x4_t q8s32, q9s32, q10s32;
646d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x2_t q0s64, q1s64, q5s64;
647d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
648d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q8s32 = vdupq_n_s32(0);
649d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q9s32 = vdupq_n_s32(0);
650d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vdupq_n_s32(0);
651d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
652d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0u8 = vld1q_u8(src_ptr);
653d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    src_ptr += source_stride;
654d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
655d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vld1q_u8(src_ptr);
656d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
657d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u8 = vld1q_u8(src_ptr);
658d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
659d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vld1q_u8(src_ptr);
660d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
661d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15u8 = vld1q_u8(src_ptr);
662d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
663d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
664d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vld1q_u8(ref_ptr);
665d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
666d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vld1q_u8(ref_ptr);
667d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
668d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vld1q_u8(ref_ptr);
669d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
670d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vld1q_u8(ref_ptr);
671d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
672d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
673d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = vrhaddq_u8(q0u8, q2u8);
674d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vrhaddq_u8(q2u8, q4u8);
675d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u8 = vrhaddq_u8(q4u8, q6u8);
676d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vrhaddq_u8(q6u8, q15u8);
677d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
678d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8  = vget_low_u8(q0u8);
679d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8  = vget_high_u8(q0u8);
680d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4u8  = vget_low_u8(q2u8);
681d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5u8  = vget_high_u8(q2u8);
682d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d8u8  = vget_low_u8(q4u8);
683d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d9u8  = vget_high_u8(q4u8);
684d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12u8 = vget_low_u8(q6u8);
685d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d13u8 = vget_high_u8(q6u8);
686d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
687d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
688d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
689d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
690d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
691d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u16  = vsubl_u8(d8u8, vget_low_u8(q5u8));
692d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u16  = vsubl_u8(d9u8, vget_high_u8(q5u8));
693d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u16  = vsubl_u8(d12u8, vget_low_u8(q7u8));
694d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u16  = vsubl_u8(d13u8, vget_high_u8(q7u8));
695d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
696d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
697d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
698d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
699d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
700d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
701d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
702d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
703d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
704d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
705d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
706d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
707d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
708d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
709d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
710d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
711d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
712d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
713d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
714d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
715d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
716d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
717d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
718d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
719d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
720d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
721d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
722d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
723d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
724d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
725d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
726d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
727d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
728d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
729d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
730d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
731d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
732d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
733d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
734d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
735d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
736d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
737d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = q15u8;
738d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
739d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
740d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q10s32 = vaddq_s32(q10s32, q9s32);
741d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0s64 = vpaddlq_s32(q8s32);
742d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1s64 = vpaddlq_s32(q10s32);
743d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
744d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vget_low_s64(q0s64);
745d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vget_high_s64(q0s64);
746d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d2s64 = vget_low_s64(q1s64);
747d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d3s64 = vget_high_s64(q1s64);
748d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vadd_s64(d0s64, d1s64);
749d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vadd_s64(d2s64, d3s64);
750d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
751d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
752d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                      vreinterpret_s32_s64(d0s64));
753d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
754d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
755d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
756d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
757d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
758d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    return vget_lane_u32(d0u32, 0);
759d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
760d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
761d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp8_variance_halfpixvar16x16_hv_neon(
762d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *src_ptr,
763d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  source_stride,
764d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *ref_ptr,
765d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int  recon_stride,
766d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        unsigned int *sse) {
767d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int i;
768d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
769d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
770d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
771d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint32x2_t d0u32, d10u32;
772d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x1_t d0s64, d1s64, d2s64, d3s64;
773d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
774d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
775d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int32x4_t q13s32, q14s32, q15s32;
776d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    int64x2_t q0s64, q1s64, q5s64;
777d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
778d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q13s32 = vdupq_n_s32(0);
779d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q14s32 = vdupq_n_s32(0);
780d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q15s32 = vdupq_n_s32(0);
781d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
782d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0u8 = vld1q_u8(src_ptr);
783d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1u8 = vld1q_u8(src_ptr + 16);
784d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    src_ptr += source_stride;
785d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1u8 = vextq_u8(q0u8, q1u8, 1);
786d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0u8 = vrhaddq_u8(q0u8, q1u8);
787d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
788d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vld1q_u8(src_ptr);
789d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vld1q_u8(src_ptr + 16);
790d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
791d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u8 = vld1q_u8(src_ptr);
792d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vld1q_u8(src_ptr + 16);
793d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
794d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vld1q_u8(src_ptr);
795d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vld1q_u8(src_ptr + 16);
796d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
797d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8u8 = vld1q_u8(src_ptr);
798d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u8 = vld1q_u8(src_ptr + 16);
799d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        src_ptr += source_stride;
800d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
801d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vextq_u8(q2u8, q3u8, 1);
802d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vextq_u8(q4u8, q5u8, 1);
803d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vextq_u8(q6u8, q7u8, 1);
804d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u8 = vextq_u8(q8u8, q9u8, 1);
805d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
806d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vrhaddq_u8(q2u8, q3u8);
807d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vrhaddq_u8(q4u8, q5u8);
808d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vrhaddq_u8(q6u8, q7u8);
809d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q4u8 = vrhaddq_u8(q8u8, q9u8);
810d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = vrhaddq_u8(q0u8, q1u8);
811d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u8 = vrhaddq_u8(q1u8, q2u8);
812d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q2u8 = vrhaddq_u8(q2u8, q3u8);
813d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q3u8 = vrhaddq_u8(q3u8, q4u8);
814d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
815d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u8 = vld1q_u8(ref_ptr);
816d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
817d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u8 = vld1q_u8(ref_ptr);
818d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
819d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q7u8 = vld1q_u8(ref_ptr);
820d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
821d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q8u8 = vld1q_u8(ref_ptr);
822d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        ref_ptr += recon_stride;
823d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
824d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0u8 = vget_low_u8(q0u8);
825d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1u8 = vget_high_u8(q0u8);
826d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2u8 = vget_low_u8(q1u8);
827d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3u8 = vget_high_u8(q1u8);
828d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d4u8 = vget_low_u8(q2u8);
829d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d5u8 = vget_high_u8(q2u8);
830d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d6u8 = vget_low_u8(q3u8);
831d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d7u8 = vget_high_u8(q3u8);
832d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
833d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
834d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
835d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
836d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
837d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
838d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
839d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
840d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));
841d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
842d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
843d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
844d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
845d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
846d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
847d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
848d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
849d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
850d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
851d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
852d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
853d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
854d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
855d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
856d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
857d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
858d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
859d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
860d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
861d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
862d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
863d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
864d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
865d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
866d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
867d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
868d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
869d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
870d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
871d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
872d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
873d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
874d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
875d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
876d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
877d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
878d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
879d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
880d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
881d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
882d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
883d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
884d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
885d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
886d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
887d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
888d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
889d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
890d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        q0u8 = q4u8;
891d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
892d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
893d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q15s32 = vaddq_s32(q14s32, q15s32);
894d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q0s64 = vpaddlq_s32(q13s32);
895d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q1s64 = vpaddlq_s32(q15s32);
896d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
897d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vget_low_s64(q0s64);
898d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vget_high_s64(q0s64);
899d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d2s64 = vget_low_s64(q1s64);
900d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d3s64 = vget_high_s64(q1s64);
901d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0s64 = vadd_s64(d0s64, d1s64);
902d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d1s64 = vadd_s64(d2s64, d3s64);
903d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
904d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
905d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                      vreinterpret_s32_s64(d0s64));
906d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
907d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
908d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
909d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
910d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
911d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    return vget_lane_u32(d0u32, 0);
912d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
913d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
914d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgenum { kWidth8 = 8 };
915d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgenum { kHeight8 = 8 };
916d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgenum { kHeight8PlusOne = 9 };
917d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgenum { kPixelStepOne = 1 };
918d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgenum { kAlign16 = 16 };
919d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
920d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#define FILTER_BITS 7
921d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
922d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
923d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int32x4_t a = vpaddlq_s16(v_16x8);
924d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int64x2_t b = vpaddlq_s32(a);
925d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
926d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               vreinterpret_s32_s64(vget_high_s64(b)));
927d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return vget_lane_s32(c, 0);
928d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
929d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
930d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
931d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int64x2_t b = vpaddlq_s32(v_32x4);
932d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
933d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               vreinterpret_s32_s64(vget_high_s64(b)));
934d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return vget_lane_s32(c, 0);
935d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
936d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
937d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic void variance_neon_w8(const uint8_t *a, int a_stride,
938d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             const uint8_t *b, int b_stride,
939d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             int w, int h, unsigned int *sse, int *sum) {
940d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int i, j;
941d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int16x8_t v_sum = vdupq_n_s16(0);
942d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int32x4_t v_sse_lo = vdupq_n_s32(0);
943d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int32x4_t v_sse_hi = vdupq_n_s32(0);
944d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
945d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  for (i = 0; i < h; ++i) {
946d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    for (j = 0; j < w; j += 8) {
947d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      const uint8x8_t v_a = vld1_u8(&a[j]);
948d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      const uint8x8_t v_b = vld1_u8(&b[j]);
949d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
950d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
951d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      v_sum = vaddq_s16(v_sum, sv_diff);
952d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      v_sse_lo = vmlal_s16(v_sse_lo,
953d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                           vget_low_s16(sv_diff),
954d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                           vget_low_s16(sv_diff));
955d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      v_sse_hi = vmlal_s16(v_sse_hi,
956d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                           vget_high_s16(sv_diff),
957d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                           vget_high_s16(sv_diff));
958d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    }
959d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    a += a_stride;
960d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    b += b_stride;
961d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
962d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
963d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sum = horizontal_add_s16x8(v_sum);
964d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
965d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
966d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
967d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
968d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                     const uint8_t *b, int b_stride,
969d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                     unsigned int *sse) {
970d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
971d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
972d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
973d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
974d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
975d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
976d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      uint8_t *output_ptr,
977d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      unsigned int src_pixels_per_line,
978d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      int pixel_step,
979d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      unsigned int output_height,
980d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      unsigned int output_width,
981d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                      const uint16_t *vpx_filter) {
982d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
983d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
984d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  unsigned int i;
985d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  for (i = 0; i < output_height; ++i) {
986d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
987d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
988d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const uint16x8_t a = vmull_u8(src_0, f0);
989d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const uint16x8_t b = vmlal_u8(a, src_1, f1);
990d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
991d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vst1_u8(&output_ptr[0], out);
992d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    // Next row...
993d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    src_ptr += src_pixels_per_line;
994d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    output_ptr += output_width;
995d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
996d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
997d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
998d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp8_sub_pixel_variance8x8_neon(
999d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *src,
1000d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int src_stride,
1001d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int xoffset,
1002d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int yoffset,
1003d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        const unsigned char *dst,
1004d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        int dst_stride,
1005d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        unsigned int *sse) {
1006d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
1007d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
1008d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  if (xoffset == 0) {
1009d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
1010d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              kWidth8, bilinear_taps_coeff[yoffset]);
1011d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  } else if (yoffset == 0) {
1012d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
1013d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              kHeight8PlusOne, kWidth8,
1014d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              bilinear_taps_coeff[xoffset]);
1015d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  } else {
1016d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
1017d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              kHeight8PlusOne, kWidth8,
1018d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              bilinear_taps_coeff[xoffset]);
1019d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
1020d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              kWidth8, bilinear_taps_coeff[yoffset]);
1021d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
1022d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
1023d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
1024d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
1025