1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-----------------
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan    REQUIRE8
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PRESERVE8
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    unsigned char  *src_ptr,
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int  src_pixels_per_line,
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    int  xoffset,
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    int  yoffset,
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r4) unsigned char *dst_ptr,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r5) int dst_pixels_per_line,
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r6) unsigned int *sse
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywanbilinear_taps_coeff
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_sub_pixel_variance16x16_neon_func| PROC
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push            {r4-r6, lr}
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    adr             r12, bilinear_taps_coeff
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r6, [sp, #24]           ;load *sse from stack
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq             secondpass_bfilter16x16_only
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r2, r12, r2, lsl #3     ;calculate filter location
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s32        {d31}, [r2]             ;load first_pass filter
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq             firstpass_bfilter16x16_only
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             sp, sp, #272            ;reserve space on stack for temporary storage
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             lr, sp
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d5, d6, d7}, [r0], r1
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #3                  ;loop counter
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d8, d9, d10}, [r0], r1
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d11, d12, d13}, [r0], r1
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d31[4]
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (17x16)
65233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_fp16x16_loop_neon
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0]
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1]
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1, lsl #1]
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d3, d0
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d5, d0
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d6, d0
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q11, d8, d0
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q12, d9, d0
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q13, d11, d0
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q14, d12, d0
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d5, d5, d6, #1
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d8, d8, d9, #1
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d11, d11, d12, #1
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d5, d1
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q11, d8, d1
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q13, d11, d1
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d3, d3, d4, #1
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d6, d6, d7, #1
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d9, d9, d10, #1
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d12, d12, d13, #1
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q10, d6, d1
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q12, d9, d1
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q14, d12, d1
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r2, r2, #1
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d15, q8, #7
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d16, q9, #7
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d17, q10, #7
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d18, q11, #7
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d19, q12, #7
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d20, q13, #7
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d21, q14, #7
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d5, d6, d7}, [r0], r1
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d8, d9, d10}, [r0], r1
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d18, d19, d20, d21}, [lr]!
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d11, d12, d13}, [r0], r1
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp8e_filt_blk2d_fp16x16_loop_neon
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First-pass filtering for rest 5 lines
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d14, d15, d16}, [r0], r1
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d3, d0
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q11, d5, d0
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q12, d6, d0
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q13, d8, d0
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q14, d9, d0
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d5, d5, d6, #1
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d8, d8, d9, #1
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q11, d5, d1
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q13, d8, d1
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d3, d3, d4, #1
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d6, d6, d7, #1
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d9, d9, d10, #1
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q12, d6, d1
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q14, d9, d1
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q1, d11, d0
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q2, d12, d0
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d14, d0
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d15, d0
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d14, d14, d15, #1
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d14, d1
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d12, d12, d13, #1
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d15, d15, d16, #1
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d15, d1
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d11, q10, #7
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d12, q11, #7
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d13, q12, #7
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d14, q13, #7
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d15, q14, #7
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d16, q1, #7
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d17, q2, #7
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d18, q3, #7
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d19, q4, #7
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d14, d15, d16, d17}, [lr]!
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d18, d19}, [lr]!
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, r12, r3, lsl #3
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             lr, lr, #272
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d31}, [r3]             ;load second_pass filter
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             sp, sp, #256
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, sp
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d22, d23}, [lr]!       ;load src data
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d31[4]
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #4                 ;loop counter
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan
194233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_sp16x16_loop_neon
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d24, d25}, [lr]!
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d26, d27}, [lr]!
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q2, d23, d0
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d28, d29}, [lr]!
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d24, d0
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d30, d31}, [lr]!
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d25, d0
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q5, d26, d0
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d27, d0
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d28, d0
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d29, d0
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q2, d25, d1
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d26, d1
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d27, d1
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d28, d1
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d29, d1
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d30, d1
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d31, d1
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #1
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d3, q2, #7
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d4, q3, #7
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d5, q4, #7
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d6, q5, #7
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d7, q6, #7
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d8, q7, #7
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d9, q8, #7
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d2, d3}, [r3]!         ;store result
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d4, d5}, [r3]!
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d6, d7}, [r3]!
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q11, q15
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d8, d9}, [r3]!
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp8e_filt_blk2d_sp16x16_loop_neon
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b               sub_pixel_variance16x16_neon
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------
240233d2500723e5594f3e7c70896ffeeef32b9c950ywanfirstpass_bfilter16x16_only
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #4                      ;loop counter
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             sp, sp, #528            ;reserve space on stack for temporary storage
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d31[4]
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, sp
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (16x16)
248233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_fpo16x16_loop_neon
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d5, d6, d7}, [r0], r1
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d8, d9, d10}, [r0], r1
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d11, d12, d13}, [r0], r1
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0]
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1]
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1, lsl #1]
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d3, d0
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d5, d0
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d6, d0
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q11, d8, d0
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q12, d9, d0
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q13, d11, d0
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q14, d12, d0
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d5, d5, d6, #1
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d8, d8, d9, #1
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d11, d11, d12, #1
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d5, d1
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q11, d8, d1
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q13, d11, d1
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d3, d3, d4, #1
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d6, d6, d7, #1
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d9, d9, d10, #1
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d12, d12, d13, #1
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q10, d6, d1
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q12, d9, d1
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q14, d12, d1
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r2, r2, #1
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d15, q8, #7
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d16, q9, #7
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d17, q10, #7
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d18, q11, #7
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d19, q12, #7
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d20, q13, #7
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d14, d15}, [r3]!       ;store result
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d21, q14, #7
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d16, d17}, [r3]!
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d18, d19}, [r3]!
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d20, d21}, [r3]!
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b               sub_pixel_variance16x16_neon
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan;---------------------
308233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_bfilter16x16_only
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             sp, sp, #528            ;reserve space on stack for temporary storage
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, r12, r3, lsl #3
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #4                     ;loop counter
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d31}, [r3]                 ;load second_pass filter
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d22, d23}, [r0], r1        ;load src data
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, sp
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d31[4]
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan
321233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_spo16x16_loop_neon
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d24, d25}, [r0], r1
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d26, d27}, [r0], r1
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q2, d23, d0
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d28, d29}, [r0], r1
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d24, d0
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d30, d31}, [r0], r1
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d25, d0
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q5, d26, d0
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d27, d0
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d28, d0
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d29, d0
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q2, d25, d1
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d26, d1
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d27, d1
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d28, d1
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d29, d1
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d30, d1
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d31, d1
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d3, q2, #7
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d4, q3, #7
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d5, q4, #7
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d6, q5, #7
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d7, q6, #7
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d8, q7, #7
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16    d9, q8, #7
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d2, d3}, [r3]!         ;store result
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #1
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d4, d5}, [r3]!
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q11, q15
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d6, d7}, [r3]!
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d8, d9}, [r3]!
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp8e_filt_blk2d_spo16x16_loop_neon
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b               sub_pixel_variance16x16_neon
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan;----------------------------
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan;variance16x16
367233d2500723e5594f3e7c70896ffeeef32b9c950ywansub_pixel_variance16x16_neon
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.i8         q8, #0                      ;q8 - sum
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.i8         q9, #0                      ;q9, q10 - sse
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.i8         q10, #0
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r3, r3, #256
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #8
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywansub_pixel_variance16x16_neon_loop
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {q0}, [r3]!                 ;Load up source and reference
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {q2}, [r4], r5
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {q1}, [r3]!
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {q3}, [r4], r5
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubl.u8        q11, d0, d4                 ;diff
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubl.u8        q12, d1, d5
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubl.u8        q13, d2, d6
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubl.u8        q14, d3, d7
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpadal.s16      q8, q11                     ;sum
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q9, d22, d22                ;sse
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q10, d23, d23
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #1
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpadal.s16      q8, q12
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q9, d24, d24
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q10, d25, d25
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpadal.s16      q8, q13
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q9, d26, d26
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q10, d27, d27
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpadal.s16      q8, q14
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q9, d28, d28
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16       q10, d29, d29
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             sub_pixel_variance16x16_neon_loop
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.u32        q10, q9, q10                ;accumulate sse
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpaddl.s32      q0, q8                      ;accumulate sum
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpaddl.u32      q1, q10
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.s64        d0, d0, d1
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.u64        d1, d2, d3
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s32       q5, d0, d0
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.32         {d1[0]}, [r6]               ;store sse
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.u32        d10, d10, #8
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.u32        d0, d1, d10
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             sp, sp, #528
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.32         r0, d0[0]                   ;return
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r6,pc}
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
424