1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp8_sixtap_predict16x16_neon|
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    REQUIRE8
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PRESERVE8
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan
19233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilter16_coeff
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     0,  0,  128,    0,   0,  0,   0,  0
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     0, -6,  123,   12,  -1,  0,   0,  0
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     2, -11, 108,   36,  -8,  1,   0,  0
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     0, -9,   93,   50,  -6,  0,   0,  0
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     3, -16,  77,   77, -16,  3,   0,  0
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     0, -6,   50,   93,  -9,  0,   0,  0
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     1, -8,   36,  108, -11,  2,   0,  0
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    DCD     0, -1,   12,  123,  -6,   0,  0,  0
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    unsigned char  *src_ptr,
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int  src_pixels_per_line,
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    int  xoffset,
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    int  yoffset,
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r4    unsigned char *dst_ptr,
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r5) int  dst_pitch
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan; the result can be negtive. So, I treat the result as s16. But, since it is also possible
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that the result can be a large positive number (> 2^15-1), which could be confused as a
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan; which ensures that the result stays in s16 range. Finally, saturated add the result by
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan; applying 3rd filter coeff. Same applys to other filter functions.
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_sixtap_predict16x16_neon| PROC
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push            {r4-r5, lr}
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    adr             r12, filter16_coeff
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r4, [sp, #12]           ;load parameters from stack
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r5, [sp, #16]           ;load parameters from stack
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq             secondpass_filter16x16_only
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r2, r12, r2, lsl #5     ;calculate filter location
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq             firstpass_filter16x16_only
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             sp, sp, #336            ;reserve space on stack for temporary storage
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             lr, sp
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q12, q14
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q13, q15
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #7                  ;loop counter
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1, lsl #1
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d24[4]
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d2, d25[0]
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d3, d25[4]
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d4, d26[0]
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d5, d26[4]
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (21x16)
80233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_fp16x16_loop_neon
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d9, d10, d11}, [r0], r1
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d12, d13, d14}, [r0], r1
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0]
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1]
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1, lsl #1]
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d7, d0
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d9, d0
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q11, d10, d0
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q12, d12, d0
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q13, d13, d0
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #1
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d12, d13, #1
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q10, d29, d1
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q12, d30, d1
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d7, d8, #1
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d10, d11, #1
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d13, d14, #1
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q11, d29, d1
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q13, d30, d1
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #4
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d12, d13, #4
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q10, d29, d4
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q12, d30, d4
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d7, d8, #4
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d10, d11, #4
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d13, d14, #4
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q11, d29, d4
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q13, d30, d4
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #5
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d12, d13, #5
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q10, d29, d5
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q12, d30, d5
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d7, d8, #5
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d10, d11, #5
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d13, d14, #5
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q11, d29, d5
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q13, d30, d5
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #2
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d12, d13, #2
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q10, d29, d2
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q12, d30, d2
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d7, d8, #2
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d10, d11, #2
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d13, d14, #2
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q11, d29, d2
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q13, d30, d2
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #3
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d30, d12, d13, #3
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d15, d7, d8, #3
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d31, d10, d11, #3
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d6, d13, d14, #3
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q5, d29, d3
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d30, d3
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q10, q5
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q12, q6
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d31, d3
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d6, d3
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r2, r2, #1
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q9, q6
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q11, q7
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q13, q3
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d7, q9, #7
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d8, q10, #7
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d9, q11, #7
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d10, q12, #7
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d11, q13, #7
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d9, d10, d11}, [lr]!
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             filt_blk2d_fp16x16_loop_neon
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter - do first 8-columns and then second 8-columns
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, r12, r3, lsl #5
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             lr, lr, #336
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, #2                  ;loop counter
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q7, q5
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q8, q6
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #16
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d14[4]
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d2, d15[0]
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d3, d15[4]
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d4, d16[0]
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d5, d16[4]
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan
218233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_sp16x16_outloop_neon
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d18}, [lr], r2         ;load src data
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d19}, [lr], r2
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d20}, [lr], r2
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d21}, [lr], r2
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #4                 ;loop counter
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d22}, [lr], r2
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_inner_loop_neon
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d23}, [lr], r2         ;load src data
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d24}, [lr], r2
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d25}, [lr], r2
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d26}, [lr], r2
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d19, d0
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q5, d20, d0
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d21, d0
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q4, d20, d1
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q5, d21, d1
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d22, d1
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q4, d23, d4
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q5, d24, d4
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d25, d4
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d21, d2
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d22, d2
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d23, d2
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d24, d5
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d25, d5
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d26, d5
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d22, d3
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d23, d3
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d24, d3
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #1
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q8, q4
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q9, q5
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q10, q6
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d7, q8, #7
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d8, q9, #7
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d9, q10, #7
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d6}, [r4], r5          ;store result
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q9, q11
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d7}, [r4], r5
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q10, q12
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d8}, [r4], r5
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d22, d26
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d9}, [r4], r5
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             secondpass_inner_loop_neon
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r3, r3, #1
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             lr, lr, #336
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             lr, lr, #8
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r4, r4, r5, lsl #4
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r4, r4, #8
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne filt_blk2d_sp16x16_outloop_neon
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             sp, sp, #336
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r5,pc}
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------
297233d2500723e5594f3e7c70896ffeeef32b9c950ywanfirstpass_filter16x16_only
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q12, q14
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q13, q15
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r2, #8                  ;loop counter
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, #2              ;move srcptr back to (column-2)
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d24[4]
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d2, d25[0]
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d3, d25[4]
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d4, d26[0]
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d5, d26[4]
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (16x16)
312233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_fpo16x16_loop_neon
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d9, d10, d11}, [r0], r1
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0]
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1]
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d7, d0
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d9, d0
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d10, d0
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d21, d9, d10, #1
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d22, d7, d8, #1
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d23, d10, d11, #1
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d25, d9, d10, #4
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d26, d7, d8, #4
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d27, d10, d11, #4
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d9, d10, #5
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q8, d21, d1
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q9, d23, d1
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q8, d25, d4
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q9, d27, d4
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d29, d5
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d20, d7, d8, #5
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d21, d10, d11, #5
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d23, d9, d10, #2
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d24, d7, d8, #2
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d25, d10, d11, #2
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d27, d9, d10, #3
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d28, d7, d8, #3
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vext.8          d29, d10, d11, #3
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d21, d5
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q8, d23, d2
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q9, d25, d2
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q11, d27, d3
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q15, d29, d3
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q8, q11
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q7, q12
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q9, q15
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r2, r2, #1
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d7, q7, #7
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d8, q8, #7
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d9, q9, #7
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {q3}, [r4], r5              ;store result
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {q4}, [r4], r5
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             filt_blk2d_fpo16x16_loop_neon
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r5,pc}
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------
390233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_filter16x16_only
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r3, r12, r3, lsl #5
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1, lsl #1
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r3, #2                  ;loop counter
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q7, q5
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabs.s32        q8, q6
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d1, d14[4]
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d2, d15[0]
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d3, d15[4]
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d4, d16[0]
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vdup.8          d5, d16[4]
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan
408233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_spo16x16_outloop_neon
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d18}, [r0], r1         ;load src data
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d19}, [r0], r1
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d20}, [r0], r1
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d21}, [r0], r1
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, #4                 ;loop counter
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d22}, [r0], r1
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_only_inner_loop_neon
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d23}, [r0], r1         ;load src data
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d24}, [r0], r1
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d25}, [r0], r1
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8         {d26}, [r0], r1
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q4, d19, d0
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q5, d20, d0
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q6, d21, d0
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q4, d20, d1
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q5, d21, d1
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d22, d1
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q4, d23, d4
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q5, d24, d4
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlsl.u8        q6, d25, d4
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d21, d2
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d22, d2
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d23, d2
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q4, d24, d5
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q5, d25, d5
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8        q6, d26, d5
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q8, d22, d3
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q9, d23, d3
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.u8        q10, d24, d3
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #1
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q8, q4
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q9, q5
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s16       q10, q6
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d7, q8, #7
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d8, q9, #7
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s16    d9, q10, #7
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d6}, [r4], r5          ;store result
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q9, q11
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d7}, [r4], r5
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q10, q12
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d8}, [r4], r5
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d22, d26
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8         {d9}, [r4], r5
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             secondpass_only_inner_loop_neon
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r3, r3, #1
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1, lsl #4
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1, lsl #2
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r0, r0, #8
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r4, r4, r5, lsl #4
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r4, r4, #8
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne filt_blk2d_spo16x16_outloop_neon
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r5,pc}
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-----------------
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
491