1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; These functions are only valid when:
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; x_step_q4 == 16
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; w%4 == 0
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; h%4 == 0
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; taps == 8
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; VP9_FILTER_WEIGHT == 128
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; VP9_FILTER_SHIFT == 7
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_convolve8_horiz_neon|
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_convolve8_vert_neon|
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IMPORT  |vp9_convolve8_horiz_c|
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    IMPORT  |vp9_convolve8_vert_c|
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    REQUIRE8
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    PRESERVE8
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Multiply and accumulate by q0
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s16 $dst, $src0, d0[0]
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src1, d0[1]
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src2, d0[2]
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src3, d0[3]
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src4, d1[0]
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src5, d1[1]
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src6, d1[2]
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.s16 $dst, $src7, d1[3]
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    const uint8_t *src
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int src_stride
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    uint8_t *dst
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    int dst_stride
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]const int16_t *filter_x
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]int x_step_q4
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]const int16_t *filter_y ; unused
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]int y_step_q4           ; unused
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]int w
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp[]int h
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_convolve8_horiz_neon| PROC
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r12, [sp, #4]           ; x_step_q4
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r12, #16
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp9_convolve8_horiz_c
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push            {r4-r10, lr}
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, #3              ; adjust for taps
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r5, [sp, #32]           ; filter_x
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r6, [sp, #48]           ; w
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r7, [sp, #52]           ; h
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q0}, [r5]              ; filter_x
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r8, r8, #4              ; -src_stride * 3 + 4
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r4, r4, #4              ; -dst_stride * 3 + 4
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r9, r9, #7
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r10, r6                 ; w loop counter
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan
81233d2500723e5594f3e7c70896ffeeef32b9c950ywanloop_horiz_v
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {d24}, [r0], r1
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {d25}, [r0], r1
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {d26}, [r0], r1
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8          {d27}, [r0], r8
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         q12, q13
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8          d24, d25
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8          d26, d27
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r0, r1, lsl #2]
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q8, d24
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q9, d25
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q10, d26
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q11, d27
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; save a few instructions in the inner loop
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp            d17, d18
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d23, d21
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r0, r0, #3
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan
104233d2500723e5594f3e7c70896ffeeef32b9c950ywanloop_horiz
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r5, r0, #64
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.32         {d28[]}, [r0], r1
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.32         {d29[]}, [r0], r1
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.32         {d31[]}, [r0], r1
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.32         {d30[]}, [r0], r8
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5]
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         d28, d31
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         d29, d30
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8          d28, d29
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8          d31, d30
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5, r1]
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; extract to s16
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         q14, q15
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q12, d28
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q13, d29
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5, r1, lsl #1]
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; src[] * filter_x
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5, -r8]
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; += 64 >> 7
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d2, q1, #7
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d3, q2, #7
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d4, q14, #7
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d5, q15, #7
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; saturate
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.u16      d2, q1
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.u16      d3, q2
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16         d2, d3
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32         d2, d3
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8          d2, d3
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d2[0]}, [r2@32], r3
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d3[0]}, [r2@32], r3
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d2[1]}, [r2@32], r3
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d3[1]}, [r2@32], r4
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q8,  q9
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d20, d23
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q11, q12
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q9,  q13
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r6, r6, #4              ; w -= 4
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bgt             loop_horiz
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; outer loop
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r6, r10                 ; restore w counter
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r0, r0, r9              ; src += src_stride * 4 - w
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r2, r2, r12             ; dst += dst_stride * 4 - w
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r7, r7, #4              ; h -= 4
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bgt loop_horiz_v
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r10, pc}
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_convolve8_vert_neon| PROC
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r12, [sp, #12]
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp             r12, #16
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne             vp9_convolve8_vert_c
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push            {r4-r8, lr}
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; adjust for taps
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             r0, r0, r1, lsl #1
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r4, [sp, #32]           ; filter_y
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             r6, [sp, #40]           ; w
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr             lr, [sp, #44]           ; h
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.s16        {q0}, [r4]              ; filter_y
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lsl             r1, r1, #1
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lsl             r3, r3, #1
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
195233d2500723e5594f3e7c70896ffeeef32b9c950ywanloop_vert_h
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r4, r0
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r7, r0, r1, asr #1
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r5, r2
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r8, r2, r3, asr #1
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov             r12, lr                 ; h loop counter
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d16[0]}, [r4], r1
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d16[1]}, [r7], r1
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d18[0]}, [r4], r1
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d18[1]}, [r7], r1
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d20[0]}, [r4], r1
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d20[1]}, [r7], r1
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d22[0]}, [r4], r1
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q8, d16
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q9, d18
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q10, d20
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q11, d22
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan
215233d2500723e5594f3e7c70896ffeeef32b9c950ywanloop_vert
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; always process a 4x4 block at a time
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d24[0]}, [r7], r1
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d26[0]}, [r4], r1
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d26[1]}, [r7], r1
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u32        {d24[1]}, [r4], r1
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; extract to s16
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q12, d24
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmovl.u8        q13, d26
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5]
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r8]
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; src[] * filter_y
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r5, r3]
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r8, r3]
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r7]
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r4]
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r7, r1]
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld             [r4, r1]
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; += 64 >> 7
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d2, q1, #7
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d3, q2, #7
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d4, q14, #7
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrun.s32    d5, q15, #7
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; saturate
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.u16      d2, q1
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.u16      d3, q2
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d2[0]}, [r5@32], r3
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d2[1]}, [r8@32], r3
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d3[0]}, [r5@32], r3
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u32        {d3[1]}, [r8@32], r3
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q8, q10
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d18, d22
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d19, d24
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            q10, q13
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov            d22, d25
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r12, r12, #4            ; h -= 4
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bgt             loop_vert
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; outer loop
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r0, r0, #4
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add             r2, r2, #4
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs            r6, r6, #4              ; w -= 4
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bgt             loop_vert_h
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop             {r4-r8, pc}
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
281