191037db265ecdd914a26e056cf69207b4f50924ehkuang;
291037db265ecdd914a26e056cf69207b4f50924ehkuang;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang;
491037db265ecdd914a26e056cf69207b4f50924ehkuang;  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang;  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang;  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang;  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang;  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang;
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
1191037db265ecdd914a26e056cf69207b4f50924ehkuang
1291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; These functions are only valid when:
1391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; x_step_q4 == 16
1491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; w%4 == 0
1591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; h%4 == 0
1691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; taps == 8
1791037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_WEIGHT == 128
1891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_SHIFT == 7
1991037db265ecdd914a26e056cf69207b4f50924ehkuang
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    EXPORT  |vpx_convolve8_avg_horiz_neon|
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    EXPORT  |vpx_convolve8_avg_vert_neon|
2291037db265ecdd914a26e056cf69207b4f50924ehkuang    ARM
2391037db265ecdd914a26e056cf69207b4f50924ehkuang    REQUIRE8
2491037db265ecdd914a26e056cf69207b4f50924ehkuang    PRESERVE8
2591037db265ecdd914a26e056cf69207b4f50924ehkuang
2691037db265ecdd914a26e056cf69207b4f50924ehkuang    AREA ||.text||, CODE, READONLY, ALIGN=2
2791037db265ecdd914a26e056cf69207b4f50924ehkuang
2891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; Multiply and accumulate by q0
2991037db265ecdd914a26e056cf69207b4f50924ehkuang    MACRO
3091037db265ecdd914a26e056cf69207b4f50924ehkuang    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
3191037db265ecdd914a26e056cf69207b4f50924ehkuang    vmull.s16 $dst, $src0, d0[0]
3291037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src1, d0[1]
3391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src2, d0[2]
3491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src3, d0[3]
3591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src4, d1[0]
3691037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src5, d1[1]
3791037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src6, d1[2]
3891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src7, d1[3]
3991037db265ecdd914a26e056cf69207b4f50924ehkuang    MEND
4091037db265ecdd914a26e056cf69207b4f50924ehkuang
4191037db265ecdd914a26e056cf69207b4f50924ehkuang; r0    const uint8_t *src
4291037db265ecdd914a26e056cf69207b4f50924ehkuang; r1    int src_stride
4391037db265ecdd914a26e056cf69207b4f50924ehkuang; r2    uint8_t *dst
4491037db265ecdd914a26e056cf69207b4f50924ehkuang; r3    int dst_stride
45df37111358d02836cb29bbcb9c6e4c95dff90a16Johann; sp[]const int16_t *filter
46df37111358d02836cb29bbcb9c6e4c95dff90a16Johann; sp[]int x0_q4
47df37111358d02836cb29bbcb9c6e4c95dff90a16Johann; sp[]int x_step_q4 ; unused
48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann; sp[]int y0_q4
49df37111358d02836cb29bbcb9c6e4c95dff90a16Johann; sp[]int y_step_q4 ; unused
5091037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int w
5191037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int h
5291037db265ecdd914a26e056cf69207b4f50924ehkuang
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian|vpx_convolve8_avg_horiz_neon| PROC
5491037db265ecdd914a26e056cf69207b4f50924ehkuang    push            {r4-r10, lr}
5591037db265ecdd914a26e056cf69207b4f50924ehkuang
5691037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, #3              ; adjust for taps
5791037db265ecdd914a26e056cf69207b4f50924ehkuang
58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
59df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    add             r4, r5, lsl #4
60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldrd            r6, r7, [sp, #52]       ; w, h
6191037db265ecdd914a26e056cf69207b4f50924ehkuang
62df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vld1.s16        {q0}, [r4]              ; filter
6391037db265ecdd914a26e056cf69207b4f50924ehkuang
643df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
653df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r8, #4              ; -src_stride * 3 + 4
6691037db265ecdd914a26e056cf69207b4f50924ehkuang
673df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
683df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r4, r4, #4              ; -dst_stride * 3 + 4
6991037db265ecdd914a26e056cf69207b4f50924ehkuang
703df0563f1b24dac6c0bd122fc922a48211269061hkuang    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
713df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r9, r9, #7
7291037db265ecdd914a26e056cf69207b4f50924ehkuang    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
7391037db265ecdd914a26e056cf69207b4f50924ehkuang
7491037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r10, r6                 ; w loop counter
7591037db265ecdd914a26e056cf69207b4f50924ehkuang
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvpx_convolve8_avg_loop_horiz_v
773df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d24}, [r0], r1
783df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d25}, [r0], r1
793df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d26}, [r0], r1
803df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d27}, [r0], r8
8191037db265ecdd914a26e056cf69207b4f50924ehkuang
82f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.16         q12, q13
83f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d24, d25
84f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d26, d27
85f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
863df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r0, r1, lsl #2]
873df0563f1b24dac6c0bd122fc922a48211269061hkuang
8891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q8, d24
8991037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q9, d25
9091037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q10, d26
9191037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q11, d27
923df0563f1b24dac6c0bd122fc922a48211269061hkuang
933df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; save a few instructions in the inner loop
943df0563f1b24dac6c0bd122fc922a48211269061hkuang    vswp            d17, d18
953df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d23, d21
963df0563f1b24dac6c0bd122fc922a48211269061hkuang
973df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #3
983df0563f1b24dac6c0bd122fc922a48211269061hkuang
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvpx_convolve8_avg_loop_horiz
1003df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r5, r0, #64
1013df0563f1b24dac6c0bd122fc922a48211269061hkuang
1023df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d28[]}, [r0], r1
1033df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d29[]}, [r0], r1
1043df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d31[]}, [r0], r1
1053df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d30[]}, [r0], r8
1063df0563f1b24dac6c0bd122fc922a48211269061hkuang
1073df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
1083df0563f1b24dac6c0bd122fc922a48211269061hkuang
1093df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d28, d31
1103df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d29, d30
1113df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d28, d29
1123df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d31, d30
1133df0563f1b24dac6c0bd122fc922a48211269061hkuang
1143df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1]
1153df0563f1b24dac6c0bd122fc922a48211269061hkuang
1163df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; extract to s16
1173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.32         q14, q15
11891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d28
1193df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q13, d29
1203df0563f1b24dac6c0bd122fc922a48211269061hkuang
1213df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1, lsl #1]
12291037db265ecdd914a26e056cf69207b4f50924ehkuang
12391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; slightly out of order load to match the existing data
12491037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d6[0]}, [r2], r3
12591037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d7[0]}, [r2], r3
12691037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d6[1]}, [r2], r3
12791037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d7[1]}, [r2], r3
12891037db265ecdd914a26e056cf69207b4f50924ehkuang
12991037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r2, r2, r3, lsl #2      ; reset for store
13091037db265ecdd914a26e056cf69207b4f50924ehkuang
131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ; src[] * filter
1323df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
1333df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
1343df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
1353df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
1363df0563f1b24dac6c0bd122fc922a48211269061hkuang
1373df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, -r8]
13891037db265ecdd914a26e056cf69207b4f50924ehkuang
13991037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
14091037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
14191037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
14291037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
14391037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
14491037db265ecdd914a26e056cf69207b4f50924ehkuang
14591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
146f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
147f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
14891037db265ecdd914a26e056cf69207b4f50924ehkuang
14991037db265ecdd914a26e056cf69207b4f50924ehkuang    ; transpose
15091037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.16         d2, d3
15191037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.32         d2, d3
15291037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.8          d2, d3
1533df0563f1b24dac6c0bd122fc922a48211269061hkuang
15491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; average the new value and the dst value
155f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vrhadd.u8       q1, q1, q3
15691037db265ecdd914a26e056cf69207b4f50924ehkuang
1573df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r2@32], r3
1583df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r2@32], r3
1593df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r2@32], r3
1603df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r2@32], r4
1613df0563f1b24dac6c0bd122fc922a48211269061hkuang
1623df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8,  q9
1633df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d20, d23
1643df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q11, q12
1653df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q9,  q13
16691037db265ecdd914a26e056cf69207b4f50924ehkuang
16791037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r6, r6, #4              ; w -= 4
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    bgt             vpx_convolve8_avg_loop_horiz
16991037db265ecdd914a26e056cf69207b4f50924ehkuang
17091037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
17191037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r6, r10                 ; restore w counter
1723df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, r9              ; src += src_stride * 4 - w
17391037db265ecdd914a26e056cf69207b4f50924ehkuang    add             r2, r2, r12             ; dst += dst_stride * 4 - w
17491037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r7, r7, #4              ; h -= 4
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    bgt vpx_convolve8_avg_loop_horiz_v
17691037db265ecdd914a26e056cf69207b4f50924ehkuang
17791037db265ecdd914a26e056cf69207b4f50924ehkuang    pop             {r4-r10, pc}
17891037db265ecdd914a26e056cf69207b4f50924ehkuang
17991037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
18091037db265ecdd914a26e056cf69207b4f50924ehkuang
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian|vpx_convolve8_avg_vert_neon| PROC
1823df0563f1b24dac6c0bd122fc922a48211269061hkuang    push            {r4-r8, lr}
18391037db265ecdd914a26e056cf69207b4f50924ehkuang
18491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; adjust for taps
18591037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1
18691037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1, lsl #1
18791037db265ecdd914a26e056cf69207b4f50924ehkuang
188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldr             r4, [sp, #24]           ; filter
189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldr             r5, [sp, #36]           ; y0_q4
190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    add             r4, r5, lsl #4
191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldr             r6, [sp, #44]           ; w
192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ldr             lr, [sp, #48]           ; h
19391037db265ecdd914a26e056cf69207b4f50924ehkuang
194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vld1.s16        {q0}, [r4]              ; filter
19591037db265ecdd914a26e056cf69207b4f50924ehkuang
1963df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r1, r1, #1
1973df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r3, r3, #1
19891037db265ecdd914a26e056cf69207b4f50924ehkuang
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvpx_convolve8_avg_loop_vert_h
2003df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r4, r0
2013df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r7, r0, r1, asr #1
2023df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r5, r2
2033df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r2, r3, asr #1
2043df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r12, lr                 ; h loop counter
20591037db265ecdd914a26e056cf69207b4f50924ehkuang
2063df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[0]}, [r4], r1
2073df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[1]}, [r7], r1
2083df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[0]}, [r4], r1
2093df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[1]}, [r7], r1
2103df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[0]}, [r4], r1
2113df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[1]}, [r7], r1
2123df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d22[0]}, [r4], r1
21391037db265ecdd914a26e056cf69207b4f50924ehkuang
2143df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q8, d16
2153df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q9, d18
2163df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q10, d20
2173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q11, d22
21891037db265ecdd914a26e056cf69207b4f50924ehkuang
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvpx_convolve8_avg_loop_vert
22091037db265ecdd914a26e056cf69207b4f50924ehkuang    ; always process a 4x4 block at a time
2213df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[0]}, [r7], r1
2223df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[0]}, [r4], r1
2233df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[1]}, [r7], r1
2243df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[1]}, [r4], r1
22591037db265ecdd914a26e056cf69207b4f50924ehkuang
22691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; extract to s16
22791037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d24
22891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q13, d26
22991037db265ecdd914a26e056cf69207b4f50924ehkuang
2303df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d6[0]}, [r5@32], r3
2313df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d6[1]}, [r8@32], r3
2323df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d7[0]}, [r5@32], r3
2333df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d7[1]}, [r8@32], r3
23491037db265ecdd914a26e056cf69207b4f50924ehkuang
2353df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7]
2363df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4]
23791037db265ecdd914a26e056cf69207b4f50924ehkuang
238df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    ; src[] * filter
2393df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
2403df0563f1b24dac6c0bd122fc922a48211269061hkuang
2413df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7, r1]
2423df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4, r1]
2433df0563f1b24dac6c0bd122fc922a48211269061hkuang
2443df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
2453df0563f1b24dac6c0bd122fc922a48211269061hkuang
2463df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
2473df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8]
2483df0563f1b24dac6c0bd122fc922a48211269061hkuang
2493df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
2503df0563f1b24dac6c0bd122fc922a48211269061hkuang
2513df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r3]
2523df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8, r3]
2533df0563f1b24dac6c0bd122fc922a48211269061hkuang
2543df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
25591037db265ecdd914a26e056cf69207b4f50924ehkuang
25691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
25791037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
25891037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
25991037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
26091037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
26191037db265ecdd914a26e056cf69207b4f50924ehkuang
26291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
263f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
264f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
26591037db265ecdd914a26e056cf69207b4f50924ehkuang
26691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; average the new value and the dst value
267f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vrhadd.u8       q1, q1, q3
26891037db265ecdd914a26e056cf69207b4f50924ehkuang
2693df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r5, r5, r3, lsl #1      ; reset for store
2703df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r8, r8, r3, lsl #1
27191037db265ecdd914a26e056cf69207b4f50924ehkuang
2723df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r5@32], r3
2733df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r8@32], r3
2743df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r5@32], r3
2753df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r8@32], r3
2763df0563f1b24dac6c0bd122fc922a48211269061hkuang
2773df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8, q10
2783df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d18, d22
2793df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d19, d24
2803df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q10, q13
2813df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d22, d25
2823df0563f1b24dac6c0bd122fc922a48211269061hkuang
2833df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r12, r12, #4            ; h -= 4
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    bgt             vpx_convolve8_avg_loop_vert
28591037db265ecdd914a26e056cf69207b4f50924ehkuang
28691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
2873df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #4
2883df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r2, r2, #4
2893df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r6, r6, #4              ; w -= 4
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    bgt             vpx_convolve8_avg_loop_vert_h
29191037db265ecdd914a26e056cf69207b4f50924ehkuang
2923df0563f1b24dac6c0bd122fc922a48211269061hkuang    pop             {r4-r8, pc}
29391037db265ecdd914a26e056cf69207b4f50924ehkuang
29491037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
29591037db265ecdd914a26e056cf69207b4f50924ehkuang    END
296