vpx_convolve8_avg_neon_asm.asm revision 3df0563f1b24dac6c0bd122fc922a48211269061
191037db265ecdd914a26e056cf69207b4f50924ehkuang;
291037db265ecdd914a26e056cf69207b4f50924ehkuang;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang;
491037db265ecdd914a26e056cf69207b4f50924ehkuang;  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang;  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang;  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang;  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang;  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang;
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
1191037db265ecdd914a26e056cf69207b4f50924ehkuang
1291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; These functions are only valid when:
1391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; x_step_q4 == 16
1491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; w%4 == 0
1591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; h%4 == 0
1691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; taps == 8
1791037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_WEIGHT == 128
1891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_SHIFT == 7
1991037db265ecdd914a26e056cf69207b4f50924ehkuang
2091037db265ecdd914a26e056cf69207b4f50924ehkuang    EXPORT  |vp9_convolve8_avg_horiz_neon|
2191037db265ecdd914a26e056cf69207b4f50924ehkuang    EXPORT  |vp9_convolve8_avg_vert_neon|
2291037db265ecdd914a26e056cf69207b4f50924ehkuang    IMPORT  |vp9_convolve8_avg_horiz_c|
2391037db265ecdd914a26e056cf69207b4f50924ehkuang    IMPORT  |vp9_convolve8_avg_vert_c|
2491037db265ecdd914a26e056cf69207b4f50924ehkuang    ARM
2591037db265ecdd914a26e056cf69207b4f50924ehkuang    REQUIRE8
2691037db265ecdd914a26e056cf69207b4f50924ehkuang    PRESERVE8
2791037db265ecdd914a26e056cf69207b4f50924ehkuang
2891037db265ecdd914a26e056cf69207b4f50924ehkuang    AREA ||.text||, CODE, READONLY, ALIGN=2
2991037db265ecdd914a26e056cf69207b4f50924ehkuang
3091037db265ecdd914a26e056cf69207b4f50924ehkuang    ; Multiply and accumulate by q0
3191037db265ecdd914a26e056cf69207b4f50924ehkuang    MACRO
3291037db265ecdd914a26e056cf69207b4f50924ehkuang    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
3391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmull.s16 $dst, $src0, d0[0]
3491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src1, d0[1]
3591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src2, d0[2]
3691037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src3, d0[3]
3791037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src4, d1[0]
3891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src5, d1[1]
3991037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src6, d1[2]
4091037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src7, d1[3]
4191037db265ecdd914a26e056cf69207b4f50924ehkuang    MEND
4291037db265ecdd914a26e056cf69207b4f50924ehkuang
4391037db265ecdd914a26e056cf69207b4f50924ehkuang; r0    const uint8_t *src
4491037db265ecdd914a26e056cf69207b4f50924ehkuang; r1    int src_stride
4591037db265ecdd914a26e056cf69207b4f50924ehkuang; r2    uint8_t *dst
4691037db265ecdd914a26e056cf69207b4f50924ehkuang; r3    int dst_stride
4791037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]const int16_t *filter_x
4891037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int x_step_q4
4991037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]const int16_t *filter_y ; unused
5091037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int y_step_q4           ; unused
5191037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int w
5291037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int h
5391037db265ecdd914a26e056cf69207b4f50924ehkuang
5491037db265ecdd914a26e056cf69207b4f50924ehkuang|vp9_convolve8_avg_horiz_neon| PROC
55f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    ldr             r12, [sp, #4]           ; x_step_q4
56f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    cmp             r12, #16
57f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    bne             vp9_convolve8_avg_horiz_c
58f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
5991037db265ecdd914a26e056cf69207b4f50924ehkuang    push            {r4-r10, lr}
6091037db265ecdd914a26e056cf69207b4f50924ehkuang
6191037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, #3              ; adjust for taps
6291037db265ecdd914a26e056cf69207b4f50924ehkuang
6391037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r5, [sp, #32]           ; filter_x
6491037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r6, [sp, #48]           ; w
6591037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r7, [sp, #52]           ; h
6691037db265ecdd914a26e056cf69207b4f50924ehkuang
6791037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.s16        {q0}, [r5]              ; filter_x
6891037db265ecdd914a26e056cf69207b4f50924ehkuang
693df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
703df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r8, #4              ; -src_stride * 3 + 4
7191037db265ecdd914a26e056cf69207b4f50924ehkuang
723df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
733df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r4, r4, #4              ; -dst_stride * 3 + 4
7491037db265ecdd914a26e056cf69207b4f50924ehkuang
753df0563f1b24dac6c0bd122fc922a48211269061hkuang    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
763df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r9, r9, #7
7791037db265ecdd914a26e056cf69207b4f50924ehkuang    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
7891037db265ecdd914a26e056cf69207b4f50924ehkuang
7991037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r10, r6                 ; w loop counter
8091037db265ecdd914a26e056cf69207b4f50924ehkuang
813df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_horiz_v
823df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d24}, [r0], r1
833df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d25}, [r0], r1
843df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d26}, [r0], r1
853df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d27}, [r0], r8
8691037db265ecdd914a26e056cf69207b4f50924ehkuang
87f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.16         q12, q13
88f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d24, d25
89f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d26, d27
90f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
913df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r0, r1, lsl #2]
923df0563f1b24dac6c0bd122fc922a48211269061hkuang
9391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q8, d24
9491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q9, d25
9591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q10, d26
9691037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q11, d27
973df0563f1b24dac6c0bd122fc922a48211269061hkuang
983df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; save a few instructions in the inner loop
993df0563f1b24dac6c0bd122fc922a48211269061hkuang    vswp            d17, d18
1003df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d23, d21
1013df0563f1b24dac6c0bd122fc922a48211269061hkuang
1023df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #3
1033df0563f1b24dac6c0bd122fc922a48211269061hkuang
1043df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_horiz
1053df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r5, r0, #64
1063df0563f1b24dac6c0bd122fc922a48211269061hkuang
1073df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d28[]}, [r0], r1
1083df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d29[]}, [r0], r1
1093df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d31[]}, [r0], r1
1103df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d30[]}, [r0], r8
1113df0563f1b24dac6c0bd122fc922a48211269061hkuang
1123df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
1133df0563f1b24dac6c0bd122fc922a48211269061hkuang
1143df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d28, d31
1153df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d29, d30
1163df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d28, d29
1173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d31, d30
1183df0563f1b24dac6c0bd122fc922a48211269061hkuang
1193df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1]
1203df0563f1b24dac6c0bd122fc922a48211269061hkuang
1213df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; extract to s16
1223df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.32         q14, q15
12391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d28
1243df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q13, d29
1253df0563f1b24dac6c0bd122fc922a48211269061hkuang
1263df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1, lsl #1]
12791037db265ecdd914a26e056cf69207b4f50924ehkuang
12891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; slightly out of order load to match the existing data
12991037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d6[0]}, [r2], r3
13091037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d7[0]}, [r2], r3
13191037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d6[1]}, [r2], r3
13291037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.u32        {d7[1]}, [r2], r3
13391037db265ecdd914a26e056cf69207b4f50924ehkuang
13491037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r2, r2, r3, lsl #2      ; reset for store
13591037db265ecdd914a26e056cf69207b4f50924ehkuang
13691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; src[] * filter_x
1373df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
1383df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
1393df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
1403df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
1413df0563f1b24dac6c0bd122fc922a48211269061hkuang
1423df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, -r8]
14391037db265ecdd914a26e056cf69207b4f50924ehkuang
14491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
14591037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
14691037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
14791037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
14891037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
14991037db265ecdd914a26e056cf69207b4f50924ehkuang
15091037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
151f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
152f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
15391037db265ecdd914a26e056cf69207b4f50924ehkuang
15491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; transpose
15591037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.16         d2, d3
15691037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.32         d2, d3
15791037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.8          d2, d3
1583df0563f1b24dac6c0bd122fc922a48211269061hkuang
15991037db265ecdd914a26e056cf69207b4f50924ehkuang    ; average the new value and the dst value
160f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vrhadd.u8       q1, q1, q3
16191037db265ecdd914a26e056cf69207b4f50924ehkuang
1623df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r2@32], r3
1633df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r2@32], r3
1643df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r2@32], r3
1653df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r2@32], r4
1663df0563f1b24dac6c0bd122fc922a48211269061hkuang
1673df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8,  q9
1683df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d20, d23
1693df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q11, q12
1703df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q9,  q13
17191037db265ecdd914a26e056cf69207b4f50924ehkuang
17291037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r6, r6, #4              ; w -= 4
17391037db265ecdd914a26e056cf69207b4f50924ehkuang    bgt             loop_horiz
17491037db265ecdd914a26e056cf69207b4f50924ehkuang
17591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
17691037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r6, r10                 ; restore w counter
1773df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, r9              ; src += src_stride * 4 - w
17891037db265ecdd914a26e056cf69207b4f50924ehkuang    add             r2, r2, r12             ; dst += dst_stride * 4 - w
17991037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r7, r7, #4              ; h -= 4
1803df0563f1b24dac6c0bd122fc922a48211269061hkuang    bgt loop_horiz_v
18191037db265ecdd914a26e056cf69207b4f50924ehkuang
18291037db265ecdd914a26e056cf69207b4f50924ehkuang    pop             {r4-r10, pc}
18391037db265ecdd914a26e056cf69207b4f50924ehkuang
18491037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
18591037db265ecdd914a26e056cf69207b4f50924ehkuang
18691037db265ecdd914a26e056cf69207b4f50924ehkuang|vp9_convolve8_avg_vert_neon| PROC
187f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    ldr             r12, [sp, #12]
188f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    cmp             r12, #16
189f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    bne             vp9_convolve8_avg_vert_c
190f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1913df0563f1b24dac6c0bd122fc922a48211269061hkuang    push            {r4-r8, lr}
19291037db265ecdd914a26e056cf69207b4f50924ehkuang
19391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; adjust for taps
19491037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1
19591037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1, lsl #1
19691037db265ecdd914a26e056cf69207b4f50924ehkuang
1973df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             r4, [sp, #32]           ; filter_y
1983df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             r6, [sp, #40]           ; w
1993df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             lr, [sp, #44]           ; h
20091037db265ecdd914a26e056cf69207b4f50924ehkuang
2013df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.s16        {q0}, [r4]              ; filter_y
20291037db265ecdd914a26e056cf69207b4f50924ehkuang
2033df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r1, r1, #1
2043df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r3, r3, #1
20591037db265ecdd914a26e056cf69207b4f50924ehkuang
2063df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_vert_h
2073df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r4, r0
2083df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r7, r0, r1, asr #1
2093df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r5, r2
2103df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r2, r3, asr #1
2113df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r12, lr                 ; h loop counter
21291037db265ecdd914a26e056cf69207b4f50924ehkuang
2133df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[0]}, [r4], r1
2143df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[1]}, [r7], r1
2153df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[0]}, [r4], r1
2163df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[1]}, [r7], r1
2173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[0]}, [r4], r1
2183df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[1]}, [r7], r1
2193df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d22[0]}, [r4], r1
22091037db265ecdd914a26e056cf69207b4f50924ehkuang
2213df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q8, d16
2223df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q9, d18
2233df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q10, d20
2243df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q11, d22
22591037db265ecdd914a26e056cf69207b4f50924ehkuang
22691037db265ecdd914a26e056cf69207b4f50924ehkuangloop_vert
22791037db265ecdd914a26e056cf69207b4f50924ehkuang    ; always process a 4x4 block at a time
2283df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[0]}, [r7], r1
2293df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[0]}, [r4], r1
2303df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[1]}, [r7], r1
2313df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[1]}, [r4], r1
23291037db265ecdd914a26e056cf69207b4f50924ehkuang
23391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; extract to s16
23491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d24
23591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q13, d26
23691037db265ecdd914a26e056cf69207b4f50924ehkuang
2373df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d6[0]}, [r5@32], r3
2383df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d6[1]}, [r8@32], r3
2393df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d7[0]}, [r5@32], r3
2403df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d7[1]}, [r8@32], r3
24191037db265ecdd914a26e056cf69207b4f50924ehkuang
2423df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7]
2433df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4]
24491037db265ecdd914a26e056cf69207b4f50924ehkuang
24591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; src[] * filter_y
2463df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
2473df0563f1b24dac6c0bd122fc922a48211269061hkuang
2483df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7, r1]
2493df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4, r1]
2503df0563f1b24dac6c0bd122fc922a48211269061hkuang
2513df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
2523df0563f1b24dac6c0bd122fc922a48211269061hkuang
2533df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
2543df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8]
2553df0563f1b24dac6c0bd122fc922a48211269061hkuang
2563df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
2573df0563f1b24dac6c0bd122fc922a48211269061hkuang
2583df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r3]
2593df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8, r3]
2603df0563f1b24dac6c0bd122fc922a48211269061hkuang
2613df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
26291037db265ecdd914a26e056cf69207b4f50924ehkuang
26391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
26491037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
26591037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
26691037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
26791037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
26891037db265ecdd914a26e056cf69207b4f50924ehkuang
26991037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
270f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
271f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
27291037db265ecdd914a26e056cf69207b4f50924ehkuang
27391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; average the new value and the dst value
274f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vrhadd.u8       q1, q1, q3
27591037db265ecdd914a26e056cf69207b4f50924ehkuang
2763df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r5, r5, r3, lsl #1      ; reset for store
2773df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r8, r8, r3, lsl #1
27891037db265ecdd914a26e056cf69207b4f50924ehkuang
2793df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r5@32], r3
2803df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r8@32], r3
2813df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r5@32], r3
2823df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r8@32], r3
2833df0563f1b24dac6c0bd122fc922a48211269061hkuang
2843df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8, q10
2853df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d18, d22
2863df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d19, d24
2873df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q10, q13
2883df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d22, d25
2893df0563f1b24dac6c0bd122fc922a48211269061hkuang
2903df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r12, r12, #4            ; h -= 4
29191037db265ecdd914a26e056cf69207b4f50924ehkuang    bgt             loop_vert
29291037db265ecdd914a26e056cf69207b4f50924ehkuang
29391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
2943df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #4
2953df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r2, r2, #4
2963df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r6, r6, #4              ; w -= 4
2973df0563f1b24dac6c0bd122fc922a48211269061hkuang    bgt             loop_vert_h
29891037db265ecdd914a26e056cf69207b4f50924ehkuang
2993df0563f1b24dac6c0bd122fc922a48211269061hkuang    pop             {r4-r8, pc}
30091037db265ecdd914a26e056cf69207b4f50924ehkuang
30191037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
30291037db265ecdd914a26e056cf69207b4f50924ehkuang    END
303