191037db265ecdd914a26e056cf69207b4f50924ehkuang;
291037db265ecdd914a26e056cf69207b4f50924ehkuang;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang;
491037db265ecdd914a26e056cf69207b4f50924ehkuang;  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang;  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang;  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang;  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang;  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang;
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
1191037db265ecdd914a26e056cf69207b4f50924ehkuang
1291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; These functions are only valid when:
1391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; x_step_q4 == 16
1491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; w%4 == 0
1591037db265ecdd914a26e056cf69207b4f50924ehkuang    ; h%4 == 0
1691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; taps == 8
1791037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_WEIGHT == 128
1891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; VP9_FILTER_SHIFT == 7
1991037db265ecdd914a26e056cf69207b4f50924ehkuang
2091037db265ecdd914a26e056cf69207b4f50924ehkuang    EXPORT  |vp9_convolve8_horiz_neon|
2191037db265ecdd914a26e056cf69207b4f50924ehkuang    EXPORT  |vp9_convolve8_vert_neon|
2291037db265ecdd914a26e056cf69207b4f50924ehkuang    IMPORT  |vp9_convolve8_horiz_c|
2391037db265ecdd914a26e056cf69207b4f50924ehkuang    IMPORT  |vp9_convolve8_vert_c|
2491037db265ecdd914a26e056cf69207b4f50924ehkuang    ARM
2591037db265ecdd914a26e056cf69207b4f50924ehkuang    REQUIRE8
2691037db265ecdd914a26e056cf69207b4f50924ehkuang    PRESERVE8
2791037db265ecdd914a26e056cf69207b4f50924ehkuang
2891037db265ecdd914a26e056cf69207b4f50924ehkuang    AREA ||.text||, CODE, READONLY, ALIGN=2
2991037db265ecdd914a26e056cf69207b4f50924ehkuang
3091037db265ecdd914a26e056cf69207b4f50924ehkuang    ; Multiply and accumulate by q0
3191037db265ecdd914a26e056cf69207b4f50924ehkuang    MACRO
3291037db265ecdd914a26e056cf69207b4f50924ehkuang    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
3391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmull.s16 $dst, $src0, d0[0]
3491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src1, d0[1]
3591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src2, d0[2]
3691037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src3, d0[3]
3791037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src4, d1[0]
3891037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src5, d1[1]
3991037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src6, d1[2]
4091037db265ecdd914a26e056cf69207b4f50924ehkuang    vmlal.s16 $dst, $src7, d1[3]
4191037db265ecdd914a26e056cf69207b4f50924ehkuang    MEND
4291037db265ecdd914a26e056cf69207b4f50924ehkuang
4391037db265ecdd914a26e056cf69207b4f50924ehkuang; r0    const uint8_t *src
4491037db265ecdd914a26e056cf69207b4f50924ehkuang; r1    int src_stride
4591037db265ecdd914a26e056cf69207b4f50924ehkuang; r2    uint8_t *dst
4691037db265ecdd914a26e056cf69207b4f50924ehkuang; r3    int dst_stride
4791037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]const int16_t *filter_x
4891037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int x_step_q4
4991037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]const int16_t *filter_y ; unused
5091037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int y_step_q4           ; unused
5191037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int w
5291037db265ecdd914a26e056cf69207b4f50924ehkuang; sp[]int h
5391037db265ecdd914a26e056cf69207b4f50924ehkuang
5491037db265ecdd914a26e056cf69207b4f50924ehkuang|vp9_convolve8_horiz_neon| PROC
55f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    ldr             r12, [sp, #4]           ; x_step_q4
56f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    cmp             r12, #16
57f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    bne             vp9_convolve8_horiz_c
58f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
5991037db265ecdd914a26e056cf69207b4f50924ehkuang    push            {r4-r10, lr}
6091037db265ecdd914a26e056cf69207b4f50924ehkuang
6191037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, #3              ; adjust for taps
6291037db265ecdd914a26e056cf69207b4f50924ehkuang
6391037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r5, [sp, #32]           ; filter_x
6491037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r6, [sp, #48]           ; w
6591037db265ecdd914a26e056cf69207b4f50924ehkuang    ldr             r7, [sp, #52]           ; h
6691037db265ecdd914a26e056cf69207b4f50924ehkuang
6791037db265ecdd914a26e056cf69207b4f50924ehkuang    vld1.s16        {q0}, [r5]              ; filter_x
6891037db265ecdd914a26e056cf69207b4f50924ehkuang
693df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
703df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r8, #4              ; -src_stride * 3 + 4
7191037db265ecdd914a26e056cf69207b4f50924ehkuang
723df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
733df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r4, r4, #4              ; -dst_stride * 3 + 4
7491037db265ecdd914a26e056cf69207b4f50924ehkuang
753df0563f1b24dac6c0bd122fc922a48211269061hkuang    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
763df0563f1b24dac6c0bd122fc922a48211269061hkuang    sub             r9, r9, #7
7791037db265ecdd914a26e056cf69207b4f50924ehkuang    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
7891037db265ecdd914a26e056cf69207b4f50924ehkuang
7991037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r10, r6                 ; w loop counter
8091037db265ecdd914a26e056cf69207b4f50924ehkuang
813df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_horiz_v
823df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d24}, [r0], r1
833df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d25}, [r0], r1
843df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d26}, [r0], r1
853df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.8          {d27}, [r0], r8
8691037db265ecdd914a26e056cf69207b4f50924ehkuang
87f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.16         q12, q13
88f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d24, d25
89f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vtrn.8          d26, d27
90f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
913df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r0, r1, lsl #2]
923df0563f1b24dac6c0bd122fc922a48211269061hkuang
9391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q8, d24
9491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q9, d25
9591037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q10, d26
9691037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q11, d27
973df0563f1b24dac6c0bd122fc922a48211269061hkuang
983df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; save a few instructions in the inner loop
993df0563f1b24dac6c0bd122fc922a48211269061hkuang    vswp            d17, d18
1003df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d23, d21
1013df0563f1b24dac6c0bd122fc922a48211269061hkuang
1023df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #3
1033df0563f1b24dac6c0bd122fc922a48211269061hkuang
1043df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_horiz
1053df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r5, r0, #64
1063df0563f1b24dac6c0bd122fc922a48211269061hkuang
1073df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d28[]}, [r0], r1
1083df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d29[]}, [r0], r1
1093df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d31[]}, [r0], r1
1103df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.32         {d30[]}, [r0], r8
1113df0563f1b24dac6c0bd122fc922a48211269061hkuang
1123df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
1133df0563f1b24dac6c0bd122fc922a48211269061hkuang
1143df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d28, d31
1153df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.16         d29, d30
1163df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d28, d29
1173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.8          d31, d30
1183df0563f1b24dac6c0bd122fc922a48211269061hkuang
1193df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1]
1203df0563f1b24dac6c0bd122fc922a48211269061hkuang
1213df0563f1b24dac6c0bd122fc922a48211269061hkuang    ; extract to s16
1223df0563f1b24dac6c0bd122fc922a48211269061hkuang    vtrn.32         q14, q15
12391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d28
1243df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q13, d29
1253df0563f1b24dac6c0bd122fc922a48211269061hkuang
1263df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r1, lsl #1]
12791037db265ecdd914a26e056cf69207b4f50924ehkuang
12891037db265ecdd914a26e056cf69207b4f50924ehkuang    ; src[] * filter_x
1293df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
1303df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
1313df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
1323df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
1333df0563f1b24dac6c0bd122fc922a48211269061hkuang
1343df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, -r8]
13591037db265ecdd914a26e056cf69207b4f50924ehkuang
13691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
13791037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
13891037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
13991037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
14091037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
14191037db265ecdd914a26e056cf69207b4f50924ehkuang
14291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
143f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
144f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
14591037db265ecdd914a26e056cf69207b4f50924ehkuang
14691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; transpose
14791037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.16         d2, d3
14891037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.32         d2, d3
14991037db265ecdd914a26e056cf69207b4f50924ehkuang    vtrn.8          d2, d3
15091037db265ecdd914a26e056cf69207b4f50924ehkuang
1513df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r2@32], r3
1523df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r2@32], r3
1533df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r2@32], r3
1543df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r2@32], r4
1553df0563f1b24dac6c0bd122fc922a48211269061hkuang
1563df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8,  q9
1573df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d20, d23
1583df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q11, q12
1593df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q9,  q13
16091037db265ecdd914a26e056cf69207b4f50924ehkuang
16191037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r6, r6, #4              ; w -= 4
16291037db265ecdd914a26e056cf69207b4f50924ehkuang    bgt             loop_horiz
16391037db265ecdd914a26e056cf69207b4f50924ehkuang
16491037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
16591037db265ecdd914a26e056cf69207b4f50924ehkuang    mov             r6, r10                 ; restore w counter
1663df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, r9              ; src += src_stride * 4 - w
16791037db265ecdd914a26e056cf69207b4f50924ehkuang    add             r2, r2, r12             ; dst += dst_stride * 4 - w
16891037db265ecdd914a26e056cf69207b4f50924ehkuang    subs            r7, r7, #4              ; h -= 4
1693df0563f1b24dac6c0bd122fc922a48211269061hkuang    bgt loop_horiz_v
17091037db265ecdd914a26e056cf69207b4f50924ehkuang
17191037db265ecdd914a26e056cf69207b4f50924ehkuang    pop             {r4-r10, pc}
17291037db265ecdd914a26e056cf69207b4f50924ehkuang
17391037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
17491037db265ecdd914a26e056cf69207b4f50924ehkuang
17591037db265ecdd914a26e056cf69207b4f50924ehkuang|vp9_convolve8_vert_neon| PROC
176f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    ldr             r12, [sp, #12]
177f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    cmp             r12, #16
178f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    bne             vp9_convolve8_vert_c
179f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang
1803df0563f1b24dac6c0bd122fc922a48211269061hkuang    push            {r4-r8, lr}
18191037db265ecdd914a26e056cf69207b4f50924ehkuang
18291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; adjust for taps
18391037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1
18491037db265ecdd914a26e056cf69207b4f50924ehkuang    sub             r0, r0, r1, lsl #1
18591037db265ecdd914a26e056cf69207b4f50924ehkuang
1863df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             r4, [sp, #32]           ; filter_y
1873df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             r6, [sp, #40]           ; w
1883df0563f1b24dac6c0bd122fc922a48211269061hkuang    ldr             lr, [sp, #44]           ; h
18991037db265ecdd914a26e056cf69207b4f50924ehkuang
1903df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.s16        {q0}, [r4]              ; filter_y
19191037db265ecdd914a26e056cf69207b4f50924ehkuang
1923df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r1, r1, #1
1933df0563f1b24dac6c0bd122fc922a48211269061hkuang    lsl             r3, r3, #1
19491037db265ecdd914a26e056cf69207b4f50924ehkuang
1953df0563f1b24dac6c0bd122fc922a48211269061hkuangloop_vert_h
1963df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r4, r0
1973df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r7, r0, r1, asr #1
1983df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r5, r2
1993df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r8, r2, r3, asr #1
2003df0563f1b24dac6c0bd122fc922a48211269061hkuang    mov             r12, lr                 ; h loop counter
20191037db265ecdd914a26e056cf69207b4f50924ehkuang
2023df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[0]}, [r4], r1
2033df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d16[1]}, [r7], r1
2043df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[0]}, [r4], r1
2053df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d18[1]}, [r7], r1
2063df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[0]}, [r4], r1
2073df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d20[1]}, [r7], r1
2083df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d22[0]}, [r4], r1
20991037db265ecdd914a26e056cf69207b4f50924ehkuang
2103df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q8, d16
2113df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q9, d18
2123df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q10, d20
2133df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmovl.u8        q11, d22
21491037db265ecdd914a26e056cf69207b4f50924ehkuang
21591037db265ecdd914a26e056cf69207b4f50924ehkuangloop_vert
21691037db265ecdd914a26e056cf69207b4f50924ehkuang    ; always process a 4x4 block at a time
2173df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[0]}, [r7], r1
2183df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[0]}, [r4], r1
2193df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d26[1]}, [r7], r1
2203df0563f1b24dac6c0bd122fc922a48211269061hkuang    vld1.u32        {d24[1]}, [r4], r1
22191037db265ecdd914a26e056cf69207b4f50924ehkuang
22291037db265ecdd914a26e056cf69207b4f50924ehkuang    ; extract to s16
22391037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q12, d24
22491037db265ecdd914a26e056cf69207b4f50924ehkuang    vmovl.u8        q13, d26
22591037db265ecdd914a26e056cf69207b4f50924ehkuang
2263df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5]
2273df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8]
2283df0563f1b24dac6c0bd122fc922a48211269061hkuang
22991037db265ecdd914a26e056cf69207b4f50924ehkuang    ; src[] * filter_y
2303df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
2313df0563f1b24dac6c0bd122fc922a48211269061hkuang
2323df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r5, r3]
2333df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r8, r3]
2343df0563f1b24dac6c0bd122fc922a48211269061hkuang
2353df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
2363df0563f1b24dac6c0bd122fc922a48211269061hkuang
2373df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7]
2383df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4]
2393df0563f1b24dac6c0bd122fc922a48211269061hkuang
2403df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
2413df0563f1b24dac6c0bd122fc922a48211269061hkuang
2423df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r7, r1]
2433df0563f1b24dac6c0bd122fc922a48211269061hkuang    pld             [r4, r1]
2443df0563f1b24dac6c0bd122fc922a48211269061hkuang
2453df0563f1b24dac6c0bd122fc922a48211269061hkuang    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
24691037db265ecdd914a26e056cf69207b4f50924ehkuang
24791037db265ecdd914a26e056cf69207b4f50924ehkuang    ; += 64 >> 7
24891037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d2, q1, #7
24991037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d3, q2, #7
25091037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d4, q14, #7
25191037db265ecdd914a26e056cf69207b4f50924ehkuang    vqrshrun.s32    d5, q15, #7
25291037db265ecdd914a26e056cf69207b4f50924ehkuang
25391037db265ecdd914a26e056cf69207b4f50924ehkuang    ; saturate
254f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d2, q1
255f3bed9137f66ef693bd406e43b17e9a1114f1e14hkuang    vqmovn.u16      d3, q2
25691037db265ecdd914a26e056cf69207b4f50924ehkuang
2573df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[0]}, [r5@32], r3
2583df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d2[1]}, [r8@32], r3
2593df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[0]}, [r5@32], r3
2603df0563f1b24dac6c0bd122fc922a48211269061hkuang    vst1.u32        {d3[1]}, [r8@32], r3
2613df0563f1b24dac6c0bd122fc922a48211269061hkuang
2623df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q8, q10
2633df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d18, d22
2643df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d19, d24
2653df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            q10, q13
2663df0563f1b24dac6c0bd122fc922a48211269061hkuang    vmov            d22, d25
26791037db265ecdd914a26e056cf69207b4f50924ehkuang
2683df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r12, r12, #4            ; h -= 4
26991037db265ecdd914a26e056cf69207b4f50924ehkuang    bgt             loop_vert
27091037db265ecdd914a26e056cf69207b4f50924ehkuang
27191037db265ecdd914a26e056cf69207b4f50924ehkuang    ; outer loop
2723df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r0, r0, #4
2733df0563f1b24dac6c0bd122fc922a48211269061hkuang    add             r2, r2, #4
2743df0563f1b24dac6c0bd122fc922a48211269061hkuang    subs            r6, r6, #4              ; w -= 4
2753df0563f1b24dac6c0bd122fc922a48211269061hkuang    bgt             loop_vert_h
27691037db265ecdd914a26e056cf69207b4f50924ehkuang
2773df0563f1b24dac6c0bd122fc922a48211269061hkuang    pop             {r4-r8, pc}
27891037db265ecdd914a26e056cf69207b4f50924ehkuang
27991037db265ecdd914a26e056cf69207b4f50924ehkuang    ENDP
28091037db265ecdd914a26e056cf69207b4f50924ehkuang    END
281