190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    EXPORT  |vp8_sixtap_predict16x16_neon|
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ARM
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    REQUIRE8
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    PRESERVE8
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA ||.text||, CODE, READONLY, ALIGN=2
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r0    unsigned char  *src_ptr,
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r1    int  src_pixels_per_line,
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r2    int  xoffset,
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r3    int  yoffset,
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r4    unsigned char *dst_ptr,
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack(r5) int  dst_pitch
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; the result can be negtive. So, I treat the result as s16. But, since it is also possible
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; that the result can be a large positive number (> 2^15-1), which could be confused as a
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; which ensures that the result stays in s16 range. Finally, saturated add the result by
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; applying 3rd filter coeff. Same applys to other filter functions.
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_sixtap_predict16x16_neon| PROC
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {r4-r5, lr}
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r12, _filter16_coeff_
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r4, [sp, #12]           ;load parameters from stack
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r5, [sp, #16]           ;load parameters from stack
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             secondpass_filter16x16_only
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r12, r2, lsl #5     ;calculate filter location
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             firstpass_filter16x16_only
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             sp, sp, #336            ;reserve space on stack for temporary storage
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             lr, sp
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #7                  ;loop counter
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (21x16)
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fp16x16_loop_neon
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d9, d10, d11}, [r0], r1
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14}, [r0], r1
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0]
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1]
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1, lsl #1]
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d7, d0
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d9, d0
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q11, d10, d0
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q12, d12, d0
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q13, d13, d0
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #1
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #1
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q10, d29, d1
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d30, d1
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #1
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #1
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #1
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q11, d29, d1
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q13, d30, d1
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #4
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #4
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q10, d29, d4
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d30, d4
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #4
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #4
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #4
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q11, d29, d4
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q13, d30, d4
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #5
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #5
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q10, d29, d5
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d30, d5
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #5
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #5
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #5
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q11, d29, d5
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q13, d30, d5
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #2
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #2
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q10, d29, d2
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d30, d2
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #2
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #2
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #2
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q11, d29, d2
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q13, d30, d2
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #3
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #3
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d15, d7, d8, #3
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d10, d11, #3
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d6, d13, d14, #3
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d29, d3
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d30, d3
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q5
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q12, q6
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d31, d3
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d6, d3
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q6
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q11, q7
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q13, q3
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q9, #7
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q10, #7
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q11, #7
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d10, q12, #7
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d11, q13, #7
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9, d10, d11}, [lr]!
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             filt_blk2d_fp16x16_loop_neon
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;secondpass_filter - do first 8-columns and then second 8-columns
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             lr, lr, #336
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, #2                  ;loop counter
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #16
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_sp16x16_outloop_neon
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d18}, [lr], r2         ;load src data
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d19}, [lr], r2
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d20}, [lr], r2
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d21}, [lr], r2
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                 ;loop counter
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d22}, [lr], r2
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_inner_loop_neon
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d23}, [lr], r2         ;load src data
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d24}, [lr], r2
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d25}, [lr], r2
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d26}, [lr], r2
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d19, d0
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d20, d0
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d21, d0
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d20, d1
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d21, d1
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d22, d1
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d23, d4
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d24, d4
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d25, d4
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d21, d2
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d22, d2
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d23, d2
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d24, d5
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d25, d5
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d26, d5
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d22, d3
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d23, d3
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d24, d3
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q5
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q6
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q8, #7
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q9, #7
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q10, #7
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6}, [r4], r5          ;store result
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q9, q11
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d7}, [r4], r5
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q10, q12
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8}, [r4], r5
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            d22, d26
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9}, [r4], r5
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             secondpass_inner_loop_neon
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r3, r3, #1
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             lr, lr, #336
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             lr, lr, #8
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r4, r4, r5, lsl #4
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r4, r4, #8
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne filt_blk2d_sp16x16_outloop_neon
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             sp, sp, #336
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;--------------------
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfirstpass_filter16x16_only
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #8                  ;loop counter
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;move srcptr back to (column-2)
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (16x16)
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fpo16x16_loop_neon
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d9, d10, d11}, [r0], r1
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0]
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1]
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d7, d0
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d9, d0
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d10, d0
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d9, d10, #1
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d22, d7, d8, #1
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d10, d11, #1
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d9, d10, #4
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d7, d8, #4
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d27, d10, d11, #4
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #5
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d21, d1
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d23, d1
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d25, d4
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d27, d4
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d29, d5
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d7, d8, #5
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d10, d11, #5
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d9, d10, #2
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d7, d8, #2
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d10, d11, #2
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d27, d9, d10, #3
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #3
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #3
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d21, d5
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d23, d2
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d25, d2
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q11, d27, d3
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q15, d29, d3
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q11
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q12
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q15
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q7, #7
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q8, #7
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q9, #7
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {q3}, [r4], r5              ;store result
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {q4}, [r4], r5
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             filt_blk2d_fpo16x16_loop_neon
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;--------------------
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_filter16x16_only
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, #2                  ;loop counter
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_spo16x16_outloop_neon
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d18}, [r0], r1         ;load src data
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d19}, [r0], r1
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d20}, [r0], r1
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d21}, [r0], r1
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                 ;loop counter
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d22}, [r0], r1
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_only_inner_loop_neon
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d23}, [r0], r1         ;load src data
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d24}, [r0], r1
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d25}, [r0], r1
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d26}, [r0], r1
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d19, d0
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d20, d0
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d21, d0
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d20, d1
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d21, d1
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d22, d1
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d23, d4
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d24, d4
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d25, d4
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d21, d2
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d22, d2
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d23, d2
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d24, d5
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d25, d5
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d26, d5
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d22, d3
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d23, d3
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d24, d3
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q5
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q6
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q8, #7
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q9, #7
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q10, #7
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6}, [r4], r5          ;store result
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q9, q11
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d7}, [r4], r5
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q10, q12
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8}, [r4], r5
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            d22, d26
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9}, [r4], r5
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             secondpass_only_inner_loop_neon
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r3, r3, #1
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #4
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #2
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r0, #8
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r4, r4, r5, lsl #4
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r4, r4, #8
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne filt_blk2d_spo16x16_outloop_neon
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;-----------------
47979f15823c34ae1e423108295e416213200bb280fAndreas Huber
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber_filter16_coeff_
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     filter16_coeff
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter16_coeff
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0,  0,  128,    0,   0,  0,   0,  0
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0, -6,  123,   12,  -1,  0,   0,  0
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     2, -11, 108,   36,  -8,  1,   0,  0
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0, -9,   93,   50,  -6,  0,   0,  0
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     3, -16,  77,   77, -16,  3,   0,  0
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0, -6,   50,   93,  -9,  0,   0,  0
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     1, -8,   36,  108, -11,  2,   0,  0
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    DCD     0, -1,   12,  123,  -6,   0,  0,  0
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    END
493