190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    EXPORT  |vp8_sixtap_predict16x16_neon|
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ARM
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    REQUIRE8
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    PRESERVE8
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA ||.text||, CODE, READONLY, ALIGN=2
18d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel
19d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvelfilter16_coeff
20d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0,  0,  128,    0,   0,  0,   0,  0
21d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -6,  123,   12,  -1,  0,   0,  0
22d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     2, -11, 108,   36,  -8,  1,   0,  0
23d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -9,   93,   50,  -6,  0,   0,  0
24d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     3, -16,  77,   77, -16,  3,   0,  0
25d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -6,   50,   93,  -9,  0,   0,  0
26d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     1, -8,   36,  108, -11,  2,   0,  0
27d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -1,   12,  123,  -6,   0,  0,  0
28d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel
29d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel;-----------------
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r0    unsigned char  *src_ptr,
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r1    int  src_pixels_per_line,
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r2    int  xoffset,
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r3    int  yoffset,
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r4    unsigned char *dst_ptr,
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack(r5) int  dst_pitch
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; the result can be negtive. So, I treat the result as s16. But, since it is also possible
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; that the result can be a large positive number (> 2^15-1), which could be confused as a
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; which ensures that the result stays in s16 range. Finally, saturated add the result by
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; applying 3rd filter coeff. Same applys to other filter functions.
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_sixtap_predict16x16_neon| PROC
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {r4-r5, lr}
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    adr             r12, filter16_coeff
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r4, [sp, #12]           ;load parameters from stack
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r5, [sp, #16]           ;load parameters from stack
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             secondpass_filter16x16_only
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r12, r2, lsl #5     ;calculate filter location
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             firstpass_filter16x16_only
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             sp, sp, #336            ;reserve space on stack for temporary storage
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             lr, sp
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #7                  ;loop counter
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (21x16)
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fp16x16_loop_neon
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d9, d10, d11}, [r0], r1
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14}, [r0], r1
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0]
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1]
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1, lsl #1]
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d7, d0
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d9, d0
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q11, d10, d0
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q12, d12, d0
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q13, d13, d0
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #1
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #1
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q10, d29, d1
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d30, d1
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #1
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #1
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #1
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q11, d29, d1
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q13, d30, d1
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #4
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #4
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q10, d29, d4
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d30, d4
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #4
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #4
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #4
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q11, d29, d4
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q13, d30, d4
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #5
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #5
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q10, d29, d5
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d30, d5
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #5
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #5
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #5
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q11, d29, d5
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q13, d30, d5
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #2
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #2
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q10, d29, d2
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d30, d2
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #2
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #2
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d13, d14, #2
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q11, d29, d2
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q13, d30, d2
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #3
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d30, d12, d13, #3
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d15, d7, d8, #3
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d10, d11, #3
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d6, d13, d14, #3
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d29, d3
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d30, d3
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q5
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q12, q6
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d31, d3
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d6, d3
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q6
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q11, q7
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q13, q3
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q9, #7
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q10, #7
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q11, #7
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d10, q12, #7
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d11, q13, #7
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9, d10, d11}, [lr]!
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             filt_blk2d_fp16x16_loop_neon
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;secondpass_filter - do first 8-columns and then second 8-columns
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             lr, lr, #336
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, #2                  ;loop counter
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #16
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_sp16x16_outloop_neon
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d18}, [lr], r2         ;load src data
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d19}, [lr], r2
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d20}, [lr], r2
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d21}, [lr], r2
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                 ;loop counter
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d22}, [lr], r2
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_inner_loop_neon
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d23}, [lr], r2         ;load src data
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d24}, [lr], r2
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d25}, [lr], r2
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d26}, [lr], r2
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d19, d0
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d20, d0
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d21, d0
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d20, d1
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d21, d1
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d22, d1
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d23, d4
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d24, d4
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d25, d4
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d21, d2
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d22, d2
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d23, d2
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d24, d5
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d25, d5
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d26, d5
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d22, d3
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d23, d3
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d24, d3
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q5
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q6
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q8, #7
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q9, #7
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q10, #7
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6}, [r4], r5          ;store result
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q9, q11
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d7}, [r4], r5
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q10, q12
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8}, [r4], r5
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            d22, d26
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9}, [r4], r5
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             secondpass_inner_loop_neon
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r3, r3, #1
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             lr, lr, #336
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             lr, lr, #8
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r4, r4, r5, lsl #4
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r4, r4, #8
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne filt_blk2d_sp16x16_outloop_neon
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             sp, sp, #336
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;--------------------
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfirstpass_filter16x16_only
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #8                  ;loop counter
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;move srcptr back to (column-2)
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (16x16)
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fpo16x16_loop_neon
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d9, d10, d11}, [r0], r1
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0]
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1]
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d7, d0
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d9, d0
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d10, d0
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d9, d10, #1
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d22, d7, d8, #1
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d10, d11, #1
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d9, d10, #4
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d7, d8, #4
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d27, d10, d11, #4
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d9, d10, #5
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d21, d1
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d23, d1
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d25, d4
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q9, d27, d4
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d29, d5
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d7, d8, #5
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d10, d11, #5
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d9, d10, #2
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d7, d8, #2
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d10, d11, #2
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d27, d9, d10, #3
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d28, d7, d8, #3
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d29, d10, d11, #3
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d21, d5
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d23, d2
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q9, d25, d2
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q11, d27, d3
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q15, d29, d3
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q11
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q12
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q15
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q7, #7
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q8, #7
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q9, #7
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {q3}, [r4], r5              ;store result
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {q4}, [r4], r5
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             filt_blk2d_fpo16x16_loop_neon
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;--------------------
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_filter16x16_only
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, #2                  ;loop counter
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_spo16x16_outloop_neon
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d18}, [r0], r1         ;load src data
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d19}, [r0], r1
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d20}, [r0], r1
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d21}, [r0], r1
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                 ;loop counter
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d22}, [r0], r1
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_only_inner_loop_neon
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d23}, [r0], r1         ;load src data
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d24}, [r0], r1
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d25}, [r0], r1
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d26}, [r0], r1
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d19, d0
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d20, d0
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d21, d0
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d20, d1
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d21, d1
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d22, d1
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d23, d4
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d24, d4
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d25, d4
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d21, d2
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d22, d2
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d23, d2
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d24, d5
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d25, d5
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d26, d5
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d22, d3
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d23, d3
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d24, d3
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q4
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q9, q5
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q10, q6
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d7, q8, #7
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d8, q9, #7
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d9, q10, #7
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d6}, [r4], r5          ;store result
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q9, q11
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d7}, [r4], r5
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q10, q12
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8}, [r4], r5
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            d22, d26
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d9}, [r4], r5
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             secondpass_only_inner_loop_neon
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r3, r3, #1
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #4
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #2
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r0, #8
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r4, r4, r5, lsl #4
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r4, r4, #8
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne filt_blk2d_spo16x16_outloop_neon
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4-r5,pc}
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    END
491