190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    EXPORT  |vp8_sixtap_predict_neon|
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ARM
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    REQUIRE8
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    PRESERVE8
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA ||.text||, CODE, READONLY, ALIGN=2
18d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel
19d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvelfilter4_coeff
20d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0,  0,  128,    0,   0,  0,   0,  0
21d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -6,  123,   12,  -1,  0,   0,  0
22d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     2, -11, 108,   36,  -8,  1,   0,  0
23d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -9,   93,   50,  -6,  0,   0,  0
24d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     3, -16,  77,   77, -16,  3,   0,  0
25d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -6,   50,   93,  -9,  0,   0,  0
26d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     1, -8,   36,  108, -11,  2,   0,  0
27d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    DCD     0, -1,   12,  123,  -6,   0,  0,  0
28d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel
29d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel;-----------------
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r0    unsigned char  *src_ptr,
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r1    int  src_pixels_per_line,
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r2    int  xoffset,
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r3    int  yoffset,
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack(r4) unsigned char *dst_ptr,
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack(lr) int  dst_pitch
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_sixtap_predict_neon| PROC
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {r4, lr}
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40d35fe0269d77984b383b6bdc051f26b72da15277Ard Biesheuvel    adr             r12, filter4_coeff
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r4, [sp, #8]            ;load parameters from stack
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             lr, [sp, #12]           ;load parameters from stack
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             secondpass_filter4x4_only
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r12, r2, lsl #5     ;calculate filter location
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             firstpass_filter4x4_only
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14                ;get abs(filer_parameters)
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;go back 2 columns of src data
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First pass: output_height lines x output_width columns (9x4)
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q4}, [r0], r1
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q5}, [r0], r1
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q6}, [r0], r1
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0]
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1]
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pld             [r0, r1, lsl #1]
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d19, d8, d9, #5
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d10, d11, #5
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d12, d13, #5
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d11, d12
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d20, d5
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q4, q3                  ;keep original src data in q4 q6
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q6, q5
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #8
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d10, d0
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #32
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d20, d1
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #16
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d10, d4
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #24
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d20, d2
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d10, d3
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q4}, [r0], r1
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q10
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q5}, [r0], r1
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q6}, [r0], r1
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d28, q8, #7
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;First Pass on rest 5-line data
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q11}, [r0], r1
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d19, d8, d9, #5
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d10, d11, #5
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d12, d13, #5
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d11, d12
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d20, d5
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp8_filter[5])
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q4, q3                  ;keep original src data in q4 q6
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q6, q5
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #8
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d10, d0
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp8_filter[0])
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #32
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d20, d1
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp8_filter[1])
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #16
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d10, d4
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp8_filter[4])
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #24
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d20, d2
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp8_filter[2])
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d10, d3
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp8_filter[3])
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q10
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q12, q11
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d27, d28, #4
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d30, q8, #7
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d31, q12, #7
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 4x4
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d28, d29, #4
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d29, d30, #4
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d30, d31, #4
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d28, d0
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d26, d5
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d30, d4
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d24, d1
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d29, d2
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d25, d3
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r4, lr
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r1, r0, lr
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r1, lr
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q6, q4
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d4, q6, #7
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d3[0]}, [r4]           ;store result
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d3[1]}, [r0]
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d4[0]}, [r1]
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d4[1]}, [r2]
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4, pc}
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;---------------------
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfirstpass_filter4x4_only
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q12, q14                ;get abs(filer_parameters)
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q13, q15
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, #2              ;go back 2 columns of src data
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First pass: output_height lines x output_width columns (4x4)
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q4}, [r0], r1
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d24[4]
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q5}, [r0], r1
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d25[0]
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q6}, [r0], r1
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d25[4]
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d26[0]
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d26[4]
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d19, d8, d9, #5
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d20, d10, d11, #5
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d21, d12, d13, #5
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vswp            d11, d12
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q8, d20, d5
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q4, q3                  ;keep original src data in q4 q6
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q6, q5
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #8
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d10, d0
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #32
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d20, d1
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q10, q6, #16
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q8, d10, d4
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d20, d21
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.u64        q5, q6, #24
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q8, d20, d2
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vzip.32         d10, d11
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q10, d10, d3
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r4, lr
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r1, r0, lr
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r1, lr
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q8, q10
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d28, q8, #7
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d27[0]}, [r4]          ;store result
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d27[1]}, [r0]
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d28[0]}, [r1]
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d28[1]}, [r2]
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4, pc}
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;---------------------
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_filter4x4_only
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r0, r0, r1, lsl #1
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r3, r12, r3, lsl #5
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d27[0]}, [r0], r1      ;load src data
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d27[1]}, [r0], r1
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q7, q5
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d28[0]}, [r0], r1
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vabs.s32        q8, q6
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d28[1]}, [r0], r1
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d29[0]}, [r0], r1
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d1, d14[4]
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d29[1]}, [r0], r1
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d2, d15[0]
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d30[0]}, [r0], r1
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d3, d15[4]
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d30[1]}, [r0], r1
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d4, d16[0]
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.32         {d31[0]}, [r0], r1
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vdup.8          d5, d16[4]
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d23, d27, d28, #4
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d24, d28, d29, #4
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d25, d29, d30, #4
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          d26, d30, d31, #4
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q4, d28, d0
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.u8        q6, d26, d5
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q4, d30, d4
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlsl.u8        q6, d24, d1
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q4, d29, d2
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.u8        q6, d25, d3
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r0, r4, lr
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r1, r0, lr
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             r2, r1, lr
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqadd.s16       q6, q4
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vqrshrun.s16    d4, q6, #7
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d3[0]}, [r4]           ;store result
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d3[1]}, [r0]
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d4[0]}, [r1]
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d4[1]}, [r2]
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4, pc}
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    END
422