190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
13538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
14538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ARM
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    REQUIRE8
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    PRESERVE8
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    AREA ||.text||, CODE, READONLY, ALIGN=2
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;unsigned int vp8_variance_halfpixvar16x16_h_neon
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr, r0
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixels_per_line,  r1
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *dst_ptr,  r2
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pixels_per_line,   r3
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sse
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;);
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber|vp8_variance_halfpixvar16x16_h_neon| PROC
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {lr}
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                  ;loop counter
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             lr, [sp, #4]           ;load *sse from stack
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q8, #0                      ;q8 - sum
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q9, #0                      ;q9, q10 - sse
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q10, #0
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (16x16)
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_filt_fpo16x16s_4_0_loop_neon
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q11}, [r2], r3
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d4, d5, d6, d7}, [r0], r1
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q12}, [r2], r3
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d8, d9, d10, d11}, [r0], r1
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q13}, [r2], r3
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14, d15}, [r0], r1
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0]
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1]
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1, lsl #1]
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q3, q2, q3, #1
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q5, q4, q5, #1
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q7, q6, q7, #1
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q14}, [r2], r3
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q2, q3
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q4, q5
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q6, q7
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q4, d0, d22                 ;diff
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q5, d1, d23
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q6, d2, d24
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q7, d3, d25
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q0, d4, d26
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q1, d5, d27
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q2, d6, d28
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q3, d7, d29
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q4                     ;sum
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d8, d8                ;sse
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d9, d9
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q5
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d10, d10
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d11, d11
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q6
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d12, d12
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d13, d13
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q7
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d14, d14
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d15, d15
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q0                     ;sum
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d0, d0                ;sse
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d1, d1
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q1
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d2, d2
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d3, d3
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q2
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d4, d4
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d5, d5
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q3
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d6, d6
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d7, d7
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8_filt_fpo16x16s_4_0_loop_neon
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u32        q10, q9, q10                ;accumulate sse
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.s32      q0, q8                      ;accumulate sum
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.u32      q1, q10
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.s64        d0, d0, d1
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u64        d1, d2, d3
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.s32       q5, d0, d0
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d1[0]}, [lr]               ;store sse
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s32        d10, d10, #8
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsub.s32        d0, d1, d10
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.32         r0, d0[0]                   ;return
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {pc}
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;unsigned int vp8_variance_halfpixvar16x16_v_neon
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr, r0
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixels_per_line,  r1
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *dst_ptr,  r2
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pixels_per_line,   r3
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sse
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;);
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber|vp8_variance_halfpixvar16x16_v_neon| PROC
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {lr}
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                     ;loop counter
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q0}, [r0], r1              ;load src data
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             lr, [sp, #4]                ;load *sse from stack
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q8, #0                      ;q8 - sum
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q9, #0                      ;q9, q10 - sse
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q10, #0
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_filt_spo16x16s_0_4_loop_neon
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q2}, [r0], r1
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q1}, [r2], r3
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q4}, [r0], r1
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q3}, [r2], r3
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q6}, [r0], r1
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q5}, [r2], r3
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {q15}, [r0], r1
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q2
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q7}, [r2], r3
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q2, q4
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q4, q4, q6
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q6, q6, q15
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q11, d0, d2                 ;diff
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q12, d1, d3
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q13, d4, d6
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q14, d5, d7
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q0, d8, d10
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q1, d9, d11
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q2, d12, d14
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q3, d13, d15
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q11                     ;sum
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d22, d22                ;sse
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d23, d23
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q12
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d24, d24
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d25, d25
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q13
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d26, d26
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d27, d27
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q14
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d28, d28
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d29, d29
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q0                     ;sum
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d0, d0                 ;sse
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d1, d1
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q1
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d2, d2
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d3, d3
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q2
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d4, d4
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d5, d5
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q0, q15
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q3
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d6, d6
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d7, d7
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8_filt_spo16x16s_0_4_loop_neon
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u32        q10, q9, q10                ;accumulate sse
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.s32      q0, q8                      ;accumulate sum
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.u32      q1, q10
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.s64        d0, d0, d1
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u64        d1, d2, d3
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.s32       q5, d0, d0
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d1[0]}, [lr]               ;store sse
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s32        d10, d10, #8
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsub.s32        d0, d1, d10
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.32         r0, d0[0]                   ;return
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {pc}
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
219538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;unsigned int vp8_variance_halfpixvar16x16_hv_neon
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char  *src_ptr, r0
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixels_per_line,  r1
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *dst_ptr,  r2
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int dst_pixels_per_line,   r3
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sse
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;);
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;================================================
228538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber|vp8_variance_halfpixvar16x16_hv_neon| PROC
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {lr}
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             lr, [sp, #4]           ;load *sse from stack
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q13, #0                      ;q8 - sum
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q14, #0                      ;q9, q10 - sse
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q15, #0
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r12, #4                  ;loop counter
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (17x16)
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_filt16x16s_4_4_loop_neon
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d4, d5, d6, d7}, [r0], r1
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d8, d9, d10, d11}, [r0], r1
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14, d15}, [r0], r1
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d16, d17, d18, d19}, [r0], r1
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0]
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1]
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1, lsl #1]
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q5, q4, q5, #1
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q7, q6, q7, #1
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q9, q8, q9, #1
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q4, q5
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q6, q7
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q4, q8, q9
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q5}, [r2], r3
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q6}, [r2], r3
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q1, q2
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q7}, [r2], r3
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q2, q3
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q8}, [r2], r3
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q3, q4
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q9, d0, d10                 ;diff
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q10, d1, d11
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q11, d2, d12
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q12, d3, d13
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q0, d4, d14                 ;diff
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q1, d5, d15
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q5, d6, d16
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q6, d7, d17
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q9                     ;sum
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d18, d18                ;sse
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d19, d19
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q10                     ;sum
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d20, d20                ;sse
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d21, d21
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q11                     ;sum
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d22, d22                ;sse
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d23, d23
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q12                     ;sum
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d24, d24                ;sse
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d25, d25
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r12, r12, #1
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q0                     ;sum
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d0, d0                ;sse
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d1, d1
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q1                     ;sum
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d2, d2                ;sse
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d3, d3
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q5                     ;sum
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d10, d10                ;sse
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d11, d11
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q0, q4
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q13, q6                     ;sum
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q14, d12, d12                ;sse
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q15, d13, d13
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8_filt16x16s_4_4_loop_neon
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u32        q15, q14, q15                ;accumulate sse
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.s32      q0, q13                      ;accumulate sum
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.u32      q1, q15
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.s64        d0, d0, d1
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u64        d1, d2, d3
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.s32       q5, d0, d0
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d1[0]}, [lr]               ;store sse
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s32        d10, d10, #8
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsub.s32        d0, d1, d10
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.32         r0, d0[0]                   ;return
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {pc}
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;==============================
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r0    unsigned char  *src_ptr,
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r1    int  src_pixels_per_line,
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r2    int  xoffset,
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r3    int  yoffset,
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack unsigned char *dst_ptr,
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack int dst_pixels_per_line,
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack unsigned int *sse
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;or filter coeff is {64, 64}. This simplified program only works in this situation.
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_sub_pixel_variance16x16s_neon| PROC
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push            {r4, lr}
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ldr             lr, [sp, #16]           ;load *sse from stack
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             secondpass_bfilter16x16s_only
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq             firstpass_bfilter16x16s_only
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             sp, sp, #256            ;reserve space on stack for temporary storage
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, sp
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #4                  ;loop counter
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (17x16)
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8e_filt_blk2d_fp16x16s_loop_neon
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d4, d5, d6, d7}, [r0], r1
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d8, d9, d10, d11}, [r0], r1
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14, d15}, [r0], r1
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d16, d17, d18, d19}, [r0], r1
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0]
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1]
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1, lsl #1]
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q5, q4, q5, #1
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q7, q6, q7, #1
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q9, q8, q9, #1
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q4, q5
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q6, q7
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q4, q8, q9
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q1, q2
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q2, q3
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q3, q4
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q0, q4
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d4, d5, d6, d7}, [r3]!
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b               sub_pixel_variance16x16s_neon
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;--------------------
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfirstpass_bfilter16x16s_only
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #2                  ;loop counter
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             sp, sp, #256            ;reserve space on stack for temporary storage
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, sp
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (16x16)
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8e_filt_blk2d_fpo16x16s_loop_neon
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d4, d5, d6, d7}, [r0], r1
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d8, d9, d10, d11}, [r0], r1
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13, d14, d15}, [r0], r1
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0]
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1]
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;pld                [r0, r1, lsl #1]
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d16, d17, d18, d19}, [r0], r1
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q3, q2, q3, #1
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d20, d21, d22, d23}, [r0], r1
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q5, q4, q5, #1
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d24, d25, d26, d27}, [r0], r1
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q7, q6, q7, #1
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d28, d29, d30, d31}, [r0], r1
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q9, q8, q9, #1
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q11, q10, q11, #1
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q13, q12, q13, #1
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vext.8          q15, q14, q15, #1
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q2, q3
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q4, q5
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q6, q7
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q4, q8, q9
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q5, q10, q11
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q6, q12, q13
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q7, q14, q15
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d4, d5, d6, d7}, [r3]!
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8, d9, d10, d11}, [r3]!
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d12, d13, d14, d15}, [r3]!
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b               sub_pixel_variance16x16s_neon
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;---------------------
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_bfilter16x16s_only
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             sp, sp, #256            ;reserve space on stack for temporary storage
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #2                  ;loop counter
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d0, d1}, [r0], r1      ;load src data
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r3, sp
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8e_filt_blk2d_spo16x16s_loop_neon
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d2, d3}, [r0], r1
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d4, d5}, [r0], r1
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d6, d7}, [r0], r1
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d8, d9}, [r0], r1
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q0, q0, q1
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d10, d11}, [r0], r1
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q1, q1, q2
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d12, d13}, [r0], r1
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q2, q2, q3
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d14, d15}, [r0], r1
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q3, q3, q4
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.u8         {d16, d17}, [r0], r1
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q4, q4, q5
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q5, q5, q6
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q6, q6, q7
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vrhadd.u8       q7, q7, q8
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov            q0, q8
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d4, d5, d6, d7}, [r3]!
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.u8         {d12, d13, d14, d15}, [r3]!
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b               sub_pixel_variance16x16s_neon
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;----------------------------
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;variance16x16
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersub_pixel_variance16x16s_neon
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q8, #0                      ;q8 - sum
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q9, #0                      ;q9, q10 - sse
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.i8         q10, #0
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub             r3, r3, #256
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov             r2, #4
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersub_pixel_variance16x16s_neon_loop
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q0}, [r3]!                 ;Load up source and reference
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q1}, [r4], r12
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q2}, [r3]!
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q3}, [r4], r12
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q4}, [r3]!
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q5}, [r4], r12
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q6}, [r3]!
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vld1.8          {q7}, [r4], r12
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q11, d0, d2                 ;diff
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q12, d1, d3
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q13, d4, d6
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q14, d5, d7
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q0, d8, d10
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q1, d9, d11
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q2, d12, d14
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubl.u8        q3, d13, d15
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q11                     ;sum
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d22, d22                ;sse
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d23, d23
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subs            r2, r2, #1
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q12
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d24, d24
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d25, d25
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q13
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d26, d26
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d27, d27
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q14
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d28, d28
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d29, d29
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q0                     ;sum
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d0, d0                ;sse
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d1, d1
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q1
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d2, d2
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d3, d3
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q2
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d4, d4
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d5, d5
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpadal.s16      q8, q3
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q9, d6, d6
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmlal.s16       q10, d7, d7
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    bne             sub_pixel_variance16x16s_neon_loop
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u32        q10, q9, q10                ;accumulate sse
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.s32      q0, q8                      ;accumulate sum
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpaddl.u32      q1, q10
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.s64        d0, d0, d1
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadd.u64        d1, d2, d3
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmull.s32       q5, d0, d0
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vst1.32         {d1[0]}, [lr]               ;store sse
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vshr.s32        d10, d10, #8
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsub.s32        d0, d1, d10
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add             sp, sp, #256
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmov.32         r0, d0[0]                   ;return
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop             {r4, pc}
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ENDP
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    END
573