190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber EXPORT |vp8_sixtap_predict16x16_neon| 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ARM 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber REQUIRE8 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber PRESERVE8 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber AREA ||.text||, CODE, READONLY, ALIGN=2 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r0 unsigned char *src_ptr, 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r1 int src_pixels_per_line, 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r2 int xoffset, 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r3 int yoffset, 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; r4 unsigned char *dst_ptr, 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; stack(r5) int dst_pitch 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; the result can be negtive. So, I treat the result as s16. But, since it is also possible 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; that the result can be a large positive number (> 2^15-1), which could be confused as a 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; which ensures that the result stays in s16 range. Finally, saturated add the result by 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; applying 3rd filter coeff. Same applys to other filter functions. 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber|vp8_sixtap_predict16x16_neon| PROC 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push {r4-r5, lr} 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ldr r12, _filter16_coeff_ 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ldr r4, [sp, #12] ;load parameters from stack 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ldr r5, [sp, #16] ;load parameters from stack 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber cmp r2, #0 ;skip first_pass filter if xoffset=0 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber beq secondpass_filter16x16_only 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r2, r12, r2, lsl #5 ;calculate filter location 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber cmp r3, #0 ;skip second_pass filter if yoffset=0 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.s32 {q14, q15}, [r2] ;load first_pass filter 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber beq firstpass_filter16x16_only 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub sp, sp, #336 ;reserve space on stack for temporary storage 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov lr, sp 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q12, q14 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q13, q15 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r2, #7 ;loop counter 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, r1, lsl #1 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d1, d24[4] 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d2, d25[0] 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d3, d25[4] 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d4, d26[0] 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d5, d26[4] 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (21x16) 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fp16x16_loop_neon 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d9, d10, d11}, [r0], r1 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d12, d13, d14}, [r0], r1 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pld [r0] 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pld [r0, r1] 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pld [r0, r1, lsl #1] 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q9, d7, d0 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q10, d9, d0 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q11, d10, d0 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q12, d12, d0 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q13, d13, d0 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #1 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d12, d13, #1 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q10, d29, d1 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q12, d30, d1 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d7, d8, #1 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d10, d11, #1 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d13, d14, #1 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q11, d29, d1 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q13, d30, d1 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #4 ;construct src_ptr[2] 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #4 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d12, d13, #4 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q10, d29, d4 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q12, d30, d4 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d7, d8, #4 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d10, d11, #4 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d13, d14, #4 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q11, d29, d4 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q13, d30, d4 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #5 ;construct src_ptr[3] 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #5 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d12, d13, #5 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q10, d29, d5 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q12, d30, d5 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d7, d8, #5 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d10, d11, #5 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d13, d14, #5 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q11, d29, d5 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q13, d30, d5 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #2 ;construct src_ptr[0] 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #2 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d12, d13, #2 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q10, d29, d2 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q12, d30, d2 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d7, d8, #2 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d10, d11, #2 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d13, d14, #2 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q11, d29, d2 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q13, d30, d2 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #3 ;construct src_ptr[1] 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #3 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d30, d12, d13, #3 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d15, d7, d8, #3 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d31, d10, d11, #3 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d6, d13, d14, #3 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q5, d29, d3 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q6, d30, d3 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q10, q5 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q12, q6 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q7, d31, d3 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q3, d6, d3 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r2, r2, #1 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q9, q6 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q11, q7 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q13, q3 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d7, q9, #7 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d8, q10, #7 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d9, q11, #7 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d10, q12, #7 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d11, q13, #7 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d6, d7, d8}, [lr]! ;store result 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d9, d10, d11}, [lr]! 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne filt_blk2d_fp16x16_loop_neon 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;secondpass_filter - do first 8-columns and then second 8-columns 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r3, r12, r3, lsl #5 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub lr, lr, #336 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.s32 {q5, q6}, [r3] ;load second_pass filter 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r3, #2 ;loop counter 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q7, q5 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q8, q6 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r2, #16 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d1, d14[4] 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d2, d15[0] 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d3, d15[4] 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d4, d16[0] 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d5, d16[4] 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_sp16x16_outloop_neon 20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d18}, [lr], r2 ;load src data 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d19}, [lr], r2 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d20}, [lr], r2 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d21}, [lr], r2 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r12, #4 ;loop counter 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d22}, [lr], r2 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_inner_loop_neon 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d23}, [lr], r2 ;load src data 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d24}, [lr], r2 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d25}, [lr], r2 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d26}, [lr], r2 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q4, d19, d0 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q5, d20, d0 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q6, d21, d0 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q4, d20, d1 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q5, d21, d1 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d22, d1 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q4, d23, d4 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q5, d24, d4 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d25, d4 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q4, d21, d2 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q5, d22, d2 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d23, d2 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q4, d24, d5 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q5, d25, d5 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d26, d5 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q8, d22, d3 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q9, d23, d3 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q10, d24, d3 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r12, r12, #1 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q8, q4 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q9, q5 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q10, q6 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d7, q8, #7 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d8, q9, #7 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d9, q10, #7 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d6}, [r4], r5 ;store result 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov q9, q11 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d7}, [r4], r5 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov q10, q12 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d8}, [r4], r5 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov d22, d26 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d9}, [r4], r5 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne secondpass_inner_loop_neon 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r3, r3, #1 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub lr, lr, #336 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add lr, lr, #8 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r4, r4, r5, lsl #4 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r4, r4, #8 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne filt_blk2d_sp16x16_outloop_neon 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add sp, sp, #336 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop {r4-r5,pc} 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;-------------------- 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfirstpass_filter16x16_only 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q12, q14 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q13, q15 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r2, #8 ;loop counter 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, #2 ;move srcptr back to (column-2) 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d1, d24[4] 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d2, d25[0] 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d3, d25[4] 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d4, d26[0] 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d5, d26[4] 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;First Pass: output_height lines x output_width columns (16x16) 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_fpo16x16_loop_neon 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d9, d10, d11}, [r0], r1 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pld [r0] 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pld [r0, r1] 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q7, d7, d0 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q8, d9, d0 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q9, d10, d0 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d21, d9, d10, #1 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d22, d7, d8, #1 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d23, d10, d11, #1 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d24, d6, d7, #4 ;construct src_ptr[2] 31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d25, d9, d10, #4 31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d26, d7, d8, #4 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d27, d10, d11, #4 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d6, d7, #5 ;construct src_ptr[3] 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d9, d10, #5 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q8, d21, d1 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q9, d23, d1 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q8, d25, d4 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q9, d27, d4 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q8, d29, d5 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d20, d7, d8, #5 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d21, d10, d11, #5 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d22, d6, d7, #2 ;construct src_ptr[0] 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d23, d9, d10, #2 33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d24, d7, d8, #2 34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d25, d10, d11, #2 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d26, d6, d7, #3 ;construct src_ptr[1] 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d27, d9, d10, #3 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d28, d7, d8, #3 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vext.8 d29, d10, d11, #3 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) 34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q9, d21, d5 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q8, d23, d2 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) 35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q9, d25, d2 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q11, d27, d3 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q15, d29, d3 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q8, q11 36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q7, q12 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q9, q15 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r2, r2, #1 36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d7, q7, #7 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d8, q8, #7 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d9, q9, #7 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {q3}, [r4], r5 ;store result 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {q4}, [r4], r5 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne filt_blk2d_fpo16x16_loop_neon 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop {r4-r5,pc} 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;-------------------- 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_filter16x16_only 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;Second pass: 16x16 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r3, r12, r3, lsl #5 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, r1, lsl #1 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.s32 {q5, q6}, [r3] ;load second_pass filter 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r3, #2 ;loop counter 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q7, q5 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vabs.s32 q8, q6 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d1, d14[4] 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d2, d15[0] 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d3, d15[4] 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d4, d16[0] 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vdup.8 d5, d16[4] 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilt_blk2d_spo16x16_outloop_neon 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d18}, [r0], r1 ;load src data 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d19}, [r0], r1 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d20}, [r0], r1 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d21}, [r0], r1 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov r12, #4 ;loop counter 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d22}, [r0], r1 40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecondpass_only_inner_loop_neon 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d23}, [r0], r1 ;load src data 40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d24}, [r0], r1 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d25}, [r0], r1 40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vld1.u8 {d26}, [r0], r1 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q4, d19, d0 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q5, d20, d0 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q6, d21, d0 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q4, d20, d1 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q5, d21, d1 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d22, d1 42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q4, d23, d4 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q5, d24, d4 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlsl.u8 q6, d25, d4 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q4, d21, d2 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q5, d22, d2 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d23, d2 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q4, d24, d5 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q5, d25, d5 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmlal.u8 q6, d26, d5 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q8, d22, d3 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q9, d23, d3 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmull.u8 q10, d24, d3 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r12, r12, #1 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q8, q4 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q9, q5 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqadd.s16 q10, q6 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d7, q8, #7 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d8, q9, #7 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vqrshrun.s16 d9, q10, #7 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d6}, [r4], r5 ;store result 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov q9, q11 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d7}, [r4], r5 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov q10, q12 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d8}, [r4], r5 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vmov d22, d26 45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber vst1.u8 {d9}, [r4], r5 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne secondpass_only_inner_loop_neon 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber subs r3, r3, #1 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, r1, lsl #4 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, r1, lsl #2 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r0, r0, r1 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r0, r0, #8 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub r4, r4, r5, lsl #4 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add r4, r4, #8 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber bne filt_blk2d_spo16x16_outloop_neon 47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop {r4-r5,pc} 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ENDP 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;----------------- 47979f15823c34ae1e423108295e416213200bb280fAndreas Huber 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber_filter16_coeff_ 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD filter16_coeff 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter16_coeff 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0, 0, 128, 0, 0, 0, 0, 0 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0, -6, 123, 12, -1, 0, 0, 0 48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 2, -11, 108, 36, -8, 1, 0, 0 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0, -9, 93, 50, -6, 0, 0, 0 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 3, -16, 77, 77, -16, 3, 0, 0 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0, -6, 50, 93, -9, 0, 0, 0 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 1, -8, 36, 108, -11, 2, 0, 0 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber DCD 0, -1, 12, 123, -6, 0, 0, 0 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber END 493