1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp8_sixtap_predict16x16_neon| 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan ARM 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan REQUIRE8 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan PRESERVE8 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA ||.text||, CODE, READONLY, ALIGN=2 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan 19233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilter16_coeff 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, 0, 128, 0, 0, 0, 0, 0 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -6, 123, 12, -1, 0, 0, 0 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 2, -11, 108, 36, -8, 1, 0, 0 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -9, 93, 50, -6, 0, 0, 0 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 3, -16, 77, 77, -16, 3, 0, 0 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -6, 50, 93, -9, 0, 0, 0 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 1, -8, 36, 108, -11, 2, 0, 0 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -1, 12, 123, -6, 0, 0, 0 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 unsigned char *src_ptr, 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int src_pixels_per_line, 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 int xoffset, 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 int yoffset, 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r4 unsigned char *dst_ptr, 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r5) int dst_pitch 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan; the result can be negtive. So, I treat the result as s16. But, since it is also possible 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that the result can be a large positive number (> 2^15-1), which could be confused as a 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan; which ensures that the result stays in s16 range. Finally, saturated add the result by 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan; applying 3rd filter coeff. Same applys to other filter functions. 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_sixtap_predict16x16_neon| PROC 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r4-r5, lr} 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan adr r12, filter16_coeff 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r4, [sp, #12] ;load parameters from stack 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r5, [sp, #16] ;load parameters from stack 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r2, #0 ;skip first_pass filter if xoffset=0 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq secondpass_filter16x16_only 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r12, r2, lsl #5 ;calculate filter location 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r3, #0 ;skip second_pass filter if yoffset=0 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q14, q15}, [r2] ;load first_pass filter 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq firstpass_filter16x16_only 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub sp, sp, #336 ;reserve space on stack for temporary storage 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov lr, sp 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q12, q14 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q13, q15 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #7 ;loop counter 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #1 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d24[4] 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d25[0] 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d25[4] 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d26[0] 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d26[4] 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (21x16) 80233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_fp16x16_loop_neon 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d9, d10, d11}, [r0], r1 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d12, d13, d14}, [r0], r1 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0] 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1] 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1, lsl #1] 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d7, d0 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d9, d0 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d10, d0 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d12, d0 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q13, d13, d0 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #1 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d12, d13, #1 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q10, d29, d1 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q12, d30, d1 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d7, d8, #1 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d10, d11, #1 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d13, d14, #1 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q11, d29, d1 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q13, d30, d1 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #4 ;construct src_ptr[2] 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #4 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d12, d13, #4 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q10, d29, d4 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q12, d30, d4 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d7, d8, #4 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d10, d11, #4 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d13, d14, #4 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q11, d29, d4 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q13, d30, d4 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #5 ;construct src_ptr[3] 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #5 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d12, d13, #5 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q10, d29, d5 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d30, d5 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d7, d8, #5 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d10, d11, #5 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d13, d14, #5 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q11, d29, d5 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q13, d30, d5 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #2 ;construct src_ptr[0] 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #2 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d12, d13, #2 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q10, d29, d2 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d30, d2 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d7, d8, #2 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d10, d11, #2 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d13, d14, #2 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q11, d29, d2 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q13, d30, d2 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #3 ;construct src_ptr[1] 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #3 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d30, d12, d13, #3 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d15, d7, d8, #3 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d10, d11, #3 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d6, d13, d14, #3 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d29, d3 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d30, d3 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q10, q5 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q12, q6 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d31, d3 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d6, d3 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r2, r2, #1 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q9, q6 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q11, q7 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q13, q3 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d7, q9, #7 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d8, q10, #7 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d9, q11, #7 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d10, q12, #7 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d11, q13, #7 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6, d7, d8}, [lr]! ;store result 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d9, d10, d11}, [lr]! 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne filt_blk2d_fp16x16_loop_neon 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter - do first 8-columns and then second 8-columns 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #5 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub lr, lr, #336 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q5, q6}, [r3] ;load second_pass filter 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, #2 ;loop counter 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q7, q5 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q8, q6 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #16 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d14[4] 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d15[0] 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d15[4] 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d16[0] 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d16[4] 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan 218233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_sp16x16_outloop_neon 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [lr], r2 ;load src data 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d19}, [lr], r2 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d20}, [lr], r2 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d21}, [lr], r2 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #4 ;loop counter 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d22}, [lr], r2 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan 226233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_inner_loop_neon 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d23}, [lr], r2 ;load src data 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d24}, [lr], r2 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d25}, [lr], r2 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d26}, [lr], r2 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d19, d0 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d20, d0 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d21, d0 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d20, d1 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d21, d1 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d22, d1 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d23, d4 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d24, d4 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d25, d4 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d21, d2 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d22, d2 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d23, d2 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d24, d5 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d25, d5 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d26, d5 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d22, d3 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d23, d3 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d24, d3 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q4 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q9, q5 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q10, q6 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d7, q8, #7 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d8, q9, #7 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d9, q10, #7 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6}, [r4], r5 ;store result 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q9, q11 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d7}, [r4], r5 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q10, q12 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d8}, [r4], r5 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov d22, d26 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d9}, [r4], r5 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne secondpass_inner_loop_neon 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r3, r3, #1 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub lr, lr, #336 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan add lr, lr, #8 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r4, r4, r5, lsl #4 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r4, r4, #8 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne filt_blk2d_sp16x16_outloop_neon 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan add sp, sp, #336 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r5,pc} 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-------------------- 297233d2500723e5594f3e7c70896ffeeef32b9c950ywanfirstpass_filter16x16_only 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q12, q14 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q13, q15 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #8 ;loop counter 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, #2 ;move srcptr back to (column-2) 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d24[4] 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d25[0] 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d25[4] 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d26[0] 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d26[4] 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (16x16) 312233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_fpo16x16_loop_neon 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d9, d10, d11}, [r0], r1 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0] 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1] 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d7, d0 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d9, d0 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d10, d0 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d21, d9, d10, #1 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d22, d7, d8, #1 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d23, d10, d11, #1 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d24, d6, d7, #4 ;construct src_ptr[2] 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d25, d9, d10, #4 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d26, d7, d8, #4 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d27, d10, d11, #4 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d6, d7, #5 ;construct src_ptr[3] 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d9, d10, #5 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d21, d1 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q9, d23, d1 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d25, d4 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q9, d27, d4 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d29, d5 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d20, d7, d8, #5 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d21, d10, d11, #5 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d22, d6, d7, #2 ;construct src_ptr[0] 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d23, d9, d10, #2 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d24, d7, d8, #2 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d25, d10, d11, #2 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d26, d6, d7, #3 ;construct src_ptr[1] 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d27, d9, d10, #3 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d28, d7, d8, #3 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d29, d10, d11, #3 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d21, d5 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d23, d2 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d25, d2 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d27, d3 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q15, d29, d3 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q11 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q12 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q9, q15 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r2, r2, #1 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d7, q7, #7 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d8, q8, #7 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d9, q9, #7 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {q3}, [r4], r5 ;store result 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {q4}, [r4], r5 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne filt_blk2d_fpo16x16_loop_neon 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r5,pc} 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-------------------- 390233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_filter16x16_only 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #5 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #1 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q5, q6}, [r3] ;load second_pass filter 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, #2 ;loop counter 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q7, q5 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q8, q6 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d14[4] 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d15[0] 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d15[4] 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d16[0] 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d16[4] 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan 408233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilt_blk2d_spo16x16_outloop_neon 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [r0], r1 ;load src data 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d19}, [r0], r1 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d20}, [r0], r1 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d21}, [r0], r1 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #4 ;loop counter 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d22}, [r0], r1 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_only_inner_loop_neon 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d23}, [r0], r1 ;load src data 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d24}, [r0], r1 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d25}, [r0], r1 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d26}, [r0], r1 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d19, d0 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d20, d0 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d21, d0 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d20, d1 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d21, d1 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d22, d1 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d23, d4 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d24, d4 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d25, d4 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d21, d2 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d22, d2 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d23, d2 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d24, d5 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d25, d5 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d26, d5 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d22, d3 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d23, d3 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d24, d3 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q4 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q9, q5 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q10, q6 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d7, q8, #7 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d8, q9, #7 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d9, q10, #7 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6}, [r4], r5 ;store result 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q9, q11 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d7}, [r4], r5 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q10, q12 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d8}, [r4], r5 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov d22, d26 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d9}, [r4], r5 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne secondpass_only_inner_loop_neon 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r3, r3, #1 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #4 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #2 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r0, #8 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r4, r4, r5, lsl #4 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r4, r4, #8 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne filt_blk2d_spo16x16_outloop_neon 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r5,pc} 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan;----------------- 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan END 491