1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp8_sixtap_predict4x4_neon| 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan ARM 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan REQUIRE8 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan PRESERVE8 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA ||.text||, CODE, READONLY, ALIGN=2 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan 19233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilter4_coeff 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, 0, 128, 0, 0, 0, 0, 0 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -6, 123, 12, -1, 0, 0, 0 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 2, -11, 108, 36, -8, 1, 0, 0 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -9, 93, 50, -6, 0, 0, 0 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 3, -16, 77, 77, -16, 3, 0, 0 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -6, 50, 93, -9, 0, 0, 0 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 1, -8, 36, 108, -11, 2, 0, 0 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 0, -1, 12, 123, -6, 0, 0, 0 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 unsigned char *src_ptr, 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int src_pixels_per_line, 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 int xoffset, 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 int yoffset, 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r4) unsigned char *dst_ptr, 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(lr) int dst_pitch 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_sixtap_predict4x4_neon| PROC 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r4, lr} 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan adr r12, filter4_coeff 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r4, [sp, #8] ;load parameters from stack 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr lr, [sp, #12] ;load parameters from stack 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r2, #0 ;skip first_pass filter if xoffset=0 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq secondpass_filter4x4_only 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r12, r2, lsl #5 ;calculate filter location 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r3, #0 ;skip second_pass filter if yoffset=0 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q14, q15}, [r2] ;load first_pass filter 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq firstpass_filter4x4_only 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q12, q14 ;get abs(filer_parameters) 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q13, q15 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, #2 ;go back 2 columns of src data 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #1 ;go back 2 lines of src data 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First pass: output_height lines x output_width columns (9x4) 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q3}, [r0], r1 ;load first 4-line src data 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q4}, [r0], r1 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d24[4] 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q5}, [r0], r1 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d25[0] 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q6}, [r0], r1 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d25[4] 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d26[0] 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d26[4] 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0] 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1] 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1, lsl #1] 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d18, d6, d7, #5 ;construct src_ptr[3] 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d19, d8, d9, #5 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d20, d10, d11, #5 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d21, d12, d13, #5 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d11, d12 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d20, d5 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q4, q3 ;keep original src data in q4 q6 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q6, q5 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #8 ;construct src_ptr[-1] 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #8 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d10, d0 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #32 ;construct src_ptr[2] 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #32 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d20, d1 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #16 ;construct src_ptr[0] 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #16 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d10, d4 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #24 ;construct src_ptr[1] 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #24 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d20, d2 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d10, d3 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q4}, [r0], r1 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q10 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q5}, [r0], r1 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q6}, [r0], r1 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d28, q8, #7 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;First Pass on rest 5-line data 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q11}, [r0], r1 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d18, d6, d7, #5 ;construct src_ptr[3] 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d19, d8, d9, #5 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d20, d10, d11, #5 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d21, d12, d13, #5 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d11, d12 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d22, d23, #5 ;construct src_ptr[3] 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d20, d5 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q4, q3 ;keep original src data in q4 q6 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q6, q5 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #8 ;construct src_ptr[-1] 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #8 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d10, d0 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #32 ;construct src_ptr[2] 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #32 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d20, d1 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #16 ;construct src_ptr[0] 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #16 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d22, d23, #4 ;construct src_ptr[2] 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d10, d4 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #24 ;construct src_ptr[1] 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #24 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d22, d23, #2 ;construct src_ptr[0] 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d20, d2 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d31, d22, d23, #3 ;construct src_ptr[1] 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d10, d3 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #5 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q10 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q12, q11 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d23, d27, d28, #4 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q5, q6}, [r3] ;load second_pass filter 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d30, q8, #7 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d31, q12, #7 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 4x4 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q7, q5 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q8, q6 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d24, d28, d29, #4 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d25, d29, d30, #4 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d26, d30, d31, #4 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d14[4] 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d15[0] 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d15[4] 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d16[0] 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d16[4] 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d28, d0 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d26, d5 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d30, d4 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d24, d1 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d29, d2 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d25, d3 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r4, lr 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, r0, lr 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r1, lr 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q6, q4 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d4, q6, #7 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d3[0]}, [r4] ;store result 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d3[1]}, [r0] 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d4[0]}, [r1] 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d4[1]}, [r2] 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4, pc} 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------- 268233d2500723e5594f3e7c70896ffeeef32b9c950ywanfirstpass_filter4x4_only 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q12, q14 ;get abs(filer_parameters) 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q13, q15 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, #2 ;go back 2 columns of src data 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First pass: output_height lines x output_width columns (4x4) 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q3}, [r0], r1 ;load first 4-line src data 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d24[0] ;first_pass filter (d0-d5) 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q4}, [r0], r1 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d24[4] 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q5}, [r0], r1 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d25[0] 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {q6}, [r0], r1 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d25[4] 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d26[0] 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d26[4] 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d18, d6, d7, #5 ;construct src_ptr[3] 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d19, d8, d9, #5 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d20, d10, d11, #5 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d21, d12, d13, #5 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d11, d12 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d20, d5 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q4, q3 ;keep original src data in q4 q6 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q6, q5 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #8 ;construct src_ptr[-1] 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #8 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d10, d0 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #32 ;construct src_ptr[2] 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #32 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d20, d1 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q9, q4, #16 ;construct src_ptr[0] 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q10, q6, #16 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q8, d10, d4 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d20, d21 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q3, q4, #24 ;construct src_ptr[1] 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u64 q5, q6, #24 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d20, d2 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan vzip.32 d10, d11 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d10, d3 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r4, lr 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, r0, lr 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r1, lr 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q8, q10 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d28, q8, #7 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d27[0]}, [r4] ;store result 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d27[1]}, [r0] 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d28[0]}, [r1] 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d28[1]}, [r2] 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4, pc} 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------- 355233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_filter4x4_only 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, r1, lsl #1 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #5 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d27[0]}, [r0], r1 ;load src data 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {q5, q6}, [r3] ;load second_pass filter 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d27[1]}, [r0], r1 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q7, q5 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d28[0]}, [r0], r1 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabs.s32 q8, q6 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d28[1]}, [r0], r1 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d29[0]}, [r0], r1 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d14[4] 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d29[1]}, [r0], r1 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d2, d15[0] 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d30[0]}, [r0], r1 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d3, d15[4] 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d30[1]}, [r0], r1 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d4, d16[0] 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.32 {d31[0]}, [r0], r1 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d5, d16[4] 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d23, d27, d28, #4 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d24, d28, d29, #4 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d25, d29, d30, #4 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d26, d30, d31, #4 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d28, d0 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d26, d5 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q4, d30, d4 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlsl.u8 q6, d24, d1 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d29, d2 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d25, d3 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r4, lr 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, r0, lr 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r1, lr 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s16 q6, q4 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrun.s16 d4, q6, #7 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d3[0]}, [r4] ;store result 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d3[1]}, [r0] 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d4[0]}, [r1] 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d4[1]}, [r2] 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4, pc} 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan;----------------- 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan END 423