1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan;----------------- 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp8_sub_pixel_variance16x16_neon_func| 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan ARM 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan REQUIRE8 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan PRESERVE8 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA ||.text||, CODE, READONLY, ALIGN=2 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 unsigned char *src_ptr, 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int src_pixels_per_line, 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 int xoffset, 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 int yoffset, 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r4) unsigned char *dst_ptr, 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r5) int dst_pixels_per_line, 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; stack(r6) unsigned int *sse 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywanbilinear_taps_coeff 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_sub_pixel_variance16x16_neon_func| PROC 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r4-r6, lr} 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan adr r12, bilinear_taps_coeff 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r4, [sp, #16] ;load *dst_ptr from stack 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r5, [sp, #20] ;load dst_pixels_per_line from stack 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r6, [sp, #24] ;load *sse from stack 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r2, #0 ;skip first_pass filter if xoffset=0 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq secondpass_bfilter16x16_only 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r12, r2, lsl #3 ;calculate filter location 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r3, #0 ;skip second_pass filter if yoffset=0 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.s32 {d31}, [r2] ;load first_pass filter 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq firstpass_bfilter16x16_only 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub sp, sp, #272 ;reserve space on stack for temporary storage 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov lr, sp 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5, d6, d7}, [r0], r1 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #3 ;loop counter 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d8, d9, d10}, [r0], r1 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d31[0] ;first_pass filter (d0 d1) 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d11, d12, d13}, [r0], r1 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d31[4] 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (17x16) 65233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_fp16x16_loop_neon 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0] 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1] 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1, lsl #1] 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d3, d0 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d5, d0 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d6, d0 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d8, d0 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d9, d0 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q13, d11, d0 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q14, d12, d0 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d2, d2, d3, #1 ;construct src_ptr[1] 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d5, d5, d6, #1 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d8, d8, d9, #1 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d11, d11, d12, #1 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d5, d1 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q11, d8, d1 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q13, d11, d1 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d3, d3, d4, #1 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d6, d6, d7, #1 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d9, d9, d10, #1 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d12, d12, d13, #1 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q10, d6, d1 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d9, d1 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d12, d1 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r2, r2, #1 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d15, q8, #7 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d16, q9, #7 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d17, q10, #7 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d18, q11, #7 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d19, q12, #7 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d20, q13, #7 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d21, q14, #7 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5, d6, d7}, [r0], r1 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d8, d9, d10}, [r0], r1 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d18, d19, d20, d21}, [lr]! 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d11, d12, d13}, [r0], r1 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne vp8e_filt_blk2d_fp16x16_loop_neon 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First-pass filtering for rest 5 lines 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d14, d15, d16}, [r0], r1 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d3, d0 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d5, d0 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d6, d0 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q13, d8, d0 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q14, d9, d0 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d2, d2, d3, #1 ;construct src_ptr[1] 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d5, d5, d6, #1 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d8, d8, d9, #1 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q11, d5, d1 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q13, d8, d1 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d3, d3, d4, #1 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d6, d6, d7, #1 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d9, d9, d10, #1 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d6, d1 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d9, d1 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q1, d11, d0 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q2, d12, d0 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d14, d0 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d15, d0 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d11, d11, d12, #1 ;construct src_ptr[1] 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d14, d14, d15, #1 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d14, d1 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d12, d12, d13, #1 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d15, d15, d16, #1 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d15, d1 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d11, q10, #7 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d12, q11, #7 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d13, q12, #7 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d14, q13, #7 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d15, q14, #7 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d16, q1, #7 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d17, q2, #7 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d18, q3, #7 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d19, q4, #7 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d14, d15, d16, d17}, [lr]! 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d18, d19}, [lr]! 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #3 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub lr, lr, #272 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u32 {d31}, [r3] ;load second_pass filter 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub sp, sp, #256 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, sp 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d22, d23}, [lr]! ;load src data 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d31[4] 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #4 ;loop counter 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan 194233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_sp16x16_loop_neon 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d24, d25}, [lr]! 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d26, d27}, [lr]! 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q2, d23, d0 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d28, d29}, [lr]! 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d24, d0 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d30, d31}, [lr]! 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d25, d0 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d26, d0 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d27, d0 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d28, d0 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d29, d0 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q2, d25, d1 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d26, d1 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d27, d1 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d28, d1 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d29, d1 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d30, d1 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d31, d1 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d3, q2, #7 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d4, q3, #7 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d5, q4, #7 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d6, q5, #7 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d7, q6, #7 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d8, q7, #7 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d9, q8, #7 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d2, d3}, [r3]! ;store result 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d4, d5}, [r3]! 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6, d7}, [r3]! 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q11, q15 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d8, d9}, [r3]! 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne vp8e_filt_blk2d_sp16x16_loop_neon 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan b sub_pixel_variance16x16_neon 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-------------------- 240233d2500723e5594f3e7c70896ffeeef32b9c950ywanfirstpass_bfilter16x16_only 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r2, #4 ;loop counter 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub sp, sp, #528 ;reserve space on stack for temporary storage 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d31[0] ;first_pass filter (d0 d1) 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d31[4] 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, sp 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan;First Pass: output_height lines x output_width columns (16x16) 248233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_fpo16x16_loop_neon 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5, d6, d7}, [r0], r1 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d8, d9, d10}, [r0], r1 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d11, d12, d13}, [r0], r1 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0] 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1] 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan pld [r0, r1, lsl #1] 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d3, d0 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q9, d5, d0 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q10, d6, d0 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q11, d8, d0 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q12, d9, d0 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q13, d11, d0 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q14, d12, d0 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d2, d2, d3, #1 ;construct src_ptr[1] 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d5, d5, d6, #1 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d8, d8, d9, #1 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d11, d11, d12, #1 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q9, d5, d1 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q11, d8, d1 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q13, d11, d1 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d3, d3, d4, #1 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d6, d6, d7, #1 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d9, d9, d10, #1 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan vext.8 d12, d12, d13, #1 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q10, d6, d1 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q12, d9, d1 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d12, d1 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r2, r2, #1 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d15, q8, #7 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d16, q9, #7 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d17, q10, #7 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d18, q11, #7 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d19, q12, #7 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d20, q13, #7 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d14, d15}, [r3]! ;store result 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d21, q14, #7 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d16, d17}, [r3]! 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d18, d19}, [r3]! 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d20, d21}, [r3]! 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne vp8e_filt_blk2d_fpo16x16_loop_neon 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan b sub_pixel_variance16x16_neon 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan;--------------------- 308233d2500723e5594f3e7c70896ffeeef32b9c950ywansecondpass_bfilter16x16_only 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan;Second pass: 16x16 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan;secondpass_filter 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub sp, sp, #528 ;reserve space on stack for temporary storage 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r12, r3, lsl #3 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #4 ;loop counter 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u32 {d31}, [r3] ;load second_pass filter 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d22, d23}, [r0], r1 ;load src data 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r3, sp 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan vdup.8 d1, d31[4] 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan 321233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8e_filt_blk2d_spo16x16_loop_neon 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d24, d25}, [r0], r1 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d26, d27}, [r0], r1 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q2, d23, d0 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d28, d29}, [r0], r1 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q3, d24, d0 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d30, d31}, [r0], r1 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q4, d25, d0 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q5, d26, d0 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q6, d27, d0 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q7, d28, d0 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.u8 q8, d29, d0 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q2, d25, d1 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q3, d26, d1 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q4, d27, d1 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q5, d28, d1 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q6, d29, d1 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q7, d30, d1 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q8, d31, d1 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d3, q2, #7 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d4, q3, #7 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d5, q4, #7 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d6, q5, #7 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d7, q6, #7 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d8, q7, #7 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d9, q8, #7 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d2, d3}, [r3]! ;store result 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d4, d5}, [r3]! 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov q11, q15 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6, d7}, [r3]! 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d8, d9}, [r3]! 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne vp8e_filt_blk2d_spo16x16_loop_neon 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan b sub_pixel_variance16x16_neon 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan;---------------------------- 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan;variance16x16 367233d2500723e5594f3e7c70896ffeeef32b9c950ywansub_pixel_variance16x16_neon 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.i8 q8, #0 ;q8 - sum 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.i8 q9, #0 ;q9, q10 - sse 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.i8 q10, #0 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r3, r3, #256 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov r12, #8 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywansub_pixel_variance16x16_neon_loop 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {q0}, [r3]! ;Load up source and reference 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {q2}, [r4], r5 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {q1}, [r3]! 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {q3}, [r4], r5 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubl.u8 q11, d0, d4 ;diff 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubl.u8 q12, d1, d5 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubl.u8 q13, d2, d6 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubl.u8 q14, d3, d7 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpadal.s16 q8, q11 ;sum 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q9, d22, d22 ;sse 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q10, d23, d23 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpadal.s16 q8, q12 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q9, d24, d24 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q10, d25, d25 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpadal.s16 q8, q13 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q9, d26, d26 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q10, d27, d27 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpadal.s16 q8, q14 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q9, d28, d28 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.s16 q10, d29, d29 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne sub_pixel_variance16x16_neon_loop 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.u32 q10, q9, q10 ;accumulate sse 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpaddl.s32 q0, q8 ;accumulate sum 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpaddl.u32 q1, q10 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.s64 d0, d0, d1 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan vadd.u64 d1, d2, d3 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s32 q5, d0, d0 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.32 {d1[0]}, [r6] ;store sse 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u32 d10, d10, #8 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.u32 d0, d1, d10 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan add sp, sp, #528 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.32 r0, d0[0] ;return 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r6,pc} 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan END 424