1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_bilinear_predict8x8_neon| 13 ARM 14 REQUIRE8 15 PRESERVE8 16 17 AREA ||.text||, CODE, READONLY, ALIGN=2 18; r0 unsigned char *src_ptr, 19; r1 int src_pixels_per_line, 20; r2 int xoffset, 21; r3 int yoffset, 22; r4 unsigned char *dst_ptr, 23; stack(lr) int dst_pitch 24 25|vp8_bilinear_predict8x8_neon| PROC 26 push {r4, lr} 27 28 ldr r12, _bifilter8_coeff_ 29 ldr r4, [sp, #8] ;load parameters from stack 30 ldr lr, [sp, #12] ;load parameters from stack 31 32 cmp r2, #0 ;skip first_pass filter if xoffset=0 33 beq skip_firstpass_filter 34 35;First pass: output_height lines x output_width columns (9x8) 36 add r2, r12, r2, lsl #3 ;calculate filter location 37 38 vld1.u8 {q1}, [r0], r1 ;load src data 39 vld1.u32 {d31}, [r2] ;load first_pass filter 40 vld1.u8 {q2}, [r0], r1 41 vdup.8 d0, d31[0] ;first_pass filter (d0 d1) 42 vld1.u8 {q3}, [r0], r1 43 vdup.8 d1, d31[4] 44 vld1.u8 {q4}, [r0], r1 45 46 vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) 47 vmull.u8 q7, d4, d0 48 vmull.u8 q8, d6, d0 49 vmull.u8 q9, d8, d0 50 51 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] 52 vext.8 d5, d4, d5, #1 53 vext.8 d7, d6, d7, #1 54 vext.8 d9, d8, d9, #1 55 56 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) 57 vmlal.u8 q7, d5, d1 58 vmlal.u8 q8, d7, d1 59 vmlal.u8 q9, d9, d1 60 61 vld1.u8 {q1}, [r0], r1 ;load src data 62 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 63 vld1.u8 {q2}, [r0], r1 64 vqrshrn.u16 d23, q7, #7 65 vld1.u8 {q3}, [r0], r1 66 vqrshrn.u16 d24, q8, #7 67 vld1.u8 {q4}, [r0], r1 68 vqrshrn.u16 d25, q9, #7 69 70 ;first_pass filtering on the rest 5-line data 71 vld1.u8 {q5}, [r0], r1 72 73 vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) 74 vmull.u8 q7, d4, d0 75 vmull.u8 q8, d6, d0 76 vmull.u8 q9, d8, d0 77 vmull.u8 q10, d10, d0 78 79 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] 80 vext.8 d5, d4, d5, #1 81 vext.8 d7, d6, d7, #1 82 vext.8 d9, d8, d9, #1 83 vext.8 d11, d10, d11, #1 84 85 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) 86 vmlal.u8 q7, d5, d1 87 vmlal.u8 q8, d7, d1 88 vmlal.u8 q9, d9, d1 89 vmlal.u8 q10, d11, d1 90 91 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 92 vqrshrn.u16 d27, q7, #7 93 vqrshrn.u16 d28, q8, #7 94 vqrshrn.u16 d29, q9, #7 95 vqrshrn.u16 d30, q10, #7 96 97;Second pass: 8x8 98secondpass_filter 99 cmp r3, #0 ;skip second_pass filter if yoffset=0 100 beq skip_secondpass_filter 101 102 add r3, r12, r3, lsl #3 103 add r0, r4, lr 104 105 vld1.u32 {d31}, [r3] ;load second_pass filter 106 add r1, r0, lr 107 108 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) 109 vdup.8 d1, d31[4] 110 111 vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) 112 vmull.u8 q2, d23, d0 113 vmull.u8 q3, d24, d0 114 vmull.u8 q4, d25, d0 115 vmull.u8 q5, d26, d0 116 vmull.u8 q6, d27, d0 117 vmull.u8 q7, d28, d0 118 vmull.u8 q8, d29, d0 119 120 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) 121 vmlal.u8 q2, d24, d1 122 vmlal.u8 q3, d25, d1 123 vmlal.u8 q4, d26, d1 124 vmlal.u8 q5, d27, d1 125 vmlal.u8 q6, d28, d1 126 vmlal.u8 q7, d29, d1 127 vmlal.u8 q8, d30, d1 128 129 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 130 vqrshrn.u16 d3, q2, #7 131 vqrshrn.u16 d4, q3, #7 132 vqrshrn.u16 d5, q4, #7 133 vqrshrn.u16 d6, q5, #7 134 vqrshrn.u16 d7, q6, #7 135 vqrshrn.u16 d8, q7, #7 136 vqrshrn.u16 d9, q8, #7 137 138 vst1.u8 {d2}, [r4] ;store result 139 vst1.u8 {d3}, [r0] 140 vst1.u8 {d4}, [r1], lr 141 vst1.u8 {d5}, [r1], lr 142 vst1.u8 {d6}, [r1], lr 143 vst1.u8 {d7}, [r1], lr 144 vst1.u8 {d8}, [r1], lr 145 vst1.u8 {d9}, [r1], lr 146 147 pop {r4, pc} 148 149;-------------------- 150skip_firstpass_filter 151 vld1.u8 {d22}, [r0], r1 ;load src data 152 vld1.u8 {d23}, [r0], r1 153 vld1.u8 {d24}, [r0], r1 154 vld1.u8 {d25}, [r0], r1 155 vld1.u8 {d26}, [r0], r1 156 vld1.u8 {d27}, [r0], r1 157 vld1.u8 {d28}, [r0], r1 158 vld1.u8 {d29}, [r0], r1 159 vld1.u8 {d30}, [r0], r1 160 161 b secondpass_filter 162 163;--------------------- 164skip_secondpass_filter 165 vst1.u8 {d22}, [r4], lr ;store result 166 vst1.u8 {d23}, [r4], lr 167 vst1.u8 {d24}, [r4], lr 168 vst1.u8 {d25}, [r4], lr 169 vst1.u8 {d26}, [r4], lr 170 vst1.u8 {d27}, [r4], lr 171 vst1.u8 {d28}, [r4], lr 172 vst1.u8 {d29}, [r4], lr 173 174 pop {r4, pc} 175 176 ENDP 177 178;----------------- 179 180_bifilter8_coeff_ 181 DCD bifilter8_coeff 182bifilter8_coeff 183 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 184 185 END 186