loopfilter_4_neon.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1; 2; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 EXPORT |vpx_lpf_horizontal_4_neon| 12 EXPORT |vpx_lpf_vertical_4_neon| 13 ARM 14 15 AREA ||.text||, CODE, READONLY, ALIGN=2 16 17; Currently vpx only works on iterations 8 at a time. The vp8 loop filter 18; works on 16 iterations at a time. 19; TODO(fgalligan): See about removing the count code as this function is only 20; called with a count of 1. 21; 22; void vpx_lpf_horizontal_4_neon(uint8_t *s, 23; int p /* pitch */, 24; const uint8_t *blimit, 25; const uint8_t *limit, 26; const uint8_t *thresh, 27; int count) 28; 29; r0 uint8_t *s, 30; r1 int p, /* pitch */ 31; r2 const uint8_t *blimit, 32; r3 const uint8_t *limit, 33; sp const uint8_t *thresh, 34; sp+4 int count 35|vpx_lpf_horizontal_4_neon| PROC 36 push {lr} 37 38 vld1.8 {d0[]}, [r2] ; duplicate *blimit 39 ldr r12, [sp, #8] ; load count 40 ldr r2, [sp, #4] ; load thresh 41 add r1, r1, r1 ; double pitch 42 43 cmp r12, #0 44 beq end_vpx_lf_h_edge 45 46 vld1.8 {d1[]}, [r3] ; duplicate *limit 47 vld1.8 {d2[]}, [r2] ; duplicate *thresh 48 49count_lf_h_loop 50 sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines 51 add r3, r2, r1, lsr #1 ; set to 3 lines down 52 53 vld1.u8 {d3}, [r2@64], r1 ; p3 54 vld1.u8 {d4}, [r3@64], r1 ; p2 55 vld1.u8 {d5}, [r2@64], r1 ; p1 56 vld1.u8 {d6}, [r3@64], r1 ; p0 57 vld1.u8 {d7}, [r2@64], r1 ; q0 58 vld1.u8 {d16}, [r3@64], r1 ; q1 59 vld1.u8 {d17}, [r2@64] ; q2 60 vld1.u8 {d18}, [r3@64] ; q3 61 62 sub r2, r2, r1, lsl #1 63 sub r3, r3, r1, lsl #1 64 65 bl vpx_loop_filter_neon 66 67 vst1.u8 {d4}, [r2@64], r1 ; store op1 68 vst1.u8 {d5}, [r3@64], r1 ; store op0 69 vst1.u8 {d6}, [r2@64], r1 ; store oq0 70 vst1.u8 {d7}, [r3@64], r1 ; store oq1 71 72 add r0, r0, #8 73 subs r12, r12, #1 74 bne count_lf_h_loop 75 76end_vpx_lf_h_edge 77 pop {pc} 78 ENDP ; |vpx_lpf_horizontal_4_neon| 79 80; Currently vpx only works on iterations 8 at a time. The vp8 loop filter 81; works on 16 iterations at a time. 82; TODO(fgalligan): See about removing the count code as this function is only 83; called with a count of 1. 84; 85; void vpx_lpf_vertical_4_neon(uint8_t *s, 86; int p /* pitch */, 87; const uint8_t *blimit, 88; const uint8_t *limit, 89; const uint8_t *thresh, 90; int count) 91; 92; r0 uint8_t *s, 93; r1 int p, /* pitch */ 94; r2 const uint8_t *blimit, 95; r3 const uint8_t *limit, 96; sp const uint8_t *thresh, 97; sp+4 int count 98|vpx_lpf_vertical_4_neon| PROC 99 push {lr} 100 101 vld1.8 {d0[]}, [r2] ; duplicate *blimit 102 ldr r12, [sp, #8] ; load count 103 vld1.8 {d1[]}, [r3] ; duplicate *limit 104 105 ldr r3, [sp, #4] ; load thresh 106 sub r2, r0, #4 ; move s pointer down by 4 columns 107 cmp r12, #0 108 beq end_vpx_lf_v_edge 109 110 vld1.8 {d2[]}, [r3] ; duplicate *thresh 111 112count_lf_v_loop 113 vld1.u8 {d3}, [r2], r1 ; load s data 114 vld1.u8 {d4}, [r2], r1 115 vld1.u8 {d5}, [r2], r1 116 vld1.u8 {d6}, [r2], r1 117 vld1.u8 {d7}, [r2], r1 118 vld1.u8 {d16}, [r2], r1 119 vld1.u8 {d17}, [r2], r1 120 vld1.u8 {d18}, [r2] 121 122 ;transpose to 8x16 matrix 123 vtrn.32 d3, d7 124 vtrn.32 d4, d16 125 vtrn.32 d5, d17 126 vtrn.32 d6, d18 127 128 vtrn.16 d3, d5 129 vtrn.16 d4, d6 130 vtrn.16 d7, d17 131 vtrn.16 d16, d18 132 133 vtrn.8 d3, d4 134 vtrn.8 d5, d6 135 vtrn.8 d7, d16 136 vtrn.8 d17, d18 137 138 bl vpx_loop_filter_neon 139 140 sub r0, r0, #2 141 142 ;store op1, op0, oq0, oq1 143 vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 144 vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 145 vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 146 vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 147 vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 148 vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 149 vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 150 vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] 151 152 add r0, r0, r1, lsl #3 ; s += pitch * 8 153 subs r12, r12, #1 154 subne r2, r0, #4 ; move s pointer down by 4 columns 155 bne count_lf_v_loop 156 157end_vpx_lf_v_edge 158 pop {pc} 159 ENDP ; |vpx_lpf_vertical_4_neon| 160 161; void vpx_loop_filter_neon(); 162; This is a helper function for the loopfilters. The invidual functions do the 163; necessary load, transpose (if necessary) and store. The function does not use 164; registers d8-d15. 165; 166; Inputs: 167; r0-r3, r12 PRESERVE 168; d0 blimit 169; d1 limit 170; d2 thresh 171; d3 p3 172; d4 p2 173; d5 p1 174; d6 p0 175; d7 q0 176; d16 q1 177; d17 q2 178; d18 q3 179; 180; Outputs: 181; d4 op1 182; d5 op0 183; d6 oq0 184; d7 oq1 185|vpx_loop_filter_neon| PROC 186 ; filter_mask 187 vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) 188 vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) 189 vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) 190 vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) 191 vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) 192 vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) 193 194 ; only compare the largest value to limit 195 vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) 196 vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) 197 198 vabd.u8 d17, d6, d7 ; abs(p0 - q0) 199 200 vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) 201 202 vmov.u8 d18, #0x80 203 204 vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) 205 206 ; hevmask 207 vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 208 vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 209 vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) 210 211 vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) 212 vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 213 214 veor d7, d7, d18 ; qs0 215 216 vcge.u8 d23, d1, d23 ; abs(m1) > limit 217 218 ; filter() function 219 ; convert to signed 220 221 vshr.u8 d28, d28, #1 ; a = a / 2 222 veor d6, d6, d18 ; ps0 223 224 veor d5, d5, d18 ; ps1 225 vqadd.u8 d17, d17, d28 ; a = b + a 226 227 veor d16, d16, d18 ; qs1 228 229 vmov.u8 d19, #3 230 231 vsub.s8 d28, d7, d6 ; ( qs0 - ps0) 232 233 vcge.u8 d17, d0, d17 ; a > blimit 234 235 vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) 236 vorr d22, d21, d22 ; hevmask 237 238 vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) 239 240 vand d27, d27, d22 ; filter &= hev 241 vand d23, d23, d17 ; filter_mask 242 243 vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) 244 245 vmov.u8 d17, #4 246 247 ; filter = clamp(filter + 3 * ( qs0 - ps0)) 248 vqmovn.s16 d27, q12 249 250 vand d27, d27, d23 ; filter &= mask 251 252 vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) 253 vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) 254 vshr.s8 d28, d28, #3 ; filter2 >>= 3 255 vshr.s8 d27, d27, #3 ; filter1 >>= 3 256 257 vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) 258 vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) 259 260 ; outer tap adjustments 261 vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 262 263 veor d6, d26, d18 ; *oq0 = u^0x80 264 265 vbic d27, d27, d22 ; filter &= ~hev 266 267 vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) 268 vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) 269 270 veor d5, d19, d18 ; *op0 = u^0x80 271 veor d4, d21, d18 ; *op1 = u^0x80 272 veor d7, d20, d18 ; *oq1 = u^0x80 273 274 bx lr 275 ENDP ; |vpx_loop_filter_neon| 276 277 END 278