1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp9_lpf_horizontal_4_neon| 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp9_lpf_vertical_4_neon| 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp9_lpf_horizontal_8_neon| 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan EXPORT |vp9_lpf_vertical_8_neon| 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan ARM 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan AREA ||.text||, CODE, READONLY, ALIGN=2 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; works on 16 iterations at a time. 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan; TODO(fgalligan): See about removing the count code as this function is only 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; called with a count of 1. 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_horizontal_4_neon(uint8_t *s, 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int p /* pitch */, 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *blimit, 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *limit, 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *thresh, 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int count) 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 uint8_t *s, 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int p, /* pitch */ 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 const uint8_t *blimit, 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 const uint8_t *limit, 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp const uint8_t *thresh, 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4 int count 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_horizontal_4_neon| PROC 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {lr} 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d0[]}, [r2] ; duplicate *blimit 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r12, [sp, #8] ; load count 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r2, [sp, #4] ; load thresh 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, r1, r1 ; double pitch 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r12, #0 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq end_vp9_lf_h_edge 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d1[]}, [r3] ; duplicate *limit 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d2[]}, [r2] ; duplicate *thresh 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_lf_h_loop 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r2, r1, lsr #1 ; set to 3 lines down 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d3}, [r2@64], r1 ; p3 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d4}, [r3@64], r1 ; p2 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5}, [r2@64], r1 ; p1 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6}, [r3@64], r1 ; p0 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d7}, [r2@64], r1 ; q0 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d16}, [r3@64], r1 ; q1 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d17}, [r2@64] ; q2 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [r3@64] ; q3 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r2, r1, lsl #1 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r3, r3, r1, lsl #1 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan bl vp9_loop_filter_neon 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d4}, [r2@64], r1 ; store op1 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d5}, [r3@64], r1 ; store op0 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d6}, [r2@64], r1 ; store oq0 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d7}, [r3@64], r1 ; store oq1 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r0, #8 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne count_lf_h_loop 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan 78233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_lf_h_edge 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {pc} 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_lpf_horizontal_4_neon| 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan; works on 16 iterations at a time. 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan; TODO(fgalligan): See about removing the count code as this function is only 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan; called with a count of 1. 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_vertical_4_neon(uint8_t *s, 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int p /* pitch */, 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *blimit, 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *limit, 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *thresh, 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int count) 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 uint8_t *s, 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int p, /* pitch */ 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 const uint8_t *blimit, 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 const uint8_t *limit, 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp const uint8_t *thresh, 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4 int count 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_vertical_4_neon| PROC 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {lr} 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d0[]}, [r2] ; duplicate *blimit 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r12, [sp, #8] ; load count 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d1[]}, [r3] ; duplicate *limit 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r3, [sp, #4] ; load thresh 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r0, #4 ; move s pointer down by 4 columns 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r12, #0 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq end_vp9_lf_v_edge 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d2[]}, [r3] ; duplicate *thresh 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan 114233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_lf_v_loop 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d3}, [r2], r1 ; load s data 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d4}, [r2], r1 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5}, [r2], r1 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6}, [r2], r1 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d7}, [r2], r1 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d16}, [r2], r1 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d17}, [r2], r1 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [r2] 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;transpose to 8x16 matrix 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d3, d7 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d4, d16 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d5, d17 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d6, d18 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d3, d5 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d4, d6 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d7, d17 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d16, d18 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d3, d4 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d5, d6 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d7, d16 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d17, d18 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan bl vp9_loop_filter_neon 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r0, r0, #2 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;store op1, op0, oq0, oq1 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r0, r1, lsl #3 ; s += pitch * 8 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan subne r2, r0, #4 ; move s pointer down by 4 columns 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne count_lf_v_loop 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan 159233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_lf_v_edge 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {pc} 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_lpf_vertical_4_neon| 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_loop_filter_neon(); 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a helper function for the loopfilters. The invidual functions do the 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan; necessary load, transpose (if necessary) and store. The function does not use 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers d8-d15. 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Inputs: 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0-r3, r12 PRESERVE 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0 blimit 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1 limit 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2 thresh 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3 p3 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4 p2 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5 p1 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6 p0 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7 q0 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d16 q1 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d17 q2 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d18 q3 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Outputs: 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4 op1 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5 op0 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6 oq0 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7 oq1 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_loop_filter_neon| PROC 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter_mask 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; only compare the largest value to limit 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d17, d6, d7 ; abs(p0 - q0) 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d18, #0x80 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; hevmask 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d7, d7, d18 ; qs0 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcge.u8 d23, d1, d23 ; abs(m1) > limit 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter() function 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; convert to signed 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u8 d28, d28, #1 ; a = a / 2 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d6, d6, d18 ; ps0 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d5, d5, d18 ; ps1 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.u8 d17, d17, d28 ; a = b + a 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d16, d16, d18 ; qs1 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d19, #3 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s8 d28, d7, d6 ; ( qs0 - ps0) 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcge.u8 d17, d0, d17 ; a > blimit 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan vorr d22, d21, d22 ; hevmask 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d27, d27, d22 ; filter &= hev 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d23, d23, d17 ; filter_mask 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d17, #4 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter = clamp(filter + 3 * ( qs0 - ps0)) 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovn.s16 d27, q12 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d27, d27, d23 ; filter &= mask 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.s8 d28, d28, #3 ; filter2 >>= 3 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.s8 d27, d27, #3 ; filter1 >>= 3 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; outer tap adjustments 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d6, d26, d18 ; *oq0 = u^0x80 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbic d27, d27, d22 ; filter &= ~hev 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d5, d19, d18 ; *op0 = u^0x80 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d4, d21, d18 ; *op1 = u^0x80 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d7, d20, d18 ; *oq1 = u^0x80 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan bx lr 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_loop_filter_neon| 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p, 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *blimit, 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *limit, 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *thresh, 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int count) 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 uint8_t *s, 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int p, /* pitch */ 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 const uint8_t *blimit, 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 const uint8_t *limit, 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp const uint8_t *thresh, 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4 int count 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_horizontal_8_neon| PROC 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r4-r5, lr} 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d0[]}, [r2] ; duplicate *blimit 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r12, [sp, #16] ; load count 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r2, [sp, #12] ; load thresh 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r1, r1, r1 ; double pitch 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r12, #0 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq end_vp9_mblf_h_edge 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d1[]}, [r3] ; duplicate *limit 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d2[]}, [r2] ; duplicate *thresh 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan 304233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_mblf_h_loop 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r2, r3, r1, lsr #1 ; set to 3 lines down 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d3}, [r3@64], r1 ; p3 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d4}, [r2@64], r1 ; p2 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5}, [r3@64], r1 ; p1 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6}, [r2@64], r1 ; p0 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d7}, [r3@64], r1 ; q0 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d16}, [r2@64], r1 ; q1 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d17}, [r3@64] ; q2 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [r2@64], r1 ; q3 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r3, r3, r1, lsl #1 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r2, r1, lsl #2 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan bl vp9_mbloop_filter_neon 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d0}, [r2@64], r1 ; store op2 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d1}, [r3@64], r1 ; store op1 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d2}, [r2@64], r1 ; store op0 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d3}, [r3@64], r1 ; store oq0 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d4}, [r2@64], r1 ; store oq1 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1.u8 {d5}, [r3@64], r1 ; store oq2 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r0, #8 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne count_mblf_h_loop 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan 333233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_mblf_h_edge 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r5, pc} 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_lpf_horizontal_8_neon| 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_vertical_8_neon(uint8_t *s, 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int pitch, 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *blimit, 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *limit, 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan; const uint8_t *thresh, 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int count) 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0 uint8_t *s, 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1 int pitch, 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2 const uint8_t *blimit, 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3 const uint8_t *limit, 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp const uint8_t *thresh, 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4 int count 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_vertical_8_neon| PROC 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan push {r4-r5, lr} 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d0[]}, [r2] ; duplicate *blimit 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r12, [sp, #16] ; load count 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d1[]}, [r3] ; duplicate *limit 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan ldr r3, [sp, #12] ; load thresh 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r0, #4 ; move s pointer down by 4 columns 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r12, #0 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq end_vp9_mblf_v_edge 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.8 {d2[]}, [r3] ; duplicate *thresh 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan 365233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_mblf_v_loop 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d3}, [r2], r1 ; load s data 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d4}, [r2], r1 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d5}, [r2], r1 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d6}, [r2], r1 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d7}, [r2], r1 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d16}, [r2], r1 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d17}, [r2], r1 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan vld1.u8 {d18}, [r2] 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;transpose to 8x16 matrix 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d3, d7 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d4, d16 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d5, d17 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.32 d6, d18 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d3, d5 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d4, d6 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d7, d17 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.16 d16, d18 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d3, d4 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d5, d6 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d7, d16 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan vtrn.8 d17, d18 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub r2, r0, #3 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r3, r0, #1 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan bl vp9_mbloop_filter_neon 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;store op2, op1, op0, oq0 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;store oq1, oq2 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[0], d5[0]}, [r3], r1 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[1], d5[1]}, [r3], r1 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[2], d5[2]}, [r3], r1 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[3], d5[3]}, [r3], r1 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[4], d5[4]}, [r3], r1 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[5], d5[5]}, [r3], r1 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[6], d5[6]}, [r3], r1 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst2.8 {d4[7], d5[7]}, [r3] 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan add r0, r0, r1, lsl #3 ; s += pitch * 8 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan subs r12, r12, #1 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan subne r2, r0, #4 ; move s pointer down by 4 columns 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan bne count_mblf_v_loop 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan 421233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_mblf_v_edge 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop {r4-r5, pc} 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_lpf_vertical_8_neon| 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_mbloop_filter_neon(); 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a helper function for the loopfilters. The invidual functions do the 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan; necessary load, transpose (if necessary) and store. The function does not use 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers d8-d15. 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Inputs: 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0-r3, r12 PRESERVE 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0 blimit 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1 limit 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2 thresh 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3 p3 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4 p2 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5 p1 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6 p0 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7 q0 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d16 q1 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d17 q2 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d18 q3 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Outputs: 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0 op2 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1 op1 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2 op0 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3 oq0 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4 oq1 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5 oq2 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_mbloop_filter_neon| PROC 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter_mask 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; only compare the largest value to limit 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d19, d19, d20 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d19, d19, d23 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; abs () > limit 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcge.u8 d19, d1, d19 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; only compare the largest value to thresh 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.u8 d23, d23, #1 ; a = a / 2 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.u8 d24, d24, d23 ; a = b + a 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d23, #1 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcge.u8 d24, d0, d24 ; a > blimit 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcge.u8 d20, d23, d20 ; flat 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d19, d19, d24 ; mask 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d20, d20, d19 ; flat & mask 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d22, #0x80 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan vorr d23, d21, d23 ; hev 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; This instruction will truncate the "flat & mask" masks down to 4 bits 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; each to fit into one 32 bit arm register. The values are stored in 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; q10.64[0]. 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshrn.u16 d30, q10, #4 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u32 r4, d30[0] ; flat & mask 4bits 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan adds r5, r4, #1 ; Check for all 1's 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; If mask and flat are 1's for all vectors, then we only need to execute 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; the power branch for all vectors. 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq power_branch_only 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp r4, #0 ; Check for 0, set flag for later 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; mbfilter() function 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter() function 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; convert to signed 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d21, d7, d22 ; qs0 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d24, d6, d22 ; ps0 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d25, d5, d22 ; ps1 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d26, d16, d22 ; qs1 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d27, #3 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsub.s8 d28, d21, d24 ; ( qs0 - ps0) 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d29, d29, d23 ; filter &= hev 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d29, #4 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; filter = clamp(filter + 3 * ( qs0 - ps0)) 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqmovn.s16 d28, q15 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan vand d28, d28, d19 ; filter &= mask 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.s8 d30, d30, #3 ; filter2 >>= 3 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan vshr.s8 d29, d29, #3 ; filter1 >>= 3 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; outer tap adjustments: ++filter1 >> 1 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan vrshr.s8 d29, d29, #1 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbic d29, d29, d23 ; filter &= ~hev 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; If mask and flat are 0's for all vectors, then we only need to execute 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; the filter branch for all vectors. 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan beq filter_branch_only 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; If mask and flat are mixed then we must perform both branches and 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; combine the data. 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d24, d24, d22 ; *f_op0 = u^0x80 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d21, d21, d22 ; *f_oq0 = u^0x80 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d25, d25, d22 ; *f_op1 = u^0x80 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d26, d26, d22 ; *f_oq1 = u^0x80 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; At this point we have already executed the filter branch. The filter 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; branch does not set op2 or oq2, so use p2 and q2. Execute the power 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; branch and combine the data. 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d23, #2 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d5 ; r_op2 += p1 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d30, q14, #3 ; r_op2 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d4 ; r_op1 -= p2 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d5 ; r_op1 += p1 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d16 ; r_op1 += q1 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d31, q14, #3 ; r_op1 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d5 ; r_op0 -= p1 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d6 ; r_op0 += p0 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d17 ; r_op0 += q2 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d23, q14, #3 ; r_op0 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d6 ; r_oq0 -= p0 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d7 ; r_oq0 += q0 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; oq0 += q3 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d22, q14, #3 ; r_oq0 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d7 ; r_oq1 -= q0 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d16 ; r_oq1 += q1 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; r_oq1 += q3 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d6, q14, #3 ; r_oq1 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d16 ; r_oq2 -= q1 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d17 ; r_oq2 += q2 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; r_oq2 += q3 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d7, q14, #3 ; r_oq2 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan bx lr 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan 653233d2500723e5594f3e7c70896ffeeef32b9c950ywanpower_branch_only 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d27, #3 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmov.u8 d21, #2 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d5 ; op2 += p1 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d0, q14, #3 ; op2 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; op1 = op2 - p3 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d4 ; op1 -= p2 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d5 ; op1 += p1 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d16 ; op1 += q1 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d1, q14, #3 ; op1 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; op0 = op1 - p3 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d5 ; op0 -= p1 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d6 ; op0 += p0 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d17 ; op0 += q2 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d2, q14, #3 ; op0 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d3 ; oq0 = op0 - p3 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d6 ; oq0 -= p0 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d7 ; oq0 += q0 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; oq0 += q3 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d3, q14, #3 ; oq0 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d4 ; oq1 = oq0 - p2 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d7 ; oq1 -= q0 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d16 ; oq1 += q1 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; oq1 += q3 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d4, q14, #3 ; oq1 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d5 ; oq2 = oq1 - p1 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsubw.u8 q14, d16 ; oq2 -= q1 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d17 ; oq2 += q2 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan vaddw.u8 q14, d18 ; oq2 += q3 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan vqrshrn.u16 d5, q14, #3 ; oq2 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan bx lr 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan 694233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilter_branch_only 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; TODO(fgalligan): See if we can rearange registers so we do not need to 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; do the 2 vswp. 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d0, d4 ; op2 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan vswp d5, d17 ; oq2 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d2, d24, d22 ; *op0 = u^0x80 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d3, d21, d22 ; *oq0 = u^0x80 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d1, d25, d22 ; *op1 = u^0x80 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan veor d4, d26, d22 ; *oq1 = u^0x80 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan bx lr 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan ENDP ; |vp9_mbloop_filter_neon| 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan END 709