1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 EXPORT |vp8_subtract_b_neon| 12 EXPORT |vp8_subtract_mby_neon| 13 EXPORT |vp8_subtract_mbuv_neon| 14 15 INCLUDE vp8_asm_enc_offsets.asm 16 17 ARM 18 REQUIRE8 19 PRESERVE8 20 21 AREA ||.text||, CODE, READONLY, ALIGN=2 22 23;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch) 24|vp8_subtract_b_neon| PROC 25 26 stmfd sp!, {r4-r7} 27 28 ldr r3, [r0, #vp8_block_base_src] 29 ldr r4, [r0, #vp8_block_src] 30 ldr r5, [r0, #vp8_block_src_diff] 31 ldr r3, [r3] 32 ldr r6, [r0, #vp8_block_src_stride] 33 add r3, r3, r4 ; src = *base_src + src 34 ldr r7, [r1, #vp8_blockd_predictor] 35 36 vld1.8 {d0}, [r3], r6 ;load src 37 vld1.8 {d1}, [r7], r2 ;load pred 38 vld1.8 {d2}, [r3], r6 39 vld1.8 {d3}, [r7], r2 40 vld1.8 {d4}, [r3], r6 41 vld1.8 {d5}, [r7], r2 42 vld1.8 {d6}, [r3], r6 43 vld1.8 {d7}, [r7], r2 44 45 vsubl.u8 q10, d0, d1 46 vsubl.u8 q11, d2, d3 47 vsubl.u8 q12, d4, d5 48 vsubl.u8 q13, d6, d7 49 50 mov r2, r2, lsl #1 51 52 vst1.16 {d20}, [r5], r2 ;store diff 53 vst1.16 {d22}, [r5], r2 54 vst1.16 {d24}, [r5], r2 55 vst1.16 {d26}, [r5], r2 56 57 ldmfd sp!, {r4-r7} 58 bx lr 59 60 ENDP 61 62 63;========================================== 64;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride 65; unsigned char *pred, int pred_stride) 66|vp8_subtract_mby_neon| PROC 67 push {r4-r7} 68 mov r12, #4 69 ldr r4, [sp, #16] ; pred_stride 70 mov r6, #32 ; "diff" stride x2 71 add r5, r0, #16 ; second diff pointer 72 73subtract_mby_loop 74 vld1.8 {q0}, [r1], r2 ;load src 75 vld1.8 {q1}, [r3], r4 ;load pred 76 vld1.8 {q2}, [r1], r2 77 vld1.8 {q3}, [r3], r4 78 vld1.8 {q4}, [r1], r2 79 vld1.8 {q5}, [r3], r4 80 vld1.8 {q6}, [r1], r2 81 vld1.8 {q7}, [r3], r4 82 83 vsubl.u8 q8, d0, d2 84 vsubl.u8 q9, d1, d3 85 vsubl.u8 q10, d4, d6 86 vsubl.u8 q11, d5, d7 87 vsubl.u8 q12, d8, d10 88 vsubl.u8 q13, d9, d11 89 vsubl.u8 q14, d12, d14 90 vsubl.u8 q15, d13, d15 91 92 vst1.16 {q8}, [r0], r6 ;store diff 93 vst1.16 {q9}, [r5], r6 94 vst1.16 {q10}, [r0], r6 95 vst1.16 {q11}, [r5], r6 96 vst1.16 {q12}, [r0], r6 97 vst1.16 {q13}, [r5], r6 98 vst1.16 {q14}, [r0], r6 99 vst1.16 {q15}, [r5], r6 100 101 subs r12, r12, #1 102 bne subtract_mby_loop 103 104 pop {r4-r7} 105 bx lr 106 ENDP 107 108;================================= 109;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, 110; int src_stride, unsigned char *upred, 111; unsigned char *vpred, int pred_stride) 112 113|vp8_subtract_mbuv_neon| PROC 114 push {r4-r7} 115 ldr r4, [sp, #16] ; upred 116 ldr r5, [sp, #20] ; vpred 117 ldr r6, [sp, #24] ; pred_stride 118 add r0, r0, #512 ; short *udiff = diff + 256; 119 mov r12, #32 ; "diff" stride x2 120 add r7, r0, #16 ; second diff pointer 121 122;u 123 vld1.8 {d0}, [r1], r3 ;load usrc 124 vld1.8 {d1}, [r4], r6 ;load upred 125 vld1.8 {d2}, [r1], r3 126 vld1.8 {d3}, [r4], r6 127 vld1.8 {d4}, [r1], r3 128 vld1.8 {d5}, [r4], r6 129 vld1.8 {d6}, [r1], r3 130 vld1.8 {d7}, [r4], r6 131 vld1.8 {d8}, [r1], r3 132 vld1.8 {d9}, [r4], r6 133 vld1.8 {d10}, [r1], r3 134 vld1.8 {d11}, [r4], r6 135 vld1.8 {d12}, [r1], r3 136 vld1.8 {d13}, [r4], r6 137 vld1.8 {d14}, [r1], r3 138 vld1.8 {d15}, [r4], r6 139 140 vsubl.u8 q8, d0, d1 141 vsubl.u8 q9, d2, d3 142 vsubl.u8 q10, d4, d5 143 vsubl.u8 q11, d6, d7 144 vsubl.u8 q12, d8, d9 145 vsubl.u8 q13, d10, d11 146 vsubl.u8 q14, d12, d13 147 vsubl.u8 q15, d14, d15 148 149 vst1.16 {q8}, [r0], r12 ;store diff 150 vst1.16 {q9}, [r7], r12 151 vst1.16 {q10}, [r0], r12 152 vst1.16 {q11}, [r7], r12 153 vst1.16 {q12}, [r0], r12 154 vst1.16 {q13}, [r7], r12 155 vst1.16 {q14}, [r0], r12 156 vst1.16 {q15}, [r7], r12 157 158;v 159 vld1.8 {d0}, [r2], r3 ;load vsrc 160 vld1.8 {d1}, [r5], r6 ;load vpred 161 vld1.8 {d2}, [r2], r3 162 vld1.8 {d3}, [r5], r6 163 vld1.8 {d4}, [r2], r3 164 vld1.8 {d5}, [r5], r6 165 vld1.8 {d6}, [r2], r3 166 vld1.8 {d7}, [r5], r6 167 vld1.8 {d8}, [r2], r3 168 vld1.8 {d9}, [r5], r6 169 vld1.8 {d10}, [r2], r3 170 vld1.8 {d11}, [r5], r6 171 vld1.8 {d12}, [r2], r3 172 vld1.8 {d13}, [r5], r6 173 vld1.8 {d14}, [r2], r3 174 vld1.8 {d15}, [r5], r6 175 176 vsubl.u8 q8, d0, d1 177 vsubl.u8 q9, d2, d3 178 vsubl.u8 q10, d4, d5 179 vsubl.u8 q11, d6, d7 180 vsubl.u8 q12, d8, d9 181 vsubl.u8 q13, d10, d11 182 vsubl.u8 q14, d12, d13 183 vsubl.u8 q15, d14, d15 184 185 vst1.16 {q8}, [r0], r12 ;store diff 186 vst1.16 {q9}, [r7], r12 187 vst1.16 {q10}, [r0], r12 188 vst1.16 {q11}, [r7], r12 189 vst1.16 {q12}, [r0], r12 190 vst1.16 {q13}, [r7], r12 191 vst1.16 {q14}, [r0], r12 192 vst1.16 {q15}, [r7], r12 193 194 pop {r4-r7} 195 bx lr 196 197 ENDP 198 199 END 200