1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_fast_fdct8x4_neon| 13 14 ARM 15 REQUIRE8 16 PRESERVE8 17 18 AREA ||.text||, CODE, READONLY, ALIGN=2 19;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch); 20;NOTE: 21;The input *src_diff. src_diff is calculated as: 22;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function) 23;In which *src_ptr and *pred_ptr both are unsigned char. 24;Therefore, *src_diff should be in the range of [-255, 255]. 25;CAUTION: 26;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255]. 27;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes 28;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c. 29 30|vp8_fast_fdct8x4_neon| PROC 31 vld1.16 {q1}, [r0], r2 ;load input 32 ldr r12, _ffdct8_coeff_ 33 vld1.16 {q2}, [r0], r2 34 vld1.16 {q3}, [r0], r2 35 vld1.16 {d0}, [r12] 36 vld1.16 {q4}, [r0], r2 37 38 ;First for-loop 39 ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3] 40 ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3] 41 vtrn.32 d2, d6 42 vtrn.32 d3, d7 43 vtrn.32 d4, d8 44 vtrn.32 d5, d9 45 vtrn.16 d2, d4 46 vtrn.16 d3, d5 47 vtrn.16 d6, d8 48 vtrn.16 d7, d9 49 50 vadd.s16 d10, d2, d8 ;ip[0]+ip[3] 51 vadd.s16 d11, d4, d6 ;ip[1]+ip[2] 52 vsub.s16 d12, d4, d6 ;ip[1]-ip[2] 53 vsub.s16 d13, d2, d8 ;ip[0]-ip[3] 54 vadd.s16 d22, d3, d9 55 vadd.s16 d23, d5, d7 56 vsub.s16 d24, d5, d7 57 vsub.s16 d25, d3, d9 58 59 vshl.i16 q5, q5, #1 ; a1, b1 60 vshl.i16 q6, q6, #1 ; c1, d1 61 vshl.i16 q1, q11, #1 62 vshl.i16 q2, q12, #1 63 64 vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 65 vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 66 vadd.s16 d24, d2, d3 67 vsub.s16 d25, d2, d3 68 69 vqdmulh.s16 q8, q7, d0[1] 70 vqdmulh.s16 q13, q12, d0[1] 71 vqdmulh.s16 q10, q6, d0[0] 72 vqdmulh.s16 q15, q2, d0[0] 73 vqdmulh.s16 q9, q6, d0[2] 74 vqdmulh.s16 q14, q2, d0[2] 75 76 vshr.s16 q8, q8, #1 77 vshr.s16 q13, q13, #1 78 vshr.s16 q10, q10, #1 79 vshr.s16 q15, q15, #1 80 vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 81 vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 82 vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 83 vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 84 85 vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 86 vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1 87 vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 88 vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2 89 vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection 90 vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection 91 vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2 92 vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2 93 94 ;Second for-loop 95 ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12] 96 ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12] 97 vtrn.32 d2, d6 98 vtrn.32 d3, d7 99 vtrn.32 d4, d8 100 vtrn.32 d5, d9 101 vtrn.16 d2, d4 102 vtrn.16 d3, d5 103 vtrn.16 d6, d8 104 vtrn.16 d7, d9 105 106 vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12] 107 vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8] 108 vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8] 109 vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12] 110 vadd.s16 d2, d3, d9 111 vadd.s16 d4, d5, d7 112 vsub.s16 d24, d5, d7 113 vsub.s16 d25, d3, d9 114 115 vadd.s16 d14, d10, d11 ;temp1 = a1 + b1 116 vsub.s16 d15, d10, d11 ;temp2 = a1 - b1 117 vadd.s16 d22, d2, d4 118 vsub.s16 d23, d2, d4 119 120 vqdmulh.s16 q8, q7, d0[1] 121 vqdmulh.s16 q13, q11, d0[1] 122 vqdmulh.s16 q10, q6, d0[0] 123 vqdmulh.s16 q15, q12, d0[0] 124 vqdmulh.s16 q9, q6, d0[2] 125 vqdmulh.s16 q14, q12, d0[2] 126 127 vshr.s16 q8, q8, #1 128 vshr.s16 q13, q13, #1 129 vshr.s16 q10, q10, #1 130 vshr.s16 q15, q15, #1 131 vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16 132 vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16 133 vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1 134 vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1 135 136 vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1 137 vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1 138 vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2 139 vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2 140 vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection 141 vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection 142 vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2 143 vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2 144 145 vclt.s16 q5, q1, #0 146 vclt.s16 q6, q2, #0 147 vclt.s16 q7, q3, #0 148 vclt.s16 q8, q4, #0 149 150 vsub.s16 q1, q1, q5 151 vsub.s16 q2, q2, q6 152 vsub.s16 q3, q3, q7 153 vsub.s16 q4, q4, q8 154 155 vshr.s16 q1, q1, #1 156 vshr.s16 q2, q2, #1 157 vshr.s16 q3, q3, #1 158 vshr.s16 q4, q4, #1 159 160 vst1.16 {q1, q2}, [r1]! 161 vst1.16 {q3, q4}, [r1] 162 163 bx lr 164 165 ENDP 166 167;----------------- 168 AREA fastfdct8x4_dat, DATA, READONLY 169;Data section with name data_area is specified. DCD reserves space in memory for 48 data. 170;One word each is reserved. Label filter_coeff can be used to access the data. 171;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... 172_ffdct8_coeff_ 173 DCD ffdct8_coeff 174ffdct8_coeff 175; 60547 = 0xEC83 176; 46341 = 0xB505 177; 25080 = 0x61F8 178 DCD 0xB505EC83, 0x000061F8 179 180 END 181