1@ 2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3@ 4@ Use of this source code is governed by a BSD-style license 5@ that can be found in the LICENSE file in the root of the source 6@ tree. An additional intellectual property rights grant can be found 7@ in the file PATENTS. All contributing project authors may 8@ be found in the AUTHORS file in the root of the source tree. 9@ 10@ Reference code in filters.c. Output is bit-exact. 11 12#include "webrtc/system_wrappers/interface/asm_defines.h" 13 14GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon 15.align 2 16 17@ int WebRtcIsacfix_AutocorrNeon( 18@ int32_t* __restrict r, 19@ const int16_t* __restrict x, 20@ int16_t N, 21@ int16_t order, 22@ int16_t* __restrict scale); 23 24DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon 25 push {r3 - r12} 26 27 @ Constant initializations 28 mov r4, #33 29 vmov.i32 d0, #0 30 vmov.i32 q8, #0 31 vmov.i32 d29, #0 @ Initialize (-scale). 32 vmov.u8 d30, #255 @ Initialize d30 as -1. 33 vmov.i32 d0[0], r4 @ d0: 00000033 (low), 00000000 (high) 34 vmov.i32 d25, #32 35 36 mov r5, r1 @ x 37 mov r6, r2 @ N 38 39@ Generate the first coefficient r0. 40LOOP_R0: 41 vld1.16 {d18}, [r5]! @ x[] 42 subs r6, r6, #4 43 vmull.s16 q9, d18, d18 44 vpadal.s32 q8, q9 45 bgt LOOP_R0 46 47 vadd.i64 d16, d16, d17 48 49 @ Calculate scaling (the value of shifting). 50 vmov d17, d16 51 52 @ Check overflow and determine the value for 'scale'. 53 @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and 54 @ lower 32-bit words. Note that we don't care about the value of the upper 55 @ word in d17. 56 57 @ Check the case of 1 bit overflow. If it occurs store the results for 58 @ scale and r[0] in d17 and d29. 59 60 vshr.u64 d3, d16, #1 61 vclt.s32 d1, d16, #0 @ < 0 ? 62 vbit d17, d3, d1 @ For r[0] 63 vbit d29, d30, d1 @ -scale = -1 64 65 @ For the case of more than 1 bit overflow. If it occurs overwrite the 66 @ results for scale and r[0] in d17 and d29. 67 vclz.s32 d5, d16 @ Leading zeros of the two 32 bit words. 68 vshr.s64 d26, d5, #32 @ Keep only the upper 32 bits. 69 vsub.i64 d31, d26, d0 @ zeros - 33 70 vshl.i64 d27, d26, #32 71 vorr d27, d26 @ Duplicate the high word with its low one. 72 vshl.u64 d2, d16, d31 @ Shift by (-scale). 73 vclt.s32 d1, d27, d25 @ < 32 ? 74 vbit d17, d2, d1 @ For r[0] 75 vbit d29, d31, d1 @ -scale 76 77 vst1.32 d17[0], [r0]! @ r[0] 78 mov r5, #1 @ outer loop counter 79 80@ Generate rest of the coefficients 81LOOP_R: 82 vmov.i32 q8, #0 @ Initialize the accumulation result. 83 vmov.i32 q9, #0 @ Initialize the accumulation result. 84 mov r7, r1 @ &x[0] 85 add r6, r7, r5, lsl #1 @ x[i] 86 sub r12, r2, r5 @ N - i 87 lsr r8, r12, #3 @ inner loop counter 88 sub r12, r8, lsl #3 @ Leftover samples to be processed 89 90LOOP_8X_SAMPLES: @ Multiple of 8 samples 91 vld1.16 {d20, d21}, [r7]! @ x[0, ...] 92 vld1.16 {d22, d23}, [r6]! @ x[i, ...] 93 vmull.s16 q12, d20, d22 94 vmull.s16 q13, d21, d23 95 subs r8, #1 96 vpadal.s32 q8, q12 97 vpadal.s32 q9, q13 98 bgt LOOP_8X_SAMPLES 99 100 cmp r12, #4 101 blt REST_SAMPLES 102 103Four_SAMPLES: 104 vld1.16 d20, [r7]! 105 vld1.16 d22, [r6]! 106 vmull.s16 q12, d20, d22 107 vpadal.s32 q8, q12 108 sub r12, #4 109 110REST_SAMPLES: 111 mov r8, #0 @ Initialize lower word of the accumulation. 112 mov r4, #0 @ Initialize upper word of the accumulation. 113 cmp r12, #0 114 ble SUMUP 115 116LOOP_REST_SAMPLES: 117 ldrh r9, [r7], #2 @ x[0, ...] 118 ldrh r10, [r6], #2 @ x[i, ...] 119 smulbb r11, r9, r10 120 adds r8, r8, r11 @ lower word of the accumulation. 121 adc r4, r4, r11, asr #31 @ upper word of the accumulation. 122 subs r12, #1 123 bgt LOOP_REST_SAMPLES 124 125@ Added the multiplication results together and do a shift. 126SUMUP: 127 vadd.i64 d16, d17 128 vadd.i64 d18, d19 129 vadd.i64 d18, d16 130 vmov d17, r8, r4 131 vadd.i64 d18, d17 132 vshl.s64 d18, d29 @ Shift left by (-scale). 133 vst1.32 d18[0], [r0]! @ r[i] 134 135 add r5, #1 136 cmp r5, r3 137 ble LOOP_R 138 139 vneg.s32 d29, d29 @ Get value for 'scale'. 140 ldr r2, [sp, #40] @ &scale 141 add r0, r3, #1 @ return (order + 1) 142 vst1.s16 d29[0], [r2] @ Store 'scale' 143 144 pop {r3 - r12} 145 bx lr 146