1@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10@ Reference code in filters.c. Output is bit-exact.
11
12#include "webrtc/system_wrappers/interface/asm_defines.h"
13
14GLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
15.align  2
16
17@ int WebRtcIsacfix_AutocorrNeon(
18@     int32_t* __restrict r,
19@     const int16_t* __restrict x,
20@     int16_t N,
21@     int16_t order,
22@     int16_t* __restrict scale);
23
24DEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
25  push       {r3 - r12}
26
27  @ Constant initializations
28  mov        r4, #33
29  vmov.i32   d0, #0
30  vmov.i32   q8, #0
31  vmov.i32   d29, #0               @ Initialize (-scale).
32  vmov.u8    d30, #255             @ Initialize d30 as -1.
33  vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
34  vmov.i32   d25, #32
35
36  mov        r5, r1                @ x
37  mov        r6, r2                @ N
38
39@ Generate the first coefficient r0.
40LOOP_R0:
41  vld1.16    {d18}, [r5]!          @ x[]
42  subs       r6, r6, #4
43  vmull.s16  q9, d18, d18
44  vpadal.s32 q8, q9
45  bgt        LOOP_R0
46
47  vadd.i64   d16, d16, d17
48
49  @ Calculate scaling (the value of shifting).
50  vmov       d17, d16
51
52  @ Check overflow and determine the value for 'scale'.
53  @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
54  @ lower 32-bit words. Note that we don't care about the value of the upper
55  @ word in d17.
56
57  @ Check the case of 1 bit overflow. If it occurs store the results for
58  @ scale and r[0] in d17 and d29.
59
60  vshr.u64   d3, d16, #1
61  vclt.s32   d1, d16, #0           @ < 0 ?
62  vbit       d17, d3, d1           @ For r[0]
63  vbit       d29, d30, d1          @ -scale = -1
64
65  @ For the case of more than 1 bit overflow. If it occurs overwrite the
66  @ results for scale and r[0] in d17 and d29.
67  vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
68  vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
69  vsub.i64   d31, d26, d0          @ zeros - 33
70  vshl.i64   d27, d26, #32
71  vorr       d27, d26              @ Duplicate the high word with its low one.
72  vshl.u64   d2, d16, d31          @ Shift by (-scale).
73  vclt.s32   d1, d27, d25          @ < 32 ?
74  vbit       d17, d2, d1           @ For r[0]
75  vbit       d29, d31, d1          @ -scale
76
77  vst1.32    d17[0], [r0]!         @ r[0]
78  mov        r5, #1                @ outer loop counter
79
80@ Generate rest of the coefficients
81LOOP_R:
82  vmov.i32   q8, #0                @ Initialize the accumulation result.
83  vmov.i32   q9, #0                @ Initialize the accumulation result.
84  mov        r7, r1                @ &x[0]
85  add        r6, r7, r5, lsl #1    @ x[i]
86  sub        r12, r2, r5           @ N - i
87  lsr        r8, r12, #3           @ inner loop counter
88  sub        r12, r8, lsl #3       @ Leftover samples to be processed
89
90LOOP_8X_SAMPLES:                   @ Multiple of 8 samples
91  vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
92  vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
93  vmull.s16  q12, d20, d22
94  vmull.s16  q13, d21, d23
95  subs       r8, #1
96  vpadal.s32 q8, q12
97  vpadal.s32 q9, q13
98  bgt        LOOP_8X_SAMPLES
99
100  cmp r12, #4
101  blt REST_SAMPLES
102
103Four_SAMPLES:
104  vld1.16    d20, [r7]!
105  vld1.16    d22, [r6]!
106  vmull.s16  q12, d20, d22
107  vpadal.s32 q8, q12
108  sub r12, #4
109
110REST_SAMPLES:
111  mov        r8, #0                @ Initialize lower word of the accumulation.
112  mov        r4, #0                @ Initialize upper word of the accumulation.
113  cmp r12, #0
114  ble SUMUP
115
116LOOP_REST_SAMPLES:
117  ldrh       r9, [r7], #2          @ x[0, ...]
118  ldrh       r10, [r6], #2         @ x[i, ...]
119  smulbb     r11, r9, r10
120  adds       r8, r8, r11           @ lower word of the accumulation.
121  adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
122  subs       r12, #1
123  bgt        LOOP_REST_SAMPLES
124
125@ Added the multiplication results together and do a shift.
126SUMUP:
127  vadd.i64   d16, d17
128  vadd.i64   d18, d19
129  vadd.i64   d18, d16
130  vmov       d17, r8, r4
131  vadd.i64   d18, d17
132  vshl.s64   d18, d29              @ Shift left by (-scale).
133  vst1.32    d18[0], [r0]!         @ r[i]
134
135  add        r5, #1
136  cmp        r5, r3
137  ble        LOOP_R
138
139  vneg.s32   d29, d29              @ Get value for 'scale'.
140  ldr        r2, [sp, #40]         @ &scale
141  add        r0, r3, #1            @ return (order + 1)
142  vst1.s16   d29[0], [r2]          @ Store 'scale'
143
144  pop        {r3 - r12}
145  bx         lr
146