Norm_Corr_neon.s revision 956c553ab0ce72f8074ad0fda2ffd66a0305700c
1@/*
2@ ** Copyright 2003-2010, VisualOn, Inc.
3@ **
4@ ** Licensed under the Apache License, Version 2.0 (the "License");
5@ ** you may not use this file except in compliance with the License.
6@ ** You may obtain a copy of the License at
7@ **
8@ **     http://www.apache.org/licenses/LICENSE-2.0
9@ **
10@ ** Unless required by applicable law or agreed to in writing, software
11@ ** distributed under the License is distributed on an "AS IS" BASIS,
12@ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13@ ** See the License for the specific language governing permissions and
14@ ** limitations under the License.
15@ */
16@
17@static void Norm_Corr (Word16 exc[],                    /* (i)     : excitation buffer          */
18@                       Word16 xn[],                     /* (i)     : target vector              */
19@                       Word16 h[],                      /* (i) Q15 : impulse response of synth/wgt filters */
20@                       Word16 L_subfr,                  /* (i)     : sub-frame length */
21@                       Word16 t_min,                    /* (i)     : minimum value of pitch lag.   */
22@                       Word16 t_max,                    /* (i)     : maximum value of pitch lag.   */
23@                       Word16 corr_norm[])              /* (o) Q15 : normalized correlation    */
24@
25
26@ r0 --- exc[]
27@ r1 --- xn[]
28@ r2 --- h[]
29@ r3 --- L_subfr
30@ r4 --- t_min
31@ r5 --- t_max
32@ r6 --- corr_norm[]
33
34
35	.section  .text
36        .global    Norm_corr_asm
37        .extern    Convolve_asm
38        .extern    Isqrt_n
39@******************************
40@ constant
41@******************************
42.equ    EXC               , 0
43.equ    XN                , 4
44.equ    H                 , 8
45.equ    L_SUBFR           , 12
46.equ    voSTACK           , 172
47.equ    T_MIN             , 212
48.equ    T_MAX             , 216
49.equ    CORR_NORM         , 220
50
51Norm_corr_asm:
52
53        STMFD          r13!, {r4 - r12, r14}
54        SUB            r13, r13, #voSTACK
55
56        ADD            r8, r13, #20                 @get the excf[L_SUBFR]
57        LDR            r4, [r13, #T_MIN]            @get t_min
58        RSB            r11, r4, #0                  @k = -t_min
59        ADD            r5, r0, r11, LSL #1          @get the &exc[k]
60
61        @transfer Convolve function
62        STMFD          sp!, {r0 - r3}
63        MOV            r0, r5
64        MOV            r1, r2
65        MOV            r2, r8                       @r2 --- excf[]
66        BL             Convolve_asm
67        LDMFD          sp!, {r0 - r3}
68
69        @ r8 --- excf[]
70
71	MOV            r14, r1                       @copy xn[] address
72        MOV            r7, #1
73	VLD1.S16       {Q0, Q1}, [r14]!
74	VLD1.S16       {Q2, Q3}, [r14]!
75	VLD1.S16       {Q4, Q5}, [r14]!
76	VLD1.S16       {Q6, Q7}, [r14]!
77
78        VMULL.S16      Q10, D0, D0
79        VMLAL.S16      Q10, D1, D1
80        VMLAL.S16      Q10, D2, D2
81        VMLAL.S16      Q10, D3, D3
82        VMLAL.S16      Q10, D4, D4
83        VMLAL.S16      Q10, D5, D5
84        VMLAL.S16      Q10, D6, D6
85        VMLAL.S16      Q10, D7, D7
86        VMLAL.S16      Q10, D8, D8
87        VMLAL.S16      Q10, D9, D9
88	VMLAL.S16      Q10, D10, D10
89	VMLAL.S16      Q10, D11, D11
90	VMLAL.S16      Q10, D12, D12
91	VMLAL.S16      Q10, D13, D13
92	VMLAL.S16      Q10, D14, D14
93	VMLAL.S16      Q10, D15, D15
94
95        VQADD.S32      D20, D20, D21
96        VMOV.S32       r9,  D20[0]
97        VMOV.S32       r10, D20[1]
98        QADD           r6, r9, r10
99	QADD           r6, r6, r6
100        QADD           r9, r6, r7                   @L_tmp = (L_tmp << 1) + 1;
101	CLZ            r7, r9
102	SUB            r6, r7, #1                   @exp = norm_l(L_tmp)
103        RSB            r7, r6, #32                  @exp = 32 - exp
104	MOV            r6, r7, ASR #1
105	RSB            r7, r6, #0                   @scale = -(exp >> 1)
106
107        @loop for every possible period
108	@for(t = t_min@ t <= t_max@ t++)
109	@r7 --- scale r4 --- t_min r8 --- excf[]
110
111LOOPFOR:
112	ADD            r14, r13, #20                @copy of excf[]
113	MOV            r12, r1                      @copy of xn[]
114	MOV            r8, #0x8000
115
116        VLD1.S16       {Q0, Q1}, [r14]!                 @ load 16 excf[]
117        VLD1.S16       {Q2, Q3}, [r14]!                 @ load 16 excf[]
118        VLD1.S16       {Q4, Q5}, [r12]!                 @ load 16 x[]
119	VLD1.S16       {Q6, Q7}, [r12]!                 @ load 16 x[]
120        VMULL.S16    Q10, D0, D0                      @L_tmp1 += excf[] * excf[]
121        VMULL.S16    Q11, D0, D8                      @L_tmp  += x[] * excf[]
122        VMLAL.S16    Q10, D1, D1
123        VMLAL.S16    Q11, D1, D9
124        VMLAL.S16    Q10, D2, D2
125        VMLAL.S16    Q11, D2, D10
126        VMLAL.S16    Q10, D3, D3
127        VMLAL.S16    Q11, D3, D11
128        VMLAL.S16    Q10, D4, D4
129        VMLAL.S16    Q11, D4, D12
130        VMLAL.S16    Q10, D5, D5
131        VMLAL.S16    Q11, D5, D13
132        VMLAL.S16    Q10, D6, D6
133        VMLAL.S16    Q11, D6, D14
134        VMLAL.S16    Q10, D7, D7
135        VMLAL.S16    Q11, D7, D15
136
137	VLD1.S16       {Q0, Q1}, [r14]!                 @ load 16 excf[]
138        VLD1.S16       {Q2, Q3}, [r14]!                 @ load 16 excf[]
139        VLD1.S16       {Q4, Q5}, [r12]!                 @ load 16 x[]
140        VLD1.S16       {Q6, Q7}, [r12]!                 @ load 16 x[]
141        VMLAL.S16    Q10, D0, D0
142        VMLAL.S16    Q11, D0, D8
143        VMLAL.S16    Q10, D1, D1
144        VMLAL.S16    Q11, D1, D9
145        VMLAL.S16    Q10, D2, D2
146        VMLAL.S16    Q11, D2, D10
147        VMLAL.S16    Q10, D3, D3
148        VMLAL.S16    Q11, D3, D11
149        VMLAL.S16    Q10, D4, D4
150        VMLAL.S16    Q11, D4, D12
151        VMLAL.S16    Q10, D5, D5
152        VMLAL.S16    Q11, D5, D13
153        VMLAL.S16    Q10, D6, D6
154        VMLAL.S16    Q11, D6, D14
155        VMLAL.S16    Q10, D7, D7
156        VMLAL.S16    Q11, D7, D15
157
158        VQADD.S32      D20, D20, D21
159        VQADD.S32      D22, D22, D23
160
161	VPADD.S32      D20, D20, D20                   @D20[0] --- L_tmp1 << 1
162	VPADD.S32      D22, D22, D22                   @D22[0] --- L_tmp << 1
163
164	VMOV.S32       r6, D20[0]
165        VMOV.S32       r5, D22[0]
166
167	@r5 --- L_tmp, r6 --- L_tmp1
168	MOV            r10, #1
169	ADD            r5, r10, r5, LSL #1                     @L_tmp = (L_tmp << 1) + 1
170	ADD            r6, r10, r6, LSL #1                     @L_tmp1 = (L_tmp1 << 1) + 1
171
172	CLZ            r10, r5
173	CMP            r5, #0
174	RSBLT          r11, r5, #0
175	CLZLT          r10, r11
176	SUB            r10, r10, #1                 @exp = norm_l(L_tmp)
177
178	MOV            r5, r5, LSL r10              @L_tmp = (L_tmp << exp)
179	RSB            r10, r10, #30                @exp_corr = 30 - exp
180	MOV            r11, r5, ASR #16             @corr = extract_h(L_tmp)
181
182	CLZ            r5, r6
183	SUB            r5, r5, #1
184	MOV            r6, r6, LSL r5               @L_tmp = (L_tmp1 << exp)
185	RSB            r5, r5, #30                  @exp_norm = 30 - exp
186
187	@r10 --- exp_corr, r11 --- corr
188	@r6  --- L_tmp, r5 --- exp_norm
189
190	@Isqrt_n(&L_tmp, &exp_norm)
191
192	MOV            r14, r0
193	MOV            r12, r1
194
195        STMFD          sp!, {r0 - r4, r7 - r12, r14}
196	ADD            r1, sp, #4
197	ADD            r0, sp, #0
198	STR            r6, [sp]
199	STRH           r5, [sp, #4]
200	BL             Isqrt_n
201	LDR            r6, [sp]
202	LDRSH          r5, [sp, #4]
203        LDMFD          sp!, {r0 - r4, r7 - r12, r14}
204	MOV            r0, r14
205	MOV            r1, r12
206
207
208	MOV            r6, r6, ASR #16              @norm = extract_h(L_tmp)
209	MUL            r12, r6, r11
210	ADD            r12, r12, r12                @L_tmp = vo_L_mult(corr, norm)
211
212	ADD            r6, r10, r5
213	ADD            r6, r6, r7                   @exp_corr + exp_norm + scale
214
215        CMP            r6, #0
216        RSBLT          r6, r6, #0
217	MOVLT          r12, r12, ASR r6
218        MOVGT          r12, r12, LSL r6             @L_tmp = L_shl(L_tmp, exp_corr + exp_norm + scale)
219
220        ADD            r12, r12, r8
221        MOV            r12, r12, ASR #16            @vo_round(L_tmp)
222
223        LDR            r5, [r13, #CORR_NORM]        @ get corr_norm address
224	LDR            r6, [r13, #T_MAX]            @ get t_max
225	ADD            r10, r5, r4, LSL #1          @ get corr_norm[t] address
226	STRH           r12, [r10]                   @ corr_norm[t] = vo_round(L_tmp)
227
228	CMP            r4, r6
229	BEQ            Norm_corr_asm_end
230
231	ADD            r4, r4, #1                   @ t_min ++
232	RSB            r5, r4, #0                   @ k
233
234	MOV            r6, #63                      @ i = 63
235	MOV            r8, r0                       @ exc[]
236	MOV            r9, r2                       @ h[]
237	ADD            r10, r13, #20                @ excf[]
238
239	ADD            r8, r8, r5, LSL #1           @ exc[k] address
240	ADD            r9, r9, r6, LSL #1           @ h[i] address
241	ADD            r10, r10, r6, LSL #1         @ excf[i] address
242	LDRSH          r11, [r8]                    @ tmp = exc[k]
243
244LOOPK:
245        LDRSH          r8, [r9], #-2                @ load h[i]
246	LDRSH          r12, [r10, #-2]              @ load excf[i - 1]
247	MUL            r14, r11, r8
248	MOV            r8, r14, ASR #15
249	ADD            r14, r8, r12
250	STRH           r14, [r10], #-2
251	SUBS           r6, r6, #1
252	BGT            LOOPK
253
254	LDRSH          r8, [r9]                     @ load h[0]
255	MUL            r14, r11, r8
256        LDR            r6, [r13, #T_MAX]            @ get t_max
257	MOV            r8, r14, ASR #15
258	STRH           r8, [r10]
259
260	CMP            r4, r6
261	BLE            LOOPFOR
262
263Norm_corr_asm_end:
264
265        ADD            r13, r13, #voSTACK
266        LDMFD          r13!, {r4 - r12, r15}
267
268        .END
269
270
271