1@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ This file contains the function WebRtcSpl_FilterARFastQ12(), optimized for
12@ ARMv7  platform. The description header can be found in
13@ signal_processing_library.h
14@
15@ Output is bit-exact with the generic C code as in filter_ar_fast_q12.c, and
16@ the reference C code at end of this file.
17
18@ Assumptions:
19@ (1) data_length > 0
20@ (2) coefficients_length > 1
21
22@ Register usage:
23@
24@ r0:  &data_in[i]
25@ r1:  &data_out[i], for result ouput
26@ r2:  &coefficients[0]
27@ r3:  coefficients_length
28@ r4:  Iteration counter for the outer loop.
29@ r5:  data_out[j] as multiplication inputs
30@ r6:  Calculated value for output data_out[]; interation counter for inner loop
31@ r7:  Partial sum of a filtering multiplication results
32@ r8:  Partial sum of a filtering multiplication results
33@ r9:  &data_out[], for filtering input; data_in[i]
34@ r10: coefficients[j]
35@ r11: Scratch
36@ r12: &coefficients[j]
37
38#include "webrtc/system_wrappers/interface/asm_defines.h"
39
40GLOBAL_FUNCTION WebRtcSpl_FilterARFastQ12
41.align  2
42DEFINE_FUNCTION WebRtcSpl_FilterARFastQ12
43  push {r4-r11}
44
45  ldrsh r12, [sp, #32]         @ data_length
46  subs r4, r12, #1
47  beq ODD_LENGTH               @ jump if data_length == 1
48
49LOOP_LENGTH:
50  add r12, r2, r3, lsl #1
51  sub r12, #4                  @ &coefficients[coefficients_length - 2]
52  sub r9, r1, r3, lsl #1
53  add r9, #2                   @ &data_out[i - coefficients_length + 1]
54  ldr r5, [r9], #4             @ data_out[i - coefficients_length + {1,2}]
55
56  mov r7, #0                   @ sum1
57  mov r8, #0                   @ sum2
58  subs r6, r3, #3              @ Iteration counter for inner loop.
59  beq ODD_A_LENGTH             @ branch if coefficients_length == 3
60  blt POST_LOOP_A_LENGTH       @ branch if coefficients_length == 2
61
62LOOP_A_LENGTH:
63  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
64  subs r6, #2
65  smlatt r8, r10, r5, r8       @ sum2 += coefficients[j] * data_out[i - j + 1];
66  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
67  smlabt r7, r10, r5, r7       @ coefficients[j - 1] * data_out[i - j + 1];
68  ldr r5, [r9], #4             @ data_out[i - j + 2],  data_out[i - j + 3]
69  smlabb r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 2];
70  bgt LOOP_A_LENGTH
71  blt POST_LOOP_A_LENGTH
72
73ODD_A_LENGTH:
74  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[2]
75  sub r12, #2                  @ &coefficients[0]
76  smlabb r7, r10, r5, r7       @ sum1 += coefficients[2] * data_out[i - 2];
77  smlabt r8, r10, r5, r8       @ sum2 += coefficients[2] * data_out[i - 1];
78  ldr r5, [r9, #-2]            @ data_out[i - 1],  data_out[i]
79
80POST_LOOP_A_LENGTH:
81  ldr r10, [r12]               @ coefficients[0], coefficients[1]
82  smlatb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
83
84  ldr r9, [r0], #4             @ data_in[i], data_in[i + 1]
85  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
86  sub r6, r7                   @ output1 -= sum1;
87
88  sbfx r11, r6, #12, #16
89  ssat r7, #16, r6, asr #12
90  cmp r7, r11
91  addeq r6, r6, #2048
92  ssat r6, #16, r6, asr #12
93  strh r6, [r1], #2            @ Store data_out[i]
94
95  smlatb r8, r10, r6, r8       @ sum2 += coefficients[1] * data_out[i];
96  smulbt r6, r10, r9           @ output2 = coefficients[0] * data_in[i + 1];
97  sub r6, r8                   @ output1 -= sum1;
98
99  sbfx r11, r6, #12, #16
100  ssat r7, #16, r6, asr #12
101  cmp r7, r11
102  addeq r6, r6, #2048
103  ssat r6, #16, r6, asr #12
104  strh r6, [r1], #2            @ Store data_out[i + 1]
105
106  subs r4, #2
107  bgt LOOP_LENGTH
108  blt END                      @ For even data_length, it's done. Jump to END.
109
110@ Process i = data_length -1, for the case of an odd length.
111ODD_LENGTH:
112  add r12, r2, r3, lsl #1
113  sub r12, #4                  @ &coefficients[coefficients_length - 2]
114  sub r9, r1, r3, lsl #1
115  add r9, #2                   @ &data_out[i - coefficients_length + 1]
116  mov r7, #0                   @ sum1
117  mov r8, #0                   @ sum1
118  subs r6, r3, #2              @ inner loop counter
119  beq EVEN_A_LENGTH            @ branch if coefficients_length == 2
120
121LOOP2_A_LENGTH:
122  ldr r10, [r12], #-4          @ coefficients[j - 1], coefficients[j]
123  ldr r5, [r9], #4             @ data_out[i - j],  data_out[i - j + 1]
124  subs r6, #2
125  smlatb r7, r10, r5, r7       @ sum1 += coefficients[j] * data_out[i - j];
126  smlabt r8, r10, r5, r8       @ coefficients[j - 1] * data_out[i - j + 1];
127  bgt LOOP2_A_LENGTH
128  addlt r12, #2
129  blt POST_LOOP2_A_LENGTH
130
131EVEN_A_LENGTH:
132  ldrsh r10, [r12, #2]         @ Filter coefficients coefficients[1]
133  ldrsh r5, [r9]               @ data_out[i - 1]
134  smlabb r7, r10, r5, r7       @ sum1 += coefficients[1] * data_out[i - 1];
135
136POST_LOOP2_A_LENGTH:
137  ldrsh r10, [r12]             @ Filter coefficients coefficients[0]
138  ldrsh r9, [r0]               @ data_in[i]
139  smulbb r6, r10, r9           @ output1 = coefficients[0] * data_in[i];
140  sub r6, r7                   @ output1 -= sum1;
141  sub r6, r8                   @ output1 -= sum1;
142  sbfx r8, r6, #12, #16
143  ssat r7, #16, r6, asr #12
144  cmp r7, r8
145  addeq r6, r6, #2048
146  ssat r6, #16, r6, asr #12
147  strh r6, [r1]                @ Store the data_out[i]
148
149END:
150  pop {r4-r11}
151  bx  lr
152
153@Reference C code:
154@
155@void WebRtcSpl_FilterARFastQ12(int16_t* data_in,
156@                               int16_t* data_out,
157@                               int16_t* __restrict coefficients,
158@                               int coefficients_length,
159@                               int data_length) {
160@  int i = 0;
161@  int j = 0;
162@
163@  for (i = 0; i < data_length - 1; i += 2) {
164@    int32_t output1 = 0;
165@    int32_t sum1 = 0;
166@    int32_t output2 = 0;
167@    int32_t sum2 = 0;
168@
169@    for (j = coefficients_length - 1; j > 2; j -= 2) {
170@      sum1 += coefficients[j]      * data_out[i - j];
171@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
172@      sum2 += coefficients[j]     * data_out[i - j + 1];
173@      sum2 += coefficients[j - 1] * data_out[i - j + 2];
174@    }
175@
176@    if (j == 2) {
177@      sum1 += coefficients[2] * data_out[i - 2];
178@      sum2 += coefficients[2] * data_out[i - 1];
179@    }
180@
181@    sum1 += coefficients[1] * data_out[i - 1];
182@    output1 = coefficients[0] * data_in[i];
183@    output1 -= sum1;
184@    // Saturate and store the output.
185@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
186@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
187@
188@    sum2 += coefficients[1] * data_out[i];
189@    output2 = coefficients[0] * data_in[i + 1];
190@    output2 -= sum2;
191@    // Saturate and store the output.
192@    output2 = WEBRTC_SPL_SAT(134215679, output2, -134217728);
193@    data_out[i + 1] = (int16_t)((output2 + 2048) >> 12);
194@  }
195@
196@  if (i == data_length - 1) {
197@    int32_t output1 = 0;
198@    int32_t sum1 = 0;
199@
200@    for (j = coefficients_length - 1; j > 1; j -= 2) {
201@      sum1 += coefficients[j]      * data_out[i - j];
202@      sum1 += coefficients[j - 1]  * data_out[i - j + 1];
203@    }
204@
205@    if (j == 1) {
206@      sum1 += coefficients[1] * data_out[i - 1];
207@    }
208@
209@    output1 = coefficients[0] * data_in[i];
210@    output1 -= sum1;
211@    // Saturate and store the output.
212@    output1 = WEBRTC_SPL_SAT(134215679, output1, -134217728);
213@    data_out[i] = (int16_t)((output1 + 2048) >> 12);
214@  }
215@}
216