1/*
2 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/modules/audio_processing/aecm/aecm_core.h"
12
13#include <assert.h>
14
15#include "webrtc/modules/audio_processing/aecm/include/echo_control_mobile.h"
16#include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
17
18static const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
19  0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
20  3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224,
21  6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040,
22  9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514,
23  11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553,
24  13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079,
25  15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034,
26  16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384
27};
28
29static const int16_t kNoiseEstQDomain = 15;
30static const int16_t kNoiseEstIncCount = 5;
31
32static int16_t coefTable[] = {
33   0,   4, 256, 260, 128, 132, 384, 388,
34  64,  68, 320, 324, 192, 196, 448, 452,
35  32,  36, 288, 292, 160, 164, 416, 420,
36  96, 100, 352, 356, 224, 228, 480, 484,
37  16,  20, 272, 276, 144, 148, 400, 404,
38  80,  84, 336, 340, 208, 212, 464, 468,
39  48,  52, 304, 308, 176, 180, 432, 436,
40 112, 116, 368, 372, 240, 244, 496, 500,
41   8,  12, 264, 268, 136, 140, 392, 396,
42  72,  76, 328, 332, 200, 204, 456, 460,
43  40,  44, 296, 300, 168, 172, 424, 428,
44 104, 108, 360, 364, 232, 236, 488, 492,
45  24,  28, 280, 284, 152, 156, 408, 412,
46  88,  92, 344, 348, 216, 220, 472, 476,
47  56,  60, 312, 316, 184, 188, 440, 444,
48 120, 124, 376, 380, 248, 252, 504, 508
49};
50
51static int16_t coefTable_ifft[] = {
52    0, 512, 256, 508, 128, 252, 384, 380,
53   64, 124, 320, 444, 192, 188, 448, 316,
54   32,  60, 288, 476, 160, 220, 416, 348,
55   96,  92, 352, 412, 224, 156, 480, 284,
56   16,  28, 272, 492, 144, 236, 400, 364,
57   80, 108, 336, 428, 208, 172, 464, 300,
58   48,  44, 304, 460, 176, 204, 432, 332,
59  112,  76, 368, 396, 240, 140, 496, 268,
60    8,  12, 264, 500, 136, 244, 392, 372,
61   72, 116, 328, 436, 200, 180, 456, 308,
62   40,  52, 296, 468, 168, 212, 424, 340,
63  104,  84, 360, 404, 232, 148, 488, 276,
64   24,  20, 280, 484, 152, 228, 408, 356,
65   88, 100, 344, 420, 216, 164, 472, 292,
66   56,  36, 312, 452, 184, 196, 440, 324,
67  120,  68, 376, 388, 248, 132, 504, 260
68};
69
70static void ComfortNoise(AecmCore_t* aecm,
71                         const uint16_t* dfa,
72                         complex16_t* out,
73                         const int16_t* lambda);
74
75static void WindowAndFFT(AecmCore_t* aecm,
76                         int16_t* fft,
77                         const int16_t* time_signal,
78                         complex16_t* freq_signal,
79                         int time_signal_scaling) {
80  int i, j;
81  int32_t tmp1, tmp2, tmp3, tmp4;
82  int16_t* pfrfi;
83  complex16_t* pfreq_signal;
84  int16_t  f_coef, s_coef;
85  int32_t load_ptr, store_ptr1, store_ptr2, shift, shift1;
86  int32_t hann, hann1, coefs;
87
88  memset(fft, 0, sizeof(int16_t) * PART_LEN4);
89
90  // FFT of signal
91  __asm __volatile (
92    ".set        push                                                    \n\t"
93    ".set        noreorder                                               \n\t"
94    "addiu       %[shift],          %[time_signal_scaling], -14          \n\t"
95    "addiu       %[i],              $zero,                  64           \n\t"
96    "addiu       %[load_ptr],       %[time_signal],         0            \n\t"
97    "addiu       %[hann],           %[hanning],             0            \n\t"
98    "addiu       %[hann1],          %[hanning],             128          \n\t"
99    "addiu       %[coefs],          %[coefTable],           0            \n\t"
100    "bltz        %[shift],          2f                                   \n\t"
101    " negu       %[shift1],         %[shift]                             \n\t"
102   "1:                                                                   \n\t"
103    "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
104    "lh          %[tmp2],           0(%[hann])                           \n\t"
105    "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
106    "lh          %[tmp4],           0(%[hann1])                          \n\t"
107    "addiu       %[i],              %[i],                   -1           \n\t"
108    "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
109    "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
110    "lh          %[f_coef],         0(%[coefs])                          \n\t"
111    "lh          %[s_coef],         2(%[coefs])                          \n\t"
112    "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
113    "addiu       %[hann],           %[hann],                2            \n\t"
114    "addiu       %[hann1],          %[hann1],               -2           \n\t"
115    "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
116    "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
117    "sllv        %[tmp1],           %[tmp1],                %[shift]     \n\t"
118    "sllv        %[tmp3],           %[tmp3],                %[shift]     \n\t"
119    "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
120    "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
121    "bgtz        %[i],              1b                                   \n\t"
122    " addiu      %[coefs],          %[coefs],               4            \n\t"
123    "b           3f                                                      \n\t"
124    " nop                                                                \n\t"
125   "2:                                                                   \n\t"
126    "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
127    "lh          %[tmp2],           0(%[hann])                           \n\t"
128    "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
129    "lh          %[tmp4],           0(%[hann1])                          \n\t"
130    "addiu       %[i],              %[i],                   -1           \n\t"
131    "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
132    "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
133    "lh          %[f_coef],         0(%[coefs])                          \n\t"
134    "lh          %[s_coef],         2(%[coefs])                          \n\t"
135    "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
136    "addiu       %[hann],           %[hann],                2            \n\t"
137    "addiu       %[hann1],          %[hann1],               -2           \n\t"
138    "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
139    "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
140    "srav        %[tmp1],           %[tmp1],                %[shift1]    \n\t"
141    "srav        %[tmp3],           %[tmp3],                %[shift1]    \n\t"
142    "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
143    "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
144    "bgtz        %[i],              2b                                   \n\t"
145    " addiu      %[coefs],          %[coefs],               4            \n\t"
146   "3:                                                                   \n\t"
147    ".set        pop                                                     \n\t"
148    : [load_ptr] "=&r" (load_ptr), [shift] "=&r" (shift), [hann] "=&r" (hann),
149      [hann1] "=&r" (hann1), [shift1] "=&r" (shift1), [coefs] "=&r" (coefs),
150      [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
151      [tmp4] "=&r" (tmp4), [i] "=&r" (i), [f_coef] "=&r" (f_coef),
152      [s_coef] "=&r" (s_coef), [store_ptr1] "=&r" (store_ptr1),
153      [store_ptr2] "=&r" (store_ptr2)
154    : [time_signal] "r" (time_signal), [coefTable] "r" (coefTable),
155      [time_signal_scaling] "r" (time_signal_scaling),
156      [hanning] "r" (WebRtcAecm_kSqrtHanning), [fft] "r" (fft)
157    : "memory", "hi", "lo"
158  );
159
160  WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
161  pfrfi = fft;
162  pfreq_signal = freq_signal;
163
164  __asm __volatile (
165    ".set        push                                                     \n\t"
166    ".set        noreorder                                                \n\t"
167    "addiu       %[j],              $zero,                 128            \n\t"
168   "1:                                                                    \n\t"
169    "lh          %[tmp1],           0(%[pfrfi])                           \n\t"
170    "lh          %[tmp2],           2(%[pfrfi])                           \n\t"
171    "lh          %[tmp3],           4(%[pfrfi])                           \n\t"
172    "lh          %[tmp4],           6(%[pfrfi])                           \n\t"
173    "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
174    "sh          %[tmp1],           0(%[pfreq_signal])                    \n\t"
175    "sh          %[tmp2],           2(%[pfreq_signal])                    \n\t"
176    "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
177    "sh          %[tmp3],           4(%[pfreq_signal])                    \n\t"
178    "sh          %[tmp4],           6(%[pfreq_signal])                    \n\t"
179    "lh          %[tmp1],           8(%[pfrfi])                           \n\t"
180    "lh          %[tmp2],           10(%[pfrfi])                          \n\t"
181    "lh          %[tmp3],           12(%[pfrfi])                          \n\t"
182    "lh          %[tmp4],           14(%[pfrfi])                          \n\t"
183    "addiu       %[j],              %[j],                  -8             \n\t"
184    "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
185    "sh          %[tmp1],           8(%[pfreq_signal])                    \n\t"
186    "sh          %[tmp2],           10(%[pfreq_signal])                   \n\t"
187    "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
188    "sh          %[tmp3],           12(%[pfreq_signal])                   \n\t"
189    "sh          %[tmp4],           14(%[pfreq_signal])                   \n\t"
190    "addiu       %[pfreq_signal],   %[pfreq_signal],       16             \n\t"
191    "bgtz        %[j],              1b                                    \n\t"
192    " addiu      %[pfrfi],          %[pfrfi],              16             \n\t"
193    ".set        pop                                                      \n\t"
194    : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
195      [j] "=&r" (j), [pfrfi] "+r" (pfrfi), [pfreq_signal] "+r" (pfreq_signal),
196      [tmp4] "=&r" (tmp4)
197    :
198    : "memory"
199  );
200}
201
202static void InverseFFTAndWindow(AecmCore_t* aecm,
203                                int16_t* fft,
204                                complex16_t* efw,
205                                int16_t* output,
206                                const int16_t* nearendClean) {
207  int i, outCFFT;
208  int32_t tmp1, tmp2, tmp3, tmp4, tmp_re, tmp_im;
209  int16_t* pcoefTable_ifft = coefTable_ifft;
210  int16_t* pfft = fft;
211  int16_t* ppfft = fft;
212  complex16_t* pefw = efw;
213  int32_t out_aecm;
214  int16_t* paecm_buf = aecm->outBuf;
215  const int16_t* p_kSqrtHanning = WebRtcAecm_kSqrtHanning;
216  const int16_t* pp_kSqrtHanning = &WebRtcAecm_kSqrtHanning[PART_LEN];
217  int16_t* output1 = output;
218
219  __asm __volatile (
220    ".set      push                                                        \n\t"
221    ".set      noreorder                                                   \n\t"
222    "addiu     %[i],                $zero,                   64            \n\t"
223   "1:                                                                     \n\t"
224    "lh        %[tmp1],             0(%[pcoefTable_ifft])                  \n\t"
225    "lh        %[tmp2],             2(%[pcoefTable_ifft])                  \n\t"
226    "lh        %[tmp_re],           0(%[pefw])                             \n\t"
227    "lh        %[tmp_im],           2(%[pefw])                             \n\t"
228    "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
229    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
230    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
231    "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
232    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
233    "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
234    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
235    "lh        %[tmp1],             4(%[pcoefTable_ifft])                  \n\t"
236    "lh        %[tmp2],             6(%[pcoefTable_ifft])                  \n\t"
237    "lh        %[tmp_re],           4(%[pefw])                             \n\t"
238    "lh        %[tmp_im],           6(%[pefw])                             \n\t"
239    "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
240    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
241    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
242    "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
243    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
244    "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
245    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
246    "lh        %[tmp1],             8(%[pcoefTable_ifft])                  \n\t"
247    "lh        %[tmp2],             10(%[pcoefTable_ifft])                 \n\t"
248    "lh        %[tmp_re],           8(%[pefw])                             \n\t"
249    "lh        %[tmp_im],           10(%[pefw])                            \n\t"
250    "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
251    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
252    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
253    "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
254    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
255    "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
256    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
257    "lh        %[tmp1],             12(%[pcoefTable_ifft])                 \n\t"
258    "lh        %[tmp2],             14(%[pcoefTable_ifft])                 \n\t"
259    "lh        %[tmp_re],           12(%[pefw])                            \n\t"
260    "lh        %[tmp_im],           14(%[pefw])                            \n\t"
261    "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
262    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
263    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
264    "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
265    "sh        %[tmp_re],           0(%[pfft])                             \n\t"
266    "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
267    "sh        %[tmp_im],           2(%[pfft])                             \n\t"
268    "addiu     %[pcoefTable_ifft],  %[pcoefTable_ifft],      16            \n\t"
269    "addiu     %[i],                %[i],                    -4            \n\t"
270    "bgtz      %[i],                1b                                     \n\t"
271    " addiu    %[pefw],             %[pefw],                 16            \n\t"
272    ".set      pop                                                         \n\t"
273    : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
274      [i] "=&r" (i), [tmp_re] "=&r" (tmp_re), [tmp_im] "=&r" (tmp_im),
275      [pefw] "+r" (pefw), [pcoefTable_ifft] "+r" (pcoefTable_ifft),
276      [fft] "+r" (fft)
277    :
278    : "memory"
279  );
280
281  fft[2] = efw[PART_LEN].real;
282  fft[3] = -efw[PART_LEN].imag;
283
284  outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
285  pfft = fft;
286
287  __asm __volatile (
288    ".set       push                                               \n\t"
289    ".set       noreorder                                          \n\t"
290    "addiu      %[i],            $zero,               128          \n\t"
291   "1:                                                             \n\t"
292    "lh         %[tmp1],         0(%[ppfft])                       \n\t"
293    "lh         %[tmp2],         4(%[ppfft])                       \n\t"
294    "lh         %[tmp3],         8(%[ppfft])                       \n\t"
295    "lh         %[tmp4],         12(%[ppfft])                      \n\t"
296    "addiu      %[i],            %[i],                -4           \n\t"
297    "sh         %[tmp1],         0(%[pfft])                        \n\t"
298    "sh         %[tmp2],         2(%[pfft])                        \n\t"
299    "sh         %[tmp3],         4(%[pfft])                        \n\t"
300    "sh         %[tmp4],         6(%[pfft])                        \n\t"
301    "addiu      %[ppfft],        %[ppfft],            16           \n\t"
302    "bgtz       %[i],            1b                                \n\t"
303    " addiu     %[pfft],         %[pfft],             8            \n\t"
304    ".set       pop                                                \n\t"
305    : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
306      [i] "=&r" (i), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
307      [ppfft] "+r" (ppfft)
308    :
309    : "memory"
310  );
311
312  pfft = fft;
313  out_aecm = (int32_t)(outCFFT - aecm->dfaCleanQDomain);
314
315  __asm __volatile (
316    ".set       push                                                       \n\t"
317    ".set       noreorder                                                  \n\t"
318    "addiu      %[i],                $zero,                  64            \n\t"
319   "11:                                                                    \n\t"
320    "lh         %[tmp1],             0(%[pfft])                            \n\t"
321    "lh         %[tmp2],             0(%[p_kSqrtHanning])                  \n\t"
322    "addiu      %[i],                %[i],                   -2            \n\t"
323    "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
324    "lh         %[tmp3],             2(%[pfft])                            \n\t"
325    "lh         %[tmp4],             2(%[p_kSqrtHanning])                  \n\t"
326    "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
327    "addiu      %[tmp1],             %[tmp1],                8192          \n\t"
328    "sra        %[tmp1],             %[tmp1],                14            \n\t"
329    "addiu      %[tmp3],             %[tmp3],                8192          \n\t"
330    "sra        %[tmp3],             %[tmp3],                14            \n\t"
331    "bgez       %[out_aecm],         1f                                    \n\t"
332    " negu      %[tmp2],             %[out_aecm]                           \n\t"
333    "srav       %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
334    "b          2f                                                         \n\t"
335    " srav      %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
336   "1:                                                                     \n\t"
337    "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
338    "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
339   "2:                                                                     \n\t"
340    "lh         %[tmp4],             0(%[paecm_buf])                       \n\t"
341    "lh         %[tmp2],             2(%[paecm_buf])                       \n\t"
342    "addu       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
343    "addu       %[tmp1],             %[tmp1],                %[tmp4]       \n\t"
344#if defined(MIPS_DSP_R1_LE)
345    "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
346    "sra        %[tmp1],             %[tmp1],                16            \n\t"
347    "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
348    "sra        %[tmp3],             %[tmp3],                16            \n\t"
349#else  // #if defined(MIPS_DSP_R1_LE)
350    "sra        %[tmp4],             %[tmp1],                31            \n\t"
351    "sra        %[tmp2],             %[tmp1],                15            \n\t"
352    "beq        %[tmp4],             %[tmp2],                3f            \n\t"
353    " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
354    "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
355   "3:                                                                     \n\t"
356    "sra        %[tmp2],             %[tmp3],                31            \n\t"
357    "sra        %[tmp4],             %[tmp3],                15            \n\t"
358    "beq        %[tmp2],             %[tmp4],                4f            \n\t"
359    " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
360    "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
361   "4:                                                                     \n\t"
362#endif  // #if defined(MIPS_DSP_R1_LE)
363    "sh         %[tmp1],             0(%[pfft])                            \n\t"
364    "sh         %[tmp1],             0(%[output1])                         \n\t"
365    "sh         %[tmp3],             2(%[pfft])                            \n\t"
366    "sh         %[tmp3],             2(%[output1])                         \n\t"
367    "lh         %[tmp1],             128(%[pfft])                          \n\t"
368    "lh         %[tmp2],             0(%[pp_kSqrtHanning])                 \n\t"
369    "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
370    "lh         %[tmp3],             130(%[pfft])                          \n\t"
371    "lh         %[tmp4],             -2(%[pp_kSqrtHanning])                \n\t"
372    "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
373    "sra        %[tmp1],             %[tmp1],                14            \n\t"
374    "sra        %[tmp3],             %[tmp3],                14            \n\t"
375    "bgez       %[out_aecm],         5f                                    \n\t"
376    " negu      %[tmp2],             %[out_aecm]                           \n\t"
377    "srav       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
378    "b          6f                                                         \n\t"
379    " srav      %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
380   "5:                                                                     \n\t"
381    "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
382    "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
383   "6:                                                                     \n\t"
384#if defined(MIPS_DSP_R1_LE)
385    "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
386    "sra        %[tmp1],             %[tmp1],                16            \n\t"
387    "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
388    "sra        %[tmp3],             %[tmp3],                16            \n\t"
389#else  // #if defined(MIPS_DSP_R1_LE)
390    "sra        %[tmp4],             %[tmp1],                31            \n\t"
391    "sra        %[tmp2],             %[tmp1],                15            \n\t"
392    "beq        %[tmp4],             %[tmp2],                7f            \n\t"
393    " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
394    "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
395   "7:                                                                     \n\t"
396    "sra        %[tmp2],             %[tmp3],                31            \n\t"
397    "sra        %[tmp4],             %[tmp3],                15            \n\t"
398    "beq        %[tmp2],             %[tmp4],                8f            \n\t"
399    " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
400    "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
401   "8:                                                                     \n\t"
402#endif  // #if defined(MIPS_DSP_R1_LE)
403    "sh         %[tmp1],             0(%[paecm_buf])                       \n\t"
404    "sh         %[tmp3],             2(%[paecm_buf])                       \n\t"
405    "addiu      %[output1],          %[output1],             4             \n\t"
406    "addiu      %[paecm_buf],        %[paecm_buf],           4             \n\t"
407    "addiu      %[pfft],             %[pfft],                4             \n\t"
408    "addiu      %[p_kSqrtHanning],   %[p_kSqrtHanning],      4             \n\t"
409    "bgtz       %[i],                11b                                   \n\t"
410    " addiu     %[pp_kSqrtHanning],  %[pp_kSqrtHanning],     -4            \n\t"
411    ".set       pop                                                        \n\t"
412    : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
413      [output1] "+r" (output1), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
414      [paecm_buf] "+r" (paecm_buf), [i] "=&r" (i),
415      [pp_kSqrtHanning] "+r" (pp_kSqrtHanning),
416      [p_kSqrtHanning] "+r" (p_kSqrtHanning)
417    : [out_aecm] "r" (out_aecm),
418      [WebRtcAecm_kSqrtHanning] "r" (WebRtcAecm_kSqrtHanning)
419    : "hi", "lo","memory"
420  );
421
422  // Copy the current block to the old position
423  // (aecm->outBuf is shifted elsewhere)
424  memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(int16_t) * PART_LEN);
425  memcpy(aecm->dBufNoisy,
426         aecm->dBufNoisy + PART_LEN,
427         sizeof(int16_t) * PART_LEN);
428  if (nearendClean != NULL) {
429    memcpy(aecm->dBufClean,
430           aecm->dBufClean + PART_LEN,
431           sizeof(int16_t) * PART_LEN);
432  }
433}
434
435void WebRtcAecm_CalcLinearEnergies_mips(AecmCore_t* aecm,
436                                        const uint16_t* far_spectrum,
437                                        int32_t* echo_est,
438                                        uint32_t* far_energy,
439                                        uint32_t* echo_energy_adapt,
440                                        uint32_t* echo_energy_stored) {
441  int i;
442  uint32_t par1 = (*far_energy);
443  uint32_t par2 = (*echo_energy_adapt);
444  uint32_t par3 = (*echo_energy_stored);
445  int16_t* ch_stored_p = &(aecm->channelStored[0]);
446  int16_t* ch_adapt_p = &(aecm->channelAdapt16[0]);
447  uint16_t* spectrum_p = (uint16_t*)(&(far_spectrum[0]));
448  int32_t* echo_p = &(echo_est[0]);
449  int32_t temp0, stored0, echo0, adept0, spectrum0;
450  int32_t stored1, adept1, spectrum1, echo1, temp1;
451
452  // Get energy for the delayed far end signal and estimated
453  // echo using both stored and adapted channels.
454  for (i = 0; i < PART_LEN; i+= 4) {
455    __asm __volatile (
456      ".set           push                                            \n\t"
457      ".set           noreorder                                       \n\t"
458      "lh             %[stored0],     0(%[ch_stored_p])               \n\t"
459      "lhu            %[adept0],      0(%[ch_adapt_p])                \n\t"
460      "lhu            %[spectrum0],   0(%[spectrum_p])                \n\t"
461      "lh             %[stored1],     2(%[ch_stored_p])               \n\t"
462      "lhu            %[adept1],      2(%[ch_adapt_p])                \n\t"
463      "lhu            %[spectrum1],   2(%[spectrum_p])                \n\t"
464      "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
465      "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
466      "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
467      "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
468      "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
469      "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
470      "addiu          %[echo_p],      %[echo_p],      16              \n\t"
471      "addu           %[par3],        %[par3],        %[echo0]        \n\t"
472      "addu           %[par2],        %[par2],        %[temp0]        \n\t"
473      "addu           %[par3],        %[par3],        %[echo1]        \n\t"
474      "addu           %[par2],        %[par2],        %[temp1]        \n\t"
475      "usw            %[echo0],       -16(%[echo_p])                  \n\t"
476      "usw            %[echo1],       -12(%[echo_p])                  \n\t"
477      "lh             %[stored0],     4(%[ch_stored_p])               \n\t"
478      "lhu            %[adept0],      4(%[ch_adapt_p])                \n\t"
479      "lhu            %[spectrum0],   4(%[spectrum_p])                \n\t"
480      "lh             %[stored1],     6(%[ch_stored_p])               \n\t"
481      "lhu            %[adept1],      6(%[ch_adapt_p])                \n\t"
482      "lhu            %[spectrum1],   6(%[spectrum_p])                \n\t"
483      "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
484      "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
485      "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
486      "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
487      "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
488      "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
489      "addiu          %[ch_stored_p], %[ch_stored_p], 8               \n\t"
490      "addiu          %[ch_adapt_p],  %[ch_adapt_p],  8               \n\t"
491      "addiu          %[spectrum_p],  %[spectrum_p],  8               \n\t"
492      "addu           %[par3],        %[par3],        %[echo0]        \n\t"
493      "addu           %[par2],        %[par2],        %[temp0]        \n\t"
494      "addu           %[par3],        %[par3],        %[echo1]        \n\t"
495      "addu           %[par2],        %[par2],        %[temp1]        \n\t"
496      "usw            %[echo0],       -8(%[echo_p])                   \n\t"
497      "usw            %[echo1],       -4(%[echo_p])                   \n\t"
498      ".set           pop                                             \n\t"
499      : [temp0] "=&r" (temp0), [stored0] "=&r" (stored0),
500        [adept0] "=&r" (adept0), [spectrum0] "=&r" (spectrum0),
501        [echo0] "=&r" (echo0), [echo_p] "+r" (echo_p), [par3] "+r" (par3),
502        [par1] "+r" (par1), [par2] "+r" (par2), [stored1] "=&r" (stored1),
503        [adept1] "=&r" (adept1), [echo1] "=&r" (echo1),
504        [spectrum1] "=&r" (spectrum1), [temp1] "=&r" (temp1),
505        [ch_stored_p] "+r" (ch_stored_p), [ch_adapt_p] "+r" (ch_adapt_p),
506        [spectrum_p] "+r" (spectrum_p)
507      :
508      : "hi", "lo", "memory"
509    );
510  }
511
512  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
513                                             far_spectrum[PART_LEN]);
514  par1 += (uint32_t)(far_spectrum[PART_LEN]);
515  par2 += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
516  par3 += (uint32_t)echo_est[PART_LEN];
517
518  (*far_energy) = par1;
519  (*echo_energy_adapt) = par2;
520  (*echo_energy_stored) = par3;
521}
522
523#if defined(MIPS_DSP_R1_LE)
524void WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore_t* aecm,
525                                          const uint16_t* far_spectrum,
526                                          int32_t* echo_est) {
527  int i;
528  int16_t* temp1;
529  uint16_t* temp8;
530  int32_t temp0, temp2, temp3, temp4, temp5, temp6;
531  int32_t* temp7 = &(echo_est[0]);
532  temp1 = &(aecm->channelStored[0]);
533  temp8 = (uint16_t*)(&far_spectrum[0]);
534
535  // During startup we store the channel every block.
536  memcpy(aecm->channelStored, aecm->channelAdapt16,
537         sizeof(int16_t) * PART_LEN1);
538  // Recalculate echo estimate
539  for (i = 0; i < PART_LEN; i += 4) {
540    __asm __volatile (
541      "ulw            %[temp0],   0(%[temp8])               \n\t"
542      "ulw            %[temp2],   0(%[temp1])               \n\t"
543      "ulw            %[temp4],   4(%[temp8])               \n\t"
544      "ulw            %[temp5],   4(%[temp1])               \n\t"
545      "muleq_s.w.phl  %[temp3],   %[temp2],     %[temp0]    \n\t"
546      "muleq_s.w.phr  %[temp0],   %[temp2],     %[temp0]    \n\t"
547      "muleq_s.w.phl  %[temp6],   %[temp5],     %[temp4]    \n\t"
548      "muleq_s.w.phr  %[temp4],   %[temp5],     %[temp4]    \n\t"
549      "addiu          %[temp7],   %[temp7],     16          \n\t"
550      "addiu          %[temp1],   %[temp1],     8           \n\t"
551      "addiu          %[temp8],   %[temp8],     8           \n\t"
552      "sra            %[temp3],   %[temp3],     1           \n\t"
553      "sra            %[temp0],   %[temp0],     1           \n\t"
554      "sra            %[temp6],   %[temp6],     1           \n\t"
555      "sra            %[temp4],   %[temp4],     1           \n\t"
556      "usw            %[temp3],   -12(%[temp7])             \n\t"
557      "usw            %[temp0],   -16(%[temp7])             \n\t"
558      "usw            %[temp6],   -4(%[temp7])              \n\t"
559      "usw            %[temp4],   -8(%[temp7])              \n\t"
560      : [temp0] "=&r" (temp0), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
561        [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
562        [temp1] "+r" (temp1), [temp8] "+r" (temp8), [temp7] "+r" (temp7)
563      :
564      : "hi", "lo", "memory"
565    );
566  }
567  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
568                                      far_spectrum[i]);
569}
570
571void WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore_t* aecm) {
572  int i;
573  int32_t* temp3;
574  int16_t* temp0;
575  int32_t temp1, temp2, temp4, temp5;
576
577  temp0 = &(aecm->channelStored[0]);
578  temp3 = &(aecm->channelAdapt32[0]);
579
580  // The stored channel has a significantly lower MSE than the adaptive one for
581  // two consecutive calculations. Reset the adaptive channel.
582  memcpy(aecm->channelAdapt16,
583         aecm->channelStored,
584         sizeof(int16_t) * PART_LEN1);
585
586  // Restore the W32 channel
587  for (i = 0; i < PART_LEN; i += 4) {
588    __asm __volatile (
589      "ulw            %[temp1], 0(%[temp0])           \n\t"
590      "ulw            %[temp4], 4(%[temp0])           \n\t"
591      "preceq.w.phl   %[temp2], %[temp1]              \n\t"
592      "preceq.w.phr   %[temp1], %[temp1]              \n\t"
593      "preceq.w.phl   %[temp5], %[temp4]              \n\t"
594      "preceq.w.phr   %[temp4], %[temp4]              \n\t"
595      "addiu          %[temp0], %[temp0], 8           \n\t"
596      "usw            %[temp2], 4(%[temp3])           \n\t"
597      "usw            %[temp1], 0(%[temp3])           \n\t"
598      "usw            %[temp5], 12(%[temp3])          \n\t"
599      "usw            %[temp4], 8(%[temp3])           \n\t"
600      "addiu          %[temp3], %[temp3], 16          \n\t"
601      : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
602        [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
603        [temp3] "+r" (temp3), [temp0] "+r" (temp0)
604      :
605      : "memory"
606    );
607  }
608
609  aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
610                              (int32_t)aecm->channelStored[i], 16);
611}
612#endif  // #if defined(MIPS_DSP_R1_LE)
613
614// Transforms a time domain signal into the frequency domain, outputting the
615// complex valued signal, absolute value and sum of absolute values.
616//
617// time_signal          [in]    Pointer to time domain signal
618// freq_signal_real     [out]   Pointer to real part of frequency domain array
619// freq_signal_imag     [out]   Pointer to imaginary part of frequency domain
620//                              array
621// freq_signal_abs      [out]   Pointer to absolute value of frequency domain
622//                              array
623// freq_signal_sum_abs  [out]   Pointer to the sum of all absolute values in
624//                              the frequency domain array
625// return value                 The Q-domain of current frequency values
626//
627static int TimeToFrequencyDomain(AecmCore_t* aecm,
628                                 const int16_t* time_signal,
629                                 complex16_t* freq_signal,
630                                 uint16_t* freq_signal_abs,
631                                 uint32_t* freq_signal_sum_abs)
632{
633  int i = 0;
634  int time_signal_scaling = 0;
635
636  // In fft_buf, +16 for 32-byte alignment.
637  int16_t fft_buf[PART_LEN4 + 16];
638  int16_t *fft = (int16_t *) (((uintptr_t) fft_buf + 31) & ~31);
639
640  int16_t tmp16no1;
641#if !defined(MIPS_DSP_R2_LE)
642  int32_t tmp32no1;
643  int32_t tmp32no2;
644  int16_t tmp16no2;
645#else
646  int32_t tmp32no10, tmp32no11, tmp32no12, tmp32no13;
647  int32_t tmp32no20, tmp32no21, tmp32no22, tmp32no23;
648  int16_t* freqp;
649  uint16_t* freqabsp;
650  uint32_t freqt0, freqt1, freqt2, freqt3;
651  uint32_t freqs;
652#endif
653
654#ifdef AECM_DYNAMIC_Q
655  tmp16no1 = WebRtcSpl_MaxAbsValueW16(time_signal, PART_LEN2);
656  time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
657#endif
658
659  WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
660
661  // Extract imaginary and real part,
662  // calculate the magnitude for all frequency bins
663  freq_signal[0].imag = 0;
664  freq_signal[PART_LEN].imag = 0;
665  freq_signal[PART_LEN].real = fft[PART_LEN2];
666  freq_signal_abs[0] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[0].real);
667  freq_signal_abs[PART_LEN] = (uint16_t)WEBRTC_SPL_ABS_W16(
668    freq_signal[PART_LEN].real);
669  (*freq_signal_sum_abs) = (uint32_t)(freq_signal_abs[0]) +
670    (uint32_t)(freq_signal_abs[PART_LEN]);
671
672#if !defined(MIPS_DSP_R2_LE)
673  for (i = 1; i < PART_LEN; i++) {
674    if (freq_signal[i].real == 0)
675    {
676      freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
677        freq_signal[i].imag);
678    }
679    else if (freq_signal[i].imag == 0)
680    {
681      freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
682        freq_signal[i].real);
683    }
684    else
685    {
686      // Approximation for magnitude of complex fft output
687      // magn = sqrt(real^2 + imag^2)
688      // magn ~= alpha * max(|imag|,|real|) + beta * min(|imag|,|real|)
689      //
690      // The parameters alpha and beta are stored in Q15
691      tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
692      tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
693      tmp32no1 = WEBRTC_SPL_MUL_16_16(tmp16no1, tmp16no1);
694      tmp32no2 = WEBRTC_SPL_MUL_16_16(tmp16no2, tmp16no2);
695      tmp32no2 = WebRtcSpl_AddSatW32(tmp32no1, tmp32no2);
696      tmp32no1 = WebRtcSpl_SqrtFloor(tmp32no2);
697
698      freq_signal_abs[i] = (uint16_t)tmp32no1;
699    }
700    (*freq_signal_sum_abs) += (uint32_t)freq_signal_abs[i];
701  }
702#else // #if !defined(MIPS_DSP_R2_LE)
703  freqs = (uint32_t)(freq_signal_abs[0]) +
704          (uint32_t)(freq_signal_abs[PART_LEN]);
705  freqp = &(freq_signal[1].real);
706
707  __asm __volatile (
708    "lw             %[freqt0],      0(%[freqp])             \n\t"
709    "lw             %[freqt1],      4(%[freqp])             \n\t"
710    "lw             %[freqt2],      8(%[freqp])             \n\t"
711    "mult           $ac0,           $zero,      $zero       \n\t"
712    "mult           $ac1,           $zero,      $zero       \n\t"
713    "mult           $ac2,           $zero,      $zero       \n\t"
714    "dpaq_s.w.ph    $ac0,           %[freqt0],  %[freqt0]   \n\t"
715    "dpaq_s.w.ph    $ac1,           %[freqt1],  %[freqt1]   \n\t"
716    "dpaq_s.w.ph    $ac2,           %[freqt2],  %[freqt2]   \n\t"
717    "addiu          %[freqp],       %[freqp],   12          \n\t"
718    "extr.w         %[tmp32no20],   $ac0,       1           \n\t"
719    "extr.w         %[tmp32no21],   $ac1,       1           \n\t"
720    "extr.w         %[tmp32no22],   $ac2,       1           \n\t"
721    : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
722      [freqt2] "=&r" (freqt2), [freqp] "+r" (freqp),
723      [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
724      [tmp32no22] "=r" (tmp32no22)
725    :
726    : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo"
727  );
728
729  tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
730  tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
731  tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
732  freq_signal_abs[1] = (uint16_t)tmp32no10;
733  freq_signal_abs[2] = (uint16_t)tmp32no11;
734  freq_signal_abs[3] = (uint16_t)tmp32no12;
735  freqs += (uint32_t)tmp32no10;
736  freqs += (uint32_t)tmp32no11;
737  freqs += (uint32_t)tmp32no12;
738  freqabsp = &(freq_signal_abs[4]);
739  for (i = 4; i < PART_LEN; i+=4)
740  {
741    __asm __volatile (
742      "ulw            %[freqt0],      0(%[freqp])                 \n\t"
743      "ulw            %[freqt1],      4(%[freqp])                 \n\t"
744      "ulw            %[freqt2],      8(%[freqp])                 \n\t"
745      "ulw            %[freqt3],      12(%[freqp])                \n\t"
746      "mult           $ac0,           $zero,          $zero       \n\t"
747      "mult           $ac1,           $zero,          $zero       \n\t"
748      "mult           $ac2,           $zero,          $zero       \n\t"
749      "mult           $ac3,           $zero,          $zero       \n\t"
750      "dpaq_s.w.ph    $ac0,           %[freqt0],      %[freqt0]   \n\t"
751      "dpaq_s.w.ph    $ac1,           %[freqt1],      %[freqt1]   \n\t"
752      "dpaq_s.w.ph    $ac2,           %[freqt2],      %[freqt2]   \n\t"
753      "dpaq_s.w.ph    $ac3,           %[freqt3],      %[freqt3]   \n\t"
754      "addiu          %[freqp],       %[freqp],       16          \n\t"
755      "addiu          %[freqabsp],    %[freqabsp],    8           \n\t"
756      "extr.w         %[tmp32no20],   $ac0,           1           \n\t"
757      "extr.w         %[tmp32no21],   $ac1,           1           \n\t"
758      "extr.w         %[tmp32no22],   $ac2,           1           \n\t"
759      "extr.w         %[tmp32no23],   $ac3,           1           \n\t"
760      : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
761        [freqt2] "=&r" (freqt2), [freqt3] "=&r" (freqt3),
762        [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
763        [tmp32no22] "=r" (tmp32no22), [tmp32no23] "=r" (tmp32no23),
764        [freqabsp] "+r" (freqabsp), [freqp] "+r" (freqp)
765      :
766      : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
767        "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
768    );
769
770    tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
771    tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
772    tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
773    tmp32no13 = WebRtcSpl_SqrtFloor(tmp32no23);
774
775    __asm __volatile (
776      "sh             %[tmp32no10],   -8(%[freqabsp])                 \n\t"
777      "sh             %[tmp32no11],   -6(%[freqabsp])                 \n\t"
778      "sh             %[tmp32no12],   -4(%[freqabsp])                 \n\t"
779      "sh             %[tmp32no13],   -2(%[freqabsp])                 \n\t"
780      "addu           %[freqs],       %[freqs],       %[tmp32no10]    \n\t"
781      "addu           %[freqs],       %[freqs],       %[tmp32no11]    \n\t"
782      "addu           %[freqs],       %[freqs],       %[tmp32no12]    \n\t"
783      "addu           %[freqs],       %[freqs],       %[tmp32no13]    \n\t"
784      : [freqs] "+r" (freqs)
785      : [tmp32no10] "r" (tmp32no10), [tmp32no11] "r" (tmp32no11),
786        [tmp32no12] "r" (tmp32no12), [tmp32no13] "r" (tmp32no13),
787        [freqabsp] "r" (freqabsp)
788      : "memory"
789    );
790  }
791
792  (*freq_signal_sum_abs) = freqs;
793#endif
794
795  return time_signal_scaling;
796}
797
798int WebRtcAecm_ProcessBlock(AecmCore_t* aecm,
799                            const int16_t* farend,
800                            const int16_t* nearendNoisy,
801                            const int16_t* nearendClean,
802                            int16_t* output) {
803  int i;
804  uint32_t xfaSum;
805  uint32_t dfaNoisySum;
806  uint32_t dfaCleanSum;
807  uint32_t echoEst32Gained;
808  uint32_t tmpU32;
809  int32_t tmp32no1;
810
811  uint16_t xfa[PART_LEN1];
812  uint16_t dfaNoisy[PART_LEN1];
813  uint16_t dfaClean[PART_LEN1];
814  uint16_t* ptrDfaClean = dfaClean;
815  const uint16_t* far_spectrum_ptr = NULL;
816
817  // 32 byte aligned buffers (with +8 or +16).
818  int16_t fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
819  int32_t echoEst32_buf[PART_LEN1 + 8];
820  int32_t dfw_buf[PART_LEN2 + 8];
821  int32_t efw_buf[PART_LEN2 + 8];
822
823  int16_t* fft = (int16_t*)(((uint32_t)fft_buf + 31) & ~ 31);
824  int32_t* echoEst32 = (int32_t*)(((uint32_t)echoEst32_buf + 31) & ~ 31);
825  complex16_t* dfw = (complex16_t*)(((uint32_t)dfw_buf + 31) & ~ 31);
826  complex16_t* efw = (complex16_t*)(((uint32_t)efw_buf + 31) & ~ 31);
827
828  int16_t hnl[PART_LEN1];
829  int16_t numPosCoef = 0;
830  int delay;
831  int16_t tmp16no1;
832  int16_t tmp16no2;
833  int16_t mu;
834  int16_t supGain;
835  int16_t zeros32, zeros16;
836  int16_t zerosDBufNoisy, zerosDBufClean, zerosXBuf;
837  int far_q;
838  int16_t resolutionDiff, qDomainDiff, dfa_clean_q_domain_diff;
839
840  const int kMinPrefBand = 4;
841  const int kMaxPrefBand = 24;
842  int32_t avgHnl32 = 0;
843
844  int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
845  int16_t* ptr;
846  int16_t* ptr1;
847  int16_t* er_ptr;
848  int16_t* dr_ptr;
849
850  ptr = &hnl[0];
851  ptr1 = &hnl[0];
852  er_ptr = &efw[0].real;
853  dr_ptr = &dfw[0].real;
854
855  // Determine startup state. There are three states:
856  // (0) the first CONV_LEN blocks
857  // (1) another CONV_LEN blocks
858  // (2) the rest
859
860  if (aecm->startupState < 2) {
861    aecm->startupState = (aecm->totCount >= CONV_LEN) +
862                         (aecm->totCount >= CONV_LEN2);
863  }
864  // END: Determine startup state
865
866  // Buffer near and far end signals
867  memcpy(aecm->xBuf + PART_LEN, farend, sizeof(int16_t) * PART_LEN);
868  memcpy(aecm->dBufNoisy + PART_LEN,
869         nearendNoisy,
870         sizeof(int16_t) * PART_LEN);
871  if (nearendClean != NULL) {
872    memcpy(aecm->dBufClean + PART_LEN,
873           nearendClean,
874           sizeof(int16_t) * PART_LEN);
875  }
876
877  // Transform far end signal from time domain to frequency domain.
878  far_q = TimeToFrequencyDomain(aecm,
879                                aecm->xBuf,
880                                dfw,
881                                xfa,
882                                &xfaSum);
883
884  // Transform noisy near end signal from time domain to frequency domain.
885  zerosDBufNoisy = TimeToFrequencyDomain(aecm,
886                                         aecm->dBufNoisy,
887                                         dfw,
888                                         dfaNoisy,
889                                         &dfaNoisySum);
890  aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
891  aecm->dfaNoisyQDomain = (int16_t)zerosDBufNoisy;
892
893  if (nearendClean == NULL) {
894    ptrDfaClean = dfaNoisy;
895    aecm->dfaCleanQDomainOld = aecm->dfaNoisyQDomainOld;
896    aecm->dfaCleanQDomain = aecm->dfaNoisyQDomain;
897    dfaCleanSum = dfaNoisySum;
898  } else {
899    // Transform clean near end signal from time domain to frequency domain.
900    zerosDBufClean = TimeToFrequencyDomain(aecm,
901                                           aecm->dBufClean,
902                                           dfw,
903                                           dfaClean,
904                                           &dfaCleanSum);
905    aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
906    aecm->dfaCleanQDomain = (int16_t)zerosDBufClean;
907  }
908
909  // Get the delay
910  // Save far-end history and estimate delay
911  WebRtcAecm_UpdateFarHistory(aecm, xfa, far_q);
912
913  if (WebRtc_AddFarSpectrumFix(aecm->delay_estimator_farend, xfa, PART_LEN1,
914                               far_q) == -1) {
915    return -1;
916  }
917  delay = WebRtc_DelayEstimatorProcessFix(aecm->delay_estimator,
918                                          dfaNoisy,
919                                          PART_LEN1,
920                                          zerosDBufNoisy);
921  if (delay == -1) {
922    return -1;
923  }
924  else if (delay == -2) {
925    // If the delay is unknown, we assume zero.
926    // NOTE: this will have to be adjusted if we ever add lookahead.
927    delay = 0;
928  }
929
930  if (aecm->fixedDelay >= 0) {
931    // Use fixed delay
932    delay = aecm->fixedDelay;
933  }
934
935  // Get aligned far end spectrum
936  far_spectrum_ptr = WebRtcAecm_AlignedFarend(aecm, &far_q, delay);
937  zerosXBuf = (int16_t) far_q;
938
939  if (far_spectrum_ptr == NULL) {
940    return -1;
941  }
942
943  // Calculate log(energy) and update energy threshold levels
944  WebRtcAecm_CalcEnergies(aecm,
945                          far_spectrum_ptr,
946                          zerosXBuf,
947                          dfaNoisySum,
948                          echoEst32);
949  // Calculate stepsize
950  mu = WebRtcAecm_CalcStepSize(aecm);
951
952  // Update counters
953  aecm->totCount++;
954
955  // This is the channel estimation algorithm.
956  // It is base on NLMS but has a variable step length,
957  // which was calculated above.
958  WebRtcAecm_UpdateChannel(aecm,
959                           far_spectrum_ptr,
960                           zerosXBuf,
961                           dfaNoisy,
962                           mu,
963                           echoEst32);
964
965  supGain = WebRtcAecm_CalcSuppressionGain(aecm);
966
967  // Calculate Wiener filter hnl[]
968  for (i = 0; i < PART_LEN1; i++) {
969    // Far end signal through channel estimate in Q8
970    // How much can we shift right to preserve resolution
971    tmp32no1 = echoEst32[i] - aecm->echoFilt[i];
972    aecm->echoFilt[i] += (tmp32no1 * 50) >> 8;
973
974    zeros32 = WebRtcSpl_NormW32(aecm->echoFilt[i]) + 1;
975    zeros16 = WebRtcSpl_NormW16(supGain) + 1;
976    if (zeros32 + zeros16 > 16) {
977      // Multiplication is safe
978      // Result in
979      // Q(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN+aecm->xfaQDomainBuf[diff])
980      echoEst32Gained = WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i],
981                                              (uint16_t)supGain);
982      resolutionDiff = 14 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
983      resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
984    } else {
985      tmp16no1 = 17 - zeros32 - zeros16;
986      resolutionDiff = 14 + tmp16no1 - RESOLUTION_CHANNEL16 -
987                       RESOLUTION_SUPGAIN;
988      resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
989      if (zeros32 > tmp16no1) {
990        echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
991                            (uint32_t)aecm->echoFilt[i],
992                            (uint16_t)WEBRTC_SPL_RSHIFT_W16(supGain, tmp16no1));
993      } else {
994        // Result in Q-(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN-16)
995        echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
996                            (uint32_t)WEBRTC_SPL_RSHIFT_W32(aecm->echoFilt[i],
997                                                            tmp16no1),
998                            (uint16_t)supGain);
999      }
1000    }
1001
1002    zeros16 = WebRtcSpl_NormW16(aecm->nearFilt[i]);
1003    assert(zeros16 >= 0);  // |zeros16| is a norm, hence non-negative.
1004    dfa_clean_q_domain_diff = aecm->dfaCleanQDomain - aecm->dfaCleanQDomainOld;
1005    if (zeros16 < dfa_clean_q_domain_diff && aecm->nearFilt[i]) {
1006      tmp16no1 = aecm->nearFilt[i] << zeros16;
1007      qDomainDiff = zeros16 - dfa_clean_q_domain_diff;
1008      tmp16no2 = ptrDfaClean[i] >> -qDomainDiff;
1009    } else {
1010      tmp16no1 = dfa_clean_q_domain_diff < 0
1011          ? aecm->nearFilt[i] >> -dfa_clean_q_domain_diff
1012          : aecm->nearFilt[i] << dfa_clean_q_domain_diff;
1013      qDomainDiff = 0;
1014      tmp16no2 = ptrDfaClean[i];
1015    }
1016
1017    tmp32no1 = (int32_t)(tmp16no2 - tmp16no1);
1018    tmp16no2 = (int16_t)WEBRTC_SPL_RSHIFT_W32(tmp32no1, 4);
1019    tmp16no2 += tmp16no1;
1020    zeros16 = WebRtcSpl_NormW16(tmp16no2);
1021    if ((tmp16no2) & (-qDomainDiff > zeros16)) {
1022      aecm->nearFilt[i] = WEBRTC_SPL_WORD16_MAX;
1023    } else {
1024      aecm->nearFilt[i] = qDomainDiff < 0 ? tmp16no2 << -qDomainDiff
1025                                          : tmp16no2 >> qDomainDiff;
1026    }
1027
1028    // Wiener filter coefficients, resulting hnl in Q14
1029    if (echoEst32Gained == 0) {
1030      hnl[i] = ONE_Q14;
1031      numPosCoef++;
1032    } else if (aecm->nearFilt[i] == 0) {
1033      hnl[i] = 0;
1034    } else {
1035      // Multiply the suppression gain
1036      // Rounding
1037      echoEst32Gained += (uint32_t)(aecm->nearFilt[i] >> 1);
1038      tmpU32 = WebRtcSpl_DivU32U16(echoEst32Gained,
1039                                   (uint16_t)aecm->nearFilt[i]);
1040
1041      // Current resolution is
1042      // Q-(RESOLUTION_CHANNEL + RESOLUTION_SUPGAIN
1043      //    - max(0, 17 - zeros16 - zeros32))
1044      // Make sure we are in Q14
1045      tmp32no1 = (int32_t)WEBRTC_SPL_SHIFT_W32(tmpU32, resolutionDiff);
1046      if (tmp32no1 > ONE_Q14) {
1047        hnl[i] = 0;
1048      } else if (tmp32no1 < 0) {
1049        hnl[i] = ONE_Q14;
1050        numPosCoef++;
1051      } else {
1052        // 1-echoEst/dfa
1053        hnl[i] = ONE_Q14 - (int16_t)tmp32no1;
1054        if (hnl[i] <= 0) {
1055          hnl[i] = 0;
1056        } else {
1057          numPosCoef++;
1058        }
1059      }
1060    }
1061  }
1062
1063  // Only in wideband. Prevent the gain in upper band from being larger than
1064  // in lower band.
1065  if (aecm->mult == 2) {
1066    // TODO(bjornv): Investigate if the scaling of hnl[i] below can cause
1067    //               speech distortion in double-talk.
1068    for (i = 0; i < (PART_LEN1 >> 3); i++) {
1069      __asm __volatile (
1070        "lh         %[temp1],       0(%[ptr1])                  \n\t"
1071        "lh         %[temp2],       2(%[ptr1])                  \n\t"
1072        "lh         %[temp3],       4(%[ptr1])                  \n\t"
1073        "lh         %[temp4],       6(%[ptr1])                  \n\t"
1074        "lh         %[temp5],       8(%[ptr1])                  \n\t"
1075        "lh         %[temp6],       10(%[ptr1])                 \n\t"
1076        "lh         %[temp7],       12(%[ptr1])                 \n\t"
1077        "lh         %[temp8],       14(%[ptr1])                 \n\t"
1078        "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1079        "mul        %[temp2],       %[temp2],       %[temp2]    \n\t"
1080        "mul        %[temp3],       %[temp3],       %[temp3]    \n\t"
1081        "mul        %[temp4],       %[temp4],       %[temp4]    \n\t"
1082        "mul        %[temp5],       %[temp5],       %[temp5]    \n\t"
1083        "mul        %[temp6],       %[temp6],       %[temp6]    \n\t"
1084        "mul        %[temp7],       %[temp7],       %[temp7]    \n\t"
1085        "mul        %[temp8],       %[temp8],       %[temp8]    \n\t"
1086        "sra        %[temp1],       %[temp1],       14          \n\t"
1087        "sra        %[temp2],       %[temp2],       14          \n\t"
1088        "sra        %[temp3],       %[temp3],       14          \n\t"
1089        "sra        %[temp4],       %[temp4],       14          \n\t"
1090        "sra        %[temp5],       %[temp5],       14          \n\t"
1091        "sra        %[temp6],       %[temp6],       14          \n\t"
1092        "sra        %[temp7],       %[temp7],       14          \n\t"
1093        "sra        %[temp8],       %[temp8],       14          \n\t"
1094        "sh         %[temp1],       0(%[ptr1])                  \n\t"
1095        "sh         %[temp2],       2(%[ptr1])                  \n\t"
1096        "sh         %[temp3],       4(%[ptr1])                  \n\t"
1097        "sh         %[temp4],       6(%[ptr1])                  \n\t"
1098        "sh         %[temp5],       8(%[ptr1])                  \n\t"
1099        "sh         %[temp6],       10(%[ptr1])                 \n\t"
1100        "sh         %[temp7],       12(%[ptr1])                 \n\t"
1101        "sh         %[temp8],       14(%[ptr1])                 \n\t"
1102        "addiu      %[ptr1],        %[ptr1],        16          \n\t"
1103        : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1104          [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
1105          [temp7] "=&r" (temp7), [temp8] "=&r" (temp8), [ptr1] "+r" (ptr1)
1106        :
1107        : "memory", "hi", "lo"
1108      );
1109    }
1110    for(i = 0; i < (PART_LEN1 & 7); i++) {
1111      __asm __volatile (
1112        "lh         %[temp1],       0(%[ptr1])                  \n\t"
1113        "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1114        "sra        %[temp1],       %[temp1],       14          \n\t"
1115        "sh         %[temp1],       0(%[ptr1])                  \n\t"
1116        "addiu      %[ptr1],        %[ptr1],        2           \n\t"
1117        : [temp1] "=&r" (temp1), [ptr1] "+r" (ptr1)
1118        :
1119        : "memory", "hi", "lo"
1120      );
1121    }
1122
1123    for (i = kMinPrefBand; i <= kMaxPrefBand; i++) {
1124      avgHnl32 += (int32_t)hnl[i];
1125    }
1126
1127    assert(kMaxPrefBand - kMinPrefBand + 1 > 0);
1128    avgHnl32 /= (kMaxPrefBand - kMinPrefBand + 1);
1129
1130    for (i = kMaxPrefBand; i < PART_LEN1; i++) {
1131      if (hnl[i] > (int16_t)avgHnl32) {
1132        hnl[i] = (int16_t)avgHnl32;
1133      }
1134    }
1135  }
1136
1137  // Calculate NLP gain, result is in Q14
1138  if (aecm->nlpFlag) {
1139    if (numPosCoef < 3) {
1140      for (i = 0; i < PART_LEN1; i++) {
1141        efw[i].real = 0;
1142        efw[i].imag = 0;
1143        hnl[i] = 0;
1144      }
1145    } else {
1146      for (i = 0; i < PART_LEN1; i++) {
1147#if defined(MIPS_DSP_R1_LE)
1148        __asm __volatile (
1149          ".set       push                                        \n\t"
1150          ".set       noreorder                                   \n\t"
1151          "lh         %[temp1],       0(%[ptr])                   \n\t"
1152          "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1153          "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1154          "beqz       %[temp4],       3f                          \n\t"
1155          " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1156          "slti       %[temp5],       %[temp1],       3277        \n\t"
1157          "bnez       %[temp5],       2f                          \n\t"
1158          " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1159          "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1160          "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1161          "shra_r.w   %[temp2],       %[temp2],       14          \n\t"
1162          "shra_r.w   %[temp3],       %[temp3],       14          \n\t"
1163          "b          4f                                          \n\t"
1164          " nop                                                   \n\t"
1165         "2:                                                      \n\t"
1166          "addu       %[temp1],       $zero,          $zero       \n\t"
1167          "addu       %[temp2],       $zero,          $zero       \n\t"
1168          "addu       %[temp3],       $zero,          $zero       \n\t"
1169          "b          1f                                          \n\t"
1170          " nop                                                   \n\t"
1171         "3:                                                      \n\t"
1172          "addiu      %[temp1],       $0,             0x4000      \n\t"
1173         "1:                                                      \n\t"
1174          "sh         %[temp1],       0(%[ptr])                   \n\t"
1175         "4:                                                      \n\t"
1176          "sh         %[temp2],       0(%[er_ptr])                \n\t"
1177          "sh         %[temp3],       2(%[er_ptr])                \n\t"
1178          "addiu      %[ptr],         %[ptr],         2           \n\t"
1179          "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1180          ".set       pop                                         \n\t"
1181          : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1182            [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1183            [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1184          :
1185          : "memory", "hi", "lo"
1186        );
1187#else
1188        __asm __volatile (
1189          ".set       push                                        \n\t"
1190          ".set       noreorder                                   \n\t"
1191          "lh         %[temp1],       0(%[ptr])                   \n\t"
1192          "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1193          "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1194          "beqz       %[temp4],       3f                          \n\t"
1195          " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1196          "slti       %[temp5],       %[temp1],       3277        \n\t"
1197          "bnez       %[temp5],       2f                          \n\t"
1198          " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1199          "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1200          "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1201          "addiu      %[temp2],       %[temp2],       0x2000      \n\t"
1202          "addiu      %[temp3],       %[temp3],       0x2000      \n\t"
1203          "sra        %[temp2],       %[temp2],       14          \n\t"
1204          "sra        %[temp3],       %[temp3],       14          \n\t"
1205          "b          4f                                          \n\t"
1206          " nop                                                   \n\t"
1207         "2:                                                      \n\t"
1208          "addu       %[temp1],       $zero,          $zero       \n\t"
1209          "addu       %[temp2],       $zero,          $zero       \n\t"
1210          "addu       %[temp3],       $zero,          $zero       \n\t"
1211          "b          1f                                          \n\t"
1212          " nop                                                   \n\t"
1213         "3:                                                      \n\t"
1214          "addiu      %[temp1],       $0,             0x4000      \n\t"
1215         "1:                                                      \n\t"
1216          "sh         %[temp1],       0(%[ptr])                   \n\t"
1217         "4:                                                      \n\t"
1218          "sh         %[temp2],       0(%[er_ptr])                \n\t"
1219          "sh         %[temp3],       2(%[er_ptr])                \n\t"
1220          "addiu      %[ptr],         %[ptr],         2           \n\t"
1221          "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1222          ".set       pop                                         \n\t"
1223          : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1224            [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1225            [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1226          :
1227          : "memory", "hi", "lo"
1228        );
1229#endif
1230      }
1231    }
1232  }
1233  else {
1234    // multiply with Wiener coefficients
1235    for (i = 0; i < PART_LEN1; i++) {
1236      efw[i].real = (int16_t)
1237                      (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
1238                                                            hnl[i],
1239                                                            14));
1240      efw[i].imag = (int16_t)
1241                      (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
1242                                                            hnl[i],
1243                                                            14));
1244    }
1245  }
1246
1247  if (aecm->cngMode == AecmTrue) {
1248    ComfortNoise(aecm, ptrDfaClean, efw, hnl);
1249  }
1250
1251  InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
1252
1253  return 0;
1254}
1255
1256// Generate comfort noise and add to output signal.
1257static void ComfortNoise(AecmCore_t* aecm,
1258                         const uint16_t* dfa,
1259                         complex16_t* out,
1260                         const int16_t* lambda) {
1261  int16_t i;
1262  int16_t tmp16, tmp161, tmp162, tmp163, nrsh1, nrsh2;
1263  int32_t tmp32, tmp321, tnoise, tnoise1;
1264  int32_t tmp322, tmp323, *tmp1;
1265  int16_t* dfap;
1266  int16_t* lambdap;
1267  const int32_t c2049 = 2049;
1268  const int32_t c359 = 359;
1269  const int32_t c114 = ONE_Q14;
1270
1271  int16_t randW16[PART_LEN];
1272  int16_t uReal[PART_LEN1];
1273  int16_t uImag[PART_LEN1];
1274  int32_t outLShift32;
1275
1276  int16_t shiftFromNearToNoise = kNoiseEstQDomain - aecm->dfaCleanQDomain;
1277  int16_t minTrackShift = 9;
1278
1279  assert(shiftFromNearToNoise >= 0);
1280  assert(shiftFromNearToNoise < 16);
1281
1282  if (aecm->noiseEstCtr < 100) {
1283    // Track the minimum more quickly initially.
1284    aecm->noiseEstCtr++;
1285    minTrackShift = 6;
1286  }
1287
1288  // Generate a uniform random array on [0 2^15-1].
1289  WebRtcSpl_RandUArray(randW16, PART_LEN, &aecm->seed);
1290  int16_t* randW16p = (int16_t*)randW16;
1291#if defined (MIPS_DSP_R1_LE)
1292  int16_t* kCosTablep = (int16_t*)WebRtcAecm_kCosTable;
1293  int16_t* kSinTablep = (int16_t*)WebRtcAecm_kSinTable;
1294#endif   // #if defined(MIPS_DSP_R1_LE)
1295  tmp1 = (int32_t*)aecm->noiseEst + 1;
1296  dfap = (int16_t*)dfa + 1;
1297  lambdap = (int16_t*)lambda + 1;
1298  // Estimate noise power.
1299  for (i = 1; i < PART_LEN1; i+=2) {
1300  // Shift to the noise domain.
1301    __asm __volatile (
1302      "lh     %[tmp32],       0(%[dfap])                              \n\t"
1303      "lw     %[tnoise],      0(%[tmp1])                              \n\t"
1304      "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1305      : [tmp32] "=&r" (tmp32), [outLShift32] "=r" (outLShift32),
1306        [tnoise] "=&r" (tnoise)
1307      : [tmp1] "r" (tmp1), [dfap] "r" (dfap),
1308        [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1309      : "memory"
1310    );
1311
1312    if (outLShift32 < tnoise) {
1313      // Reset "too low" counter
1314      aecm->noiseEstTooLowCtr[i] = 0;
1315      // Track the minimum.
1316      if (tnoise < (1 << minTrackShift)) {
1317        // For small values, decrease noiseEst[i] every
1318        // |kNoiseEstIncCount| block. The regular approach below can not
1319        // go further down due to truncation.
1320        aecm->noiseEstTooHighCtr[i]++;
1321        if (aecm->noiseEstTooHighCtr[i] >= kNoiseEstIncCount) {
1322          tnoise--;
1323          aecm->noiseEstTooHighCtr[i] = 0;  // Reset the counter
1324        }
1325      } else {
1326        __asm __volatile (
1327          "subu   %[tmp32],       %[tnoise],      %[outLShift32]      \n\t"
1328          "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1329          "subu   %[tnoise],      %[tnoise],      %[tmp32]            \n\t"
1330          : [tmp32] "=&r" (tmp32), [tnoise] "+r" (tnoise)
1331          : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1332        );
1333      }
1334    } else {
1335      // Reset "too high" counter
1336      aecm->noiseEstTooHighCtr[i] = 0;
1337      // Ramp slowly upwards until we hit the minimum again.
1338      if ((tnoise >> 19) <= 0) {
1339        if ((tnoise >> 11) > 0) {
1340          // Large enough for relative increase
1341          __asm __volatile (
1342            "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1343            "sra    %[tnoise],  %[tnoise],  11          \n\t"
1344            : [tnoise] "+r" (tnoise)
1345            : [c2049] "r" (c2049)
1346            : "hi", "lo"
1347          );
1348        } else {
1349          // Make incremental increases based on size every
1350          // |kNoiseEstIncCount| block
1351          aecm->noiseEstTooLowCtr[i]++;
1352          if (aecm->noiseEstTooLowCtr[i] >= kNoiseEstIncCount) {
1353            __asm __volatile (
1354              "sra    %[tmp32],   %[tnoise],  9           \n\t"
1355              "addi   %[tnoise],  %[tnoise],  1           \n\t"
1356              "addu   %[tnoise],  %[tnoise],  %[tmp32]    \n\t"
1357              : [tnoise] "+r" (tnoise), [tmp32] "=&r" (tmp32)
1358              :
1359            );
1360            aecm->noiseEstTooLowCtr[i] = 0; // Reset counter
1361          }
1362        }
1363      } else {
1364        // Avoid overflow.
1365        // Multiplication with 2049 will cause wrap around. Scale
1366        // down first and then multiply
1367        __asm __volatile (
1368          "sra    %[tnoise],  %[tnoise],  11          \n\t"
1369          "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1370          : [tnoise] "+r" (tnoise)
1371          : [c2049] "r" (c2049)
1372          : "hi", "lo"
1373        );
1374      }
1375    }
1376
1377    // Shift to the noise domain.
1378    __asm __volatile (
1379      "lh     %[tmp32],       2(%[dfap])                              \n\t"
1380      "lw     %[tnoise1],     4(%[tmp1])                              \n\t"
1381      "addiu  %[dfap],        %[dfap],    4                           \n\t"
1382      "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1383      : [tmp32] "=&r" (tmp32), [dfap] "+r" (dfap),
1384        [outLShift32] "=r" (outLShift32), [tnoise1] "=&r" (tnoise1)
1385      : [tmp1] "r" (tmp1), [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1386      : "memory"
1387    );
1388
1389    if (outLShift32 < tnoise1) {
1390      // Reset "too low" counter
1391      aecm->noiseEstTooLowCtr[i + 1] = 0;
1392      // Track the minimum.
1393      if (tnoise1 < (1 << minTrackShift)) {
1394        // For small values, decrease noiseEst[i] every
1395        // |kNoiseEstIncCount| block. The regular approach below can not
1396        // go further down due to truncation.
1397        aecm->noiseEstTooHighCtr[i + 1]++;
1398        if (aecm->noiseEstTooHighCtr[i + 1] >= kNoiseEstIncCount) {
1399          tnoise1--;
1400          aecm->noiseEstTooHighCtr[i + 1] = 0; // Reset the counter
1401        }
1402      } else {
1403        __asm __volatile (
1404          "subu   %[tmp32],       %[tnoise1],     %[outLShift32]      \n\t"
1405          "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1406          "subu   %[tnoise1],     %[tnoise1],     %[tmp32]            \n\t"
1407          : [tmp32] "=&r" (tmp32), [tnoise1] "+r" (tnoise1)
1408          : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1409        );
1410      }
1411    } else {
1412      // Reset "too high" counter
1413      aecm->noiseEstTooHighCtr[i + 1] = 0;
1414      // Ramp slowly upwards until we hit the minimum again.
1415      if ((tnoise1 >> 19) <= 0) {
1416        if ((tnoise1 >> 11) > 0) {
1417          // Large enough for relative increase
1418          __asm __volatile (
1419            "mul    %[tnoise1], %[tnoise1], %[c2049]   \n\t"
1420            "sra    %[tnoise1], %[tnoise1], 11         \n\t"
1421            : [tnoise1] "+r" (tnoise1)
1422            : [c2049] "r" (c2049)
1423            : "hi", "lo"
1424          );
1425        } else {
1426          // Make incremental increases based on size every
1427          // |kNoiseEstIncCount| block
1428          aecm->noiseEstTooLowCtr[i + 1]++;
1429          if (aecm->noiseEstTooLowCtr[i + 1] >= kNoiseEstIncCount) {
1430            __asm __volatile (
1431              "sra    %[tmp32],   %[tnoise1], 9           \n\t"
1432              "addi   %[tnoise1], %[tnoise1], 1           \n\t"
1433              "addu   %[tnoise1], %[tnoise1], %[tmp32]    \n\t"
1434              : [tnoise1] "+r" (tnoise1), [tmp32] "=&r" (tmp32)
1435              :
1436            );
1437            aecm->noiseEstTooLowCtr[i + 1] = 0; // Reset counter
1438          }
1439        }
1440      } else {
1441        // Avoid overflow.
1442        // Multiplication with 2049 will cause wrap around. Scale
1443        // down first and then multiply
1444        __asm __volatile (
1445          "sra    %[tnoise1], %[tnoise1], 11          \n\t"
1446          "mul    %[tnoise1], %[tnoise1], %[c2049]    \n\t"
1447          : [tnoise1] "+r" (tnoise1)
1448          : [c2049] "r" (c2049)
1449          : "hi", "lo"
1450        );
1451      }
1452    }
1453
1454    __asm __volatile (
1455      "lh     %[tmp16],   0(%[lambdap])                           \n\t"
1456      "lh     %[tmp161],  2(%[lambdap])                           \n\t"
1457      "sw     %[tnoise],  0(%[tmp1])                              \n\t"
1458      "sw     %[tnoise1], 4(%[tmp1])                              \n\t"
1459      "subu   %[tmp16],   %[c114],        %[tmp16]                \n\t"
1460      "subu   %[tmp161],  %[c114],        %[tmp161]               \n\t"
1461      "srav   %[tmp32],   %[tnoise],      %[shiftFromNearToNoise] \n\t"
1462      "srav   %[tmp321],  %[tnoise1],     %[shiftFromNearToNoise] \n\t"
1463      "addiu  %[lambdap], %[lambdap],     4                       \n\t"
1464      "addiu  %[tmp1],    %[tmp1],        8                       \n\t"
1465      : [tmp16] "=&r" (tmp16), [tmp161] "=&r" (tmp161), [tmp1] "+r" (tmp1),
1466        [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321), [lambdap] "+r" (lambdap)
1467      : [tnoise] "r" (tnoise), [tnoise1] "r" (tnoise1), [c114] "r" (c114),
1468        [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1469      : "memory"
1470    );
1471
1472    if (tmp32 > 32767) {
1473      tmp32 = 32767;
1474      aecm->noiseEst[i] = WEBRTC_SPL_LSHIFT_W32(tmp32, shiftFromNearToNoise);
1475    }
1476    if (tmp321 > 32767) {
1477      tmp321 = 32767;
1478      aecm->noiseEst[i+1] = WEBRTC_SPL_LSHIFT_W32(tmp321, shiftFromNearToNoise);
1479    }
1480
1481    __asm __volatile (
1482      "mul    %[tmp32],   %[tmp32],       %[tmp16]                \n\t"
1483      "mul    %[tmp321],  %[tmp321],      %[tmp161]               \n\t"
1484      "sra    %[nrsh1],   %[tmp32],       14                      \n\t"
1485      "sra    %[nrsh2],   %[tmp321],      14                      \n\t"
1486      : [nrsh1] "=&r" (nrsh1), [nrsh2] "=r" (nrsh2)
1487      : [tmp16] "r" (tmp16), [tmp161] "r" (tmp161), [tmp32] "r" (tmp32),
1488        [tmp321] "r" (tmp321)
1489      : "memory", "hi", "lo"
1490    );
1491
1492    __asm __volatile (
1493      "lh     %[tmp32],       0(%[randW16p])              \n\t"
1494      "lh     %[tmp321],      2(%[randW16p])              \n\t"
1495      "addiu  %[randW16p],    %[randW16p],    4           \n\t"
1496      "mul    %[tmp32],       %[tmp32],       %[c359]     \n\t"
1497      "mul    %[tmp321],      %[tmp321],      %[c359]     \n\t"
1498      "sra    %[tmp16],       %[tmp32],       15          \n\t"
1499      "sra    %[tmp161],      %[tmp321],      15          \n\t"
1500      : [randW16p] "+r" (randW16p), [tmp32] "=&r" (tmp32),
1501        [tmp16] "=r" (tmp16), [tmp161] "=r" (tmp161), [tmp321] "=&r" (tmp321)
1502      : [c359] "r" (c359)
1503      : "memory", "hi", "lo"
1504    );
1505
1506#if !defined(MIPS_DSP_R1_LE)
1507    tmp32 = WebRtcAecm_kCosTable[tmp16];
1508    tmp321 = WebRtcAecm_kSinTable[tmp16];
1509    tmp322 = WebRtcAecm_kCosTable[tmp161];
1510    tmp323 = WebRtcAecm_kSinTable[tmp161];
1511#else
1512    __asm __volatile (
1513      "sll    %[tmp16],       %[tmp16],                   1           \n\t"
1514      "sll    %[tmp161],      %[tmp161],                  1           \n\t"
1515      "lhx    %[tmp32],       %[tmp16](%[kCosTablep])                 \n\t"
1516      "lhx    %[tmp321],      %[tmp16](%[kSinTablep])                 \n\t"
1517      "lhx    %[tmp322],      %[tmp161](%[kCosTablep])                \n\t"
1518      "lhx    %[tmp323],      %[tmp161](%[kSinTablep])                \n\t"
1519      : [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321),
1520        [tmp322] "=&r" (tmp322), [tmp323] "=&r" (tmp323)
1521      : [kCosTablep] "r" (kCosTablep), [tmp16] "r" (tmp16),
1522        [tmp161] "r" (tmp161), [kSinTablep] "r" (kSinTablep)
1523      : "memory"
1524    );
1525#endif
1526    __asm __volatile (
1527      "mul    %[tmp32],       %[tmp32],                   %[nrsh1]    \n\t"
1528      "negu   %[tmp162],      %[nrsh1]                                \n\t"
1529      "mul    %[tmp322],      %[tmp322],                  %[nrsh2]    \n\t"
1530      "negu   %[tmp163],      %[nrsh2]                                \n\t"
1531      "sra    %[tmp32],       %[tmp32],                   13          \n\t"
1532      "mul    %[tmp321],      %[tmp321],                  %[tmp162]   \n\t"
1533      "sra    %[tmp322],      %[tmp322],                  13          \n\t"
1534      "mul    %[tmp323],      %[tmp323],                  %[tmp163]   \n\t"
1535      "sra    %[tmp321],      %[tmp321],                  13          \n\t"
1536      "sra    %[tmp323],      %[tmp323],                  13          \n\t"
1537      : [tmp32] "+r" (tmp32), [tmp321] "+r" (tmp321), [tmp162] "=&r" (tmp162),
1538        [tmp322] "+r" (tmp322), [tmp323] "+r" (tmp323), [tmp163] "=&r" (tmp163)
1539      : [nrsh1] "r" (nrsh1), [nrsh2] "r" (nrsh2)
1540      : "hi", "lo"
1541    );
1542    // Tables are in Q13.
1543    uReal[i] = (int16_t)tmp32;
1544    uImag[i] = (int16_t)tmp321;
1545    uReal[i + 1] = (int16_t)tmp322;
1546    uImag[i + 1] = (int16_t)tmp323;
1547  }
1548
1549  int32_t tt, sgn;
1550  tt = out[0].real;
1551  sgn = ((int)tt) >> 31;
1552  out[0].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1553  tt = out[0].imag;
1554  sgn = ((int)tt) >> 31;
1555  out[0].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1556  for (i = 1; i < PART_LEN; i++) {
1557    tt = out[i].real + uReal[i];
1558    sgn = ((int)tt) >> 31;
1559    out[i].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1560    tt = out[i].imag + uImag[i];
1561    sgn = ((int)tt) >> 31;
1562    out[i].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1563  }
1564  tt = out[PART_LEN].real + uReal[PART_LEN];
1565  sgn = ((int)tt) >> 31;
1566  out[PART_LEN].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1567  tt = out[PART_LEN].imag;
1568  sgn = ((int)tt) >> 31;
1569  out[PART_LEN].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1570}
1571
1572