1/*
2 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/modules/audio_processing/ns/nsx_core.h"
12
13#include <arm_neon.h>
14#include <assert.h>
15
16// Constants to compensate for shifting signal log(2^shifts).
17const int16_t WebRtcNsx_kLogTable[9] = {
18  0, 177, 355, 532, 710, 887, 1065, 1242, 1420
19};
20
21const int16_t WebRtcNsx_kCounterDiv[201] = {
22  32767, 16384, 10923, 8192, 6554, 5461, 4681, 4096, 3641, 3277, 2979, 2731,
23  2521, 2341, 2185, 2048, 1928, 1820, 1725, 1638, 1560, 1489, 1425, 1365, 1311,
24  1260, 1214, 1170, 1130, 1092, 1057, 1024, 993, 964, 936, 910, 886, 862, 840,
25  819, 799, 780, 762, 745, 728, 712, 697, 683, 669, 655, 643, 630, 618, 607,
26  596, 585, 575, 565, 555, 546, 537, 529, 520, 512, 504, 496, 489, 482, 475,
27  468, 462, 455, 449, 443, 437, 431, 426, 420, 415, 410, 405, 400, 395, 390,
28  386, 381, 377, 372, 368, 364, 360, 356, 352, 349, 345, 341, 338, 334, 331,
29  328, 324, 321, 318, 315, 312, 309, 306, 303, 301, 298, 295, 293, 290, 287,
30  285, 282, 280, 278, 275, 273, 271, 269, 266, 264, 262, 260, 258, 256, 254,
31  252, 250, 248, 246, 245, 243, 241, 239, 237, 236, 234, 232, 231, 229, 228,
32  226, 224, 223, 221, 220, 218, 217, 216, 214, 213, 211, 210, 209, 207, 206,
33  205, 204, 202, 201, 200, 199, 197, 196, 195, 194, 193, 192, 191, 189, 188,
34  187, 186, 185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173,
35  172, 172, 171, 170, 169, 168, 167, 166, 165, 165, 164, 163
36};
37
38const int16_t WebRtcNsx_kLogTableFrac[256] = {
39  0, 1, 3, 4, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 21,
40  22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 36, 37, 38, 40, 41, 42,
41  44, 45, 46, 47, 49, 50, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62,
42  63, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81,
43  82, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99,
44  100, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 116,
45  117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
46  132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
47  147, 148, 149, 150, 151, 152, 153, 154, 155, 155, 156, 157, 158, 159, 160,
48  161, 162, 163, 164, 165, 166, 167, 168, 169, 169, 170, 171, 172, 173, 174,
49  175, 176, 177, 178, 178, 179, 180, 181, 182, 183, 184, 185, 185, 186, 187,
50  188, 189, 190, 191, 192, 192, 193, 194, 195, 196, 197, 198, 198, 199, 200,
51  201, 202, 203, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 212,
52  213, 214, 215, 216, 216, 217, 218, 219, 220, 220, 221, 222, 223, 224, 224,
53  225, 226, 227, 228, 228, 229, 230, 231, 231, 232, 233, 234, 234, 235, 236,
54  237, 238, 238, 239, 240, 241, 241, 242, 243, 244, 244, 245, 246, 247, 247,
55  248, 249, 249, 250, 251, 252, 252, 253, 254, 255, 255
56};
57
58// Update the noise estimation information.
59static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
60  const int16_t kExp2Const = 11819; // Q13
61  int16_t* ptr_noiseEstLogQuantile = NULL;
62  int16_t* ptr_noiseEstQuantile = NULL;
63  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
64  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
65  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
66  int32x4_t constB32x4 = vdupq_n_s32(0x200000);
67
68  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
69                                        inst->magnLen);
70
71  // Guarantee a Q-domain as high as possible and still fit in int16
72  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
73                                                                 tmp16,
74                                                                 21);
75
76  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);
77
78  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
79       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
80       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
81       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {
82
83    // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
84    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
85    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);
86
87    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
88    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
89    v32x4A = vorrq_s32(v32x4A, constB32x4);
90
91    // tmp16 = (int16_t)(tmp32no2 >> 21);
92    v32x4B = vshrq_n_s32(v32x4B, 21);
93
94    // tmp16 -= 21;// shift 21 to get result in Q0
95    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);
96
97    // tmp16 += (int16_t) inst->qNoise;
98    // shift to get result in Q(qNoise)
99    v32x4B = vaddq_s32(v32x4B, qNoise32x4);
100
101    // if (tmp16 < 0) {
102    //   tmp32no1 >>= -tmp16;
103    // } else {
104    //   tmp32no1 <<= tmp16;
105    // }
106    v32x4B = vshlq_s32(v32x4A, v32x4B);
107
108    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
109    v16x4 = vqmovn_s32(v32x4B);
110
111    //inst->noiseEstQuantile[i] = tmp16;
112    vst1_s16(ptr_noiseEstQuantile, v16x4);
113  }
114
115  // Last iteration:
116
117  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
118  // in Q21
119  int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
120  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
121
122  tmp16 = (int16_t)(tmp32no2 >> 21);
123  tmp16 -= 21;// shift 21 to get result in Q0
124  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
125  if (tmp16 < 0) {
126    tmp32no1 >>= -tmp16;
127  } else {
128    tmp32no1 <<= tmp16;
129  }
130  *ptr_noiseEstQuantile = WebRtcSpl_SatW32ToW16(tmp32no1);
131}
132
133// Noise Estimation
134void WebRtcNsx_NoiseEstimationNeon(NoiseSuppressionFixedC* inst,
135                                   uint16_t* magn,
136                                   uint32_t* noise,
137                                   int16_t* q_noise) {
138  int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
139  int16_t countProd, delta, zeros, frac;
140  int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
141  const int16_t log2_const = 22713;
142  const int16_t width_factor = 21845;
143
144  size_t i, s, offset;
145
146  tabind = inst->stages - inst->normData;
147  assert(tabind < 9);
148  assert(tabind > -9);
149  if (tabind < 0) {
150    logval = -WebRtcNsx_kLogTable[-tabind];
151  } else {
152    logval = WebRtcNsx_kLogTable[tabind];
153  }
154
155  int16x8_t logval_16x8 = vdupq_n_s16(logval);
156
157  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
158  // magn is in Q(-stages), and the real lmagn values are:
159  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
160  // lmagn in Q8
161  for (i = 0; i < inst->magnLen; i++) {
162    if (magn[i]) {
163      zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
164      frac = (int16_t)((((uint32_t)magn[i] << zeros)
165                        & 0x7FFFFFFF) >> 23);
166      assert(frac < 256);
167      // log2(magn(i))
168      log2 = (int16_t)(((31 - zeros) << 8)
169                       + WebRtcNsx_kLogTableFrac[frac]);
170      // log2(magn(i))*log(2)
171      lmagn[i] = (int16_t)((log2 * log2_const) >> 15);
172      // + log(2^stages)
173      lmagn[i] += logval;
174    } else {
175      lmagn[i] = logval;
176    }
177  }
178
179  int16x4_t Q3_16x4  = vdup_n_s16(3);
180  int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8);
181  int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor);
182
183  int16_t factor = FACTOR_Q7;
184  if (inst->blockIndex < END_STARTUP_LONG)
185    factor = FACTOR_Q7_STARTUP;
186
187  // Loop over simultaneous estimates
188  for (s = 0; s < SIMULT; s++) {
189    offset = s * inst->magnLen;
190
191    // Get counter values from state
192    counter = inst->noiseEstCounter[s];
193    assert(counter < 201);
194    countDiv = WebRtcNsx_kCounterDiv[counter];
195    countProd = (int16_t)(counter * countDiv);
196
197    // quant_est(...)
198    int16_t deltaBuff[8];
199    int16x4_t tmp16x4_0;
200    int16x4_t tmp16x4_1;
201    int16x4_t countDiv_16x4 = vdup_n_s16(countDiv);
202    int16x8_t countProd_16x8 = vdupq_n_s16(countProd);
203    int16x8_t tmp16x8_0 = vdupq_n_s16(countDiv);
204    int16x8_t prod16x8 = vqrdmulhq_s16(WIDTHFACTOR_16x8, tmp16x8_0);
205    int16x8_t tmp16x8_1;
206    int16x8_t tmp16x8_2;
207    int16x8_t tmp16x8_3;
208    uint16x8_t tmp16x8_4;
209    int32x4_t tmp32x4;
210
211    for (i = 0; i + 7 < inst->magnLen; i += 8) {
212      // Compute delta.
213      // Smaller step size during startup. This prevents from using
214      // unrealistic values causing overflow.
215      tmp16x8_0 = vdupq_n_s16(factor);
216      vst1q_s16(deltaBuff, tmp16x8_0);
217
218      int j;
219      for (j = 0; j < 8; j++) {
220        if (inst->noiseEstDensity[offset + i + j] > 512) {
221          // Get values for deltaBuff by shifting intead of dividing.
222          int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i + j]);
223          deltaBuff[j] = (int16_t)(FACTOR_Q16 >> (14 - factor));
224        }
225      }
226
227      // Update log quantile estimate
228
229      // tmp16 = (int16_t)((delta * countDiv) >> 14);
230      tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[0]), countDiv_16x4);
231      tmp16x4_1 = vshrn_n_s32(tmp32x4, 14);
232      tmp32x4 = vmull_s16(vld1_s16(&deltaBuff[4]), countDiv_16x4);
233      tmp16x4_0 = vshrn_n_s32(tmp32x4, 14);
234      tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // Keep for several lines.
235
236      // prepare for the "if" branch
237      // tmp16 += 2;
238      // tmp16_1 = (Word16)(tmp16>>2);
239      tmp16x8_1 = vrshrq_n_s16(tmp16x8_0, 2);
240
241      // inst->noiseEstLogQuantile[offset+i] + tmp16_1;
242      tmp16x8_2 = vld1q_s16(&inst->noiseEstLogQuantile[offset + i]); // Keep
243      tmp16x8_1 = vaddq_s16(tmp16x8_2, tmp16x8_1); // Keep for several lines
244
245      // Prepare for the "else" branch
246      // tmp16 += 1;
247      // tmp16_1 = (Word16)(tmp16>>1);
248      tmp16x8_0 = vrshrq_n_s16(tmp16x8_0, 1);
249
250      // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1);
251      tmp32x4 = vmull_s16(vget_low_s16(tmp16x8_0), Q3_16x4);
252      tmp16x4_1 = vshrn_n_s32(tmp32x4, 1);
253
254      // tmp16_2 = (int16_t)((tmp16_1 * 3) >> 1);
255      tmp32x4 = vmull_s16(vget_high_s16(tmp16x8_0), Q3_16x4);
256      tmp16x4_0 = vshrn_n_s32(tmp32x4, 1);
257
258      // inst->noiseEstLogQuantile[offset + i] - tmp16_2;
259      tmp16x8_0 = vcombine_s16(tmp16x4_1, tmp16x4_0); // keep
260      tmp16x8_0 = vsubq_s16(tmp16x8_2, tmp16x8_0);
261
262      // logval is the smallest fixed point representation we can have. Values
263      // below that will correspond to values in the interval [0, 1], which
264      // can't possibly occur.
265      tmp16x8_0 = vmaxq_s16(tmp16x8_0, logval_16x8);
266
267      // Do the if-else branches:
268      tmp16x8_3 = vld1q_s16(&lmagn[i]); // keep for several lines
269      tmp16x8_4 = vcgtq_s16(tmp16x8_3, tmp16x8_2);
270      tmp16x8_2 = vbslq_s16(tmp16x8_4, tmp16x8_1, tmp16x8_0);
271      vst1q_s16(&inst->noiseEstLogQuantile[offset + i], tmp16x8_2);
272
273      // Update density estimate
274      // tmp16_1 + tmp16_2
275      tmp16x8_1 = vld1q_s16(&inst->noiseEstDensity[offset + i]);
276      tmp16x8_0 = vqrdmulhq_s16(tmp16x8_1, countProd_16x8);
277      tmp16x8_0 = vaddq_s16(tmp16x8_0, prod16x8);
278
279      // lmagn[i] - inst->noiseEstLogQuantile[offset + i]
280      tmp16x8_3 = vsubq_s16(tmp16x8_3, tmp16x8_2);
281      tmp16x8_3 = vabsq_s16(tmp16x8_3);
282      tmp16x8_4 = vcgtq_s16(WIDTHQ8_16x8, tmp16x8_3);
283      tmp16x8_1 = vbslq_s16(tmp16x8_4, tmp16x8_0, tmp16x8_1);
284      vst1q_s16(&inst->noiseEstDensity[offset + i], tmp16x8_1);
285    }  // End loop over magnitude spectrum
286
287    // Last iteration over magnitude spectrum:
288    // compute delta
289    if (inst->noiseEstDensity[offset + i] > 512) {
290      // Get values for deltaBuff by shifting intead of dividing.
291      int factor = WebRtcSpl_NormW16(inst->noiseEstDensity[offset + i]);
292      delta = (int16_t)(FACTOR_Q16 >> (14 - factor));
293    } else {
294      delta = FACTOR_Q7;
295      if (inst->blockIndex < END_STARTUP_LONG) {
296        // Smaller step size during startup. This prevents from using
297        // unrealistic values causing overflow.
298        delta = FACTOR_Q7_STARTUP;
299      }
300    }
301    // update log quantile estimate
302    tmp16 = (int16_t)((delta * countDiv) >> 14);
303    if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
304      // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
305      // CounterDiv=1/(inst->counter[s]+1) in Q15
306      tmp16 += 2;
307      inst->noiseEstLogQuantile[offset + i] += tmp16 / 4;
308    } else {
309      tmp16 += 1;
310      // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
311      // TODO(bjornv): investigate why we need to truncate twice.
312      tmp16no2 = (int16_t)((tmp16 / 2) * 3 / 2);
313      inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
314      if (inst->noiseEstLogQuantile[offset + i] < logval) {
315        // logval is the smallest fixed point representation we can have.
316        // Values below that will correspond to values in the interval
317        // [0, 1], which can't possibly occur.
318        inst->noiseEstLogQuantile[offset + i] = logval;
319      }
320    }
321
322    // update density estimate
323    if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
324        < WIDTH_Q8) {
325      tmp16no1 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
326                   inst->noiseEstDensity[offset + i], countProd, 15);
327      tmp16no2 = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
328                   width_factor, countDiv, 15);
329      inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
330    }
331
332
333    if (counter >= END_STARTUP_LONG) {
334      inst->noiseEstCounter[s] = 0;
335      if (inst->blockIndex >= END_STARTUP_LONG) {
336        UpdateNoiseEstimateNeon(inst, offset);
337      }
338    }
339    inst->noiseEstCounter[s]++;
340
341  }  // end loop over simultaneous estimates
342
343  // Sequentially update the noise during startup
344  if (inst->blockIndex < END_STARTUP_LONG) {
345    UpdateNoiseEstimateNeon(inst, offset);
346  }
347
348  for (i = 0; i < inst->magnLen; i++) {
349    noise[i] = (uint32_t)(inst->noiseEstQuantile[i]); // Q(qNoise)
350  }
351  (*q_noise) = (int16_t)inst->qNoise;
352}
353
354// Filter the data in the frequency domain, and create spectrum.
355void WebRtcNsx_PrepareSpectrumNeon(NoiseSuppressionFixedC* inst,
356                                   int16_t* freq_buf) {
357  assert(inst->magnLen % 8 == 1);
358  assert(inst->anaLen2 % 16 == 0);
359
360  // (1) Filtering.
361
362  // Fixed point C code for the next block is as follows:
363  // for (i = 0; i < inst->magnLen; i++) {
364  //   inst->real[i] = (int16_t)((inst->real[i] *
365  //      (int16_t)(inst->noiseSupFilter[i])) >> 14);  // Q(normData-stages)
366  //   inst->imag[i] = (int16_t)((inst->imag[i] *
367  //      (int16_t)(inst->noiseSupFilter[i])) >> 14);  // Q(normData-stages)
368  // }
369
370  int16_t* preal = &inst->real[0];
371  int16_t* pimag = &inst->imag[0];
372  int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0];
373  int16_t* pimag_end = pimag + inst->magnLen - 4;
374
375  while (pimag < pimag_end) {
376    int16x8_t real = vld1q_s16(preal);
377    int16x8_t imag = vld1q_s16(pimag);
378    int16x8_t ns_filter = vld1q_s16(pns_filter);
379
380    int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter));
381    int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter));
382    int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real),
383                                  vget_high_s16(ns_filter));
384    int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag),
385                                  vget_high_s16(ns_filter));
386
387    int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14);
388    int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14);
389    int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14);
390    int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14);
391
392    vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1));
393    vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1));
394    preal += 8;
395    pimag += 8;
396    pns_filter += 8;
397  }
398
399  // Filter the last element
400  *preal = (int16_t)((*preal * *pns_filter) >> 14);
401  *pimag = (int16_t)((*pimag * *pns_filter) >> 14);
402
403  // (2) Create spectrum.
404
405  // Fixed point C code for the rest of the function is as follows:
406  // freq_buf[0] = inst->real[0];
407  // freq_buf[1] = -inst->imag[0];
408  // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
409  //   freq_buf[j] = inst->real[i];
410  //   freq_buf[j + 1] = -inst->imag[i];
411  // }
412  // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
413  // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
414
415  preal = &inst->real[0];
416  pimag = &inst->imag[0];
417  pimag_end = pimag + inst->anaLen2;
418  int16_t * freq_buf_start = freq_buf;
419  while (pimag < pimag_end) {
420    // loop unroll
421    int16x8x2_t real_imag_0;
422    int16x8x2_t real_imag_1;
423    real_imag_0.val[1] = vld1q_s16(pimag);
424    real_imag_0.val[0] = vld1q_s16(preal);
425    preal += 8;
426    pimag += 8;
427    real_imag_1.val[1] = vld1q_s16(pimag);
428    real_imag_1.val[0] = vld1q_s16(preal);
429    preal += 8;
430    pimag += 8;
431
432    real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]);
433    real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]);
434    vst2q_s16(freq_buf_start, real_imag_0);
435    freq_buf_start += 16;
436    vst2q_s16(freq_buf_start, real_imag_1);
437    freq_buf_start += 16;
438  }
439  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
440  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
441}
442
443// For the noise supress process, synthesis, read out fully processed segment,
444// and update synthesis buffer.
445void WebRtcNsx_SynthesisUpdateNeon(NoiseSuppressionFixedC* inst,
446                                   int16_t* out_frame,
447                                   int16_t gain_factor) {
448  assert(inst->anaLen % 16 == 0);
449  assert(inst->blockLen10ms % 16 == 0);
450
451  int16_t* preal_start = inst->real;
452  const int16_t* pwindow = inst->window;
453  int16_t* preal_end = preal_start + inst->anaLen;
454  int16_t* psynthesis_buffer = inst->synthesisBuffer;
455
456  while (preal_start < preal_end) {
457    // Loop unroll.
458    int16x8_t window_0 = vld1q_s16(pwindow);
459    int16x8_t real_0 = vld1q_s16(preal_start);
460    int16x8_t synthesis_buffer_0 = vld1q_s16(psynthesis_buffer);
461
462    int16x8_t window_1 = vld1q_s16(pwindow + 8);
463    int16x8_t real_1 = vld1q_s16(preal_start + 8);
464    int16x8_t synthesis_buffer_1 = vld1q_s16(psynthesis_buffer + 8);
465
466    int32x4_t tmp32a_0_low = vmull_s16(vget_low_s16(real_0),
467                                       vget_low_s16(window_0));
468    int32x4_t tmp32a_0_high = vmull_s16(vget_high_s16(real_0),
469                                        vget_high_s16(window_0));
470
471    int32x4_t tmp32a_1_low = vmull_s16(vget_low_s16(real_1),
472                                       vget_low_s16(window_1));
473    int32x4_t tmp32a_1_high = vmull_s16(vget_high_s16(real_1),
474                                        vget_high_s16(window_1));
475
476    int16x4_t tmp16a_0_low = vqrshrn_n_s32(tmp32a_0_low, 14);
477    int16x4_t tmp16a_0_high = vqrshrn_n_s32(tmp32a_0_high, 14);
478
479    int16x4_t tmp16a_1_low = vqrshrn_n_s32(tmp32a_1_low, 14);
480    int16x4_t tmp16a_1_high = vqrshrn_n_s32(tmp32a_1_high, 14);
481
482    int32x4_t tmp32b_0_low = vmull_n_s16(tmp16a_0_low, gain_factor);
483    int32x4_t tmp32b_0_high = vmull_n_s16(tmp16a_0_high, gain_factor);
484
485    int32x4_t tmp32b_1_low = vmull_n_s16(tmp16a_1_low, gain_factor);
486    int32x4_t tmp32b_1_high = vmull_n_s16(tmp16a_1_high, gain_factor);
487
488    int16x4_t tmp16b_0_low = vqrshrn_n_s32(tmp32b_0_low, 13);
489    int16x4_t tmp16b_0_high = vqrshrn_n_s32(tmp32b_0_high, 13);
490
491    int16x4_t tmp16b_1_low = vqrshrn_n_s32(tmp32b_1_low, 13);
492    int16x4_t tmp16b_1_high = vqrshrn_n_s32(tmp32b_1_high, 13);
493
494    synthesis_buffer_0 = vqaddq_s16(vcombine_s16(tmp16b_0_low, tmp16b_0_high),
495                                    synthesis_buffer_0);
496    synthesis_buffer_1 = vqaddq_s16(vcombine_s16(tmp16b_1_low, tmp16b_1_high),
497                                    synthesis_buffer_1);
498    vst1q_s16(psynthesis_buffer, synthesis_buffer_0);
499    vst1q_s16(psynthesis_buffer + 8, synthesis_buffer_1);
500
501    pwindow += 16;
502    preal_start += 16;
503    psynthesis_buffer += 16;
504  }
505
506  // Read out fully processed segment.
507  int16_t * p_start = inst->synthesisBuffer;
508  int16_t * p_end = inst->synthesisBuffer + inst->blockLen10ms;
509  int16_t * p_frame = out_frame;
510  while (p_start < p_end) {
511    int16x8_t frame_0 = vld1q_s16(p_start);
512    vst1q_s16(p_frame, frame_0);
513    p_start += 8;
514    p_frame += 8;
515  }
516
517  // Update synthesis buffer.
518  int16_t* p_start_src = inst->synthesisBuffer + inst->blockLen10ms;
519  int16_t* p_end_src = inst->synthesisBuffer + inst->anaLen;
520  int16_t* p_start_dst = inst->synthesisBuffer;
521  while (p_start_src < p_end_src) {
522    int16x8_t frame = vld1q_s16(p_start_src);
523    vst1q_s16(p_start_dst, frame);
524    p_start_src += 8;
525    p_start_dst += 8;
526  }
527
528  p_start = inst->synthesisBuffer + inst->anaLen - inst->blockLen10ms;
529  p_end = p_start + inst->blockLen10ms;
530  int16x8_t zero = vdupq_n_s16(0);
531  for (;p_start < p_end; p_start += 8) {
532    vst1q_s16(p_start, zero);
533  }
534}
535
536// Update analysis buffer for lower band, and window data before FFT.
537void WebRtcNsx_AnalysisUpdateNeon(NoiseSuppressionFixedC* inst,
538                                  int16_t* out,
539                                  int16_t* new_speech) {
540  assert(inst->blockLen10ms % 16 == 0);
541  assert(inst->anaLen % 16 == 0);
542
543  // For lower band update analysis buffer.
544  // memcpy(inst->analysisBuffer, inst->analysisBuffer + inst->blockLen10ms,
545  //     (inst->anaLen - inst->blockLen10ms) * sizeof(*inst->analysisBuffer));
546  int16_t* p_start_src = inst->analysisBuffer + inst->blockLen10ms;
547  int16_t* p_end_src = inst->analysisBuffer + inst->anaLen;
548  int16_t* p_start_dst = inst->analysisBuffer;
549  while (p_start_src < p_end_src) {
550    int16x8_t frame = vld1q_s16(p_start_src);
551    vst1q_s16(p_start_dst, frame);
552
553    p_start_src += 8;
554    p_start_dst += 8;
555  }
556
557  // memcpy(inst->analysisBuffer + inst->anaLen - inst->blockLen10ms,
558  //     new_speech, inst->blockLen10ms * sizeof(*inst->analysisBuffer));
559  p_start_src = new_speech;
560  p_end_src = new_speech + inst->blockLen10ms;
561  p_start_dst = inst->analysisBuffer + inst->anaLen - inst->blockLen10ms;
562  while (p_start_src < p_end_src) {
563    int16x8_t frame = vld1q_s16(p_start_src);
564    vst1q_s16(p_start_dst, frame);
565
566    p_start_src += 8;
567    p_start_dst += 8;
568  }
569
570  // Window data before FFT.
571  int16_t* p_start_window = (int16_t*) inst->window;
572  int16_t* p_start_buffer = inst->analysisBuffer;
573  int16_t* p_start_out = out;
574  const int16_t* p_end_out = out + inst->anaLen;
575
576  // Load the first element to reduce pipeline bubble.
577  int16x8_t window = vld1q_s16(p_start_window);
578  int16x8_t buffer = vld1q_s16(p_start_buffer);
579  p_start_window += 8;
580  p_start_buffer += 8;
581
582  while (p_start_out < p_end_out) {
583    // Unroll loop.
584    int32x4_t tmp32_low = vmull_s16(vget_low_s16(window), vget_low_s16(buffer));
585    int32x4_t tmp32_high = vmull_s16(vget_high_s16(window),
586                                     vget_high_s16(buffer));
587    window = vld1q_s16(p_start_window);
588    buffer = vld1q_s16(p_start_buffer);
589
590    int16x4_t result_low = vrshrn_n_s32(tmp32_low, 14);
591    int16x4_t result_high = vrshrn_n_s32(tmp32_high, 14);
592    vst1q_s16(p_start_out, vcombine_s16(result_low, result_high));
593
594    p_start_buffer += 8;
595    p_start_window += 8;
596    p_start_out += 8;
597  }
598}
599