1/*
2 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12/*
13 * This file includes the implementation of the core functionality in VAD.
14 * For function description, see vad_core.h.
15 */
16
17#include "vad_core.h"
18
19#include "signal_processing_library.h"
20#include "typedefs.h"
21#include "vad_defines.h"
22#include "vad_filterbank.h"
23#include "vad_gmm.h"
24#include "vad_sp.h"
25
26// Spectrum Weighting
27static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 };
28static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15
29static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15
30static const WebRtc_Word16 kBackEta = 154; // Q8
31// Minimum difference between the two models, Q5
32static const WebRtc_Word16 kMinimumDifference[6] = {
33    544, 544, 576, 576, 576, 576 };
34// Upper limit of mean value for speech model, Q7
35static const WebRtc_Word16 kMaximumSpeech[6] = {
36    11392, 11392, 11520, 11520, 11520, 11520 };
37// Minimum value for mean value
38static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 };
39// Upper limit of mean value for noise model, Q7
40static const WebRtc_Word16 kMaximumNoise[6] = {
41    9216, 9088, 8960, 8832, 8704, 8576 };
42// Start values for the Gaussian models, Q7
43// Weights for the two Gaussians for the six channels (noise)
44static const WebRtc_Word16 kNoiseDataWeights[12] = {
45    34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
46// Weights for the two Gaussians for the six channels (speech)
47static const WebRtc_Word16 kSpeechDataWeights[12] = {
48    48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
49// Means for the two Gaussians for the six channels (noise)
50static const WebRtc_Word16 kNoiseDataMeans[12] = {
51    6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
52// Means for the two Gaussians for the six channels (speech)
53static const WebRtc_Word16 kSpeechDataMeans[12] = {
54    8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
55};
56// Stds for the two Gaussians for the six channels (noise)
57static const WebRtc_Word16 kNoiseDataStds[12] = {
58    378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
59// Stds for the two Gaussians for the six channels (speech)
60static const WebRtc_Word16 kSpeechDataStds[12] = {
61    555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
62
63static const int kInitCheck = 42;
64
65// Initialize VAD
66int WebRtcVad_InitCore(VadInstT *inst, short mode)
67{
68    int i;
69
70    // Initialization of struct
71    inst->vad = 1;
72    inst->frame_counter = 0;
73    inst->over_hang = 0;
74    inst->num_of_speech = 0;
75
76    // Initialization of downsampling filter state
77    inst->downsampling_filter_states[0] = 0;
78    inst->downsampling_filter_states[1] = 0;
79    inst->downsampling_filter_states[2] = 0;
80    inst->downsampling_filter_states[3] = 0;
81
82    // Read initial PDF parameters
83    for (i = 0; i < NUM_TABLE_VALUES; i++)
84    {
85        inst->noise_means[i] = kNoiseDataMeans[i];
86        inst->speech_means[i] = kSpeechDataMeans[i];
87        inst->noise_stds[i] = kNoiseDataStds[i];
88        inst->speech_stds[i] = kSpeechDataStds[i];
89    }
90
91    // Index and Minimum value vectors are initialized
92    for (i = 0; i < 16 * NUM_CHANNELS; i++)
93    {
94        inst->low_value_vector[i] = 10000;
95        inst->index_vector[i] = 0;
96    }
97
98    for (i = 0; i < 5; i++)
99    {
100        inst->upper_state[i] = 0;
101        inst->lower_state[i] = 0;
102    }
103
104    for (i = 0; i < 4; i++)
105    {
106        inst->hp_filter_state[i] = 0;
107    }
108
109    // Init mean value memory, for FindMin function
110    inst->mean_value[0] = 1600;
111    inst->mean_value[1] = 1600;
112    inst->mean_value[2] = 1600;
113    inst->mean_value[3] = 1600;
114    inst->mean_value[4] = 1600;
115    inst->mean_value[5] = 1600;
116
117    if (mode == 0)
118    {
119        // Quality mode
120        inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
121        inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
122        inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
123        inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
124        inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
125        inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
126
127        inst->individual[0] = INDIVIDUAL_10MS_Q;
128        inst->individual[1] = INDIVIDUAL_20MS_Q;
129        inst->individual[2] = INDIVIDUAL_30MS_Q;
130
131        inst->total[0] = TOTAL_10MS_Q;
132        inst->total[1] = TOTAL_20MS_Q;
133        inst->total[2] = TOTAL_30MS_Q;
134    } else if (mode == 1)
135    {
136        // Low bitrate mode
137        inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
138        inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
139        inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
140        inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
141        inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
142        inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
143
144        inst->individual[0] = INDIVIDUAL_10MS_LBR;
145        inst->individual[1] = INDIVIDUAL_20MS_LBR;
146        inst->individual[2] = INDIVIDUAL_30MS_LBR;
147
148        inst->total[0] = TOTAL_10MS_LBR;
149        inst->total[1] = TOTAL_20MS_LBR;
150        inst->total[2] = TOTAL_30MS_LBR;
151    } else if (mode == 2)
152    {
153        // Aggressive mode
154        inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
155        inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
156        inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
157        inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
158        inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
159        inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
160
161        inst->individual[0] = INDIVIDUAL_10MS_AGG;
162        inst->individual[1] = INDIVIDUAL_20MS_AGG;
163        inst->individual[2] = INDIVIDUAL_30MS_AGG;
164
165        inst->total[0] = TOTAL_10MS_AGG;
166        inst->total[1] = TOTAL_20MS_AGG;
167        inst->total[2] = TOTAL_30MS_AGG;
168    } else
169    {
170        // Very aggressive mode
171        inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
172        inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
173        inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
174        inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
175        inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
176        inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
177
178        inst->individual[0] = INDIVIDUAL_10MS_VAG;
179        inst->individual[1] = INDIVIDUAL_20MS_VAG;
180        inst->individual[2] = INDIVIDUAL_30MS_VAG;
181
182        inst->total[0] = TOTAL_10MS_VAG;
183        inst->total[1] = TOTAL_20MS_VAG;
184        inst->total[2] = TOTAL_30MS_VAG;
185    }
186
187    inst->init_flag = kInitCheck;
188
189    return 0;
190}
191
192// Set aggressiveness mode
193int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
194{
195
196    if (mode == 0)
197    {
198        // Quality mode
199        inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
200        inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
201        inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
202        inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
203        inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
204        inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
205
206        inst->individual[0] = INDIVIDUAL_10MS_Q;
207        inst->individual[1] = INDIVIDUAL_20MS_Q;
208        inst->individual[2] = INDIVIDUAL_30MS_Q;
209
210        inst->total[0] = TOTAL_10MS_Q;
211        inst->total[1] = TOTAL_20MS_Q;
212        inst->total[2] = TOTAL_30MS_Q;
213    } else if (mode == 1)
214    {
215        // Low bitrate mode
216        inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
217        inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
218        inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
219        inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
220        inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
221        inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
222
223        inst->individual[0] = INDIVIDUAL_10MS_LBR;
224        inst->individual[1] = INDIVIDUAL_20MS_LBR;
225        inst->individual[2] = INDIVIDUAL_30MS_LBR;
226
227        inst->total[0] = TOTAL_10MS_LBR;
228        inst->total[1] = TOTAL_20MS_LBR;
229        inst->total[2] = TOTAL_30MS_LBR;
230    } else if (mode == 2)
231    {
232        // Aggressive mode
233        inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
234        inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
235        inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
236        inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
237        inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
238        inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
239
240        inst->individual[0] = INDIVIDUAL_10MS_AGG;
241        inst->individual[1] = INDIVIDUAL_20MS_AGG;
242        inst->individual[2] = INDIVIDUAL_30MS_AGG;
243
244        inst->total[0] = TOTAL_10MS_AGG;
245        inst->total[1] = TOTAL_20MS_AGG;
246        inst->total[2] = TOTAL_30MS_AGG;
247    } else if (mode == 3)
248    {
249        // Very aggressive mode
250        inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
251        inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
252        inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
253        inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
254        inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
255        inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
256
257        inst->individual[0] = INDIVIDUAL_10MS_VAG;
258        inst->individual[1] = INDIVIDUAL_20MS_VAG;
259        inst->individual[2] = INDIVIDUAL_30MS_VAG;
260
261        inst->total[0] = TOTAL_10MS_VAG;
262        inst->total[1] = TOTAL_20MS_VAG;
263        inst->total[2] = TOTAL_30MS_VAG;
264    } else
265    {
266        return -1;
267    }
268
269    return 0;
270}
271
272// Calculate VAD decision by first extracting feature values and then calculate
273// probability for both speech and background noise.
274
275WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
276                                     int frame_length)
277{
278    WebRtc_Word16 len, vad;
279    WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
280    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
281
282
283    // Downsample signal 32->16->8 before doing VAD
284    WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
285                           frame_length);
286    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
287
288    WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
289    len = WEBRTC_SPL_RSHIFT_W16(len, 1);
290
291    // Do VAD on an 8 kHz signal
292    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
293
294    return vad;
295}
296
297WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
298                                     int frame_length)
299{
300    WebRtc_Word16 len, vad;
301    WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
302
303    // Wideband: Downsample signal before doing VAD
304    WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
305                           frame_length);
306
307    len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
308    vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
309
310    return vad;
311}
312
313WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
314                                    int frame_length)
315{
316    WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
317
318    // Get power in the bands
319    total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector);
320
321    // Make a VAD
322    inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
323
324    return inst->vad;
325}
326
327// Calculate probability for both speech and background noise, and perform a
328// hypothesis-test.
329WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
330                                       WebRtc_Word16 total_power, int frame_length)
331{
332    int n, k;
333    WebRtc_Word16 backval;
334    WebRtc_Word16 h0, h1;
335    WebRtc_Word16 ratvec, xval;
336    WebRtc_Word16 vadflag;
337    WebRtc_Word16 shifts0, shifts1;
338    WebRtc_Word16 tmp16, tmp16_1, tmp16_2;
339    WebRtc_Word16 diff, nr, pos;
340    WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
341    WebRtc_Word16 delt, ndelt;
342    WebRtc_Word16 maxspe, maxmu;
343    WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES];
344    WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES];
345    WebRtc_Word32 h0test, h1test;
346    WebRtc_Word32 tmp32_1, tmp32_2;
347    WebRtc_Word32 dotVal;
348    WebRtc_Word32 nmid, smid;
349    WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS];
350    WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
351            *sstd1ptr, *sstd2ptr;
352    WebRtc_Word16 overhead1, overhead2, individualTest, totalTest;
353
354    // Set the thresholds to different values based on frame length
355    if (frame_length == 80)
356    {
357        // 80 input samples
358        overhead1 = inst->over_hang_max_1[0];
359        overhead2 = inst->over_hang_max_2[0];
360        individualTest = inst->individual[0];
361        totalTest = inst->total[0];
362    } else if (frame_length == 160)
363    {
364        // 160 input samples
365        overhead1 = inst->over_hang_max_1[1];
366        overhead2 = inst->over_hang_max_2[1];
367        individualTest = inst->individual[1];
368        totalTest = inst->total[1];
369    } else
370    {
371        // 240 input samples
372        overhead1 = inst->over_hang_max_1[2];
373        overhead2 = inst->over_hang_max_2[2];
374        individualTest = inst->individual[2];
375        totalTest = inst->total[2];
376    }
377
378    if (total_power > MIN_ENERGY)
379    { // If signal present at all
380
381        // Set pointers to the gaussian parameters
382        nmean1ptr = &inst->noise_means[0];
383        nmean2ptr = &inst->noise_means[NUM_CHANNELS];
384        smean1ptr = &inst->speech_means[0];
385        smean2ptr = &inst->speech_means[NUM_CHANNELS];
386        nstd1ptr = &inst->noise_stds[0];
387        nstd2ptr = &inst->noise_stds[NUM_CHANNELS];
388        sstd1ptr = &inst->speech_stds[0];
389        sstd2ptr = &inst->speech_stds[NUM_CHANNELS];
390
391        vadflag = 0;
392        dotVal = 0;
393        for (n = 0; n < NUM_CHANNELS; n++)
394        { // For all channels
395
396            pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
397            xval = feature_vector[n];
398
399            // Probability for Noise, Q7 * Q20 = Q27
400            tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++,
401                                                    &deltaN[pos]);
402            probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1);
403            tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
404                                                    &deltaN[pos + 1]);
405            probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1);
406            h0test = probn[0] + probn[1]; // Q27
407            h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
408
409            // Probability for Speech
410            tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++,
411                                                    &deltaS[pos]);
412            probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1);
413            tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
414                                                    &deltaS[pos + 1]);
415            probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1);
416            h1test = probs[0] + probs[1]; // Q27
417            h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
418
419            // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1
420            shifts0 = WebRtcSpl_NormW32(h0test);
421            shifts1 = WebRtcSpl_NormW32(h1test);
422
423            if ((h0test > 0) && (h1test > 0))
424            {
425                ratvec = shifts0 - shifts1;
426            } else if (h1test > 0)
427            {
428                ratvec = 31 - shifts1;
429            } else if (h0test > 0)
430            {
431                ratvec = shifts0 - 31;
432            } else
433            {
434                ratvec = 0;
435            }
436
437            // VAD decision with spectrum weighting
438            dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]);
439
440            // Individual channel test
441            if ((ratvec << 2) > individualTest)
442            {
443                vadflag = 1;
444            }
445
446            // Probabilities used when updating model
447            if (h0 > 0)
448            {
449                tmp32_1 = probn[0] & 0xFFFFF000; // Q27
450                tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29
451                ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0);
452                ngprvec[pos + 1] = 16384 - ngprvec[pos];
453            } else
454            {
455                ngprvec[pos] = 16384;
456                ngprvec[pos + 1] = 0;
457            }
458
459            // Probabilities used when updating model
460            if (h1 > 0)
461            {
462                tmp32_1 = probs[0] & 0xFFFFF000;
463                tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2);
464                sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1);
465                sgprvec[pos + 1] = 16384 - sgprvec[pos];
466            } else
467            {
468                sgprvec[pos] = 0;
469                sgprvec[pos + 1] = 0;
470            }
471        }
472
473        // Overall test
474        if (dotVal >= totalTest)
475        {
476            vadflag |= 1;
477        }
478
479        // Set pointers to the means and standard deviations.
480        nmean1ptr = &inst->noise_means[0];
481        smean1ptr = &inst->speech_means[0];
482        nstd1ptr = &inst->noise_stds[0];
483        sstd1ptr = &inst->speech_stds[0];
484
485        maxspe = 12800;
486
487        // Update the model's parameters
488        for (n = 0; n < NUM_CHANNELS; n++)
489        {
490
491            pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
492
493            // Get min value in past which is used for long term correction
494            backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4
495
496            // Compute the "global" mean, that is the sum of the two means weighted
497            nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
498            nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS],
499                    *(nmean1ptr+NUM_CHANNELS));
500            tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
501
502            for (k = 0; k < NUM_MODELS; k++)
503            {
504
505                nr = pos + k;
506
507                nmean2ptr = nmean1ptr + k * NUM_CHANNELS;
508                smean2ptr = smean1ptr + k * NUM_CHANNELS;
509                nstd2ptr = nstd1ptr + k * NUM_CHANNELS;
510                sstd2ptr = sstd1ptr + k * NUM_CHANNELS;
511                nmk = *nmean2ptr;
512                smk = *smean2ptr;
513                nsk = *nstd2ptr;
514                ssk = *sstd2ptr;
515
516                // Update noise mean vector if the frame consists of noise only
517                nmk2 = nmk;
518                if (!vadflag)
519                {
520                    // deltaN = (x-mu)/sigma^2
521                    // ngprvec[k] = probn[k]/(probn[0] + probn[1])
522
523                    delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr],
524                            deltaN[nr], 11); // Q14*Q11
525                    nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
526                            kNoiseUpdateConst,
527                            22); // Q7+(Q14*Q15>>22)
528                }
529
530                // Long term correction of the noise mean
531                ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4);
532                ndelt -= tmp16_1; // Q8 - Q8
533                nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt,
534                        kBackEta,
535                        9); // Q7+(Q8*Q8)>>9
536
537                // Control that the noise mean does not drift to much
538                tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7);
539                if (nmk3 < tmp16)
540                    nmk3 = tmp16;
541                tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7);
542                if (nmk3 > tmp16)
543                    nmk3 = tmp16;
544                *nmean2ptr = nmk3;
545
546                if (vadflag)
547                {
548                    // Update speech mean vector:
549                    // deltaS = (x-mu)/sigma^2
550                    // sgprvec[k] = probn[k]/(probn[0] + probn[1])
551
552                    delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr],
553                            deltaS[nr],
554                            11); // (Q14*Q11)>>11=Q14
555                    tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
556                            kSpeechUpdateConst,
557                            21) + 1;
558                    smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22)
559
560                    // Control that the speech mean does not drift to much
561                    maxmu = maxspe + 640;
562                    if (smk2 < kMinimumMean[k])
563                        smk2 = kMinimumMean[k];
564                    if (smk2 > maxmu)
565                        smk2 = maxmu;
566
567                    *smean2ptr = smk2;
568
569                    // (Q7>>3) = Q4
570                    tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3);
571
572                    tmp16 = feature_vector[n] - tmp16; // Q4
573                    tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3);
574                    tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12
575                    tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2);
576                    tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24
577
578                    tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20
579
580                    // 0.1 * Q20 / Q7 = Q13
581                    if (tmp32_2 > 0)
582                        tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10);
583                    else
584                    {
585                        tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10);
586                        tmp16 = -tmp16;
587                    }
588                    // divide by 4 giving an update factor of 0.025
589                    tmp16 += 128; // Rounding
590                    ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
591                    // Division with 8 plus Q7
592                    if (ssk < MIN_STD)
593                        ssk = MIN_STD;
594                    *sstd2ptr = ssk;
595                } else
596                {
597                    // Update GMM variance vectors
598                    // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4
599                    tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3);
600
601                    // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24
602                    tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096;
603                    tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2);
604                    tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1);
605                    tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14);
606                    // Q20  * approx 0.001 (2^-10=0.0009766)
607
608                    // Q20 / Q7 = Q13
609                    tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
610                    if (tmp32_1 > 0)
611                        tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
612                    else
613                    {
614                        tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk);
615                        tmp16 = -tmp16;
616                    }
617                    tmp16 += 32; // Rounding
618                    nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
619
620                    if (nsk < MIN_STD)
621                        nsk = MIN_STD;
622
623                    *nstd2ptr = nsk;
624                }
625            }
626
627            // Separate models if they are too close - nmid in Q14
628            nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
629            nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr);
630
631            // smid in Q14
632            smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
633            smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr);
634
635            // diff = "global" speech mean - "global" noise mean
636            diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9);
637            tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9);
638            diff -= tmp16;
639
640            if (diff < kMinimumDifference[n])
641            {
642
643                tmp16 = kMinimumDifference[n] - diff; // Q5
644
645                // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7
646                // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7
647                tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2);
648                tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2);
649
650                // First Gauss, speech model
651                tmp16 = tmp16_1 + *smean1ptr;
652                *smean1ptr = tmp16;
653                smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]);
654
655                // Second Gauss, speech model
656                tmp16 = tmp16_1 + *smean2ptr;
657                *smean2ptr = tmp16;
658                smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]);
659
660                // First Gauss, noise model
661                tmp16 = *nmean1ptr - tmp16_2;
662                *nmean1ptr = tmp16;
663
664                nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]);
665
666                // Second Gauss, noise model
667                tmp16 = *nmean2ptr - tmp16_2;
668                *nmean2ptr = tmp16;
669                nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]);
670            }
671
672            // Control that the speech & noise means do not drift to much
673            maxspe = kMaximumSpeech[n];
674            tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7);
675            if (tmp16_2 > maxspe)
676            { // Upper limit of speech model
677                tmp16_2 -= maxspe;
678
679                *smean1ptr -= tmp16_2;
680                *smean2ptr -= tmp16_2;
681            }
682
683            tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7);
684            if (tmp16_2 > kMaximumNoise[n])
685            {
686                tmp16_2 -= kMaximumNoise[n];
687
688                *nmean1ptr -= tmp16_2;
689                *nmean2ptr -= tmp16_2;
690            }
691
692            nmean1ptr++;
693            smean1ptr++;
694            nstd1ptr++;
695            sstd1ptr++;
696        }
697        inst->frame_counter++;
698    } else
699    {
700        vadflag = 0;
701    }
702
703    // Hangover smoothing
704    if (!vadflag)
705    {
706        if (inst->over_hang > 0)
707        {
708            vadflag = 2 + inst->over_hang;
709            inst->over_hang = inst->over_hang - 1;
710        }
711        inst->num_of_speech = 0;
712    } else
713    {
714        inst->num_of_speech = inst->num_of_speech + 1;
715        if (inst->num_of_speech > NSP_MAX)
716        {
717            inst->num_of_speech = NSP_MAX;
718            inst->over_hang = overhead2;
719        } else
720            inst->over_hang = overhead1;
721    }
722    return vadflag;
723}
724