1/* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12/* 13 * This file includes the implementation of the core functionality in VAD. 14 * For function description, see vad_core.h. 15 */ 16 17#include "vad_core.h" 18 19#include "signal_processing_library.h" 20#include "typedefs.h" 21#include "vad_defines.h" 22#include "vad_filterbank.h" 23#include "vad_gmm.h" 24#include "vad_sp.h" 25 26// Spectrum Weighting 27static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 }; 28static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15 29static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15 30static const WebRtc_Word16 kBackEta = 154; // Q8 31// Minimum difference between the two models, Q5 32static const WebRtc_Word16 kMinimumDifference[6] = { 33 544, 544, 576, 576, 576, 576 }; 34// Upper limit of mean value for speech model, Q7 35static const WebRtc_Word16 kMaximumSpeech[6] = { 36 11392, 11392, 11520, 11520, 11520, 11520 }; 37// Minimum value for mean value 38static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 }; 39// Upper limit of mean value for noise model, Q7 40static const WebRtc_Word16 kMaximumNoise[6] = { 41 9216, 9088, 8960, 8832, 8704, 8576 }; 42// Start values for the Gaussian models, Q7 43// Weights for the two Gaussians for the six channels (noise) 44static const WebRtc_Word16 kNoiseDataWeights[12] = { 45 34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; 46// Weights for the two Gaussians for the six channels (speech) 47static const WebRtc_Word16 kSpeechDataWeights[12] = { 48 48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; 49// Means for the two Gaussians for the six channels (noise) 50static const WebRtc_Word16 kNoiseDataMeans[12] = { 51 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; 52// Means for the two Gaussians for the six channels (speech) 53static const WebRtc_Word16 kSpeechDataMeans[12] = { 54 8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483 55}; 56// Stds for the two Gaussians for the six channels (noise) 57static const WebRtc_Word16 kNoiseDataStds[12] = { 58 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; 59// Stds for the two Gaussians for the six channels (speech) 60static const WebRtc_Word16 kSpeechDataStds[12] = { 61 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 }; 62 63static const int kInitCheck = 42; 64 65// Initialize VAD 66int WebRtcVad_InitCore(VadInstT *inst, short mode) 67{ 68 int i; 69 70 // Initialization of struct 71 inst->vad = 1; 72 inst->frame_counter = 0; 73 inst->over_hang = 0; 74 inst->num_of_speech = 0; 75 76 // Initialization of downsampling filter state 77 inst->downsampling_filter_states[0] = 0; 78 inst->downsampling_filter_states[1] = 0; 79 inst->downsampling_filter_states[2] = 0; 80 inst->downsampling_filter_states[3] = 0; 81 82 // Read initial PDF parameters 83 for (i = 0; i < NUM_TABLE_VALUES; i++) 84 { 85 inst->noise_means[i] = kNoiseDataMeans[i]; 86 inst->speech_means[i] = kSpeechDataMeans[i]; 87 inst->noise_stds[i] = kNoiseDataStds[i]; 88 inst->speech_stds[i] = kSpeechDataStds[i]; 89 } 90 91 // Index and Minimum value vectors are initialized 92 for (i = 0; i < 16 * NUM_CHANNELS; i++) 93 { 94 inst->low_value_vector[i] = 10000; 95 inst->index_vector[i] = 0; 96 } 97 98 for (i = 0; i < 5; i++) 99 { 100 inst->upper_state[i] = 0; 101 inst->lower_state[i] = 0; 102 } 103 104 for (i = 0; i < 4; i++) 105 { 106 inst->hp_filter_state[i] = 0; 107 } 108 109 // Init mean value memory, for FindMin function 110 inst->mean_value[0] = 1600; 111 inst->mean_value[1] = 1600; 112 inst->mean_value[2] = 1600; 113 inst->mean_value[3] = 1600; 114 inst->mean_value[4] = 1600; 115 inst->mean_value[5] = 1600; 116 117 if (mode == 0) 118 { 119 // Quality mode 120 inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst 121 inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst 122 inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst 123 inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst 124 inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst 125 inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst 126 127 inst->individual[0] = INDIVIDUAL_10MS_Q; 128 inst->individual[1] = INDIVIDUAL_20MS_Q; 129 inst->individual[2] = INDIVIDUAL_30MS_Q; 130 131 inst->total[0] = TOTAL_10MS_Q; 132 inst->total[1] = TOTAL_20MS_Q; 133 inst->total[2] = TOTAL_30MS_Q; 134 } else if (mode == 1) 135 { 136 // Low bitrate mode 137 inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst 138 inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst 139 inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst 140 inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst 141 inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst 142 inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst 143 144 inst->individual[0] = INDIVIDUAL_10MS_LBR; 145 inst->individual[1] = INDIVIDUAL_20MS_LBR; 146 inst->individual[2] = INDIVIDUAL_30MS_LBR; 147 148 inst->total[0] = TOTAL_10MS_LBR; 149 inst->total[1] = TOTAL_20MS_LBR; 150 inst->total[2] = TOTAL_30MS_LBR; 151 } else if (mode == 2) 152 { 153 // Aggressive mode 154 inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst 155 inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst 156 inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst 157 inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst 158 inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst 159 inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst 160 161 inst->individual[0] = INDIVIDUAL_10MS_AGG; 162 inst->individual[1] = INDIVIDUAL_20MS_AGG; 163 inst->individual[2] = INDIVIDUAL_30MS_AGG; 164 165 inst->total[0] = TOTAL_10MS_AGG; 166 inst->total[1] = TOTAL_20MS_AGG; 167 inst->total[2] = TOTAL_30MS_AGG; 168 } else 169 { 170 // Very aggressive mode 171 inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst 172 inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst 173 inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst 174 inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst 175 inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst 176 inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst 177 178 inst->individual[0] = INDIVIDUAL_10MS_VAG; 179 inst->individual[1] = INDIVIDUAL_20MS_VAG; 180 inst->individual[2] = INDIVIDUAL_30MS_VAG; 181 182 inst->total[0] = TOTAL_10MS_VAG; 183 inst->total[1] = TOTAL_20MS_VAG; 184 inst->total[2] = TOTAL_30MS_VAG; 185 } 186 187 inst->init_flag = kInitCheck; 188 189 return 0; 190} 191 192// Set aggressiveness mode 193int WebRtcVad_set_mode_core(VadInstT *inst, short mode) 194{ 195 196 if (mode == 0) 197 { 198 // Quality mode 199 inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst 200 inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst 201 inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst 202 inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst 203 inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst 204 inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst 205 206 inst->individual[0] = INDIVIDUAL_10MS_Q; 207 inst->individual[1] = INDIVIDUAL_20MS_Q; 208 inst->individual[2] = INDIVIDUAL_30MS_Q; 209 210 inst->total[0] = TOTAL_10MS_Q; 211 inst->total[1] = TOTAL_20MS_Q; 212 inst->total[2] = TOTAL_30MS_Q; 213 } else if (mode == 1) 214 { 215 // Low bitrate mode 216 inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst 217 inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst 218 inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst 219 inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst 220 inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst 221 inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst 222 223 inst->individual[0] = INDIVIDUAL_10MS_LBR; 224 inst->individual[1] = INDIVIDUAL_20MS_LBR; 225 inst->individual[2] = INDIVIDUAL_30MS_LBR; 226 227 inst->total[0] = TOTAL_10MS_LBR; 228 inst->total[1] = TOTAL_20MS_LBR; 229 inst->total[2] = TOTAL_30MS_LBR; 230 } else if (mode == 2) 231 { 232 // Aggressive mode 233 inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst 234 inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst 235 inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst 236 inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst 237 inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst 238 inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst 239 240 inst->individual[0] = INDIVIDUAL_10MS_AGG; 241 inst->individual[1] = INDIVIDUAL_20MS_AGG; 242 inst->individual[2] = INDIVIDUAL_30MS_AGG; 243 244 inst->total[0] = TOTAL_10MS_AGG; 245 inst->total[1] = TOTAL_20MS_AGG; 246 inst->total[2] = TOTAL_30MS_AGG; 247 } else if (mode == 3) 248 { 249 // Very aggressive mode 250 inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst 251 inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst 252 inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst 253 inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst 254 inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst 255 inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst 256 257 inst->individual[0] = INDIVIDUAL_10MS_VAG; 258 inst->individual[1] = INDIVIDUAL_20MS_VAG; 259 inst->individual[2] = INDIVIDUAL_30MS_VAG; 260 261 inst->total[0] = TOTAL_10MS_VAG; 262 inst->total[1] = TOTAL_20MS_VAG; 263 inst->total[2] = TOTAL_30MS_VAG; 264 } else 265 { 266 return -1; 267 } 268 269 return 0; 270} 271 272// Calculate VAD decision by first extracting feature values and then calculate 273// probability for both speech and background noise. 274 275WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, 276 int frame_length) 277{ 278 WebRtc_Word16 len, vad; 279 WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) 280 WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 281 282 283 // Downsample signal 32->16->8 before doing VAD 284 WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), 285 frame_length); 286 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); 287 288 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); 289 len = WEBRTC_SPL_RSHIFT_W16(len, 1); 290 291 // Do VAD on an 8 kHz signal 292 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 293 294 return vad; 295} 296 297WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, 298 int frame_length) 299{ 300 WebRtc_Word16 len, vad; 301 WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 302 303 // Wideband: Downsample signal before doing VAD 304 WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, 305 frame_length); 306 307 len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); 308 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 309 310 return vad; 311} 312 313WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, 314 int frame_length) 315{ 316 WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; 317 318 // Get power in the bands 319 total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector); 320 321 // Make a VAD 322 inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length); 323 324 return inst->vad; 325} 326 327// Calculate probability for both speech and background noise, and perform a 328// hypothesis-test. 329WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, 330 WebRtc_Word16 total_power, int frame_length) 331{ 332 int n, k; 333 WebRtc_Word16 backval; 334 WebRtc_Word16 h0, h1; 335 WebRtc_Word16 ratvec, xval; 336 WebRtc_Word16 vadflag; 337 WebRtc_Word16 shifts0, shifts1; 338 WebRtc_Word16 tmp16, tmp16_1, tmp16_2; 339 WebRtc_Word16 diff, nr, pos; 340 WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk; 341 WebRtc_Word16 delt, ndelt; 342 WebRtc_Word16 maxspe, maxmu; 343 WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES]; 344 WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES]; 345 WebRtc_Word32 h0test, h1test; 346 WebRtc_Word32 tmp32_1, tmp32_2; 347 WebRtc_Word32 dotVal; 348 WebRtc_Word32 nmid, smid; 349 WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS]; 350 WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr, 351 *sstd1ptr, *sstd2ptr; 352 WebRtc_Word16 overhead1, overhead2, individualTest, totalTest; 353 354 // Set the thresholds to different values based on frame length 355 if (frame_length == 80) 356 { 357 // 80 input samples 358 overhead1 = inst->over_hang_max_1[0]; 359 overhead2 = inst->over_hang_max_2[0]; 360 individualTest = inst->individual[0]; 361 totalTest = inst->total[0]; 362 } else if (frame_length == 160) 363 { 364 // 160 input samples 365 overhead1 = inst->over_hang_max_1[1]; 366 overhead2 = inst->over_hang_max_2[1]; 367 individualTest = inst->individual[1]; 368 totalTest = inst->total[1]; 369 } else 370 { 371 // 240 input samples 372 overhead1 = inst->over_hang_max_1[2]; 373 overhead2 = inst->over_hang_max_2[2]; 374 individualTest = inst->individual[2]; 375 totalTest = inst->total[2]; 376 } 377 378 if (total_power > MIN_ENERGY) 379 { // If signal present at all 380 381 // Set pointers to the gaussian parameters 382 nmean1ptr = &inst->noise_means[0]; 383 nmean2ptr = &inst->noise_means[NUM_CHANNELS]; 384 smean1ptr = &inst->speech_means[0]; 385 smean2ptr = &inst->speech_means[NUM_CHANNELS]; 386 nstd1ptr = &inst->noise_stds[0]; 387 nstd2ptr = &inst->noise_stds[NUM_CHANNELS]; 388 sstd1ptr = &inst->speech_stds[0]; 389 sstd2ptr = &inst->speech_stds[NUM_CHANNELS]; 390 391 vadflag = 0; 392 dotVal = 0; 393 for (n = 0; n < NUM_CHANNELS; n++) 394 { // For all channels 395 396 pos = WEBRTC_SPL_LSHIFT_W16(n, 1); 397 xval = feature_vector[n]; 398 399 // Probability for Noise, Q7 * Q20 = Q27 400 tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, 401 &deltaN[pos]); 402 probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1); 403 tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, 404 &deltaN[pos + 1]); 405 probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1); 406 h0test = probn[0] + probn[1]; // Q27 407 h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15 408 409 // Probability for Speech 410 tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, 411 &deltaS[pos]); 412 probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1); 413 tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, 414 &deltaS[pos + 1]); 415 probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1); 416 h1test = probs[0] + probs[1]; // Q27 417 h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15 418 419 // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1 420 shifts0 = WebRtcSpl_NormW32(h0test); 421 shifts1 = WebRtcSpl_NormW32(h1test); 422 423 if ((h0test > 0) && (h1test > 0)) 424 { 425 ratvec = shifts0 - shifts1; 426 } else if (h1test > 0) 427 { 428 ratvec = 31 - shifts1; 429 } else if (h0test > 0) 430 { 431 ratvec = shifts0 - 31; 432 } else 433 { 434 ratvec = 0; 435 } 436 437 // VAD decision with spectrum weighting 438 dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); 439 440 // Individual channel test 441 if ((ratvec << 2) > individualTest) 442 { 443 vadflag = 1; 444 } 445 446 // Probabilities used when updating model 447 if (h0 > 0) 448 { 449 tmp32_1 = probn[0] & 0xFFFFF000; // Q27 450 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29 451 ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0); 452 ngprvec[pos + 1] = 16384 - ngprvec[pos]; 453 } else 454 { 455 ngprvec[pos] = 16384; 456 ngprvec[pos + 1] = 0; 457 } 458 459 // Probabilities used when updating model 460 if (h1 > 0) 461 { 462 tmp32_1 = probs[0] & 0xFFFFF000; 463 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); 464 sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1); 465 sgprvec[pos + 1] = 16384 - sgprvec[pos]; 466 } else 467 { 468 sgprvec[pos] = 0; 469 sgprvec[pos + 1] = 0; 470 } 471 } 472 473 // Overall test 474 if (dotVal >= totalTest) 475 { 476 vadflag |= 1; 477 } 478 479 // Set pointers to the means and standard deviations. 480 nmean1ptr = &inst->noise_means[0]; 481 smean1ptr = &inst->speech_means[0]; 482 nstd1ptr = &inst->noise_stds[0]; 483 sstd1ptr = &inst->speech_stds[0]; 484 485 maxspe = 12800; 486 487 // Update the model's parameters 488 for (n = 0; n < NUM_CHANNELS; n++) 489 { 490 491 pos = WEBRTC_SPL_LSHIFT_W16(n, 1); 492 493 // Get min value in past which is used for long term correction 494 backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4 495 496 // Compute the "global" mean, that is the sum of the two means weighted 497 nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 498 nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], 499 *(nmean1ptr+NUM_CHANNELS)); 500 tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8 501 502 for (k = 0; k < NUM_MODELS; k++) 503 { 504 505 nr = pos + k; 506 507 nmean2ptr = nmean1ptr + k * NUM_CHANNELS; 508 smean2ptr = smean1ptr + k * NUM_CHANNELS; 509 nstd2ptr = nstd1ptr + k * NUM_CHANNELS; 510 sstd2ptr = sstd1ptr + k * NUM_CHANNELS; 511 nmk = *nmean2ptr; 512 smk = *smean2ptr; 513 nsk = *nstd2ptr; 514 ssk = *sstd2ptr; 515 516 // Update noise mean vector if the frame consists of noise only 517 nmk2 = nmk; 518 if (!vadflag) 519 { 520 // deltaN = (x-mu)/sigma^2 521 // ngprvec[k] = probn[k]/(probn[0] + probn[1]) 522 523 delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], 524 deltaN[nr], 11); // Q14*Q11 525 nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, 526 kNoiseUpdateConst, 527 22); // Q7+(Q14*Q15>>22) 528 } 529 530 // Long term correction of the noise mean 531 ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4); 532 ndelt -= tmp16_1; // Q8 - Q8 533 nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt, 534 kBackEta, 535 9); // Q7+(Q8*Q8)>>9 536 537 // Control that the noise mean does not drift to much 538 tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7); 539 if (nmk3 < tmp16) 540 nmk3 = tmp16; 541 tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7); 542 if (nmk3 > tmp16) 543 nmk3 = tmp16; 544 *nmean2ptr = nmk3; 545 546 if (vadflag) 547 { 548 // Update speech mean vector: 549 // deltaS = (x-mu)/sigma^2 550 // sgprvec[k] = probn[k]/(probn[0] + probn[1]) 551 552 delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], 553 deltaS[nr], 554 11); // (Q14*Q11)>>11=Q14 555 tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, 556 kSpeechUpdateConst, 557 21) + 1; 558 smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22) 559 560 // Control that the speech mean does not drift to much 561 maxmu = maxspe + 640; 562 if (smk2 < kMinimumMean[k]) 563 smk2 = kMinimumMean[k]; 564 if (smk2 > maxmu) 565 smk2 = maxmu; 566 567 *smean2ptr = smk2; 568 569 // (Q7>>3) = Q4 570 tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3); 571 572 tmp16 = feature_vector[n] - tmp16; // Q4 573 tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); 574 tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12 575 tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2); 576 tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24 577 578 tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20 579 580 // 0.1 * Q20 / Q7 = Q13 581 if (tmp32_2 > 0) 582 tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); 583 else 584 { 585 tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); 586 tmp16 = -tmp16; 587 } 588 // divide by 4 giving an update factor of 0.025 589 tmp16 += 128; // Rounding 590 ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); 591 // Division with 8 plus Q7 592 if (ssk < MIN_STD) 593 ssk = MIN_STD; 594 *sstd2ptr = ssk; 595 } else 596 { 597 // Update GMM variance vectors 598 // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4 599 tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3); 600 601 // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24 602 tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; 603 tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2); 604 tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1); 605 tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14); 606 // Q20 * approx 0.001 (2^-10=0.0009766) 607 608 // Q20 / Q7 = Q13 609 tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); 610 if (tmp32_1 > 0) 611 tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); 612 else 613 { 614 tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk); 615 tmp16 = -tmp16; 616 } 617 tmp16 += 32; // Rounding 618 nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); 619 620 if (nsk < MIN_STD) 621 nsk = MIN_STD; 622 623 *nstd2ptr = nsk; 624 } 625 } 626 627 // Separate models if they are too close - nmid in Q14 628 nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); 629 nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr); 630 631 // smid in Q14 632 smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); 633 smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr); 634 635 // diff = "global" speech mean - "global" noise mean 636 diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9); 637 tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9); 638 diff -= tmp16; 639 640 if (diff < kMinimumDifference[n]) 641 { 642 643 tmp16 = kMinimumDifference[n] - diff; // Q5 644 645 // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7 646 // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7 647 tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); 648 tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); 649 650 // First Gauss, speech model 651 tmp16 = tmp16_1 + *smean1ptr; 652 *smean1ptr = tmp16; 653 smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); 654 655 // Second Gauss, speech model 656 tmp16 = tmp16_1 + *smean2ptr; 657 *smean2ptr = tmp16; 658 smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]); 659 660 // First Gauss, noise model 661 tmp16 = *nmean1ptr - tmp16_2; 662 *nmean1ptr = tmp16; 663 664 nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); 665 666 // Second Gauss, noise model 667 tmp16 = *nmean2ptr - tmp16_2; 668 *nmean2ptr = tmp16; 669 nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]); 670 } 671 672 // Control that the speech & noise means do not drift to much 673 maxspe = kMaximumSpeech[n]; 674 tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7); 675 if (tmp16_2 > maxspe) 676 { // Upper limit of speech model 677 tmp16_2 -= maxspe; 678 679 *smean1ptr -= tmp16_2; 680 *smean2ptr -= tmp16_2; 681 } 682 683 tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7); 684 if (tmp16_2 > kMaximumNoise[n]) 685 { 686 tmp16_2 -= kMaximumNoise[n]; 687 688 *nmean1ptr -= tmp16_2; 689 *nmean2ptr -= tmp16_2; 690 } 691 692 nmean1ptr++; 693 smean1ptr++; 694 nstd1ptr++; 695 sstd1ptr++; 696 } 697 inst->frame_counter++; 698 } else 699 { 700 vadflag = 0; 701 } 702 703 // Hangover smoothing 704 if (!vadflag) 705 { 706 if (inst->over_hang > 0) 707 { 708 vadflag = 2 + inst->over_hang; 709 inst->over_hang = inst->over_hang - 1; 710 } 711 inst->num_of_speech = 0; 712 } else 713 { 714 inst->num_of_speech = inst->num_of_speech + 1; 715 if (inst->num_of_speech > NSP_MAX) 716 { 717 inst->num_of_speech = NSP_MAX; 718 inst->over_hang = overhead2; 719 } else 720 inst->over_hang = overhead1; 721 } 722 return vadflag; 723} 724