1/***********************************************************************
2Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "main.h"
33#include "stack_alloc.h"
34
35/* Silk VAD noise level estimation */
36static OPUS_INLINE void silk_VAD_GetNoiseLevels(
37    const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
38    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
39);
40
41/**********************************/
42/* Initialization of the Silk VAD */
43/**********************************/
44opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
45    silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
46)
47{
48    opus_int b, ret = 0;
49
50    /* reset state memory */
51    silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
52
53    /* init noise levels */
54    /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
55    for( b = 0; b < VAD_N_BANDS; b++ ) {
56        psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
57    }
58
59    /* Initialize state */
60    for( b = 0; b < VAD_N_BANDS; b++ ) {
61        psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
62        psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
63    }
64    psSilk_VAD->counter = 15;
65
66    /* init smoothed energy-to-noise ratio*/
67    for( b = 0; b < VAD_N_BANDS; b++ ) {
68        psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
69    }
70
71    return( ret );
72}
73
74/* Weighting factors for tilt measure */
75static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
76
77/***************************************/
78/* Get the speech activity level in Q8 */
79/***************************************/
80opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
81    silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
82    const opus_int16            pIn[]                           /* I    PCM input                                   */
83)
84{
85    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
86    opus_int   decimated_framelength1, decimated_framelength2;
87    opus_int   decimated_framelength;
88    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
89    opus_int32 sumSquared, smooth_coef_Q16;
90    opus_int16 HPstateTmp;
91    VARDECL( opus_int16, X );
92    opus_int32 Xnrg[ VAD_N_BANDS ];
93    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
94    opus_int32 speech_nrg, x_tmp;
95    opus_int   X_offset[ VAD_N_BANDS ];
96    opus_int   ret = 0;
97    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
98    SAVE_STACK;
99
100    /* Safety checks */
101    silk_assert( VAD_N_BANDS == 4 );
102    silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
103    silk_assert( psEncC->frame_length <= 512 );
104    silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
105
106    /***********************/
107    /* Filter and Decimate */
108    /***********************/
109    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
110    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
111    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
112    /* Decimate into 4 bands:
113       0       L      3L       L              3L                             5L
114               -      --       -              --                             --
115               8       8       2               4                              4
116
117       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
118
119       They're arranged to allow the minimal ( frame_length / 4 ) extra
120       scratch space during the downsampling process */
121    X_offset[ 0 ] = 0;
122    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
123    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
124    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
125    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
126
127    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
128    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
129        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
130
131    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
132    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
133        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
134
135    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
136    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
137        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
138
139    /*********************************************/
140    /* HP filter on lowest band (differentiator) */
141    /*********************************************/
142    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
143    HPstateTmp = X[ decimated_framelength - 1 ];
144    for( i = decimated_framelength - 1; i > 0; i-- ) {
145        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
146        X[ i ]     -= X[ i - 1 ];
147    }
148    X[ 0 ] -= psSilk_VAD->HPstate;
149    psSilk_VAD->HPstate = HPstateTmp;
150
151    /*************************************/
152    /* Calculate the energy in each band */
153    /*************************************/
154    for( b = 0; b < VAD_N_BANDS; b++ ) {
155        /* Find the decimated framelength in the non-uniformly divided bands */
156        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
157
158        /* Split length into subframe lengths */
159        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
160        dec_subframe_offset = 0;
161
162        /* Compute energy per sub-frame */
163        /* initialize with summed energy of last subframe */
164        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
165        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
166            sumSquared = 0;
167            for( i = 0; i < dec_subframe_length; i++ ) {
168                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
169                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
170                x_tmp = silk_RSHIFT(
171                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
172                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
173
174                /* Safety check */
175                silk_assert( sumSquared >= 0 );
176            }
177
178            /* Add/saturate summed energy of current subframe */
179            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
180                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
181            } else {
182                /* Look-ahead subframe */
183                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
184            }
185
186            dec_subframe_offset += dec_subframe_length;
187        }
188        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
189    }
190
191    /********************/
192    /* Noise estimation */
193    /********************/
194    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
195
196    /***********************************************/
197    /* Signal-plus-noise to noise ratio estimation */
198    /***********************************************/
199    sumSquared = 0;
200    input_tilt = 0;
201    for( b = 0; b < VAD_N_BANDS; b++ ) {
202        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
203        if( speech_nrg > 0 ) {
204            /* Divide, with sufficient resolution */
205            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
206                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
207            } else {
208                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
209            }
210
211            /* Convert to log domain */
212            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
213
214            /* Sum-of-squares */
215            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
216
217            /* Tilt measure */
218            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
219                /* Scale down SNR value for small subband speech energies */
220                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
221            }
222            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
223        } else {
224            NrgToNoiseRatio_Q8[ b ] = 256;
225        }
226    }
227
228    /* Mean-of-squares */
229    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
230
231    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
232    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
233
234    /*********************************/
235    /* Speech Probability Estimation */
236    /*********************************/
237    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
238
239    /**************************/
240    /* Frequency Tilt Measure */
241    /**************************/
242    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
243
244    /**************************************************/
245    /* Scale the sigmoid output based on power levels */
246    /**************************************************/
247    speech_nrg = 0;
248    for( b = 0; b < VAD_N_BANDS; b++ ) {
249        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
250        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
251    }
252
253    /* Power scaling */
254    if( speech_nrg <= 0 ) {
255        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
256    } else if( speech_nrg < 32768 ) {
257        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
258            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
259        } else {
260            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
261        }
262
263        /* square-root */
264        speech_nrg = silk_SQRT_APPROX( speech_nrg );
265        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
266    }
267
268    /* Copy the resulting speech activity in Q8 */
269    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
270
271    /***********************************/
272    /* Energy Level and SNR estimation */
273    /***********************************/
274    /* Smoothing coefficient */
275    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
276
277    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
278        smooth_coef_Q16 >>= 1;
279    }
280
281    for( b = 0; b < VAD_N_BANDS; b++ ) {
282        /* compute smoothed energy-to-noise ratio per band */
283        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
284            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
285
286        /* signal to noise ratio in dB per band */
287        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
288        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
289        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
290    }
291
292    RESTORE_STACK;
293    return( ret );
294}
295
296/**************************/
297/* Noise level estimation */
298/**************************/
299static OPUS_INLINE void silk_VAD_GetNoiseLevels(
300    const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
301    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
302)
303{
304    opus_int   k;
305    opus_int32 nl, nrg, inv_nrg;
306    opus_int   coef, min_coef;
307
308    /* Initially faster smoothing */
309    if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
310        min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
311    } else {
312        min_coef = 0;
313    }
314
315    for( k = 0; k < VAD_N_BANDS; k++ ) {
316        /* Get old noise level estimate for current band */
317        nl = psSilk_VAD->NL[ k ];
318        silk_assert( nl >= 0 );
319
320        /* Add bias */
321        nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
322        silk_assert( nrg > 0 );
323
324        /* Invert energies */
325        inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
326        silk_assert( inv_nrg >= 0 );
327
328        /* Less update when subband energy is high */
329        if( nrg > silk_LSHIFT( nl, 3 ) ) {
330            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
331        } else if( nrg < nl ) {
332            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
333        } else {
334            coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
335        }
336
337        /* Initially faster smoothing */
338        coef = silk_max_int( coef, min_coef );
339
340        /* Smooth inverse energies */
341        psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
342        silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
343
344        /* Compute noise level by inverting again */
345        nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
346        silk_assert( nl >= 0 );
347
348        /* Limit noise levels (guarantee 7 bits of head room) */
349        nl = silk_min( nl, 0x00FFFFFF );
350
351        /* Store as part of state */
352        psSilk_VAD->NL[ k ] = nl;
353    }
354
355    /* Increment frame counter */
356    psSilk_VAD->counter++;
357}
358