1/***********************************************************************
2Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3Redistribution and use in source and binary forms, with or without
4modification, are permitted provided that the following conditions
5are met:
6- Redistributions of source code must retain the above copyright notice,
7this list of conditions and the following disclaimer.
8- Redistributions in binary form must reproduce the above copyright
9notice, this list of conditions and the following disclaimer in the
10documentation and/or other materials provided with the distribution.
11- Neither the name of Internet Society, IETF or IETF Trust, nor the
12names of specific contributors, may be used to endorse or promote
13products derived from this software without specific prior written
14permission.
15THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
16AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25POSSIBILITY OF SUCH DAMAGE.
26***********************************************************************/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "main.h"
33
34/* Silk VAD noise level estimation */
35static inline void silk_VAD_GetNoiseLevels(
36    const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
37    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
38);
39
40/**********************************/
41/* Initialization of the Silk VAD */
42/**********************************/
43opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
44    silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
45)
46{
47    opus_int b, ret = 0;
48
49    /* reset state memory */
50    silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
51
52    /* init noise levels */
53    /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
54    for( b = 0; b < VAD_N_BANDS; b++ ) {
55        psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
56    }
57
58    /* Initialize state */
59    for( b = 0; b < VAD_N_BANDS; b++ ) {
60        psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
61        psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
62    }
63    psSilk_VAD->counter = 15;
64
65    /* init smoothed energy-to-noise ratio*/
66    for( b = 0; b < VAD_N_BANDS; b++ ) {
67        psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
68    }
69
70    return( ret );
71}
72
73/* Weighting factors for tilt measure */
74static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
75
76/***************************************/
77/* Get the speech activity level in Q8 */
78/***************************************/
79opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
80    silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
81    const opus_int16            pIn[]                           /* I    PCM input                                   */
82)
83{
84    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
85    opus_int   decimated_framelength, dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
86    opus_int32 sumSquared, smooth_coef_Q16;
87    opus_int16 HPstateTmp;
88    opus_int16 X[ VAD_N_BANDS ][ MAX_FRAME_LENGTH / 2 ];
89    opus_int32 Xnrg[ VAD_N_BANDS ];
90    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
91    opus_int32 speech_nrg, x_tmp;
92    opus_int   ret = 0;
93    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
94
95    /* Safety checks */
96    silk_assert( VAD_N_BANDS == 4 );
97    silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
98    silk_assert( psEncC->frame_length <= 512 );
99    silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
100
101    /***********************/
102    /* Filter and Decimate */
103    /***********************/
104    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
105    silk_ana_filt_bank_1( pIn,          &psSilk_VAD->AnaState[  0 ], &X[ 0 ][ 0 ], &X[ 3 ][ 0 ], psEncC->frame_length );
106
107    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
108    silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState1[ 0 ], &X[ 0 ][ 0 ], &X[ 2 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 1 ) );
109
110    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
111    silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState2[ 0 ], &X[ 0 ][ 0 ], &X[ 1 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 2 ) );
112
113    /*********************************************/
114    /* HP filter on lowest band (differentiator) */
115    /*********************************************/
116    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
117    X[ 0 ][ decimated_framelength - 1 ] = silk_RSHIFT( X[ 0 ][ decimated_framelength - 1 ], 1 );
118    HPstateTmp = X[ 0 ][ decimated_framelength - 1 ];
119    for( i = decimated_framelength - 1; i > 0; i-- ) {
120        X[ 0 ][ i - 1 ]  = silk_RSHIFT( X[ 0 ][ i - 1 ], 1 );
121        X[ 0 ][ i ]     -= X[ 0 ][ i - 1 ];
122    }
123    X[ 0 ][ 0 ] -= psSilk_VAD->HPstate;
124    psSilk_VAD->HPstate = HPstateTmp;
125
126    /*************************************/
127    /* Calculate the energy in each band */
128    /*************************************/
129    for( b = 0; b < VAD_N_BANDS; b++ ) {
130        /* Find the decimated framelength in the non-uniformly divided bands */
131        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
132
133        /* Split length into subframe lengths */
134        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
135        dec_subframe_offset = 0;
136
137        /* Compute energy per sub-frame */
138        /* initialize with summed energy of last subframe */
139        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
140        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
141            sumSquared = 0;
142            for( i = 0; i < dec_subframe_length; i++ ) {
143                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
144                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
145                x_tmp = silk_RSHIFT( X[ b ][ i + dec_subframe_offset ], 3 );
146                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
147
148                /* Safety check */
149                silk_assert( sumSquared >= 0 );
150            }
151
152            /* Add/saturate summed energy of current subframe */
153            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
154                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
155            } else {
156                /* Look-ahead subframe */
157                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
158            }
159
160            dec_subframe_offset += dec_subframe_length;
161        }
162        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
163    }
164
165    /********************/
166    /* Noise estimation */
167    /********************/
168    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
169
170    /***********************************************/
171    /* Signal-plus-noise to noise ratio estimation */
172    /***********************************************/
173    sumSquared = 0;
174    input_tilt = 0;
175    for( b = 0; b < VAD_N_BANDS; b++ ) {
176        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
177        if( speech_nrg > 0 ) {
178            /* Divide, with sufficient resolution */
179            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
180                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
181            } else {
182                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
183            }
184
185            /* Convert to log domain */
186            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
187
188            /* Sum-of-squares */
189            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
190
191            /* Tilt measure */
192            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
193                /* Scale down SNR value for small subband speech energies */
194                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
195            }
196            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
197        } else {
198            NrgToNoiseRatio_Q8[ b ] = 256;
199        }
200    }
201
202    /* Mean-of-squares */
203    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
204
205    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
206    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
207
208    /*********************************/
209    /* Speech Probability Estimation */
210    /*********************************/
211    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
212
213    /**************************/
214    /* Frequency Tilt Measure */
215    /**************************/
216    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
217
218    /**************************************************/
219    /* Scale the sigmoid output based on power levels */
220    /**************************************************/
221    speech_nrg = 0;
222    for( b = 0; b < VAD_N_BANDS; b++ ) {
223        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
224        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
225    }
226
227    /* Power scaling */
228    if( speech_nrg <= 0 ) {
229        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
230    } else if( speech_nrg < 32768 ) {
231        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
232            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
233        } else {
234            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
235        }
236
237        /* square-root */
238        speech_nrg = silk_SQRT_APPROX( speech_nrg );
239        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
240    }
241
242    /* Copy the resulting speech activity in Q8 */
243    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
244
245    /***********************************/
246    /* Energy Level and SNR estimation */
247    /***********************************/
248    /* Smoothing coefficient */
249    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
250
251    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
252        smooth_coef_Q16 >>= 1;
253    }
254
255    for( b = 0; b < VAD_N_BANDS; b++ ) {
256        /* compute smoothed energy-to-noise ratio per band */
257        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
258            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
259
260        /* signal to noise ratio in dB per band */
261        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
262        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
263        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
264    }
265
266    return( ret );
267}
268
269/**************************/
270/* Noise level estimation */
271/**************************/
272static inline void silk_VAD_GetNoiseLevels(
273    const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
274    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
275)
276{
277    opus_int   k;
278    opus_int32 nl, nrg, inv_nrg;
279    opus_int   coef, min_coef;
280
281    /* Initially faster smoothing */
282    if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
283        min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
284    } else {
285        min_coef = 0;
286    }
287
288    for( k = 0; k < VAD_N_BANDS; k++ ) {
289        /* Get old noise level estimate for current band */
290        nl = psSilk_VAD->NL[ k ];
291        silk_assert( nl >= 0 );
292
293        /* Add bias */
294        nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
295        silk_assert( nrg > 0 );
296
297        /* Invert energies */
298        inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
299        silk_assert( inv_nrg >= 0 );
300
301        /* Less update when subband energy is high */
302        if( nrg > silk_LSHIFT( nl, 3 ) ) {
303            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
304        } else if( nrg < nl ) {
305            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
306        } else {
307            coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
308        }
309
310        /* Initially faster smoothing */
311        coef = silk_max_int( coef, min_coef );
312
313        /* Smooth inverse energies */
314        psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
315        silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
316
317        /* Compute noise level by inverting again */
318        nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
319        silk_assert( nl >= 0 );
320
321        /* Limit noise levels (guarantee 7 bits of head room) */
322        nl = silk_min( nl, 0x00FFFFFF );
323
324        /* Store as part of state */
325        psSilk_VAD->NL[ k ] = nl;
326    }
327
328    /* Increment frame counter */
329    psSilk_VAD->counter++;
330}
331