1/* Copyright (c) 2011 Xiph.Org Foundation 2 Written by Jean-Marc Valin */ 3/* 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28#ifdef HAVE_CONFIG_H 29#include "config.h" 30#endif 31 32#include "kiss_fft.h" 33#include "celt.h" 34#include "modes.h" 35#include "arch.h" 36#include "quant_bands.h" 37#include <stdio.h> 38#include "analysis.h" 39#include "mlp.h" 40#include "stack_alloc.h" 41 42#ifndef M_PI 43#define M_PI 3.141592653 44#endif 45 46static const float dct_table[128] = { 47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f, 50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f, 51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f, 52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f, 53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f, 54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f, 55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 56 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 57 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f, 58 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f, 59 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f, 60 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f, 61 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f, 62 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f, 63}; 64 65static const float analysis_window[240] = { 66 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f, 67 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f, 68 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f, 69 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f, 70 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f, 71 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f, 72 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f, 73 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f, 74 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f, 75 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f, 76 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f, 77 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f, 78 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f, 79 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f, 80 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f, 81 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f, 82 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f, 83 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f, 84 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f, 85 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f, 86 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f, 87 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f, 88 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f, 89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f, 90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f, 91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f, 92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f, 93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f, 94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f, 95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f, 96}; 97 98static const int tbands[NB_TBANDS+1] = { 99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 100}; 101 102static const int extra_bands[NB_TOT_BANDS+1] = { 103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 104}; 105 106/*static const float tweight[NB_TBANDS+1] = { 107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 108};*/ 109 110#define NB_TONAL_SKIP_BANDS 9 111 112#define cA 0.43157974f 113#define cB 0.67848403f 114#define cC 0.08595542f 115#define cE ((float)M_PI/2) 116static OPUS_INLINE float fast_atan2f(float y, float x) { 117 float x2, y2; 118 /* Should avoid underflow on the values we'll get */ 119 if (ABS16(x)+ABS16(y)<1e-9f) 120 { 121 x*=1e12f; 122 y*=1e12f; 123 } 124 x2 = x*x; 125 y2 = y*y; 126 if(x2<y2){ 127 float den = (y2 + cB*x2) * (y2 + cC*x2); 128 if (den!=0) 129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); 130 else 131 return (y<0 ? -cE : cE); 132 }else{ 133 float den = (x2 + cB*y2) * (x2 + cC*y2); 134 if (den!=0) 135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 136 else 137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 138 } 139} 140 141void tonality_analysis_init(TonalityAnalysisState *tonal) 142{ 143 /* Initialize reusable fields. */ 144 tonal->arch = opus_select_arch(); 145 /* Clear remaining fields. */ 146 tonality_analysis_reset(tonal); 147} 148 149void tonality_analysis_reset(TonalityAnalysisState *tonal) 150{ 151 /* Clear non-reusable fields. */ 152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; 153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); 154} 155 156void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) 157{ 158 int pos; 159 int curr_lookahead; 160 float psum; 161 int i; 162 163 pos = tonal->read_pos; 164 curr_lookahead = tonal->write_pos-tonal->read_pos; 165 if (curr_lookahead<0) 166 curr_lookahead += DETECT_SIZE; 167 168 if (len > 480 && pos != tonal->write_pos) 169 { 170 pos++; 171 if (pos==DETECT_SIZE) 172 pos=0; 173 } 174 if (pos == tonal->write_pos) 175 pos--; 176 if (pos<0) 177 pos = DETECT_SIZE-1; 178 OPUS_COPY(info_out, &tonal->info[pos], 1); 179 tonal->read_subframe += len/120; 180 while (tonal->read_subframe>=4) 181 { 182 tonal->read_subframe -= 4; 183 tonal->read_pos++; 184 } 185 if (tonal->read_pos>=DETECT_SIZE) 186 tonal->read_pos-=DETECT_SIZE; 187 188 /* Compensate for the delay in the features themselves. 189 FIXME: Need a better estimate the 10 I just made up */ 190 curr_lookahead = IMAX(curr_lookahead-10, 0); 191 192 psum=0; 193 /* Summing the probability of transition patterns that involve music at 194 time (DETECT_SIZE-curr_lookahead-1) */ 195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) 196 psum += tonal->pmusic[i]; 197 for (;i<DETECT_SIZE;i++) 198 psum += tonal->pspeech[i]; 199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; 200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ 201 202 info_out->music_prob = psum; 203} 204 205static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) 206{ 207 int i, b; 208 const kiss_fft_state *kfft; 209 VARDECL(kiss_fft_cpx, in); 210 VARDECL(kiss_fft_cpx, out); 211 int N = 480, N2=240; 212 float * OPUS_RESTRICT A = tonal->angle; 213 float * OPUS_RESTRICT dA = tonal->d_angle; 214 float * OPUS_RESTRICT d2A = tonal->d2_angle; 215 VARDECL(float, tonality); 216 VARDECL(float, noisiness); 217 float band_tonality[NB_TBANDS]; 218 float logE[NB_TBANDS]; 219 float BFCC[8]; 220 float features[25]; 221 float frame_tonality; 222 float max_frame_tonality; 223 /*float tw_sum=0;*/ 224 float frame_noisiness; 225 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI); 226 float slope=0; 227 float frame_stationarity; 228 float relativeE; 229 float frame_probs[2]; 230 float alpha, alphaE, alphaE2; 231 float frame_loudness; 232 float bandwidth_mask; 233 int bandwidth=0; 234 float maxE = 0; 235 float noise_floor; 236 int remaining; 237 AnalysisInfo *info; 238 SAVE_STACK; 239 240 tonal->last_transition++; 241 alpha = 1.f/IMIN(20, 1+tonal->count); 242 alphaE = 1.f/IMIN(50, 1+tonal->count); 243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count); 244 245 if (tonal->count<4) 246 tonal->music_prob = .5; 247 kfft = celt_mode->mdct.kfft[0]; 248 if (tonal->count==0) 249 tonal->mem_fill = 240; 250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); 251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) 252 { 253 tonal->mem_fill += len; 254 /* Don't have enough to update the analysis */ 255 RESTORE_STACK; 256 return; 257 } 258 info = &tonal->info[tonal->write_pos++]; 259 if (tonal->write_pos>=DETECT_SIZE) 260 tonal->write_pos-=DETECT_SIZE; 261 262 ALLOC(in, 480, kiss_fft_cpx); 263 ALLOC(out, 480, kiss_fft_cpx); 264 ALLOC(tonality, 240, float); 265 ALLOC(noisiness, 240, float); 266 for (i=0;i<N2;i++) 267 { 268 float w = analysis_window[i]; 269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); 270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); 271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); 272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); 273 } 274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); 275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); 276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); 277 tonal->mem_fill = 240 + remaining; 278 opus_fft(kfft, in, out, tonal->arch); 279#ifndef FIXED_POINT 280 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */ 281 if (celt_isnan(out[0].r)) 282 { 283 info->valid = 0; 284 RESTORE_STACK; 285 return; 286 } 287#endif 288 289 for (i=1;i<N2;i++) 290 { 291 float X1r, X2r, X1i, X2i; 292 float angle, d_angle, d2_angle; 293 float angle2, d_angle2, d2_angle2; 294 float mod1, mod2, avg_mod; 295 X1r = (float)out[i].r+out[N-i].r; 296 X1i = (float)out[i].i-out[N-i].i; 297 X2r = (float)out[i].i+out[N-i].i; 298 X2i = (float)out[N-i].r-out[i].r; 299 300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); 301 d_angle = angle - A[i]; 302 d2_angle = d_angle - dA[i]; 303 304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); 305 d_angle2 = angle2 - angle; 306 d2_angle2 = d_angle2 - d_angle; 307 308 mod1 = d2_angle - (float)floor(.5+d2_angle); 309 noisiness[i] = ABS16(mod1); 310 mod1 *= mod1; 311 mod1 *= mod1; 312 313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2); 314 noisiness[i] += ABS16(mod2); 315 mod2 *= mod2; 316 mod2 *= mod2; 317 318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); 319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; 320 321 A[i] = angle2; 322 dA[i] = d_angle2; 323 d2A[i] = mod2; 324 } 325 326 frame_tonality = 0; 327 max_frame_tonality = 0; 328 /*tw_sum = 0;*/ 329 info->activity = 0; 330 frame_noisiness = 0; 331 frame_stationarity = 0; 332 if (!tonal->count) 333 { 334 for (b=0;b<NB_TBANDS;b++) 335 { 336 tonal->lowE[b] = 1e10; 337 tonal->highE[b] = -1e10; 338 } 339 } 340 relativeE = 0; 341 frame_loudness = 0; 342 for (b=0;b<NB_TBANDS;b++) 343 { 344 float E=0, tE=0, nE=0; 345 float L1, L2; 346 float stationarity; 347 for (i=tbands[b];i<tbands[b+1];i++) 348 { 349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 351#ifdef FIXED_POINT 352 /* FIXME: It's probably best to change the BFCC filter initial state instead */ 353 binE *= 5.55e-17f; 354#endif 355 E += binE; 356 tE += binE*tonality[i]; 357 nE += binE*2.f*(.5f-noisiness[i]); 358 } 359#ifndef FIXED_POINT 360 /* Check for extreme band energies that could cause NaNs later. */ 361 if (!(E<1e9f) || celt_isnan(E)) 362 { 363 info->valid = 0; 364 RESTORE_STACK; 365 return; 366 } 367#endif 368 369 tonal->E[tonal->E_count][b] = E; 370 frame_noisiness += nE/(1e-15f+E); 371 372 frame_loudness += (float)sqrt(E+1e-10f); 373 logE[b] = (float)log(E+1e-10f); 374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); 375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); 376 if (tonal->highE[b] < tonal->lowE[b]+1.f) 377 { 378 tonal->highE[b]+=.5f; 379 tonal->lowE[b]-=.5f; 380 } 381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); 382 383 L1=L2=0; 384 for (i=0;i<NB_FRAMES;i++) 385 { 386 L1 += (float)sqrt(tonal->E[i][b]); 387 L2 += tonal->E[i][b]; 388 } 389 390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); 391 stationarity *= stationarity; 392 stationarity *= stationarity; 393 frame_stationarity += stationarity; 394 /*band_tonality[b] = tE/(1e-15+E)*/; 395 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]); 396#if 0 397 if (b>=NB_TONAL_SKIP_BANDS) 398 { 399 frame_tonality += tweight[b]*band_tonality[b]; 400 tw_sum += tweight[b]; 401 } 402#else 403 frame_tonality += band_tonality[b]; 404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) 405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; 406#endif 407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality); 408 slope += band_tonality[b]*(b-8); 409 /*printf("%f %f ", band_tonality[b], stationarity);*/ 410 tonal->prev_band_tonality[b] = band_tonality[b]; 411 } 412 413 bandwidth_mask = 0; 414 bandwidth = 0; 415 maxE = 0; 416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); 417#ifdef FIXED_POINT 418 noise_floor *= 1<<(15+SIG_SHIFT); 419#endif 420 noise_floor *= noise_floor; 421 for (b=0;b<NB_TOT_BANDS;b++) 422 { 423 float E=0; 424 int band_start, band_end; 425 /* Keep a margin of 300 Hz for aliasing */ 426 band_start = extra_bands[b]; 427 band_end = extra_bands[b+1]; 428 for (i=band_start;i<band_end;i++) 429 { 430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 432 E += binE; 433 } 434 maxE = MAX32(maxE, E); 435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); 436 E = MAX32(E, tonal->meanE[b]); 437 /* Use a simple follower with 13 dB/Bark slope for spreading function */ 438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); 439 /* Consider the band "active" only if all these conditions are met: 440 1) less than 10 dB below the simple follower 441 2) less than 90 dB below the peak band (maximal masking possible considering 442 both the ATH and the loudness-dependent slope of the spreading function) 443 3) above the PCM quantization noise floor 444 */ 445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) 446 bandwidth = b; 447 } 448 if (tonal->count<=2) 449 bandwidth = 20; 450 frame_loudness = 20*(float)log10(frame_loudness); 451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); 452 tonal->lowECount *= (1-alphaE); 453 if (frame_loudness < tonal->Etracker-30) 454 tonal->lowECount += alphaE; 455 456 for (i=0;i<8;i++) 457 { 458 float sum=0; 459 for (b=0;b<16;b++) 460 sum += dct_table[i*16+b]*logE[b]; 461 BFCC[i] = sum; 462 } 463 464 frame_stationarity /= NB_TBANDS; 465 relativeE /= NB_TBANDS; 466 if (tonal->count<10) 467 relativeE = .5; 468 frame_noisiness /= NB_TBANDS; 469#if 1 470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; 471#else 472 info->activity = .5*(1+frame_noisiness-frame_stationarity); 473#endif 474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); 475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); 476 tonal->prev_tonality = frame_tonality; 477 478 slope /= 8*8; 479 info->tonality_slope = slope; 480 481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; 482 tonal->count++; 483 info->tonality = frame_tonality; 484 485 for (i=0;i<4;i++) 486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; 487 488 for (i=0;i<4;i++) 489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; 490 491 for (i=0;i<4;i++) 492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]); 493 for (i=0;i<3;i++) 494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; 495 496 if (tonal->count > 5) 497 { 498 for (i=0;i<9;i++) 499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; 500 } 501 502 for (i=0;i<8;i++) 503 { 504 tonal->mem[i+24] = tonal->mem[i+16]; 505 tonal->mem[i+16] = tonal->mem[i+8]; 506 tonal->mem[i+8] = tonal->mem[i]; 507 tonal->mem[i] = BFCC[i]; 508 } 509 for (i=0;i<9;i++) 510 features[11+i] = (float)sqrt(tonal->std[i]); 511 features[20] = info->tonality; 512 features[21] = info->activity; 513 features[22] = frame_stationarity; 514 features[23] = info->tonality_slope; 515 features[24] = tonal->lowECount; 516 517#ifndef DISABLE_FLOAT_API 518 mlp_process(&net, features, frame_probs); 519 frame_probs[0] = .5f*(frame_probs[0]+1); 520 /* Curve fitting between the MLP probability and the actual probability */ 521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); 522 /* Probability of active audio (as opposed to silence) */ 523 frame_probs[1] = .5f*frame_probs[1]+.5f; 524 /* Consider that silence has a 50-50 probability. */ 525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; 526 527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ 528 { 529 /* Probability of state transition */ 530 float tau; 531 /* Represents independence of the MLP probabilities, where 532 beta=1 means fully independent. */ 533 float beta; 534 /* Denormalized probability of speech (p0) and music (p1) after update */ 535 float p0, p1; 536 /* Probabilities for "all speech" and "all music" */ 537 float s0, m0; 538 /* Probability sum for renormalisation */ 539 float psum; 540 /* Instantaneous probability of speech and music, with beta pre-applied. */ 541 float speech0; 542 float music0; 543 float p, q; 544 545 /* One transition every 3 minutes of active audio */ 546 tau = .00005f*frame_probs[1]; 547 /* Adapt beta based on how "unexpected" the new prob is */ 548 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); 549 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); 550 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); 551 /* p0 and p1 are the probabilities of speech and music at this frame 552 using only information from previous frame and applying the 553 state transition model */ 554 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; 555 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; 556 /* We apply the current probability with exponent beta to work around 557 the fact that the probability estimates aren't independent. */ 558 p0 *= (float)pow(1-frame_probs[0], beta); 559 p1 *= (float)pow(frame_probs[0], beta); 560 /* Normalise the probabilities to get the Marokv probability of music. */ 561 tonal->music_prob = p1/(p0+p1); 562 info->music_prob = tonal->music_prob; 563 564 /* This chunk of code deals with delayed decision. */ 565 psum=1e-20f; 566 /* Instantaneous probability of speech and music, with beta pre-applied. */ 567 speech0 = (float)pow(1-frame_probs[0], beta); 568 music0 = (float)pow(frame_probs[0], beta); 569 if (tonal->count==1) 570 { 571 tonal->pspeech[0]=.5; 572 tonal->pmusic [0]=.5; 573 } 574 /* Updated probability of having only speech (s0) or only music (m0), 575 before considering the new observation. */ 576 s0 = tonal->pspeech[0] + tonal->pspeech[1]; 577 m0 = tonal->pmusic [0] + tonal->pmusic [1]; 578 /* Updates s0 and m0 with instantaneous probability. */ 579 tonal->pspeech[0] = s0*(1-tau)*speech0; 580 tonal->pmusic [0] = m0*(1-tau)*music0; 581 /* Propagate the transition probabilities */ 582 for (i=1;i<DETECT_SIZE-1;i++) 583 { 584 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; 585 tonal->pmusic [i] = tonal->pmusic [i+1]*music0; 586 } 587 /* Probability that the latest frame is speech, when all the previous ones were music. */ 588 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; 589 /* Probability that the latest frame is music, when all the previous ones were speech. */ 590 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; 591 592 /* Renormalise probabilities to 1 */ 593 for (i=0;i<DETECT_SIZE;i++) 594 psum += tonal->pspeech[i] + tonal->pmusic[i]; 595 psum = 1.f/psum; 596 for (i=0;i<DETECT_SIZE;i++) 597 { 598 tonal->pspeech[i] *= psum; 599 tonal->pmusic [i] *= psum; 600 } 601 psum = tonal->pmusic[0]; 602 for (i=1;i<DETECT_SIZE;i++) 603 psum += tonal->pspeech[i]; 604 605 /* Estimate our confidence in the speech/music decisions */ 606 if (frame_probs[1]>.75) 607 { 608 if (tonal->music_prob>.9) 609 { 610 float adapt; 611 adapt = 1.f/(++tonal->music_confidence_count); 612 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); 613 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); 614 } 615 if (tonal->music_prob<.1) 616 { 617 float adapt; 618 adapt = 1.f/(++tonal->speech_confidence_count); 619 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); 620 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); 621 } 622 } else { 623 if (tonal->music_confidence_count==0) 624 tonal->music_confidence = .9f; 625 if (tonal->speech_confidence_count==0) 626 tonal->speech_confidence = .1f; 627 } 628 } 629 if (tonal->last_music != (tonal->music_prob>.5f)) 630 tonal->last_transition=0; 631 tonal->last_music = tonal->music_prob>.5f; 632#else 633 info->music_prob = 0; 634#endif 635 /*for (i=0;i<25;i++) 636 printf("%f ", features[i]); 637 printf("\n");*/ 638 639 info->bandwidth = bandwidth; 640 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ 641 info->noisiness = frame_noisiness; 642 info->valid = 1; 643 RESTORE_STACK; 644} 645 646void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, 647 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, 648 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) 649{ 650 int offset; 651 int pcm_len; 652 653 if (analysis_pcm != NULL) 654 { 655 /* Avoid overflow/wrap-around of the analysis buffer */ 656 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); 657 658 pcm_len = analysis_frame_size - analysis->analysis_offset; 659 offset = analysis->analysis_offset; 660 do { 661 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); 662 offset += 480; 663 pcm_len -= 480; 664 } while (pcm_len>0); 665 analysis->analysis_offset = analysis_frame_size; 666 667 analysis->analysis_offset -= frame_size; 668 } 669 670 analysis_info->valid = 0; 671 tonality_get_info(analysis, analysis_info, frame_size); 672} 673