1/* Copyright (c) 2011 Xiph.Org Foundation
2   Written by Jean-Marc Valin */
3/*
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8   - Redistributions of source code must retain the above copyright
9   notice, this list of conditions and the following disclaimer.
10
11   - Redistributions in binary form must reproduce the above copyright
12   notice, this list of conditions and the following disclaimer in the
13   documentation and/or other materials provided with the distribution.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "kiss_fft.h"
33#include "celt.h"
34#include "modes.h"
35#include "arch.h"
36#include "quant_bands.h"
37#include <stdio.h>
38#include "analysis.h"
39#include "mlp.h"
40#include "stack_alloc.h"
41
42#ifndef M_PI
43#define M_PI 3.141592653
44#endif
45
46static const float dct_table[128] = {
47        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49        0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50       -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51        0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52       -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53        0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54        0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57        0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58       -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59        0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60       -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61        0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62        0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63};
64
65static const float analysis_window[240] = {
66      0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67      0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68      0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69      0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70      0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71      0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72      0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73      0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74      0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75      0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76      0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77      0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78      0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79      0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80      0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81      0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82      0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83      0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84      0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85      0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86      0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87      0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88      0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89      0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90      0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91      0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92      0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93      0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94      0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95      0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96};
97
98static const int tbands[NB_TBANDS+1] = {
99       2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100};
101
102static const int extra_bands[NB_TOT_BANDS+1] = {
103      1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104};
105
106/*static const float tweight[NB_TBANDS+1] = {
107      .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108};*/
109
110#define NB_TONAL_SKIP_BANDS 9
111
112#define cA 0.43157974f
113#define cB 0.67848403f
114#define cC 0.08595542f
115#define cE ((float)M_PI/2)
116static OPUS_INLINE float fast_atan2f(float y, float x) {
117   float x2, y2;
118   /* Should avoid underflow on the values we'll get */
119   if (ABS16(x)+ABS16(y)<1e-9f)
120   {
121      x*=1e12f;
122      y*=1e12f;
123   }
124   x2 = x*x;
125   y2 = y*y;
126   if(x2<y2){
127      float den = (y2 + cB*x2) * (y2 + cC*x2);
128      if (den!=0)
129         return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130      else
131         return (y<0 ? -cE : cE);
132   }else{
133      float den = (x2 + cB*y2) * (x2 + cC*y2);
134      if (den!=0)
135         return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136      else
137         return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138   }
139}
140
141void tonality_analysis_init(TonalityAnalysisState *tonal)
142{
143  /* Initialize reusable fields. */
144  tonal->arch = opus_select_arch();
145  /* Clear remaining fields. */
146  tonality_analysis_reset(tonal);
147}
148
149void tonality_analysis_reset(TonalityAnalysisState *tonal)
150{
151  /* Clear non-reusable fields. */
152  char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153  OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154}
155
156void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157{
158   int pos;
159   int curr_lookahead;
160   float psum;
161   int i;
162
163   pos = tonal->read_pos;
164   curr_lookahead = tonal->write_pos-tonal->read_pos;
165   if (curr_lookahead<0)
166      curr_lookahead += DETECT_SIZE;
167
168   if (len > 480 && pos != tonal->write_pos)
169   {
170      pos++;
171      if (pos==DETECT_SIZE)
172         pos=0;
173   }
174   if (pos == tonal->write_pos)
175      pos--;
176   if (pos<0)
177      pos = DETECT_SIZE-1;
178   OPUS_COPY(info_out, &tonal->info[pos], 1);
179   tonal->read_subframe += len/120;
180   while (tonal->read_subframe>=4)
181   {
182      tonal->read_subframe -= 4;
183      tonal->read_pos++;
184   }
185   if (tonal->read_pos>=DETECT_SIZE)
186      tonal->read_pos-=DETECT_SIZE;
187
188   /* Compensate for the delay in the features themselves.
189      FIXME: Need a better estimate the 10 I just made up */
190   curr_lookahead = IMAX(curr_lookahead-10, 0);
191
192   psum=0;
193   /* Summing the probability of transition patterns that involve music at
194      time (DETECT_SIZE-curr_lookahead-1) */
195   for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196      psum += tonal->pmusic[i];
197   for (;i<DETECT_SIZE;i++)
198      psum += tonal->pspeech[i];
199   psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200   /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201
202   info_out->music_prob = psum;
203}
204
205static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
206{
207    int i, b;
208    const kiss_fft_state *kfft;
209    VARDECL(kiss_fft_cpx, in);
210    VARDECL(kiss_fft_cpx, out);
211    int N = 480, N2=240;
212    float * OPUS_RESTRICT A = tonal->angle;
213    float * OPUS_RESTRICT dA = tonal->d_angle;
214    float * OPUS_RESTRICT d2A = tonal->d2_angle;
215    VARDECL(float, tonality);
216    VARDECL(float, noisiness);
217    float band_tonality[NB_TBANDS];
218    float logE[NB_TBANDS];
219    float BFCC[8];
220    float features[25];
221    float frame_tonality;
222    float max_frame_tonality;
223    /*float tw_sum=0;*/
224    float frame_noisiness;
225    const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
226    float slope=0;
227    float frame_stationarity;
228    float relativeE;
229    float frame_probs[2];
230    float alpha, alphaE, alphaE2;
231    float frame_loudness;
232    float bandwidth_mask;
233    int bandwidth=0;
234    float maxE = 0;
235    float noise_floor;
236    int remaining;
237    AnalysisInfo *info;
238    SAVE_STACK;
239
240    tonal->last_transition++;
241    alpha = 1.f/IMIN(20, 1+tonal->count);
242    alphaE = 1.f/IMIN(50, 1+tonal->count);
243    alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244
245    if (tonal->count<4)
246       tonal->music_prob = .5;
247    kfft = celt_mode->mdct.kfft[0];
248    if (tonal->count==0)
249       tonal->mem_fill = 240;
250    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252    {
253       tonal->mem_fill += len;
254       /* Don't have enough to update the analysis */
255       RESTORE_STACK;
256       return;
257    }
258    info = &tonal->info[tonal->write_pos++];
259    if (tonal->write_pos>=DETECT_SIZE)
260       tonal->write_pos-=DETECT_SIZE;
261
262    ALLOC(in, 480, kiss_fft_cpx);
263    ALLOC(out, 480, kiss_fft_cpx);
264    ALLOC(tonality, 240, float);
265    ALLOC(noisiness, 240, float);
266    for (i=0;i<N2;i++)
267    {
268       float w = analysis_window[i];
269       in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270       in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271       in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272       in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273    }
274    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277    tonal->mem_fill = 240 + remaining;
278    opus_fft(kfft, in, out, tonal->arch);
279#ifndef FIXED_POINT
280    /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281    if (celt_isnan(out[0].r))
282    {
283       info->valid = 0;
284       RESTORE_STACK;
285       return;
286    }
287#endif
288
289    for (i=1;i<N2;i++)
290    {
291       float X1r, X2r, X1i, X2i;
292       float angle, d_angle, d2_angle;
293       float angle2, d_angle2, d2_angle2;
294       float mod1, mod2, avg_mod;
295       X1r = (float)out[i].r+out[N-i].r;
296       X1i = (float)out[i].i-out[N-i].i;
297       X2r = (float)out[i].i+out[N-i].i;
298       X2i = (float)out[N-i].r-out[i].r;
299
300       angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301       d_angle = angle - A[i];
302       d2_angle = d_angle - dA[i];
303
304       angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305       d_angle2 = angle2 - angle;
306       d2_angle2 = d_angle2 - d_angle;
307
308       mod1 = d2_angle - (float)floor(.5+d2_angle);
309       noisiness[i] = ABS16(mod1);
310       mod1 *= mod1;
311       mod1 *= mod1;
312
313       mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314       noisiness[i] += ABS16(mod2);
315       mod2 *= mod2;
316       mod2 *= mod2;
317
318       avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319       tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
320
321       A[i] = angle2;
322       dA[i] = d_angle2;
323       d2A[i] = mod2;
324    }
325
326    frame_tonality = 0;
327    max_frame_tonality = 0;
328    /*tw_sum = 0;*/
329    info->activity = 0;
330    frame_noisiness = 0;
331    frame_stationarity = 0;
332    if (!tonal->count)
333    {
334       for (b=0;b<NB_TBANDS;b++)
335       {
336          tonal->lowE[b] = 1e10;
337          tonal->highE[b] = -1e10;
338       }
339    }
340    relativeE = 0;
341    frame_loudness = 0;
342    for (b=0;b<NB_TBANDS;b++)
343    {
344       float E=0, tE=0, nE=0;
345       float L1, L2;
346       float stationarity;
347       for (i=tbands[b];i<tbands[b+1];i++)
348       {
349          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351#ifdef FIXED_POINT
352          /* FIXME: It's probably best to change the BFCC filter initial state instead */
353          binE *= 5.55e-17f;
354#endif
355          E += binE;
356          tE += binE*tonality[i];
357          nE += binE*2.f*(.5f-noisiness[i]);
358       }
359#ifndef FIXED_POINT
360       /* Check for extreme band energies that could cause NaNs later. */
361       if (!(E<1e9f) || celt_isnan(E))
362       {
363          info->valid = 0;
364          RESTORE_STACK;
365          return;
366       }
367#endif
368
369       tonal->E[tonal->E_count][b] = E;
370       frame_noisiness += nE/(1e-15f+E);
371
372       frame_loudness += (float)sqrt(E+1e-10f);
373       logE[b] = (float)log(E+1e-10f);
374       tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375       tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376       if (tonal->highE[b] < tonal->lowE[b]+1.f)
377       {
378          tonal->highE[b]+=.5f;
379          tonal->lowE[b]-=.5f;
380       }
381       relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
382
383       L1=L2=0;
384       for (i=0;i<NB_FRAMES;i++)
385       {
386          L1 += (float)sqrt(tonal->E[i][b]);
387          L2 += tonal->E[i][b];
388       }
389
390       stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391       stationarity *= stationarity;
392       stationarity *= stationarity;
393       frame_stationarity += stationarity;
394       /*band_tonality[b] = tE/(1e-15+E)*/;
395       band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
396#if 0
397       if (b>=NB_TONAL_SKIP_BANDS)
398       {
399          frame_tonality += tweight[b]*band_tonality[b];
400          tw_sum += tweight[b];
401       }
402#else
403       frame_tonality += band_tonality[b];
404       if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405          frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406#endif
407       max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408       slope += band_tonality[b]*(b-8);
409       /*printf("%f %f ", band_tonality[b], stationarity);*/
410       tonal->prev_band_tonality[b] = band_tonality[b];
411    }
412
413    bandwidth_mask = 0;
414    bandwidth = 0;
415    maxE = 0;
416    noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417#ifdef FIXED_POINT
418    noise_floor *= 1<<(15+SIG_SHIFT);
419#endif
420    noise_floor *= noise_floor;
421    for (b=0;b<NB_TOT_BANDS;b++)
422    {
423       float E=0;
424       int band_start, band_end;
425       /* Keep a margin of 300 Hz for aliasing */
426       band_start = extra_bands[b];
427       band_end = extra_bands[b+1];
428       for (i=band_start;i<band_end;i++)
429       {
430          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432          E += binE;
433       }
434       maxE = MAX32(maxE, E);
435       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436       E = MAX32(E, tonal->meanE[b]);
437       /* Use a simple follower with 13 dB/Bark slope for spreading function */
438       bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439       /* Consider the band "active" only if all these conditions are met:
440          1) less than 10 dB below the simple follower
441          2) less than 90 dB below the peak band (maximal masking possible considering
442             both the ATH and the loudness-dependent slope of the spreading function)
443          3) above the PCM quantization noise floor
444       */
445       if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
446          bandwidth = b;
447    }
448    if (tonal->count<=2)
449       bandwidth = 20;
450    frame_loudness = 20*(float)log10(frame_loudness);
451    tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452    tonal->lowECount *= (1-alphaE);
453    if (frame_loudness < tonal->Etracker-30)
454       tonal->lowECount += alphaE;
455
456    for (i=0;i<8;i++)
457    {
458       float sum=0;
459       for (b=0;b<16;b++)
460          sum += dct_table[i*16+b]*logE[b];
461       BFCC[i] = sum;
462    }
463
464    frame_stationarity /= NB_TBANDS;
465    relativeE /= NB_TBANDS;
466    if (tonal->count<10)
467       relativeE = .5;
468    frame_noisiness /= NB_TBANDS;
469#if 1
470    info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471#else
472    info->activity = .5*(1+frame_noisiness-frame_stationarity);
473#endif
474    frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475    frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476    tonal->prev_tonality = frame_tonality;
477
478    slope /= 8*8;
479    info->tonality_slope = slope;
480
481    tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482    tonal->count++;
483    info->tonality = frame_tonality;
484
485    for (i=0;i<4;i++)
486       features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487
488    for (i=0;i<4;i++)
489       tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490
491    for (i=0;i<4;i++)
492        features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
493    for (i=0;i<3;i++)
494        features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495
496    if (tonal->count > 5)
497    {
498       for (i=0;i<9;i++)
499          tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
500    }
501
502    for (i=0;i<8;i++)
503    {
504       tonal->mem[i+24] = tonal->mem[i+16];
505       tonal->mem[i+16] = tonal->mem[i+8];
506       tonal->mem[i+8] = tonal->mem[i];
507       tonal->mem[i] = BFCC[i];
508    }
509    for (i=0;i<9;i++)
510       features[11+i] = (float)sqrt(tonal->std[i]);
511    features[20] = info->tonality;
512    features[21] = info->activity;
513    features[22] = frame_stationarity;
514    features[23] = info->tonality_slope;
515    features[24] = tonal->lowECount;
516
517#ifndef DISABLE_FLOAT_API
518    mlp_process(&net, features, frame_probs);
519    frame_probs[0] = .5f*(frame_probs[0]+1);
520    /* Curve fitting between the MLP probability and the actual probability */
521    frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522    /* Probability of active audio (as opposed to silence) */
523    frame_probs[1] = .5f*frame_probs[1]+.5f;
524    /* Consider that silence has a 50-50 probability. */
525    frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527    /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
528    {
529       /* Probability of state transition */
530       float tau;
531       /* Represents independence of the MLP probabilities, where
532          beta=1 means fully independent. */
533       float beta;
534       /* Denormalized probability of speech (p0) and music (p1) after update */
535       float p0, p1;
536       /* Probabilities for "all speech" and "all music" */
537       float s0, m0;
538       /* Probability sum for renormalisation */
539       float psum;
540       /* Instantaneous probability of speech and music, with beta pre-applied. */
541       float speech0;
542       float music0;
543       float p, q;
544
545       /* One transition every 3 minutes of active audio */
546       tau = .00005f*frame_probs[1];
547       /* Adapt beta based on how "unexpected" the new prob is */
548       p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
549       q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
550       beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
551       /* p0 and p1 are the probabilities of speech and music at this frame
552          using only information from previous frame and applying the
553          state transition model */
554       p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
555       p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
556       /* We apply the current probability with exponent beta to work around
557          the fact that the probability estimates aren't independent. */
558       p0 *= (float)pow(1-frame_probs[0], beta);
559       p1 *= (float)pow(frame_probs[0], beta);
560       /* Normalise the probabilities to get the Marokv probability of music. */
561       tonal->music_prob = p1/(p0+p1);
562       info->music_prob = tonal->music_prob;
563
564       /* This chunk of code deals with delayed decision. */
565       psum=1e-20f;
566       /* Instantaneous probability of speech and music, with beta pre-applied. */
567       speech0 = (float)pow(1-frame_probs[0], beta);
568       music0  = (float)pow(frame_probs[0], beta);
569       if (tonal->count==1)
570       {
571          tonal->pspeech[0]=.5;
572          tonal->pmusic [0]=.5;
573       }
574       /* Updated probability of having only speech (s0) or only music (m0),
575          before considering the new observation. */
576       s0 = tonal->pspeech[0] + tonal->pspeech[1];
577       m0 = tonal->pmusic [0] + tonal->pmusic [1];
578       /* Updates s0 and m0 with instantaneous probability. */
579       tonal->pspeech[0] = s0*(1-tau)*speech0;
580       tonal->pmusic [0] = m0*(1-tau)*music0;
581       /* Propagate the transition probabilities */
582       for (i=1;i<DETECT_SIZE-1;i++)
583       {
584          tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
585          tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
586       }
587       /* Probability that the latest frame is speech, when all the previous ones were music. */
588       tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
589       /* Probability that the latest frame is music, when all the previous ones were speech. */
590       tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
591
592       /* Renormalise probabilities to 1 */
593       for (i=0;i<DETECT_SIZE;i++)
594          psum += tonal->pspeech[i] + tonal->pmusic[i];
595       psum = 1.f/psum;
596       for (i=0;i<DETECT_SIZE;i++)
597       {
598          tonal->pspeech[i] *= psum;
599          tonal->pmusic [i] *= psum;
600       }
601       psum = tonal->pmusic[0];
602       for (i=1;i<DETECT_SIZE;i++)
603          psum += tonal->pspeech[i];
604
605       /* Estimate our confidence in the speech/music decisions */
606       if (frame_probs[1]>.75)
607       {
608          if (tonal->music_prob>.9)
609          {
610             float adapt;
611             adapt = 1.f/(++tonal->music_confidence_count);
612             tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
613             tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
614          }
615          if (tonal->music_prob<.1)
616          {
617             float adapt;
618             adapt = 1.f/(++tonal->speech_confidence_count);
619             tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
620             tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
621          }
622       } else {
623          if (tonal->music_confidence_count==0)
624             tonal->music_confidence = .9f;
625          if (tonal->speech_confidence_count==0)
626             tonal->speech_confidence = .1f;
627       }
628    }
629    if (tonal->last_music != (tonal->music_prob>.5f))
630       tonal->last_transition=0;
631    tonal->last_music = tonal->music_prob>.5f;
632#else
633    info->music_prob = 0;
634#endif
635    /*for (i=0;i<25;i++)
636       printf("%f ", features[i]);
637    printf("\n");*/
638
639    info->bandwidth = bandwidth;
640    /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
641    info->noisiness = frame_noisiness;
642    info->valid = 1;
643    RESTORE_STACK;
644}
645
646void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
647                 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
648                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
649{
650   int offset;
651   int pcm_len;
652
653   if (analysis_pcm != NULL)
654   {
655      /* Avoid overflow/wrap-around of the analysis buffer */
656      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
657
658      pcm_len = analysis_frame_size - analysis->analysis_offset;
659      offset = analysis->analysis_offset;
660      do {
661         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
662         offset += 480;
663         pcm_len -= 480;
664      } while (pcm_len>0);
665      analysis->analysis_offset = analysis_frame_size;
666
667      analysis->analysis_offset -= frame_size;
668   }
669
670   analysis_info->valid = 0;
671   tonality_get_info(analysis, analysis_info, frame_size);
672}
673