1/* Copyright (c) 2011 Xiph.Org Foundation
2   Written by Jean-Marc Valin */
3/*
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8   - Redistributions of source code must retain the above copyright
9   notice, this list of conditions and the following disclaimer.
10
11   - Redistributions in binary form must reproduce the above copyright
12   notice, this list of conditions and the following disclaimer in the
13   documentation and/or other materials provided with the distribution.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include "kiss_fft.h"
33#include "celt.h"
34#include "modes.h"
35#include "arch.h"
36#include "quant_bands.h"
37#include <stdio.h>
38#include "analysis.h"
39#include "mlp.h"
40#include "stack_alloc.h"
41
42extern const MLP net;
43
44#ifndef M_PI
45#define M_PI 3.141592653
46#endif
47
48static const float dct_table[128] = {
49        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50        0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51        0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52       -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53        0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54       -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55        0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56        0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58        0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59        0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60       -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61        0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62       -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63        0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64        0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65};
66
67static const float analysis_window[240] = {
68      0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69      0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70      0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71      0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72      0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73      0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74      0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75      0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76      0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77      0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78      0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79      0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80      0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81      0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82      0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83      0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84      0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85      0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86      0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87      0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88      0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89      0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90      0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91      0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92      0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93      0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94      0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95      0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96      0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97      0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98};
99
100static const int tbands[NB_TBANDS+1] = {
101       2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102};
103
104static const int extra_bands[NB_TOT_BANDS+1] = {
105      1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106};
107
108/*static const float tweight[NB_TBANDS+1] = {
109      .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110};*/
111
112#define NB_TONAL_SKIP_BANDS 9
113
114#define cA 0.43157974f
115#define cB 0.67848403f
116#define cC 0.08595542f
117#define cE ((float)M_PI/2)
118static OPUS_INLINE float fast_atan2f(float y, float x) {
119   float x2, y2;
120   /* Should avoid underflow on the values we'll get */
121   if (ABS16(x)+ABS16(y)<1e-9f)
122   {
123      x*=1e12f;
124      y*=1e12f;
125   }
126   x2 = x*x;
127   y2 = y*y;
128   if(x2<y2){
129      float den = (y2 + cB*x2) * (y2 + cC*x2);
130      if (den!=0)
131         return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132      else
133         return (y<0 ? -cE : cE);
134   }else{
135      float den = (x2 + cB*y2) * (x2 + cC*y2);
136      if (den!=0)
137         return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138      else
139         return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140   }
141}
142
143void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144{
145   int pos;
146   int curr_lookahead;
147   float psum;
148   int i;
149
150   pos = tonal->read_pos;
151   curr_lookahead = tonal->write_pos-tonal->read_pos;
152   if (curr_lookahead<0)
153      curr_lookahead += DETECT_SIZE;
154
155   if (len > 480 && pos != tonal->write_pos)
156   {
157      pos++;
158      if (pos==DETECT_SIZE)
159         pos=0;
160   }
161   if (pos == tonal->write_pos)
162      pos--;
163   if (pos<0)
164      pos = DETECT_SIZE-1;
165   OPUS_COPY(info_out, &tonal->info[pos], 1);
166   tonal->read_subframe += len/120;
167   while (tonal->read_subframe>=4)
168   {
169      tonal->read_subframe -= 4;
170      tonal->read_pos++;
171   }
172   if (tonal->read_pos>=DETECT_SIZE)
173      tonal->read_pos-=DETECT_SIZE;
174
175   /* Compensate for the delay in the features themselves.
176      FIXME: Need a better estimate the 10 I just made up */
177   curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179   psum=0;
180   /* Summing the probability of transition patterns that involve music at
181      time (DETECT_SIZE-curr_lookahead-1) */
182   for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
183      psum += tonal->pmusic[i];
184   for (;i<DETECT_SIZE;i++)
185      psum += tonal->pspeech[i];
186   psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
187   /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
188
189   info_out->music_prob = psum;
190}
191
192void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
193{
194    int i, b;
195    const kiss_fft_state *kfft;
196    VARDECL(kiss_fft_cpx, in);
197    VARDECL(kiss_fft_cpx, out);
198    int N = 480, N2=240;
199    float * OPUS_RESTRICT A = tonal->angle;
200    float * OPUS_RESTRICT dA = tonal->d_angle;
201    float * OPUS_RESTRICT d2A = tonal->d2_angle;
202    VARDECL(float, tonality);
203    VARDECL(float, noisiness);
204    float band_tonality[NB_TBANDS];
205    float logE[NB_TBANDS];
206    float BFCC[8];
207    float features[25];
208    float frame_tonality;
209    float max_frame_tonality;
210    /*float tw_sum=0;*/
211    float frame_noisiness;
212    const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
213    float slope=0;
214    float frame_stationarity;
215    float relativeE;
216    float frame_probs[2];
217    float alpha, alphaE, alphaE2;
218    float frame_loudness;
219    float bandwidth_mask;
220    int bandwidth=0;
221    float maxE = 0;
222    float noise_floor;
223    int remaining;
224    AnalysisInfo *info;
225    SAVE_STACK;
226
227    tonal->last_transition++;
228    alpha = 1.f/IMIN(20, 1+tonal->count);
229    alphaE = 1.f/IMIN(50, 1+tonal->count);
230    alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
231
232    if (tonal->count<4)
233       tonal->music_prob = .5;
234    kfft = celt_mode->mdct.kfft[0];
235    if (tonal->count==0)
236       tonal->mem_fill = 240;
237    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
238    if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
239    {
240       tonal->mem_fill += len;
241       /* Don't have enough to update the analysis */
242       RESTORE_STACK;
243       return;
244    }
245    info = &tonal->info[tonal->write_pos++];
246    if (tonal->write_pos>=DETECT_SIZE)
247       tonal->write_pos-=DETECT_SIZE;
248
249    ALLOC(in, 480, kiss_fft_cpx);
250    ALLOC(out, 480, kiss_fft_cpx);
251    ALLOC(tonality, 240, float);
252    ALLOC(noisiness, 240, float);
253    for (i=0;i<N2;i++)
254    {
255       float w = analysis_window[i];
256       in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
257       in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
258       in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
259       in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
260    }
261    OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
262    remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
263    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
264    tonal->mem_fill = 240 + remaining;
265    opus_fft(kfft, in, out);
266
267    for (i=1;i<N2;i++)
268    {
269       float X1r, X2r, X1i, X2i;
270       float angle, d_angle, d2_angle;
271       float angle2, d_angle2, d2_angle2;
272       float mod1, mod2, avg_mod;
273       X1r = (float)out[i].r+out[N-i].r;
274       X1i = (float)out[i].i-out[N-i].i;
275       X2r = (float)out[i].i+out[N-i].i;
276       X2i = (float)out[N-i].r-out[i].r;
277
278       angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
279       d_angle = angle - A[i];
280       d2_angle = d_angle - dA[i];
281
282       angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
283       d_angle2 = angle2 - angle;
284       d2_angle2 = d_angle2 - d_angle;
285
286       mod1 = d2_angle - (float)floor(.5+d2_angle);
287       noisiness[i] = ABS16(mod1);
288       mod1 *= mod1;
289       mod1 *= mod1;
290
291       mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
292       noisiness[i] += ABS16(mod2);
293       mod2 *= mod2;
294       mod2 *= mod2;
295
296       avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
297       tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
298
299       A[i] = angle2;
300       dA[i] = d_angle2;
301       d2A[i] = mod2;
302    }
303
304    frame_tonality = 0;
305    max_frame_tonality = 0;
306    /*tw_sum = 0;*/
307    info->activity = 0;
308    frame_noisiness = 0;
309    frame_stationarity = 0;
310    if (!tonal->count)
311    {
312       for (b=0;b<NB_TBANDS;b++)
313       {
314          tonal->lowE[b] = 1e10;
315          tonal->highE[b] = -1e10;
316       }
317    }
318    relativeE = 0;
319    frame_loudness = 0;
320    for (b=0;b<NB_TBANDS;b++)
321    {
322       float E=0, tE=0, nE=0;
323       float L1, L2;
324       float stationarity;
325       for (i=tbands[b];i<tbands[b+1];i++)
326       {
327          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
328                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
329#ifdef FIXED_POINT
330          /* FIXME: It's probably best to change the BFCC filter initial state instead */
331          binE *= 5.55e-17f;
332#endif
333          E += binE;
334          tE += binE*tonality[i];
335          nE += binE*2.f*(.5f-noisiness[i]);
336       }
337       tonal->E[tonal->E_count][b] = E;
338       frame_noisiness += nE/(1e-15f+E);
339
340       frame_loudness += (float)sqrt(E+1e-10f);
341       logE[b] = (float)log(E+1e-10f);
342       tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
343       tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
344       if (tonal->highE[b] < tonal->lowE[b]+1.f)
345       {
346          tonal->highE[b]+=.5f;
347          tonal->lowE[b]-=.5f;
348       }
349       relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
350
351       L1=L2=0;
352       for (i=0;i<NB_FRAMES;i++)
353       {
354          L1 += (float)sqrt(tonal->E[i][b]);
355          L2 += tonal->E[i][b];
356       }
357
358       stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
359       stationarity *= stationarity;
360       stationarity *= stationarity;
361       frame_stationarity += stationarity;
362       /*band_tonality[b] = tE/(1e-15+E)*/;
363       band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
364#if 0
365       if (b>=NB_TONAL_SKIP_BANDS)
366       {
367          frame_tonality += tweight[b]*band_tonality[b];
368          tw_sum += tweight[b];
369       }
370#else
371       frame_tonality += band_tonality[b];
372       if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
373          frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
374#endif
375       max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
376       slope += band_tonality[b]*(b-8);
377       /*printf("%f %f ", band_tonality[b], stationarity);*/
378       tonal->prev_band_tonality[b] = band_tonality[b];
379    }
380
381    bandwidth_mask = 0;
382    bandwidth = 0;
383    maxE = 0;
384    noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
385#ifdef FIXED_POINT
386    noise_floor *= 1<<(15+SIG_SHIFT);
387#endif
388    noise_floor *= noise_floor;
389    for (b=0;b<NB_TOT_BANDS;b++)
390    {
391       float E=0;
392       int band_start, band_end;
393       /* Keep a margin of 300 Hz for aliasing */
394       band_start = extra_bands[b];
395       band_end = extra_bands[b+1];
396       for (i=band_start;i<band_end;i++)
397       {
398          float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
399                     + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
400          E += binE;
401       }
402       maxE = MAX32(maxE, E);
403       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
404       E = MAX32(E, tonal->meanE[b]);
405       /* Use a simple follower with 13 dB/Bark slope for spreading function */
406       bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
407       /* Consider the band "active" only if all these conditions are met:
408          1) less than 10 dB below the simple follower
409          2) less than 90 dB below the peak band (maximal masking possible considering
410             both the ATH and the loudness-dependent slope of the spreading function)
411          3) above the PCM quantization noise floor
412       */
413       if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
414          bandwidth = b;
415    }
416    if (tonal->count<=2)
417       bandwidth = 20;
418    frame_loudness = 20*(float)log10(frame_loudness);
419    tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
420    tonal->lowECount *= (1-alphaE);
421    if (frame_loudness < tonal->Etracker-30)
422       tonal->lowECount += alphaE;
423
424    for (i=0;i<8;i++)
425    {
426       float sum=0;
427       for (b=0;b<16;b++)
428          sum += dct_table[i*16+b]*logE[b];
429       BFCC[i] = sum;
430    }
431
432    frame_stationarity /= NB_TBANDS;
433    relativeE /= NB_TBANDS;
434    if (tonal->count<10)
435       relativeE = .5;
436    frame_noisiness /= NB_TBANDS;
437#if 1
438    info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
439#else
440    info->activity = .5*(1+frame_noisiness-frame_stationarity);
441#endif
442    frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
443    frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
444    tonal->prev_tonality = frame_tonality;
445
446    slope /= 8*8;
447    info->tonality_slope = slope;
448
449    tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
450    tonal->count++;
451    info->tonality = frame_tonality;
452
453    for (i=0;i<4;i++)
454       features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
455
456    for (i=0;i<4;i++)
457       tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
458
459    for (i=0;i<4;i++)
460        features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
461    for (i=0;i<3;i++)
462        features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
463
464    if (tonal->count > 5)
465    {
466       for (i=0;i<9;i++)
467          tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
468    }
469
470    for (i=0;i<8;i++)
471    {
472       tonal->mem[i+24] = tonal->mem[i+16];
473       tonal->mem[i+16] = tonal->mem[i+8];
474       tonal->mem[i+8] = tonal->mem[i];
475       tonal->mem[i] = BFCC[i];
476    }
477    for (i=0;i<9;i++)
478       features[11+i] = (float)sqrt(tonal->std[i]);
479    features[20] = info->tonality;
480    features[21] = info->activity;
481    features[22] = frame_stationarity;
482    features[23] = info->tonality_slope;
483    features[24] = tonal->lowECount;
484
485#ifndef DISABLE_FLOAT_API
486    mlp_process(&net, features, frame_probs);
487    frame_probs[0] = .5f*(frame_probs[0]+1);
488    /* Curve fitting between the MLP probability and the actual probability */
489    frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
490    /* Probability of active audio (as opposed to silence) */
491    frame_probs[1] = .5f*frame_probs[1]+.5f;
492    /* Consider that silence has a 50-50 probability. */
493    frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
494
495    /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
496    {
497       /* Probability of state transition */
498       float tau;
499       /* Represents independence of the MLP probabilities, where
500          beta=1 means fully independent. */
501       float beta;
502       /* Denormalized probability of speech (p0) and music (p1) after update */
503       float p0, p1;
504       /* Probabilities for "all speech" and "all music" */
505       float s0, m0;
506       /* Probability sum for renormalisation */
507       float psum;
508       /* Instantaneous probability of speech and music, with beta pre-applied. */
509       float speech0;
510       float music0;
511
512       /* One transition every 3 minutes of active audio */
513       tau = .00005f*frame_probs[1];
514       beta = .05f;
515       if (1) {
516          /* Adapt beta based on how "unexpected" the new prob is */
517          float p, q;
518          p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
519          q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
520          beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
521       }
522       /* p0 and p1 are the probabilities of speech and music at this frame
523          using only information from previous frame and applying the
524          state transition model */
525       p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
526       p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
527       /* We apply the current probability with exponent beta to work around
528          the fact that the probability estimates aren't independent. */
529       p0 *= (float)pow(1-frame_probs[0], beta);
530       p1 *= (float)pow(frame_probs[0], beta);
531       /* Normalise the probabilities to get the Marokv probability of music. */
532       tonal->music_prob = p1/(p0+p1);
533       info->music_prob = tonal->music_prob;
534
535       /* This chunk of code deals with delayed decision. */
536       psum=1e-20f;
537       /* Instantaneous probability of speech and music, with beta pre-applied. */
538       speech0 = (float)pow(1-frame_probs[0], beta);
539       music0  = (float)pow(frame_probs[0], beta);
540       if (tonal->count==1)
541       {
542          tonal->pspeech[0]=.5;
543          tonal->pmusic [0]=.5;
544       }
545       /* Updated probability of having only speech (s0) or only music (m0),
546          before considering the new observation. */
547       s0 = tonal->pspeech[0] + tonal->pspeech[1];
548       m0 = tonal->pmusic [0] + tonal->pmusic [1];
549       /* Updates s0 and m0 with instantaneous probability. */
550       tonal->pspeech[0] = s0*(1-tau)*speech0;
551       tonal->pmusic [0] = m0*(1-tau)*music0;
552       /* Propagate the transition probabilities */
553       for (i=1;i<DETECT_SIZE-1;i++)
554       {
555          tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
556          tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
557       }
558       /* Probability that the latest frame is speech, when all the previous ones were music. */
559       tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
560       /* Probability that the latest frame is music, when all the previous ones were speech. */
561       tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
562
563       /* Renormalise probabilities to 1 */
564       for (i=0;i<DETECT_SIZE;i++)
565          psum += tonal->pspeech[i] + tonal->pmusic[i];
566       psum = 1.f/psum;
567       for (i=0;i<DETECT_SIZE;i++)
568       {
569          tonal->pspeech[i] *= psum;
570          tonal->pmusic [i] *= psum;
571       }
572       psum = tonal->pmusic[0];
573       for (i=1;i<DETECT_SIZE;i++)
574          psum += tonal->pspeech[i];
575
576       /* Estimate our confidence in the speech/music decisions */
577       if (frame_probs[1]>.75)
578       {
579          if (tonal->music_prob>.9)
580          {
581             float adapt;
582             adapt = 1.f/(++tonal->music_confidence_count);
583             tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
584             tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
585          }
586          if (tonal->music_prob<.1)
587          {
588             float adapt;
589             adapt = 1.f/(++tonal->speech_confidence_count);
590             tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
591             tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
592          }
593       } else {
594          if (tonal->music_confidence_count==0)
595             tonal->music_confidence = .9f;
596          if (tonal->speech_confidence_count==0)
597             tonal->speech_confidence = .1f;
598       }
599    }
600    if (tonal->last_music != (tonal->music_prob>.5f))
601       tonal->last_transition=0;
602    tonal->last_music = tonal->music_prob>.5f;
603#else
604    info->music_prob = 0;
605#endif
606    /*for (i=0;i<25;i++)
607       printf("%f ", features[i]);
608    printf("\n");*/
609
610    info->bandwidth = bandwidth;
611    /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
612    info->noisiness = frame_noisiness;
613    info->valid = 1;
614    if (info_out!=NULL)
615       OPUS_COPY(info_out, info, 1);
616    RESTORE_STACK;
617}
618
619void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
620                 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
621                 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
622{
623   int offset;
624   int pcm_len;
625
626   if (analysis_pcm != NULL)
627   {
628      /* Avoid overflow/wrap-around of the analysis buffer */
629      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
630
631      pcm_len = analysis_frame_size - analysis->analysis_offset;
632      offset = analysis->analysis_offset;
633      do {
634         tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
635         offset += 480;
636         pcm_len -= 480;
637      } while (pcm_len>0);
638      analysis->analysis_offset = analysis_frame_size;
639
640      analysis->analysis_offset -= frame_size;
641   }
642
643   analysis_info->valid = 0;
644   tonality_get_info(analysis, analysis_info, frame_size);
645}
646