1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #define ANALYSIS_C
33 
34 #include <stdio.h>
35 
36 #include "mathops.h"
37 #include "kiss_fft.h"
38 #include "celt.h"
39 #include "modes.h"
40 #include "arch.h"
41 #include "quant_bands.h"
42 #include "analysis.h"
43 #include "mlp.h"
44 #include "stack_alloc.h"
45 #include "float_cast.h"
46 
47 #ifndef M_PI
48 #define M_PI 3.141592653
49 #endif
50 
51 #ifndef DISABLE_FLOAT_API
52 
53 static const float dct_table[128] = {
54         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
55         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
56         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
57        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
58         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
59        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
60         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
61         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
62         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
63         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
64         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
65        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
66         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
67        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
68         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
69         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
70 };
71 
72 static const float analysis_window[240] = {
73       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
74       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
75       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
76       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
77       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
78       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
79       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
80       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
81       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
82       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
83       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
84       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
85       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
86       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
87       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
88       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
89       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
90       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
91       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
92       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
93       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
94       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
95       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
96       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
97       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
98       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
99       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
100       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
101       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
102       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
103 };
104 
105 static const int tbands[NB_TBANDS+1] = {
106       4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
107 };
108 
109 #define NB_TONAL_SKIP_BANDS 9
110 
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)111 static opus_val32 silk_resampler_down2_hp(
112     opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */
113     opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */
114     const opus_val32            *in,                /* I    Input signal [ len ]                                        */
115     int                         inLen               /* I    Number of input samples                                     */
116 )
117 {
118     int k, len2 = inLen/2;
119     opus_val32 in32, out32, out32_hp, Y, X;
120     opus_val64 hp_ener = 0;
121     /* Internal variables and state are in Q10 format */
122     for( k = 0; k < len2; k++ ) {
123         /* Convert to Q10 */
124         in32 = in[ 2 * k ];
125 
126         /* All-pass section for even input sample */
127         Y      = SUB32( in32, S[ 0 ] );
128         X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
129         out32  = ADD32( S[ 0 ], X );
130         S[ 0 ] = ADD32( in32, X );
131         out32_hp = out32;
132         /* Convert to Q10 */
133         in32 = in[ 2 * k + 1 ];
134 
135         /* All-pass section for odd input sample, and add to output of previous section */
136         Y      = SUB32( in32, S[ 1 ] );
137         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138         out32  = ADD32( out32, S[ 1 ] );
139         out32  = ADD32( out32, X );
140         S[ 1 ] = ADD32( in32, X );
141 
142         Y      = SUB32( -in32, S[ 2 ] );
143         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144         out32_hp  = ADD32( out32_hp, S[ 2 ] );
145         out32_hp  = ADD32( out32_hp, X );
146         S[ 2 ] = ADD32( -in32, X );
147 
148         hp_ener += out32_hp*(opus_val64)out32_hp;
149         /* Add, convert back to int16 and store to output */
150         out[ k ] = HALF32(out32);
151     }
152 #ifdef FIXED_POINT
153     /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154     hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155 #endif
156     return (opus_val32)hp_ener;
157 }
158 
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)159 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
160 {
161    VARDECL(opus_val32, tmp);
162    opus_val32 scale;
163    int j;
164    opus_val32 ret = 0;
165    SAVE_STACK;
166 
167    if (subframe==0) return 0;
168    if (Fs == 48000)
169    {
170       subframe *= 2;
171       offset *= 2;
172    } else if (Fs == 16000) {
173       subframe = subframe*2/3;
174       offset = offset*2/3;
175    }
176    ALLOC(tmp, subframe, opus_val32);
177 
178    downmix(_x, tmp, subframe, offset, c1, c2, C);
179 #ifdef FIXED_POINT
180    scale = (1<<SIG_SHIFT);
181 #else
182    scale = 1.f/32768;
183 #endif
184    if (c2==-2)
185       scale /= C;
186    else if (c2>-1)
187       scale /= 2;
188    for (j=0;j<subframe;j++)
189       tmp[j] *= scale;
190    if (Fs == 48000)
191    {
192       ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193    } else if (Fs == 24000) {
194       OPUS_COPY(y, tmp, subframe);
195    } else if (Fs == 16000) {
196       VARDECL(opus_val32, tmp3x);
197       ALLOC(tmp3x, 3*subframe, opus_val32);
198       /* Don't do this at home! This resampler is horrible and it's only (barely)
199          usable for the purpose of the analysis because we don't care about all
200          the aliasing between 8 kHz and 12 kHz. */
201       for (j=0;j<subframe;j++)
202       {
203          tmp3x[3*j] = tmp[j];
204          tmp3x[3*j+1] = tmp[j];
205          tmp3x[3*j+2] = tmp[j];
206       }
207       silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208    }
209    RESTORE_STACK;
210    return ret;
211 }
212 
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)213 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
214 {
215   /* Initialize reusable fields. */
216   tonal->arch = opus_select_arch();
217   tonal->Fs = Fs;
218   /* Clear remaining fields. */
219   tonality_analysis_reset(tonal);
220 }
221 
tonality_analysis_reset(TonalityAnalysisState * tonal)222 void tonality_analysis_reset(TonalityAnalysisState *tonal)
223 {
224   /* Clear non-reusable fields. */
225   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
226   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
227   tonal->music_confidence = .9f;
228   tonal->speech_confidence = .1f;
229 }
230 
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232 {
233    int pos;
234    int curr_lookahead;
235    float psum;
236    float tonality_max;
237    float tonality_avg;
238    int tonality_count;
239    int i;
240 
241    pos = tonal->read_pos;
242    curr_lookahead = tonal->write_pos-tonal->read_pos;
243    if (curr_lookahead<0)
244       curr_lookahead += DETECT_SIZE;
245 
246    /* On long frames, look at the second analysis window rather than the first. */
247    if (len > tonal->Fs/50 && pos != tonal->write_pos)
248    {
249       pos++;
250       if (pos==DETECT_SIZE)
251          pos=0;
252    }
253    if (pos == tonal->write_pos)
254       pos--;
255    if (pos<0)
256       pos = DETECT_SIZE-1;
257    OPUS_COPY(info_out, &tonal->info[pos], 1);
258    tonality_max = tonality_avg = info_out->tonality;
259    tonality_count = 1;
260    /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
261    for (i=0;i<3;i++)
262    {
263       pos++;
264       if (pos==DETECT_SIZE)
265          pos = 0;
266       if (pos == tonal->write_pos)
267          break;
268       tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
269       tonality_avg += tonal->info[pos].tonality;
270       tonality_count++;
271    }
272    info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
273    tonal->read_subframe += len/(tonal->Fs/400);
274    while (tonal->read_subframe>=8)
275    {
276       tonal->read_subframe -= 8;
277       tonal->read_pos++;
278    }
279    if (tonal->read_pos>=DETECT_SIZE)
280       tonal->read_pos-=DETECT_SIZE;
281 
282    /* The -1 is to compensate for the delay in the features themselves. */
283    curr_lookahead = IMAX(curr_lookahead-1, 0);
284 
285    psum=0;
286    /* Summing the probability of transition patterns that involve music at
287       time (DETECT_SIZE-curr_lookahead-1) */
288    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
289       psum += tonal->pmusic[i];
290    for (;i<DETECT_SIZE;i++)
291       psum += tonal->pspeech[i];
292    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
293    /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
294 
295    info_out->music_prob = psum;
296 }
297 
298 static const float std_feature_bias[9] = {
299       5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
300       2.163313f, 1.260756f, 1.116868f, 1.918795f
301 };
302 
303 #define LEAKAGE_OFFSET 2.5f
304 #define LEAKAGE_SLOPE 2.f
305 
306 #ifdef FIXED_POINT
307 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
308    compensate for that in the energy. */
309 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
310 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
311 #else
312 #define SCALE_ENER(e) (e)
313 #endif
314 
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)315 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
316 {
317     int i, b;
318     const kiss_fft_state *kfft;
319     VARDECL(kiss_fft_cpx, in);
320     VARDECL(kiss_fft_cpx, out);
321     int N = 480, N2=240;
322     float * OPUS_RESTRICT A = tonal->angle;
323     float * OPUS_RESTRICT dA = tonal->d_angle;
324     float * OPUS_RESTRICT d2A = tonal->d2_angle;
325     VARDECL(float, tonality);
326     VARDECL(float, noisiness);
327     float band_tonality[NB_TBANDS];
328     float logE[NB_TBANDS];
329     float BFCC[8];
330     float features[25];
331     float frame_tonality;
332     float max_frame_tonality;
333     /*float tw_sum=0;*/
334     float frame_noisiness;
335     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
336     float slope=0;
337     float frame_stationarity;
338     float relativeE;
339     float frame_probs[2];
340     float alpha, alphaE, alphaE2;
341     float frame_loudness;
342     float bandwidth_mask;
343     int bandwidth=0;
344     float maxE = 0;
345     float noise_floor;
346     int remaining;
347     AnalysisInfo *info;
348     float hp_ener;
349     float tonality2[240];
350     float midE[8];
351     float spec_variability=0;
352     float band_log2[NB_TBANDS+1];
353     float leakage_from[NB_TBANDS+1];
354     float leakage_to[NB_TBANDS+1];
355     SAVE_STACK;
356 
357     alpha = 1.f/IMIN(10, 1+tonal->count);
358     alphaE = 1.f/IMIN(25, 1+tonal->count);
359     alphaE2 = 1.f/IMIN(500, 1+tonal->count);
360 
361     if (tonal->Fs == 48000)
362     {
363        /* len and offset are now at 24 kHz. */
364        len/= 2;
365        offset /= 2;
366     } else if (tonal->Fs == 16000) {
367        len = 3*len/2;
368        offset = 3*offset/2;
369     }
370 
371     if (tonal->count<4) {
372        if (tonal->application == OPUS_APPLICATION_VOIP)
373           tonal->music_prob = .1f;
374        else
375           tonal->music_prob = .625f;
376     }
377     kfft = celt_mode->mdct.kfft[0];
378     if (tonal->count==0)
379        tonal->mem_fill = 240;
380     tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
381           &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
382           IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
383     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
384     {
385        tonal->mem_fill += len;
386        /* Don't have enough to update the analysis */
387        RESTORE_STACK;
388        return;
389     }
390     hp_ener = tonal->hp_ener_accum;
391     info = &tonal->info[tonal->write_pos++];
392     if (tonal->write_pos>=DETECT_SIZE)
393        tonal->write_pos-=DETECT_SIZE;
394 
395     ALLOC(in, 480, kiss_fft_cpx);
396     ALLOC(out, 480, kiss_fft_cpx);
397     ALLOC(tonality, 240, float);
398     ALLOC(noisiness, 240, float);
399     for (i=0;i<N2;i++)
400     {
401        float w = analysis_window[i];
402        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
403        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
404        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
405        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
406     }
407     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
408     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
409     tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
410           &tonal->inmem[240], tonal->downmix_state, remaining,
411           offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
412     tonal->mem_fill = 240 + remaining;
413     opus_fft(kfft, in, out, tonal->arch);
414 #ifndef FIXED_POINT
415     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
416     if (celt_isnan(out[0].r))
417     {
418        info->valid = 0;
419        RESTORE_STACK;
420        return;
421     }
422 #endif
423 
424     for (i=1;i<N2;i++)
425     {
426        float X1r, X2r, X1i, X2i;
427        float angle, d_angle, d2_angle;
428        float angle2, d_angle2, d2_angle2;
429        float mod1, mod2, avg_mod;
430        X1r = (float)out[i].r+out[N-i].r;
431        X1i = (float)out[i].i-out[N-i].i;
432        X2r = (float)out[i].i+out[N-i].i;
433        X2i = (float)out[N-i].r-out[i].r;
434 
435        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
436        d_angle = angle - A[i];
437        d2_angle = d_angle - dA[i];
438 
439        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
440        d_angle2 = angle2 - angle;
441        d2_angle2 = d_angle2 - d_angle;
442 
443        mod1 = d2_angle - (float)float2int(d2_angle);
444        noisiness[i] = ABS16(mod1);
445        mod1 *= mod1;
446        mod1 *= mod1;
447 
448        mod2 = d2_angle2 - (float)float2int(d2_angle2);
449        noisiness[i] += ABS16(mod2);
450        mod2 *= mod2;
451        mod2 *= mod2;
452 
453        avg_mod = .25f*(d2A[i]+mod1+2*mod2);
454        /* This introduces an extra delay of 2 frames in the detection. */
455        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
456        /* No delay on this detection, but it's less reliable. */
457        tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
458 
459        A[i] = angle2;
460        dA[i] = d_angle2;
461        d2A[i] = mod2;
462     }
463     for (i=2;i<N2-1;i++)
464     {
465        float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
466        tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
467     }
468     frame_tonality = 0;
469     max_frame_tonality = 0;
470     /*tw_sum = 0;*/
471     info->activity = 0;
472     frame_noisiness = 0;
473     frame_stationarity = 0;
474     if (!tonal->count)
475     {
476        for (b=0;b<NB_TBANDS;b++)
477        {
478           tonal->lowE[b] = 1e10;
479           tonal->highE[b] = -1e10;
480        }
481     }
482     relativeE = 0;
483     frame_loudness = 0;
484     /* The energy of the very first band is special because of DC. */
485     {
486        float E = 0;
487        float X1r, X2r;
488        X1r = 2*(float)out[0].r;
489        X2r = 2*(float)out[0].i;
490        E = X1r*X1r + X2r*X2r;
491        for (i=1;i<4;i++)
492        {
493           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
494                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
495           E += binE;
496        }
497        E = SCALE_ENER(E);
498        band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
499     }
500     for (b=0;b<NB_TBANDS;b++)
501     {
502        float E=0, tE=0, nE=0;
503        float L1, L2;
504        float stationarity;
505        for (i=tbands[b];i<tbands[b+1];i++)
506        {
507           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
508                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
509           binE = SCALE_ENER(binE);
510           E += binE;
511           tE += binE*MAX32(0, tonality[i]);
512           nE += binE*2.f*(.5f-noisiness[i]);
513        }
514 #ifndef FIXED_POINT
515        /* Check for extreme band energies that could cause NaNs later. */
516        if (!(E<1e9f) || celt_isnan(E))
517        {
518           info->valid = 0;
519           RESTORE_STACK;
520           return;
521        }
522 #endif
523 
524        tonal->E[tonal->E_count][b] = E;
525        frame_noisiness += nE/(1e-15f+E);
526 
527        frame_loudness += (float)sqrt(E+1e-10f);
528        logE[b] = (float)log(E+1e-10f);
529        band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
530        tonal->logE[tonal->E_count][b] = logE[b];
531        if (tonal->count==0)
532           tonal->highE[b] = tonal->lowE[b] = logE[b];
533        if (tonal->highE[b] > tonal->lowE[b] + 7.5)
534        {
535           if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
536              tonal->highE[b] -= .01f;
537           else
538              tonal->lowE[b] += .01f;
539        }
540        if (logE[b] > tonal->highE[b])
541        {
542           tonal->highE[b] = logE[b];
543           tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
544        } else if (logE[b] < tonal->lowE[b])
545        {
546           tonal->lowE[b] = logE[b];
547           tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
548        }
549        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
550 
551        L1=L2=0;
552        for (i=0;i<NB_FRAMES;i++)
553        {
554           L1 += (float)sqrt(tonal->E[i][b]);
555           L2 += tonal->E[i][b];
556        }
557 
558        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
559        stationarity *= stationarity;
560        stationarity *= stationarity;
561        frame_stationarity += stationarity;
562        /*band_tonality[b] = tE/(1e-15+E)*/;
563        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
564 #if 0
565        if (b>=NB_TONAL_SKIP_BANDS)
566        {
567           frame_tonality += tweight[b]*band_tonality[b];
568           tw_sum += tweight[b];
569        }
570 #else
571        frame_tonality += band_tonality[b];
572        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
573           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
574 #endif
575        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
576        slope += band_tonality[b]*(b-8);
577        /*printf("%f %f ", band_tonality[b], stationarity);*/
578        tonal->prev_band_tonality[b] = band_tonality[b];
579     }
580 
581     leakage_from[0] = band_log2[0];
582     leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
583     for (b=1;b<NB_TBANDS+1;b++)
584     {
585        float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
586        leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
587        leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
588     }
589     for (b=NB_TBANDS-2;b>=0;b--)
590     {
591        float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
592        leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
593        leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
594     }
595     celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
596     for (b=0;b<NB_TBANDS+1;b++)
597     {
598        /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
599           represents the boost needed to overcome the amount of analysis leakage
600           cause in a weaker band b by louder neighbouring bands.
601           The second, based on leakage_from[], applies to a loud band b for
602           which the quantization noise causes synthesis leakage to the weaker
603           neighbouring bands. */
604        float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
605              MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
606        info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
607     }
608     for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
609 
610     for (i=0;i<NB_FRAMES;i++)
611     {
612        int j;
613        float mindist = 1e15f;
614        for (j=0;j<NB_FRAMES;j++)
615        {
616           int k;
617           float dist=0;
618           for (k=0;k<NB_TBANDS;k++)
619           {
620              float tmp;
621              tmp = tonal->logE[i][k] - tonal->logE[j][k];
622              dist += tmp*tmp;
623           }
624           if (j!=i)
625              mindist = MIN32(mindist, dist);
626        }
627        spec_variability += mindist;
628     }
629     spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
630     bandwidth_mask = 0;
631     bandwidth = 0;
632     maxE = 0;
633     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
634     noise_floor *= noise_floor;
635     for (b=0;b<NB_TBANDS;b++)
636     {
637        float E=0;
638        int band_start, band_end;
639        /* Keep a margin of 300 Hz for aliasing */
640        band_start = tbands[b];
641        band_end = tbands[b+1];
642        for (i=band_start;i<band_end;i++)
643        {
644           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
645                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
646           E += binE;
647        }
648        E = SCALE_ENER(E);
649        maxE = MAX32(maxE, E);
650        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
651        E = MAX32(E, tonal->meanE[b]);
652        /* Use a simple follower with 13 dB/Bark slope for spreading function */
653        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
654        /* Consider the band "active" only if all these conditions are met:
655           1) less than 10 dB below the simple follower
656           2) less than 90 dB below the peak band (maximal masking possible considering
657              both the ATH and the loudness-dependent slope of the spreading function)
658           3) above the PCM quantization noise floor
659           We use b+1 because the first CELT band isn't included in tbands[]
660        */
661        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
662           bandwidth = b+1;
663     }
664     /* Special case for the last two bands, for which we don't have spectrum but only
665        the energy above 12 kHz. */
666     if (tonal->Fs == 48000) {
667        float ratio;
668        float E = hp_ener*(1.f/(240*240));
669        ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
670 #ifdef FIXED_POINT
671        /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
672        E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
673 #endif
674        maxE = MAX32(maxE, E);
675        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
676        E = MAX32(E, tonal->meanE[b]);
677        /* Use a simple follower with 13 dB/Bark slope for spreading function */
678        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
679        if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
680           bandwidth = 20;
681        /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
682        if (bandwidth >= 17)
683           bandwidth = 20;
684     }
685     if (tonal->count<=2)
686        bandwidth = 20;
687     frame_loudness = 20*(float)log10(frame_loudness);
688     tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
689     tonal->lowECount *= (1-alphaE);
690     if (frame_loudness < tonal->Etracker-30)
691        tonal->lowECount += alphaE;
692 
693     for (i=0;i<8;i++)
694     {
695        float sum=0;
696        for (b=0;b<16;b++)
697           sum += dct_table[i*16+b]*logE[b];
698        BFCC[i] = sum;
699     }
700     for (i=0;i<8;i++)
701     {
702        float sum=0;
703        for (b=0;b<16;b++)
704           sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
705        midE[i] = sum;
706     }
707 
708     frame_stationarity /= NB_TBANDS;
709     relativeE /= NB_TBANDS;
710     if (tonal->count<10)
711        relativeE = .5f;
712     frame_noisiness /= NB_TBANDS;
713 #if 1
714     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
715 #else
716     info->activity = .5*(1+frame_noisiness-frame_stationarity);
717 #endif
718     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
719     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
720     tonal->prev_tonality = frame_tonality;
721 
722     slope /= 8*8;
723     info->tonality_slope = slope;
724 
725     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
726     tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
727     info->tonality = frame_tonality;
728 
729     for (i=0;i<4;i++)
730        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
731 
732     for (i=0;i<4;i++)
733        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
734 
735     for (i=0;i<4;i++)
736         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
737     for (i=0;i<3;i++)
738         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
739 
740     if (tonal->count > 5)
741     {
742        for (i=0;i<9;i++)
743           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
744     }
745     for (i=0;i<4;i++)
746        features[i] = BFCC[i]-midE[i];
747 
748     for (i=0;i<8;i++)
749     {
750        tonal->mem[i+24] = tonal->mem[i+16];
751        tonal->mem[i+16] = tonal->mem[i+8];
752        tonal->mem[i+8] = tonal->mem[i];
753        tonal->mem[i] = BFCC[i];
754     }
755     for (i=0;i<9;i++)
756        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
757     features[18] = spec_variability - 0.78f;
758     features[20] = info->tonality - 0.154723f;
759     features[21] = info->activity - 0.724643f;
760     features[22] = frame_stationarity - 0.743717f;
761     features[23] = info->tonality_slope + 0.069216f;
762     features[24] = tonal->lowECount - 0.067930f;
763 
764     mlp_process(&net, features, frame_probs);
765     frame_probs[0] = .5f*(frame_probs[0]+1);
766     /* Curve fitting between the MLP probability and the actual probability */
767     /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
768     /* Probability of active audio (as opposed to silence) */
769     frame_probs[1] = .5f*frame_probs[1]+.5f;
770     frame_probs[1] *= frame_probs[1];
771 
772     /* Probability of speech or music vs noise */
773     info->activity_probability = frame_probs[1];
774 
775     /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
776     {
777        /* Probability of state transition */
778        float tau;
779        /* Represents independence of the MLP probabilities, where
780           beta=1 means fully independent. */
781        float beta;
782        /* Denormalized probability of speech (p0) and music (p1) after update */
783        float p0, p1;
784        /* Probabilities for "all speech" and "all music" */
785        float s0, m0;
786        /* Probability sum for renormalisation */
787        float psum;
788        /* Instantaneous probability of speech and music, with beta pre-applied. */
789        float speech0;
790        float music0;
791        float p, q;
792 
793        /* More silence transitions for speech than for music. */
794        tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
795        p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
796        q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
797        beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
798        /* p0 and p1 are the probabilities of speech and music at this frame
799           using only information from previous frame and applying the
800           state transition model */
801        p0 = (1-tonal->vad_prob)*(1-tau) +    tonal->vad_prob *tau;
802        p1 =    tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
803        /* We apply the current probability with exponent beta to work around
804           the fact that the probability estimates aren't independent. */
805        p0 *= (float)pow(1-frame_probs[1], beta);
806        p1 *= (float)pow(frame_probs[1], beta);
807        /* Normalise the probabilities to get the Marokv probability of music. */
808        tonal->vad_prob = p1/(p0+p1);
809        info->vad_prob = tonal->vad_prob;
810        /* Consider that silence has a 50-50 probability of being speech or music. */
811        frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
812 
813        /* One transition every 3 minutes of active audio */
814        tau = .0001f;
815        /* Adapt beta based on how "unexpected" the new prob is */
816        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
817        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
818        beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
819        /* p0 and p1 are the probabilities of speech and music at this frame
820           using only information from previous frame and applying the
821           state transition model */
822        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
823        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
824        /* We apply the current probability with exponent beta to work around
825           the fact that the probability estimates aren't independent. */
826        p0 *= (float)pow(1-frame_probs[0], beta);
827        p1 *= (float)pow(frame_probs[0], beta);
828        /* Normalise the probabilities to get the Marokv probability of music. */
829        tonal->music_prob = p1/(p0+p1);
830        info->music_prob = tonal->music_prob;
831 
832        /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
833        /* This chunk of code deals with delayed decision. */
834        psum=1e-20f;
835        /* Instantaneous probability of speech and music, with beta pre-applied. */
836        speech0 = (float)pow(1-frame_probs[0], beta);
837        music0  = (float)pow(frame_probs[0], beta);
838        if (tonal->count==1)
839        {
840           if (tonal->application == OPUS_APPLICATION_VOIP)
841              tonal->pmusic[0] = .1f;
842           else
843              tonal->pmusic[0] = .625f;
844           tonal->pspeech[0] = 1-tonal->pmusic[0];
845        }
846        /* Updated probability of having only speech (s0) or only music (m0),
847           before considering the new observation. */
848        s0 = tonal->pspeech[0] + tonal->pspeech[1];
849        m0 = tonal->pmusic [0] + tonal->pmusic [1];
850        /* Updates s0 and m0 with instantaneous probability. */
851        tonal->pspeech[0] = s0*(1-tau)*speech0;
852        tonal->pmusic [0] = m0*(1-tau)*music0;
853        /* Propagate the transition probabilities */
854        for (i=1;i<DETECT_SIZE-1;i++)
855        {
856           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
857           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
858        }
859        /* Probability that the latest frame is speech, when all the previous ones were music. */
860        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
861        /* Probability that the latest frame is music, when all the previous ones were speech. */
862        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
863 
864        /* Renormalise probabilities to 1 */
865        for (i=0;i<DETECT_SIZE;i++)
866           psum += tonal->pspeech[i] + tonal->pmusic[i];
867        psum = 1.f/psum;
868        for (i=0;i<DETECT_SIZE;i++)
869        {
870           tonal->pspeech[i] *= psum;
871           tonal->pmusic [i] *= psum;
872        }
873        psum = tonal->pmusic[0];
874        for (i=1;i<DETECT_SIZE;i++)
875           psum += tonal->pspeech[i];
876 
877        /* Estimate our confidence in the speech/music decisions */
878        if (frame_probs[1]>.75)
879        {
880           if (tonal->music_prob>.9)
881           {
882              float adapt;
883              adapt = 1.f/(++tonal->music_confidence_count);
884              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
885              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
886           }
887           if (tonal->music_prob<.1)
888           {
889              float adapt;
890              adapt = 1.f/(++tonal->speech_confidence_count);
891              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
892              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
893           }
894        }
895     }
896     tonal->last_music = tonal->music_prob>.5f;
897 #ifdef MLP_TRAINING
898     for (i=0;i<25;i++)
899        printf("%f ", features[i]);
900     printf("\n");
901 #endif
902 
903     info->bandwidth = bandwidth;
904     tonal->prev_bandwidth = bandwidth;
905     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
906     info->noisiness = frame_noisiness;
907     info->valid = 1;
908     RESTORE_STACK;
909 }
910 
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)911 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
912                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
913                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
914 {
915    int offset;
916    int pcm_len;
917 
918    analysis_frame_size -= analysis_frame_size&1;
919    if (analysis_pcm != NULL)
920    {
921       /* Avoid overflow/wrap-around of the analysis buffer */
922       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
923 
924       pcm_len = analysis_frame_size - analysis->analysis_offset;
925       offset = analysis->analysis_offset;
926       while (pcm_len>0) {
927          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
928          offset += Fs/50;
929          pcm_len -= Fs/50;
930       }
931       analysis->analysis_offset = analysis_frame_size;
932 
933       analysis->analysis_offset -= frame_size;
934    }
935 
936    analysis_info->valid = 0;
937    tonality_get_info(analysis, analysis_info, frame_size);
938 }
939 
940 #endif /* DISABLE_FLOAT_API */
941