1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #define ANALYSIS_C
33 
34 #ifdef MLP_TRAINING
35 #include <stdio.h>
36 #endif
37 
38 #include "mathops.h"
39 #include "kiss_fft.h"
40 #include "celt.h"
41 #include "modes.h"
42 #include "arch.h"
43 #include "quant_bands.h"
44 #include "analysis.h"
45 #include "mlp.h"
46 #include "stack_alloc.h"
47 #include "float_cast.h"
48 
49 #ifndef M_PI
50 #define M_PI 3.141592653
51 #endif
52 
53 #ifndef DISABLE_FLOAT_API
54 
55 #define TRANSITION_PENALTY 10
56 
57 static const float dct_table[128] = {
58         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
59         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
60         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
61        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
62         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
63        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
64         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
65         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
66         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
67         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
68         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
69        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
70         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
71        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
72         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
73         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
74 };
75 
76 static const float analysis_window[240] = {
77       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
78       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
79       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
80       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
81       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
82       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
83       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
84       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
85       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
86       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
87       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
88       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
89       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
90       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
91       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
92       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
93       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
94       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
95       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
96       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
97       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
98       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
99       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
100       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
101       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
102       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
103       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
104       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
105       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
106       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
107 };
108 
109 static const int tbands[NB_TBANDS+1] = {
110       4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
111 };
112 
113 #define NB_TONAL_SKIP_BANDS 9
114 
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)115 static opus_val32 silk_resampler_down2_hp(
116     opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */
117     opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */
118     const opus_val32            *in,                /* I    Input signal [ len ]                                        */
119     int                         inLen               /* I    Number of input samples                                     */
120 )
121 {
122     int k, len2 = inLen/2;
123     opus_val32 in32, out32, out32_hp, Y, X;
124     opus_val64 hp_ener = 0;
125     /* Internal variables and state are in Q10 format */
126     for( k = 0; k < len2; k++ ) {
127         /* Convert to Q10 */
128         in32 = in[ 2 * k ];
129 
130         /* All-pass section for even input sample */
131         Y      = SUB32( in32, S[ 0 ] );
132         X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
133         out32  = ADD32( S[ 0 ], X );
134         S[ 0 ] = ADD32( in32, X );
135         out32_hp = out32;
136         /* Convert to Q10 */
137         in32 = in[ 2 * k + 1 ];
138 
139         /* All-pass section for odd input sample, and add to output of previous section */
140         Y      = SUB32( in32, S[ 1 ] );
141         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
142         out32  = ADD32( out32, S[ 1 ] );
143         out32  = ADD32( out32, X );
144         S[ 1 ] = ADD32( in32, X );
145 
146         Y      = SUB32( -in32, S[ 2 ] );
147         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
148         out32_hp  = ADD32( out32_hp, S[ 2 ] );
149         out32_hp  = ADD32( out32_hp, X );
150         S[ 2 ] = ADD32( -in32, X );
151 
152         hp_ener += out32_hp*(opus_val64)out32_hp;
153         /* Add, convert back to int16 and store to output */
154         out[ k ] = HALF32(out32);
155     }
156 #ifdef FIXED_POINT
157     /* len2 can be up to 480, so we shift by 8 more to make it fit. */
158     hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
159 #endif
160     return (opus_val32)hp_ener;
161 }
162 
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)163 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
164 {
165    VARDECL(opus_val32, tmp);
166    opus_val32 scale;
167    int j;
168    opus_val32 ret = 0;
169    SAVE_STACK;
170 
171    if (subframe==0) return 0;
172    if (Fs == 48000)
173    {
174       subframe *= 2;
175       offset *= 2;
176    } else if (Fs == 16000) {
177       subframe = subframe*2/3;
178       offset = offset*2/3;
179    }
180    ALLOC(tmp, subframe, opus_val32);
181 
182    downmix(_x, tmp, subframe, offset, c1, c2, C);
183 #ifdef FIXED_POINT
184    scale = (1<<SIG_SHIFT);
185 #else
186    scale = 1.f/32768;
187 #endif
188    if (c2==-2)
189       scale /= C;
190    else if (c2>-1)
191       scale /= 2;
192    for (j=0;j<subframe;j++)
193       tmp[j] *= scale;
194    if (Fs == 48000)
195    {
196       ret = silk_resampler_down2_hp(S, y, tmp, subframe);
197    } else if (Fs == 24000) {
198       OPUS_COPY(y, tmp, subframe);
199    } else if (Fs == 16000) {
200       VARDECL(opus_val32, tmp3x);
201       ALLOC(tmp3x, 3*subframe, opus_val32);
202       /* Don't do this at home! This resampler is horrible and it's only (barely)
203          usable for the purpose of the analysis because we don't care about all
204          the aliasing between 8 kHz and 12 kHz. */
205       for (j=0;j<subframe;j++)
206       {
207          tmp3x[3*j] = tmp[j];
208          tmp3x[3*j+1] = tmp[j];
209          tmp3x[3*j+2] = tmp[j];
210       }
211       silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
212    }
213    RESTORE_STACK;
214    return ret;
215 }
216 
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)217 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
218 {
219   /* Initialize reusable fields. */
220   tonal->arch = opus_select_arch();
221   tonal->Fs = Fs;
222   /* Clear remaining fields. */
223   tonality_analysis_reset(tonal);
224 }
225 
tonality_analysis_reset(TonalityAnalysisState * tonal)226 void tonality_analysis_reset(TonalityAnalysisState *tonal)
227 {
228   /* Clear non-reusable fields. */
229   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
230   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
231 }
232 
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)233 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
234 {
235    int pos;
236    int curr_lookahead;
237    float tonality_max;
238    float tonality_avg;
239    int tonality_count;
240    int i;
241    int pos0;
242    float prob_avg;
243    float prob_count;
244    float prob_min, prob_max;
245    float vad_prob;
246    int mpos, vpos;
247    int bandwidth_span;
248 
249    pos = tonal->read_pos;
250    curr_lookahead = tonal->write_pos-tonal->read_pos;
251    if (curr_lookahead<0)
252       curr_lookahead += DETECT_SIZE;
253 
254    tonal->read_subframe += len/(tonal->Fs/400);
255    while (tonal->read_subframe>=8)
256    {
257       tonal->read_subframe -= 8;
258       tonal->read_pos++;
259    }
260    if (tonal->read_pos>=DETECT_SIZE)
261       tonal->read_pos-=DETECT_SIZE;
262 
263    /* On long frames, look at the second analysis window rather than the first. */
264    if (len > tonal->Fs/50 && pos != tonal->write_pos)
265    {
266       pos++;
267       if (pos==DETECT_SIZE)
268          pos=0;
269    }
270    if (pos == tonal->write_pos)
271       pos--;
272    if (pos<0)
273       pos = DETECT_SIZE-1;
274    pos0 = pos;
275    OPUS_COPY(info_out, &tonal->info[pos], 1);
276    if (!info_out->valid)
277       return;
278    tonality_max = tonality_avg = info_out->tonality;
279    tonality_count = 1;
280    /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
281    bandwidth_span = 6;
282    /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
283    for (i=0;i<3;i++)
284    {
285       pos++;
286       if (pos==DETECT_SIZE)
287          pos = 0;
288       if (pos == tonal->write_pos)
289          break;
290       tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
291       tonality_avg += tonal->info[pos].tonality;
292       tonality_count++;
293       info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
294       bandwidth_span--;
295    }
296    pos = pos0;
297    /* Look back in time to see if any has a wider bandwidth than the current frame. */
298    for (i=0;i<bandwidth_span;i++)
299    {
300       pos--;
301       if (pos < 0)
302          pos = DETECT_SIZE-1;
303       if (pos == tonal->write_pos)
304          break;
305       info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
306    }
307    info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
308 
309    mpos = vpos = pos0;
310    /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
311       ~1 frame delay in the VAD prob. */
312    if (curr_lookahead > 15)
313    {
314       mpos += 5;
315       if (mpos>=DETECT_SIZE)
316          mpos -= DETECT_SIZE;
317       vpos += 1;
318       if (vpos>=DETECT_SIZE)
319          vpos -= DETECT_SIZE;
320    }
321 
322    /* The following calculations attempt to minimize a "badness function"
323       for the transition. When switching from speech to music, the badness
324       of switching at frame k is
325       b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
326       where
327       v_i is the activity probability (VAD) at frame i,
328       p_i is the music probability at frame i
329       T is the probability threshold for switching
330       S is the penalty for switching during active audio rather than silence
331       the current frame has index i=0
332 
333       Rather than apply badness to directly decide when to switch, what we compute
334       instead is the threshold for which the optimal switching point is now. When
335       considering whether to switch now (frame 0) or at frame k, we have:
336       S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
337       which gives us:
338       T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
339       We take the min threshold across all positive values of k (up to the maximum
340       amount of lookahead we have) to give us the threshold for which the current
341       frame is the optimal switch point.
342 
343       The last step is that we need to consider whether we want to switch at all.
344       For that we use the average of the music probability over the entire window.
345       If the threshold is higher than that average we're not going to
346       switch, so we compute a min with the average as well. The result of all these
347       min operations is music_prob_min, which gives the threshold for switching to music
348       if we're currently encoding for speech.
349 
350       We do the exact opposite to compute music_prob_max which is used for switching
351       from music to speech.
352     */
353    prob_min = 1.f;
354    prob_max = 0.f;
355    vad_prob = tonal->info[vpos].activity_probability;
356    prob_count = MAX16(.1f, vad_prob);
357    prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
358    while (1)
359    {
360       float pos_vad;
361       mpos++;
362       if (mpos==DETECT_SIZE)
363          mpos = 0;
364       if (mpos == tonal->write_pos)
365          break;
366       vpos++;
367       if (vpos==DETECT_SIZE)
368          vpos = 0;
369       if (vpos == tonal->write_pos)
370          break;
371       pos_vad = tonal->info[vpos].activity_probability;
372       prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
373       prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
374       prob_count += MAX16(.1f, pos_vad);
375       prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
376    }
377    info_out->music_prob = prob_avg/prob_count;
378    prob_min = MIN16(prob_avg/prob_count, prob_min);
379    prob_max = MAX16(prob_avg/prob_count, prob_max);
380    prob_min = MAX16(prob_min, 0.f);
381    prob_max = MIN16(prob_max, 1.f);
382 
383    /* If we don't have enough look-ahead, do our best to make a decent decision. */
384    if (curr_lookahead < 10)
385    {
386       float pmin, pmax;
387       pmin = prob_min;
388       pmax = prob_max;
389       pos = pos0;
390       /* Look for min/max in the past. */
391       for (i=0;i<IMIN(tonal->count-1, 15);i++)
392       {
393          pos--;
394          if (pos < 0)
395             pos = DETECT_SIZE-1;
396          pmin = MIN16(pmin, tonal->info[pos].music_prob);
397          pmax = MAX16(pmax, tonal->info[pos].music_prob);
398       }
399       /* Bias against switching on active audio. */
400       pmin = MAX16(0.f, pmin - .1f*vad_prob);
401       pmax = MIN16(1.f, pmax + .1f*vad_prob);
402       prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
403       prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
404    }
405    info_out->music_prob_min = prob_min;
406    info_out->music_prob_max = prob_max;
407 
408    /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
409 }
410 
411 static const float std_feature_bias[9] = {
412       5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
413       2.163313f, 1.260756f, 1.116868f, 1.918795f
414 };
415 
416 #define LEAKAGE_OFFSET 2.5f
417 #define LEAKAGE_SLOPE 2.f
418 
419 #ifdef FIXED_POINT
420 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
421    compensate for that in the energy. */
422 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
423 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
424 #else
425 #define SCALE_ENER(e) (e)
426 #endif
427 
428 #ifdef FIXED_POINT
is_digital_silence32(const opus_val32 * pcm,int frame_size,int channels,int lsb_depth)429 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
430 {
431    int silence = 0;
432    opus_val32 sample_max = 0;
433 #ifdef MLP_TRAINING
434    return 0;
435 #endif
436    sample_max = celt_maxabs32(pcm, frame_size*channels);
437 
438    silence = (sample_max == 0);
439    (void)lsb_depth;
440    return silence;
441 }
442 #else
443 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
444 #endif
445 
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)446 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
447 {
448     int i, b;
449     const kiss_fft_state *kfft;
450     VARDECL(kiss_fft_cpx, in);
451     VARDECL(kiss_fft_cpx, out);
452     int N = 480, N2=240;
453     float * OPUS_RESTRICT A = tonal->angle;
454     float * OPUS_RESTRICT dA = tonal->d_angle;
455     float * OPUS_RESTRICT d2A = tonal->d2_angle;
456     VARDECL(float, tonality);
457     VARDECL(float, noisiness);
458     float band_tonality[NB_TBANDS];
459     float logE[NB_TBANDS];
460     float BFCC[8];
461     float features[25];
462     float frame_tonality;
463     float max_frame_tonality;
464     /*float tw_sum=0;*/
465     float frame_noisiness;
466     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
467     float slope=0;
468     float frame_stationarity;
469     float relativeE;
470     float frame_probs[2];
471     float alpha, alphaE, alphaE2;
472     float frame_loudness;
473     float bandwidth_mask;
474     int is_masked[NB_TBANDS+1];
475     int bandwidth=0;
476     float maxE = 0;
477     float noise_floor;
478     int remaining;
479     AnalysisInfo *info;
480     float hp_ener;
481     float tonality2[240];
482     float midE[8];
483     float spec_variability=0;
484     float band_log2[NB_TBANDS+1];
485     float leakage_from[NB_TBANDS+1];
486     float leakage_to[NB_TBANDS+1];
487     float layer_out[MAX_NEURONS];
488     float below_max_pitch;
489     float above_max_pitch;
490     int is_silence;
491     SAVE_STACK;
492 
493     if (!tonal->initialized)
494     {
495        tonal->mem_fill = 240;
496        tonal->initialized = 1;
497     }
498     alpha = 1.f/IMIN(10, 1+tonal->count);
499     alphaE = 1.f/IMIN(25, 1+tonal->count);
500     /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
501     alphaE2 = 1.f/IMIN(100, 1+tonal->count);
502     if (tonal->count <= 1) alphaE2 = 1;
503 
504     if (tonal->Fs == 48000)
505     {
506        /* len and offset are now at 24 kHz. */
507        len/= 2;
508        offset /= 2;
509     } else if (tonal->Fs == 16000) {
510        len = 3*len/2;
511        offset = 3*offset/2;
512     }
513 
514     kfft = celt_mode->mdct.kfft[0];
515     tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
516           &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
517           IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
518     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
519     {
520        tonal->mem_fill += len;
521        /* Don't have enough to update the analysis */
522        RESTORE_STACK;
523        return;
524     }
525     hp_ener = tonal->hp_ener_accum;
526     info = &tonal->info[tonal->write_pos++];
527     if (tonal->write_pos>=DETECT_SIZE)
528        tonal->write_pos-=DETECT_SIZE;
529 
530     is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
531 
532     ALLOC(in, 480, kiss_fft_cpx);
533     ALLOC(out, 480, kiss_fft_cpx);
534     ALLOC(tonality, 240, float);
535     ALLOC(noisiness, 240, float);
536     for (i=0;i<N2;i++)
537     {
538        float w = analysis_window[i];
539        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
540        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
541        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
542        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
543     }
544     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
545     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
546     tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
547           &tonal->inmem[240], tonal->downmix_state, remaining,
548           offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
549     tonal->mem_fill = 240 + remaining;
550     if (is_silence)
551     {
552        /* On silence, copy the previous analysis. */
553        int prev_pos = tonal->write_pos-2;
554        if (prev_pos < 0)
555           prev_pos += DETECT_SIZE;
556        OPUS_COPY(info, &tonal->info[prev_pos], 1);
557        RESTORE_STACK;
558        return;
559     }
560     opus_fft(kfft, in, out, tonal->arch);
561 #ifndef FIXED_POINT
562     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
563     if (celt_isnan(out[0].r))
564     {
565        info->valid = 0;
566        RESTORE_STACK;
567        return;
568     }
569 #endif
570 
571     for (i=1;i<N2;i++)
572     {
573        float X1r, X2r, X1i, X2i;
574        float angle, d_angle, d2_angle;
575        float angle2, d_angle2, d2_angle2;
576        float mod1, mod2, avg_mod;
577        X1r = (float)out[i].r+out[N-i].r;
578        X1i = (float)out[i].i-out[N-i].i;
579        X2r = (float)out[i].i+out[N-i].i;
580        X2i = (float)out[N-i].r-out[i].r;
581 
582        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
583        d_angle = angle - A[i];
584        d2_angle = d_angle - dA[i];
585 
586        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
587        d_angle2 = angle2 - angle;
588        d2_angle2 = d_angle2 - d_angle;
589 
590        mod1 = d2_angle - (float)float2int(d2_angle);
591        noisiness[i] = ABS16(mod1);
592        mod1 *= mod1;
593        mod1 *= mod1;
594 
595        mod2 = d2_angle2 - (float)float2int(d2_angle2);
596        noisiness[i] += ABS16(mod2);
597        mod2 *= mod2;
598        mod2 *= mod2;
599 
600        avg_mod = .25f*(d2A[i]+mod1+2*mod2);
601        /* This introduces an extra delay of 2 frames in the detection. */
602        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
603        /* No delay on this detection, but it's less reliable. */
604        tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
605 
606        A[i] = angle2;
607        dA[i] = d_angle2;
608        d2A[i] = mod2;
609     }
610     for (i=2;i<N2-1;i++)
611     {
612        float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
613        tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
614     }
615     frame_tonality = 0;
616     max_frame_tonality = 0;
617     /*tw_sum = 0;*/
618     info->activity = 0;
619     frame_noisiness = 0;
620     frame_stationarity = 0;
621     if (!tonal->count)
622     {
623        for (b=0;b<NB_TBANDS;b++)
624        {
625           tonal->lowE[b] = 1e10;
626           tonal->highE[b] = -1e10;
627        }
628     }
629     relativeE = 0;
630     frame_loudness = 0;
631     /* The energy of the very first band is special because of DC. */
632     {
633        float E = 0;
634        float X1r, X2r;
635        X1r = 2*(float)out[0].r;
636        X2r = 2*(float)out[0].i;
637        E = X1r*X1r + X2r*X2r;
638        for (i=1;i<4;i++)
639        {
640           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
641                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
642           E += binE;
643        }
644        E = SCALE_ENER(E);
645        band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
646     }
647     for (b=0;b<NB_TBANDS;b++)
648     {
649        float E=0, tE=0, nE=0;
650        float L1, L2;
651        float stationarity;
652        for (i=tbands[b];i<tbands[b+1];i++)
653        {
654           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
655                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
656           binE = SCALE_ENER(binE);
657           E += binE;
658           tE += binE*MAX32(0, tonality[i]);
659           nE += binE*2.f*(.5f-noisiness[i]);
660        }
661 #ifndef FIXED_POINT
662        /* Check for extreme band energies that could cause NaNs later. */
663        if (!(E<1e9f) || celt_isnan(E))
664        {
665           info->valid = 0;
666           RESTORE_STACK;
667           return;
668        }
669 #endif
670 
671        tonal->E[tonal->E_count][b] = E;
672        frame_noisiness += nE/(1e-15f+E);
673 
674        frame_loudness += (float)sqrt(E+1e-10f);
675        logE[b] = (float)log(E+1e-10f);
676        band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
677        tonal->logE[tonal->E_count][b] = logE[b];
678        if (tonal->count==0)
679           tonal->highE[b] = tonal->lowE[b] = logE[b];
680        if (tonal->highE[b] > tonal->lowE[b] + 7.5)
681        {
682           if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
683              tonal->highE[b] -= .01f;
684           else
685              tonal->lowE[b] += .01f;
686        }
687        if (logE[b] > tonal->highE[b])
688        {
689           tonal->highE[b] = logE[b];
690           tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
691        } else if (logE[b] < tonal->lowE[b])
692        {
693           tonal->lowE[b] = logE[b];
694           tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
695        }
696        relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
697 
698        L1=L2=0;
699        for (i=0;i<NB_FRAMES;i++)
700        {
701           L1 += (float)sqrt(tonal->E[i][b]);
702           L2 += tonal->E[i][b];
703        }
704 
705        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
706        stationarity *= stationarity;
707        stationarity *= stationarity;
708        frame_stationarity += stationarity;
709        /*band_tonality[b] = tE/(1e-15+E)*/;
710        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
711 #if 0
712        if (b>=NB_TONAL_SKIP_BANDS)
713        {
714           frame_tonality += tweight[b]*band_tonality[b];
715           tw_sum += tweight[b];
716        }
717 #else
718        frame_tonality += band_tonality[b];
719        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
720           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
721 #endif
722        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
723        slope += band_tonality[b]*(b-8);
724        /*printf("%f %f ", band_tonality[b], stationarity);*/
725        tonal->prev_band_tonality[b] = band_tonality[b];
726     }
727 
728     leakage_from[0] = band_log2[0];
729     leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
730     for (b=1;b<NB_TBANDS+1;b++)
731     {
732        float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
733        leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
734        leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
735     }
736     for (b=NB_TBANDS-2;b>=0;b--)
737     {
738        float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
739        leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
740        leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
741     }
742     celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
743     for (b=0;b<NB_TBANDS+1;b++)
744     {
745        /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
746           represents the boost needed to overcome the amount of analysis leakage
747           cause in a weaker band b by louder neighbouring bands.
748           The second, based on leakage_from[], applies to a loud band b for
749           which the quantization noise causes synthesis leakage to the weaker
750           neighbouring bands. */
751        float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
752              MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
753        info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
754     }
755     for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
756 
757     for (i=0;i<NB_FRAMES;i++)
758     {
759        int j;
760        float mindist = 1e15f;
761        for (j=0;j<NB_FRAMES;j++)
762        {
763           int k;
764           float dist=0;
765           for (k=0;k<NB_TBANDS;k++)
766           {
767              float tmp;
768              tmp = tonal->logE[i][k] - tonal->logE[j][k];
769              dist += tmp*tmp;
770           }
771           if (j!=i)
772              mindist = MIN32(mindist, dist);
773        }
774        spec_variability += mindist;
775     }
776     spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
777     bandwidth_mask = 0;
778     bandwidth = 0;
779     maxE = 0;
780     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
781     noise_floor *= noise_floor;
782     below_max_pitch=0;
783     above_max_pitch=0;
784     for (b=0;b<NB_TBANDS;b++)
785     {
786        float E=0;
787        float Em;
788        int band_start, band_end;
789        /* Keep a margin of 300 Hz for aliasing */
790        band_start = tbands[b];
791        band_end = tbands[b+1];
792        for (i=band_start;i<band_end;i++)
793        {
794           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
795                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
796           E += binE;
797        }
798        E = SCALE_ENER(E);
799        maxE = MAX32(maxE, E);
800        if (band_start < 64)
801        {
802           below_max_pitch += E;
803        } else {
804           above_max_pitch += E;
805        }
806        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
807        Em = MAX32(E, tonal->meanE[b]);
808        /* Consider the band "active" only if all these conditions are met:
809           1) less than 90 dB below the peak band (maximal masking possible considering
810              both the ATH and the loudness-dependent slope of the spreading function)
811           2) above the PCM quantization noise floor
812           We use b+1 because the first CELT band isn't included in tbands[]
813        */
814        if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
815           bandwidth = b+1;
816        /* Check if the band is masked (see below). */
817        is_masked[b] = E < (tonal->prev_bandwidth >= b+1  ? .01f : .05f)*bandwidth_mask;
818        /* Use a simple follower with 13 dB/Bark slope for spreading function. */
819        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
820     }
821     /* Special case for the last two bands, for which we don't have spectrum but only
822        the energy above 12 kHz. The difficulty here is that the high-pass we use
823        leaks some LF energy, so we need to increase the threshold without accidentally cutting
824        off the band. */
825     if (tonal->Fs == 48000) {
826        float noise_ratio;
827        float Em;
828        float E = hp_ener*(1.f/(60*60));
829        noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
830 
831 #ifdef FIXED_POINT
832        /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
833        E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
834 #endif
835        above_max_pitch += E;
836        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
837        Em = MAX32(E, tonal->meanE[b]);
838        if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
839           bandwidth = 20;
840        /* Check if the band is masked (see below). */
841        is_masked[b] = E < (tonal->prev_bandwidth == 20  ? .01f : .05f)*bandwidth_mask;
842     }
843     if (above_max_pitch > below_max_pitch)
844        info->max_pitch_ratio = below_max_pitch/above_max_pitch;
845     else
846        info->max_pitch_ratio = 1;
847     /* In some cases, resampling aliasing can create a small amount of energy in the first band
848        being cut. So if the last band is masked, we don't include it.  */
849     if (bandwidth == 20 && is_masked[NB_TBANDS])
850        bandwidth-=2;
851     else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
852        bandwidth--;
853     if (tonal->count<=2)
854        bandwidth = 20;
855     frame_loudness = 20*(float)log10(frame_loudness);
856     tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
857     tonal->lowECount *= (1-alphaE);
858     if (frame_loudness < tonal->Etracker-30)
859        tonal->lowECount += alphaE;
860 
861     for (i=0;i<8;i++)
862     {
863        float sum=0;
864        for (b=0;b<16;b++)
865           sum += dct_table[i*16+b]*logE[b];
866        BFCC[i] = sum;
867     }
868     for (i=0;i<8;i++)
869     {
870        float sum=0;
871        for (b=0;b<16;b++)
872           sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
873        midE[i] = sum;
874     }
875 
876     frame_stationarity /= NB_TBANDS;
877     relativeE /= NB_TBANDS;
878     if (tonal->count<10)
879        relativeE = .5f;
880     frame_noisiness /= NB_TBANDS;
881 #if 1
882     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
883 #else
884     info->activity = .5*(1+frame_noisiness-frame_stationarity);
885 #endif
886     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
887     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
888     tonal->prev_tonality = frame_tonality;
889 
890     slope /= 8*8;
891     info->tonality_slope = slope;
892 
893     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
894     tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
895     info->tonality = frame_tonality;
896 
897     for (i=0;i<4;i++)
898        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
899 
900     for (i=0;i<4;i++)
901        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
902 
903     for (i=0;i<4;i++)
904         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
905     for (i=0;i<3;i++)
906         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
907 
908     if (tonal->count > 5)
909     {
910        for (i=0;i<9;i++)
911           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
912     }
913     for (i=0;i<4;i++)
914        features[i] = BFCC[i]-midE[i];
915 
916     for (i=0;i<8;i++)
917     {
918        tonal->mem[i+24] = tonal->mem[i+16];
919        tonal->mem[i+16] = tonal->mem[i+8];
920        tonal->mem[i+8] = tonal->mem[i];
921        tonal->mem[i] = BFCC[i];
922     }
923     for (i=0;i<9;i++)
924        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
925     features[18] = spec_variability - 0.78f;
926     features[20] = info->tonality - 0.154723f;
927     features[21] = info->activity - 0.724643f;
928     features[22] = frame_stationarity - 0.743717f;
929     features[23] = info->tonality_slope + 0.069216f;
930     features[24] = tonal->lowECount - 0.067930f;
931 
932     compute_dense(&layer0, layer_out, features);
933     compute_gru(&layer1, tonal->rnn_state, layer_out);
934     compute_dense(&layer2, frame_probs, tonal->rnn_state);
935 
936     /* Probability of speech or music vs noise */
937     info->activity_probability = frame_probs[1];
938     info->music_prob = frame_probs[0];
939 
940     /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
941 #ifdef MLP_TRAINING
942     for (i=0;i<25;i++)
943        printf("%f ", features[i]);
944     printf("\n");
945 #endif
946 
947     info->bandwidth = bandwidth;
948     tonal->prev_bandwidth = bandwidth;
949     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
950     info->noisiness = frame_noisiness;
951     info->valid = 1;
952     RESTORE_STACK;
953 }
954 
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)955 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
956                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
957                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
958 {
959    int offset;
960    int pcm_len;
961 
962    analysis_frame_size -= analysis_frame_size&1;
963    if (analysis_pcm != NULL)
964    {
965       /* Avoid overflow/wrap-around of the analysis buffer */
966       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
967 
968       pcm_len = analysis_frame_size - analysis->analysis_offset;
969       offset = analysis->analysis_offset;
970       while (pcm_len>0) {
971          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
972          offset += Fs/50;
973          pcm_len -= Fs/50;
974       }
975       analysis->analysis_offset = analysis_frame_size;
976 
977       analysis->analysis_offset -= frame_size;
978    }
979 
980    tonality_get_info(analysis, analysis_info, frame_size);
981 }
982 
983 #endif /* DISABLE_FLOAT_API */
984