1 /* Copyright (c) 2018 Gregor Richards
2  * Copyright (c) 2017 Mozilla */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include "kiss_fft.h"
36 #include "common.h"
37 #include <math.h>
38 #include "rnnoise.h"
39 #include "pitch.h"
40 #include "arch.h"
41 #include "rnn.h"
42 #include "rnn_data.h"
43 
44 #define FRAME_SIZE_SHIFT 2
45 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
46 #define WINDOW_SIZE (2*FRAME_SIZE)
47 #define FREQ_SIZE (FRAME_SIZE + 1)
48 
49 #define PITCH_MIN_PERIOD 60
50 #define PITCH_MAX_PERIOD 768
51 #define PITCH_FRAME_SIZE 960
52 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
53 
54 #define SQUARE(x) ((x)*(x))
55 
56 #define NB_BANDS 22
57 
58 #define CEPS_MEM 8
59 #define NB_DELTA_CEPS 6
60 
61 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
62 
63 
64 #ifndef TRAINING
65 #define TRAINING 0
66 #endif
67 
68 
69 /* The built-in model, used if no file is given as input */
70 extern const struct RNNModel rnnoise_model_orig;
71 
72 
73 static const opus_int16 eband5ms[] = {
74 /*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k 9.6 12k 15.6 20k*/
75   0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
76 };
77 
78 
79 typedef struct {
80   int init;
81   kiss_fft_state *kfft;
82   float half_window[FRAME_SIZE];
83   float dct_table[NB_BANDS*NB_BANDS];
84 } CommonState;
85 
86 struct DenoiseState {
87   float analysis_mem[FRAME_SIZE];
88   float cepstral_mem[CEPS_MEM][NB_BANDS];
89   int memid;
90   float synthesis_mem[FRAME_SIZE];
91   float pitch_buf[PITCH_BUF_SIZE];
92   float pitch_enh_buf[PITCH_BUF_SIZE];
93   float last_gain;
94   int last_period;
95   float mem_hp_x[2];
96   float lastg[NB_BANDS];
97   RNNState rnn;
98 };
99 
compute_band_energy(float * bandE,const kiss_fft_cpx * X)100 void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
101   int i;
102   float sum[NB_BANDS] = {0};
103   for (i=0;i<NB_BANDS-1;i++)
104   {
105     int j;
106     int band_size;
107     band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
108     for (j=0;j<band_size;j++) {
109       float tmp;
110       float frac = (float)j/band_size;
111       tmp = SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r);
112       tmp += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i);
113       sum[i] += (1-frac)*tmp;
114       sum[i+1] += frac*tmp;
115     }
116   }
117   sum[0] *= 2;
118   sum[NB_BANDS-1] *= 2;
119   for (i=0;i<NB_BANDS;i++)
120   {
121     bandE[i] = sum[i];
122   }
123 }
124 
compute_band_corr(float * bandE,const kiss_fft_cpx * X,const kiss_fft_cpx * P)125 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
126   int i;
127   float sum[NB_BANDS] = {0};
128   for (i=0;i<NB_BANDS-1;i++)
129   {
130     int j;
131     int band_size;
132     band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
133     for (j=0;j<band_size;j++) {
134       float tmp;
135       float frac = (float)j/band_size;
136       tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r;
137       tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i;
138       sum[i] += (1-frac)*tmp;
139       sum[i+1] += frac*tmp;
140     }
141   }
142   sum[0] *= 2;
143   sum[NB_BANDS-1] *= 2;
144   for (i=0;i<NB_BANDS;i++)
145   {
146     bandE[i] = sum[i];
147   }
148 }
149 
interp_band_gain(float * g,const float * bandE)150 void interp_band_gain(float *g, const float *bandE) {
151   int i;
152   memset(g, 0, FREQ_SIZE);
153   for (i=0;i<NB_BANDS-1;i++)
154   {
155     int j;
156     int band_size;
157     band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
158     for (j=0;j<band_size;j++) {
159       float frac = (float)j/band_size;
160       g[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j] = (1-frac)*bandE[i] + frac*bandE[i+1];
161     }
162   }
163 }
164 
165 
166 CommonState common;
167 
check_init()168 static void check_init() {
169   int i;
170   if (common.init) return;
171   common.kfft = opus_fft_alloc_twiddles(2*FRAME_SIZE, NULL, NULL, NULL, 0);
172   for (i=0;i<FRAME_SIZE;i++)
173     common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
174   for (i=0;i<NB_BANDS;i++) {
175     int j;
176     for (j=0;j<NB_BANDS;j++) {
177       common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
178       if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5);
179     }
180   }
181   common.init = 1;
182 }
183 
dct(float * out,const float * in)184 static void dct(float *out, const float *in) {
185   int i;
186   check_init();
187   for (i=0;i<NB_BANDS;i++) {
188     int j;
189     float sum = 0;
190     for (j=0;j<NB_BANDS;j++) {
191       sum += in[j] * common.dct_table[j*NB_BANDS + i];
192     }
193     out[i] = sum*sqrt(2./22);
194   }
195 }
196 
197 #if 0
198 static void idct(float *out, const float *in) {
199   int i;
200   check_init();
201   for (i=0;i<NB_BANDS;i++) {
202     int j;
203     float sum = 0;
204     for (j=0;j<NB_BANDS;j++) {
205       sum += in[j] * common.dct_table[i*NB_BANDS + j];
206     }
207     out[i] = sum*sqrt(2./22);
208   }
209 }
210 #endif
211 
forward_transform(kiss_fft_cpx * out,const float * in)212 static void forward_transform(kiss_fft_cpx *out, const float *in) {
213   int i;
214   kiss_fft_cpx x[WINDOW_SIZE];
215   kiss_fft_cpx y[WINDOW_SIZE];
216   check_init();
217   for (i=0;i<WINDOW_SIZE;i++) {
218     x[i].r = in[i];
219     x[i].i = 0;
220   }
221   opus_fft(common.kfft, x, y, 0);
222   for (i=0;i<FREQ_SIZE;i++) {
223     out[i] = y[i];
224   }
225 }
226 
inverse_transform(float * out,const kiss_fft_cpx * in)227 static void inverse_transform(float *out, const kiss_fft_cpx *in) {
228   int i;
229   kiss_fft_cpx x[WINDOW_SIZE];
230   kiss_fft_cpx y[WINDOW_SIZE];
231   check_init();
232   for (i=0;i<FREQ_SIZE;i++) {
233     x[i] = in[i];
234   }
235   for (;i<WINDOW_SIZE;i++) {
236     x[i].r = x[WINDOW_SIZE - i].r;
237     x[i].i = -x[WINDOW_SIZE - i].i;
238   }
239   opus_fft(common.kfft, x, y, 0);
240   /* output in reverse order for IFFT. */
241   out[0] = WINDOW_SIZE*y[0].r;
242   for (i=1;i<WINDOW_SIZE;i++) {
243     out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r;
244   }
245 }
246 
apply_window(float * x)247 static void apply_window(float *x) {
248   int i;
249   check_init();
250   for (i=0;i<FRAME_SIZE;i++) {
251     x[i] *= common.half_window[i];
252     x[WINDOW_SIZE - 1 - i] *= common.half_window[i];
253   }
254 }
255 
rnnoise_get_size()256 int rnnoise_get_size() {
257   return sizeof(DenoiseState);
258 }
259 
rnnoise_get_frame_size()260 int rnnoise_get_frame_size() {
261   return FRAME_SIZE;
262 }
263 
rnnoise_init(DenoiseState * st,RNNModel * model)264 int rnnoise_init(DenoiseState *st, RNNModel *model) {
265   memset(st, 0, sizeof(*st));
266   if (model)
267     st->rnn.model = model;
268   else
269     st->rnn.model = &rnnoise_model_orig;
270   st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size);
271   st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size);
272   st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size);
273   return 0;
274 }
275 
rnnoise_create(RNNModel * model)276 DenoiseState *rnnoise_create(RNNModel *model) {
277   DenoiseState *st;
278   st = malloc(rnnoise_get_size());
279   rnnoise_init(st, model);
280   return st;
281 }
282 
rnnoise_destroy(DenoiseState * st)283 void rnnoise_destroy(DenoiseState *st) {
284   free(st->rnn.vad_gru_state);
285   free(st->rnn.noise_gru_state);
286   free(st->rnn.denoise_gru_state);
287   free(st);
288 }
289 
290 #if TRAINING
291 int lowpass = FREQ_SIZE;
292 int band_lp = NB_BANDS;
293 #endif
294 
frame_analysis(DenoiseState * st,kiss_fft_cpx * X,float * Ex,const float * in)295 static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
296   int i;
297   float x[WINDOW_SIZE];
298   RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
299   for (i=0;i<FRAME_SIZE;i++) x[FRAME_SIZE + i] = in[i];
300   RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
301   apply_window(x);
302   forward_transform(X, x);
303 #if TRAINING
304   for (i=lowpass;i<FREQ_SIZE;i++)
305     X[i].r = X[i].i = 0;
306 #endif
307   compute_band_energy(Ex, X);
308 }
309 
compute_frame_features(DenoiseState * st,kiss_fft_cpx * X,kiss_fft_cpx * P,float * Ex,float * Ep,float * Exp,float * features,const float * in)310 static int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P,
311                                   float *Ex, float *Ep, float *Exp, float *features, const float *in) {
312   int i;
313   float E = 0;
314   float *ceps_0, *ceps_1, *ceps_2;
315   float spec_variability = 0;
316   float Ly[NB_BANDS];
317   float p[WINDOW_SIZE];
318   float pitch_buf[PITCH_BUF_SIZE>>1];
319   int pitch_index;
320   float gain;
321   float *(pre[1]);
322   float tmp[NB_BANDS];
323   float follow, logMax;
324   frame_analysis(st, X, Ex, in);
325   RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
326   RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
327   pre[0] = &st->pitch_buf[0];
328   pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
329   pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
330                PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
331   pitch_index = PITCH_MAX_PERIOD-pitch_index;
332 
333   gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
334           PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
335   st->last_period = pitch_index;
336   st->last_gain = gain;
337   for (i=0;i<WINDOW_SIZE;i++)
338     p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
339   apply_window(p);
340   forward_transform(P, p);
341   compute_band_energy(Ep, P);
342   compute_band_corr(Exp, X, P);
343   for (i=0;i<NB_BANDS;i++) Exp[i] = Exp[i]/sqrt(.001+Ex[i]*Ep[i]);
344   dct(tmp, Exp);
345   for (i=0;i<NB_DELTA_CEPS;i++) features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
346   features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
347   features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
348   features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
349   logMax = -2;
350   follow = -2;
351   for (i=0;i<NB_BANDS;i++) {
352     Ly[i] = log10(1e-2+Ex[i]);
353     Ly[i] = MAX16(logMax-7, MAX16(follow-1.5, Ly[i]));
354     logMax = MAX16(logMax, Ly[i]);
355     follow = MAX16(follow-1.5, Ly[i]);
356     E += Ex[i];
357   }
358   if (!TRAINING && E < 0.04) {
359     /* If there's no audio, avoid messing up the state. */
360     RNN_CLEAR(features, NB_FEATURES);
361     return 1;
362   }
363   dct(features, Ly);
364   features[0] -= 12;
365   features[1] -= 4;
366   ceps_0 = st->cepstral_mem[st->memid];
367   ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
368   ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
369   for (i=0;i<NB_BANDS;i++) ceps_0[i] = features[i];
370   st->memid++;
371   for (i=0;i<NB_DELTA_CEPS;i++) {
372     features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
373     features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
374     features[NB_BANDS+NB_DELTA_CEPS+i] =  ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
375   }
376   /* Spectral variability features. */
377   if (st->memid == CEPS_MEM) st->memid = 0;
378   for (i=0;i<CEPS_MEM;i++)
379   {
380     int j;
381     float mindist = 1e15f;
382     for (j=0;j<CEPS_MEM;j++)
383     {
384       int k;
385       float dist=0;
386       for (k=0;k<NB_BANDS;k++)
387       {
388         float tmp;
389         tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
390         dist += tmp*tmp;
391       }
392       if (j!=i)
393         mindist = MIN32(mindist, dist);
394     }
395     spec_variability += mindist;
396   }
397   features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
398   return TRAINING && E < 0.1;
399 }
400 
frame_synthesis(DenoiseState * st,float * out,const kiss_fft_cpx * y)401 static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) {
402   float x[WINDOW_SIZE];
403   int i;
404   inverse_transform(x, y);
405   apply_window(x);
406   for (i=0;i<FRAME_SIZE;i++) out[i] = x[i] + st->synthesis_mem[i];
407   RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
408 }
409 
biquad(float * y,float mem[2],const float * x,const float * b,const float * a,int N)410 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
411   int i;
412   for (i=0;i<N;i++) {
413     float xi, yi;
414     xi = x[i];
415     yi = x[i] + mem[0];
416     mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi);
417     mem[1] = (b[1]*(double)xi - a[1]*(double)yi);
418     y[i] = yi;
419   }
420 }
421 
pitch_filter(kiss_fft_cpx * X,const kiss_fft_cpx * P,const float * Ex,const float * Ep,const float * Exp,const float * g)422 void pitch_filter(kiss_fft_cpx *X, const kiss_fft_cpx *P, const float *Ex, const float *Ep,
423                   const float *Exp, const float *g) {
424   int i;
425   float r[NB_BANDS];
426   float rf[FREQ_SIZE] = {0};
427   for (i=0;i<NB_BANDS;i++) {
428 #if 0
429     if (Exp[i]>g[i]) r[i] = 1;
430     else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i]));
431     r[i] = MIN16(1, MAX16(0, r[i]));
432 #else
433     if (Exp[i]>g[i]) r[i] = 1;
434     else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
435     r[i] = sqrt(MIN16(1, MAX16(0, r[i])));
436 #endif
437     r[i] *= sqrt(Ex[i]/(1e-8+Ep[i]));
438   }
439   interp_band_gain(rf, r);
440   for (i=0;i<FREQ_SIZE;i++) {
441     X[i].r += rf[i]*P[i].r;
442     X[i].i += rf[i]*P[i].i;
443   }
444   float newE[NB_BANDS];
445   compute_band_energy(newE, X);
446   float norm[NB_BANDS];
447   float normf[FREQ_SIZE]={0};
448   for (i=0;i<NB_BANDS;i++) {
449     norm[i] = sqrt(Ex[i]/(1e-8+newE[i]));
450   }
451   interp_band_gain(normf, norm);
452   for (i=0;i<FREQ_SIZE;i++) {
453     X[i].r *= normf[i];
454     X[i].i *= normf[i];
455   }
456 }
457 
rnnoise_process_frame(DenoiseState * st,float * out,const float * in)458 float rnnoise_process_frame(DenoiseState *st, float *out, const float *in) {
459   int i;
460   kiss_fft_cpx X[FREQ_SIZE];
461   kiss_fft_cpx P[WINDOW_SIZE];
462   float x[FRAME_SIZE];
463   float Ex[NB_BANDS], Ep[NB_BANDS];
464   float Exp[NB_BANDS];
465   float features[NB_FEATURES];
466   float g[NB_BANDS];
467   float gf[FREQ_SIZE]={1};
468   float vad_prob = 0;
469   int silence;
470   static const float a_hp[2] = {-1.99599, 0.99600};
471   static const float b_hp[2] = {-2, 1};
472   biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
473   silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
474 
475   if (!silence) {
476     compute_rnn(&st->rnn, g, &vad_prob, features);
477     pitch_filter(X, P, Ex, Ep, Exp, g);
478     for (i=0;i<NB_BANDS;i++) {
479       float alpha = .6f;
480       g[i] = MAX16(g[i], alpha*st->lastg[i]);
481       st->lastg[i] = g[i];
482     }
483     interp_band_gain(gf, g);
484 #if 1
485     for (i=0;i<FREQ_SIZE;i++) {
486       X[i].r *= gf[i];
487       X[i].i *= gf[i];
488     }
489 #endif
490   }
491 
492   frame_synthesis(st, out, X);
493   return vad_prob;
494 }
495 
496 #if TRAINING
497 
uni_rand()498 static float uni_rand() {
499   return rand()/(double)RAND_MAX-.5;
500 }
501 
rand_resp(float * a,float * b)502 static void rand_resp(float *a, float *b) {
503   a[0] = .75*uni_rand();
504   a[1] = .75*uni_rand();
505   b[0] = .75*uni_rand();
506   b[1] = .75*uni_rand();
507 }
508 
main(int argc,char ** argv)509 int main(int argc, char **argv) {
510   int i;
511   int count=0;
512   static const float a_hp[2] = {-1.99599, 0.99600};
513   static const float b_hp[2] = {-2, 1};
514   float a_noise[2] = {0};
515   float b_noise[2] = {0};
516   float a_sig[2] = {0};
517   float b_sig[2] = {0};
518   float mem_hp_x[2]={0};
519   float mem_hp_n[2]={0};
520   float mem_resp_x[2]={0};
521   float mem_resp_n[2]={0};
522   float x[FRAME_SIZE];
523   float n[FRAME_SIZE];
524   float xn[FRAME_SIZE];
525   int vad_cnt=0;
526   int gain_change_count=0;
527   float speech_gain = 1, noise_gain = 1;
528   FILE *f1, *f2;
529   int maxCount;
530   DenoiseState *st;
531   DenoiseState *noise_state;
532   DenoiseState *noisy;
533   st = rnnoise_create(NULL);
534   noise_state = rnnoise_create(NULL);
535   noisy = rnnoise_create(NULL);
536   if (argc!=4) {
537     fprintf(stderr, "usage: %s <speech> <noise> <count>\n", argv[0]);
538     return 1;
539   }
540   f1 = fopen(argv[1], "r");
541   f2 = fopen(argv[2], "r");
542   maxCount = atoi(argv[3]);
543   for(i=0;i<150;i++) {
544     short tmp[FRAME_SIZE];
545     fread(tmp, sizeof(short), FRAME_SIZE, f2);
546   }
547   while (1) {
548     kiss_fft_cpx X[FREQ_SIZE], Y[FREQ_SIZE], N[FREQ_SIZE], P[WINDOW_SIZE];
549     float Ex[NB_BANDS], Ey[NB_BANDS], En[NB_BANDS], Ep[NB_BANDS];
550     float Exp[NB_BANDS];
551     float Ln[NB_BANDS];
552     float features[NB_FEATURES];
553     float g[NB_BANDS];
554     short tmp[FRAME_SIZE];
555     float vad=0;
556     float E=0;
557     if (count==maxCount) break;
558     if ((count%1000)==0) fprintf(stderr, "%d\r", count);
559     if (++gain_change_count > 2821) {
560       speech_gain = pow(10., (-40+(rand()%60))/20.);
561       noise_gain = pow(10., (-30+(rand()%50))/20.);
562       if (rand()%10==0) noise_gain = 0;
563       noise_gain *= speech_gain;
564       if (rand()%10==0) speech_gain = 0;
565       gain_change_count = 0;
566       rand_resp(a_noise, b_noise);
567       rand_resp(a_sig, b_sig);
568       lowpass = FREQ_SIZE * 3000./24000. * pow(50., rand()/(double)RAND_MAX);
569       for (i=0;i<NB_BANDS;i++) {
570         if (eband5ms[i]<<FRAME_SIZE_SHIFT > lowpass) {
571           band_lp = i;
572           break;
573         }
574       }
575     }
576     if (speech_gain != 0) {
577       fread(tmp, sizeof(short), FRAME_SIZE, f1);
578       if (feof(f1)) {
579         rewind(f1);
580         fread(tmp, sizeof(short), FRAME_SIZE, f1);
581       }
582       for (i=0;i<FRAME_SIZE;i++) x[i] = speech_gain*tmp[i];
583       for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i];
584     } else {
585       for (i=0;i<FRAME_SIZE;i++) x[i] = 0;
586       E = 0;
587     }
588     if (noise_gain!=0) {
589       fread(tmp, sizeof(short), FRAME_SIZE, f2);
590       if (feof(f2)) {
591         rewind(f2);
592         fread(tmp, sizeof(short), FRAME_SIZE, f2);
593       }
594       for (i=0;i<FRAME_SIZE;i++) n[i] = noise_gain*tmp[i];
595     } else {
596       for (i=0;i<FRAME_SIZE;i++) n[i] = 0;
597     }
598     biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
599     biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
600     biquad(n, mem_hp_n, n, b_hp, a_hp, FRAME_SIZE);
601     biquad(n, mem_resp_n, n, b_noise, a_noise, FRAME_SIZE);
602     for (i=0;i<FRAME_SIZE;i++) xn[i] = x[i] + n[i];
603     if (E > 1e9f) {
604       vad_cnt=0;
605     } else if (E > 1e8f) {
606       vad_cnt -= 5;
607     } else if (E > 1e7f) {
608       vad_cnt++;
609     } else {
610       vad_cnt+=2;
611     }
612     if (vad_cnt < 0) vad_cnt = 0;
613     if (vad_cnt > 15) vad_cnt = 15;
614 
615     if (vad_cnt >= 10) vad = 0;
616     else if (vad_cnt > 0) vad = 0.5f;
617     else vad = 1.f;
618 
619     frame_analysis(st, Y, Ey, x);
620     frame_analysis(noise_state, N, En, n);
621     for (i=0;i<NB_BANDS;i++) Ln[i] = log10(1e-2+En[i]);
622     int silence = compute_frame_features(noisy, X, P, Ex, Ep, Exp, features, xn);
623     pitch_filter(X, P, Ex, Ep, Exp, g);
624     //printf("%f %d\n", noisy->last_gain, noisy->last_period);
625     for (i=0;i<NB_BANDS;i++) {
626       g[i] = sqrt((Ey[i]+1e-3)/(Ex[i]+1e-3));
627       if (g[i] > 1) g[i] = 1;
628       if (silence || i > band_lp) g[i] = -1;
629       if (Ey[i] < 5e-2 && Ex[i] < 5e-2) g[i] = -1;
630       if (vad==0 && noise_gain==0) g[i] = -1;
631     }
632     count++;
633 #if 1
634     fwrite(features, sizeof(float), NB_FEATURES, stdout);
635     fwrite(g, sizeof(float), NB_BANDS, stdout);
636     fwrite(Ln, sizeof(float), NB_BANDS, stdout);
637     fwrite(&vad, sizeof(float), 1, stdout);
638 #endif
639   }
640   fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1);
641   fclose(f1);
642   fclose(f2);
643   return 0;
644 }
645 
646 #endif
647