1 /*
2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 /*
13  * This file includes the implementation of the core functionality in VAD.
14  * For function description, see vad_core.h.
15  */
16 
17 #include "vad_core.h"
18 
19 #include "signal_processing_library.h"
20 #include "typedefs.h"
21 #include "vad_defines.h"
22 #include "vad_filterbank.h"
23 #include "vad_gmm.h"
24 #include "vad_sp.h"
25 
26 // Spectrum Weighting
27 static const WebRtc_Word16 kSpectrumWeight[6] = { 6, 8, 10, 12, 14, 16 };
28 static const WebRtc_Word16 kNoiseUpdateConst = 655; // Q15
29 static const WebRtc_Word16 kSpeechUpdateConst = 6554; // Q15
30 static const WebRtc_Word16 kBackEta = 154; // Q8
31 // Minimum difference between the two models, Q5
32 static const WebRtc_Word16 kMinimumDifference[6] = {
33     544, 544, 576, 576, 576, 576 };
34 // Upper limit of mean value for speech model, Q7
35 static const WebRtc_Word16 kMaximumSpeech[6] = {
36     11392, 11392, 11520, 11520, 11520, 11520 };
37 // Minimum value for mean value
38 static const WebRtc_Word16 kMinimumMean[2] = { 640, 768 };
39 // Upper limit of mean value for noise model, Q7
40 static const WebRtc_Word16 kMaximumNoise[6] = {
41     9216, 9088, 8960, 8832, 8704, 8576 };
42 // Start values for the Gaussian models, Q7
43 // Weights for the two Gaussians for the six channels (noise)
44 static const WebRtc_Word16 kNoiseDataWeights[12] = {
45     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
46 // Weights for the two Gaussians for the six channels (speech)
47 static const WebRtc_Word16 kSpeechDataWeights[12] = {
48     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
49 // Means for the two Gaussians for the six channels (noise)
50 static const WebRtc_Word16 kNoiseDataMeans[12] = {
51     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
52 // Means for the two Gaussians for the six channels (speech)
53 static const WebRtc_Word16 kSpeechDataMeans[12] = {
54     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
55 };
56 // Stds for the two Gaussians for the six channels (noise)
57 static const WebRtc_Word16 kNoiseDataStds[12] = {
58     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
59 // Stds for the two Gaussians for the six channels (speech)
60 static const WebRtc_Word16 kSpeechDataStds[12] = {
61     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
62 
63 static const int kInitCheck = 42;
64 
65 // Initialize VAD
WebRtcVad_InitCore(VadInstT * inst,short mode)66 int WebRtcVad_InitCore(VadInstT *inst, short mode)
67 {
68     int i;
69 
70     // Initialization of struct
71     inst->vad = 1;
72     inst->frame_counter = 0;
73     inst->over_hang = 0;
74     inst->num_of_speech = 0;
75 
76     // Initialization of downsampling filter state
77     inst->downsampling_filter_states[0] = 0;
78     inst->downsampling_filter_states[1] = 0;
79     inst->downsampling_filter_states[2] = 0;
80     inst->downsampling_filter_states[3] = 0;
81 
82     // Read initial PDF parameters
83     for (i = 0; i < NUM_TABLE_VALUES; i++)
84     {
85         inst->noise_means[i] = kNoiseDataMeans[i];
86         inst->speech_means[i] = kSpeechDataMeans[i];
87         inst->noise_stds[i] = kNoiseDataStds[i];
88         inst->speech_stds[i] = kSpeechDataStds[i];
89     }
90 
91     // Index and Minimum value vectors are initialized
92     for (i = 0; i < 16 * NUM_CHANNELS; i++)
93     {
94         inst->low_value_vector[i] = 10000;
95         inst->index_vector[i] = 0;
96     }
97 
98     for (i = 0; i < 5; i++)
99     {
100         inst->upper_state[i] = 0;
101         inst->lower_state[i] = 0;
102     }
103 
104     for (i = 0; i < 4; i++)
105     {
106         inst->hp_filter_state[i] = 0;
107     }
108 
109     // Init mean value memory, for FindMin function
110     inst->mean_value[0] = 1600;
111     inst->mean_value[1] = 1600;
112     inst->mean_value[2] = 1600;
113     inst->mean_value[3] = 1600;
114     inst->mean_value[4] = 1600;
115     inst->mean_value[5] = 1600;
116 
117     if (mode == 0)
118     {
119         // Quality mode
120         inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
121         inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
122         inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
123         inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
124         inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
125         inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
126 
127         inst->individual[0] = INDIVIDUAL_10MS_Q;
128         inst->individual[1] = INDIVIDUAL_20MS_Q;
129         inst->individual[2] = INDIVIDUAL_30MS_Q;
130 
131         inst->total[0] = TOTAL_10MS_Q;
132         inst->total[1] = TOTAL_20MS_Q;
133         inst->total[2] = TOTAL_30MS_Q;
134     } else if (mode == 1)
135     {
136         // Low bitrate mode
137         inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
138         inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
139         inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
140         inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
141         inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
142         inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
143 
144         inst->individual[0] = INDIVIDUAL_10MS_LBR;
145         inst->individual[1] = INDIVIDUAL_20MS_LBR;
146         inst->individual[2] = INDIVIDUAL_30MS_LBR;
147 
148         inst->total[0] = TOTAL_10MS_LBR;
149         inst->total[1] = TOTAL_20MS_LBR;
150         inst->total[2] = TOTAL_30MS_LBR;
151     } else if (mode == 2)
152     {
153         // Aggressive mode
154         inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
155         inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
156         inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
157         inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
158         inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
159         inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
160 
161         inst->individual[0] = INDIVIDUAL_10MS_AGG;
162         inst->individual[1] = INDIVIDUAL_20MS_AGG;
163         inst->individual[2] = INDIVIDUAL_30MS_AGG;
164 
165         inst->total[0] = TOTAL_10MS_AGG;
166         inst->total[1] = TOTAL_20MS_AGG;
167         inst->total[2] = TOTAL_30MS_AGG;
168     } else
169     {
170         // Very aggressive mode
171         inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
172         inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
173         inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
174         inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
175         inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
176         inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
177 
178         inst->individual[0] = INDIVIDUAL_10MS_VAG;
179         inst->individual[1] = INDIVIDUAL_20MS_VAG;
180         inst->individual[2] = INDIVIDUAL_30MS_VAG;
181 
182         inst->total[0] = TOTAL_10MS_VAG;
183         inst->total[1] = TOTAL_20MS_VAG;
184         inst->total[2] = TOTAL_30MS_VAG;
185     }
186 
187     inst->init_flag = kInitCheck;
188 
189     return 0;
190 }
191 
192 // Set aggressiveness mode
WebRtcVad_set_mode_core(VadInstT * inst,short mode)193 int WebRtcVad_set_mode_core(VadInstT *inst, short mode)
194 {
195 
196     if (mode == 0)
197     {
198         // Quality mode
199         inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst
200         inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst
201         inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst
202         inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst
203         inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst
204         inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst
205 
206         inst->individual[0] = INDIVIDUAL_10MS_Q;
207         inst->individual[1] = INDIVIDUAL_20MS_Q;
208         inst->individual[2] = INDIVIDUAL_30MS_Q;
209 
210         inst->total[0] = TOTAL_10MS_Q;
211         inst->total[1] = TOTAL_20MS_Q;
212         inst->total[2] = TOTAL_30MS_Q;
213     } else if (mode == 1)
214     {
215         // Low bitrate mode
216         inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst
217         inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst
218         inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst
219         inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst
220         inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst
221         inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst
222 
223         inst->individual[0] = INDIVIDUAL_10MS_LBR;
224         inst->individual[1] = INDIVIDUAL_20MS_LBR;
225         inst->individual[2] = INDIVIDUAL_30MS_LBR;
226 
227         inst->total[0] = TOTAL_10MS_LBR;
228         inst->total[1] = TOTAL_20MS_LBR;
229         inst->total[2] = TOTAL_30MS_LBR;
230     } else if (mode == 2)
231     {
232         // Aggressive mode
233         inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst
234         inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst
235         inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst
236         inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst
237         inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst
238         inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst
239 
240         inst->individual[0] = INDIVIDUAL_10MS_AGG;
241         inst->individual[1] = INDIVIDUAL_20MS_AGG;
242         inst->individual[2] = INDIVIDUAL_30MS_AGG;
243 
244         inst->total[0] = TOTAL_10MS_AGG;
245         inst->total[1] = TOTAL_20MS_AGG;
246         inst->total[2] = TOTAL_30MS_AGG;
247     } else if (mode == 3)
248     {
249         // Very aggressive mode
250         inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst
251         inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst
252         inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst
253         inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst
254         inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst
255         inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst
256 
257         inst->individual[0] = INDIVIDUAL_10MS_VAG;
258         inst->individual[1] = INDIVIDUAL_20MS_VAG;
259         inst->individual[2] = INDIVIDUAL_30MS_VAG;
260 
261         inst->total[0] = TOTAL_10MS_VAG;
262         inst->total[1] = TOTAL_20MS_VAG;
263         inst->total[2] = TOTAL_30MS_VAG;
264     } else
265     {
266         return -1;
267     }
268 
269     return 0;
270 }
271 
272 // Calculate VAD decision by first extracting feature values and then calculate
273 // probability for both speech and background noise.
274 
WebRtcVad_CalcVad32khz(VadInstT * inst,WebRtc_Word16 * speech_frame,int frame_length)275 WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
276                                      int frame_length)
277 {
278     WebRtc_Word16 len, vad;
279     WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
280     WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
281 
282 
283     // Downsample signal 32->16->8 before doing VAD
284     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
285                            frame_length);
286     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
287 
288     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
289     len = WEBRTC_SPL_RSHIFT_W16(len, 1);
290 
291     // Do VAD on an 8 kHz signal
292     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
293 
294     return vad;
295 }
296 
WebRtcVad_CalcVad16khz(VadInstT * inst,WebRtc_Word16 * speech_frame,int frame_length)297 WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
298                                      int frame_length)
299 {
300     WebRtc_Word16 len, vad;
301     WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
302 
303     // Wideband: Downsample signal before doing VAD
304     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
305                            frame_length);
306 
307     len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1);
308     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
309 
310     return vad;
311 }
312 
WebRtcVad_CalcVad8khz(VadInstT * inst,WebRtc_Word16 * speech_frame,int frame_length)313 WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame,
314                                     int frame_length)
315 {
316     WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power;
317 
318     // Get power in the bands
319     total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector);
320 
321     // Make a VAD
322     inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length);
323 
324     return inst->vad;
325 }
326 
327 // Calculate probability for both speech and background noise, and perform a
328 // hypothesis-test.
WebRtcVad_GmmProbability(VadInstT * inst,WebRtc_Word16 * feature_vector,WebRtc_Word16 total_power,int frame_length)329 WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector,
330                                        WebRtc_Word16 total_power, int frame_length)
331 {
332     int n, k;
333     WebRtc_Word16 backval;
334     WebRtc_Word16 h0, h1;
335     WebRtc_Word16 ratvec, xval;
336     WebRtc_Word16 vadflag;
337     WebRtc_Word16 shifts0, shifts1;
338     WebRtc_Word16 tmp16, tmp16_1, tmp16_2;
339     WebRtc_Word16 diff, nr, pos;
340     WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
341     WebRtc_Word16 delt, ndelt;
342     WebRtc_Word16 maxspe, maxmu;
343     WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES];
344     WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES];
345     WebRtc_Word32 h0test, h1test;
346     WebRtc_Word32 tmp32_1, tmp32_2;
347     WebRtc_Word32 dotVal;
348     WebRtc_Word32 nmid, smid;
349     WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS];
350     WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr,
351             *sstd1ptr, *sstd2ptr;
352     WebRtc_Word16 overhead1, overhead2, individualTest, totalTest;
353 
354     // Set the thresholds to different values based on frame length
355     if (frame_length == 80)
356     {
357         // 80 input samples
358         overhead1 = inst->over_hang_max_1[0];
359         overhead2 = inst->over_hang_max_2[0];
360         individualTest = inst->individual[0];
361         totalTest = inst->total[0];
362     } else if (frame_length == 160)
363     {
364         // 160 input samples
365         overhead1 = inst->over_hang_max_1[1];
366         overhead2 = inst->over_hang_max_2[1];
367         individualTest = inst->individual[1];
368         totalTest = inst->total[1];
369     } else
370     {
371         // 240 input samples
372         overhead1 = inst->over_hang_max_1[2];
373         overhead2 = inst->over_hang_max_2[2];
374         individualTest = inst->individual[2];
375         totalTest = inst->total[2];
376     }
377 
378     if (total_power > MIN_ENERGY)
379     { // If signal present at all
380 
381         // Set pointers to the gaussian parameters
382         nmean1ptr = &inst->noise_means[0];
383         nmean2ptr = &inst->noise_means[NUM_CHANNELS];
384         smean1ptr = &inst->speech_means[0];
385         smean2ptr = &inst->speech_means[NUM_CHANNELS];
386         nstd1ptr = &inst->noise_stds[0];
387         nstd2ptr = &inst->noise_stds[NUM_CHANNELS];
388         sstd1ptr = &inst->speech_stds[0];
389         sstd2ptr = &inst->speech_stds[NUM_CHANNELS];
390 
391         vadflag = 0;
392         dotVal = 0;
393         for (n = 0; n < NUM_CHANNELS; n++)
394         { // For all channels
395 
396             pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
397             xval = feature_vector[n];
398 
399             // Probability for Noise, Q7 * Q20 = Q27
400             tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++,
401                                                     &deltaN[pos]);
402             probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1);
403             tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++,
404                                                     &deltaN[pos + 1]);
405             probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1);
406             h0test = probn[0] + probn[1]; // Q27
407             h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15
408 
409             // Probability for Speech
410             tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++,
411                                                     &deltaS[pos]);
412             probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1);
413             tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++,
414                                                     &deltaS[pos + 1]);
415             probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1);
416             h1test = probs[0] + probs[1]; // Q27
417             h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15
418 
419             // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1
420             shifts0 = WebRtcSpl_NormW32(h0test);
421             shifts1 = WebRtcSpl_NormW32(h1test);
422 
423             if ((h0test > 0) && (h1test > 0))
424             {
425                 ratvec = shifts0 - shifts1;
426             } else if (h1test > 0)
427             {
428                 ratvec = 31 - shifts1;
429             } else if (h0test > 0)
430             {
431                 ratvec = shifts0 - 31;
432             } else
433             {
434                 ratvec = 0;
435             }
436 
437             // VAD decision with spectrum weighting
438             dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]);
439 
440             // Individual channel test
441             if ((ratvec << 2) > individualTest)
442             {
443                 vadflag = 1;
444             }
445 
446             // Probabilities used when updating model
447             if (h0 > 0)
448             {
449                 tmp32_1 = probn[0] & 0xFFFFF000; // Q27
450                 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29
451                 ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0);
452                 ngprvec[pos + 1] = 16384 - ngprvec[pos];
453             } else
454             {
455                 ngprvec[pos] = 16384;
456                 ngprvec[pos + 1] = 0;
457             }
458 
459             // Probabilities used when updating model
460             if (h1 > 0)
461             {
462                 tmp32_1 = probs[0] & 0xFFFFF000;
463                 tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2);
464                 sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1);
465                 sgprvec[pos + 1] = 16384 - sgprvec[pos];
466             } else
467             {
468                 sgprvec[pos] = 0;
469                 sgprvec[pos + 1] = 0;
470             }
471         }
472 
473         // Overall test
474         if (dotVal >= totalTest)
475         {
476             vadflag |= 1;
477         }
478 
479         // Set pointers to the means and standard deviations.
480         nmean1ptr = &inst->noise_means[0];
481         smean1ptr = &inst->speech_means[0];
482         nstd1ptr = &inst->noise_stds[0];
483         sstd1ptr = &inst->speech_stds[0];
484 
485         maxspe = 12800;
486 
487         // Update the model's parameters
488         for (n = 0; n < NUM_CHANNELS; n++)
489         {
490 
491             pos = WEBRTC_SPL_LSHIFT_W16(n, 1);
492 
493             // Get min value in past which is used for long term correction
494             backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4
495 
496             // Compute the "global" mean, that is the sum of the two means weighted
497             nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7
498             nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS],
499                     *(nmean1ptr+NUM_CHANNELS));
500             tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8
501 
502             for (k = 0; k < NUM_MODELS; k++)
503             {
504 
505                 nr = pos + k;
506 
507                 nmean2ptr = nmean1ptr + k * NUM_CHANNELS;
508                 smean2ptr = smean1ptr + k * NUM_CHANNELS;
509                 nstd2ptr = nstd1ptr + k * NUM_CHANNELS;
510                 sstd2ptr = sstd1ptr + k * NUM_CHANNELS;
511                 nmk = *nmean2ptr;
512                 smk = *smean2ptr;
513                 nsk = *nstd2ptr;
514                 ssk = *sstd2ptr;
515 
516                 // Update noise mean vector if the frame consists of noise only
517                 nmk2 = nmk;
518                 if (!vadflag)
519                 {
520                     // deltaN = (x-mu)/sigma^2
521                     // ngprvec[k] = probn[k]/(probn[0] + probn[1])
522 
523                     delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr],
524                             deltaN[nr], 11); // Q14*Q11
525                     nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
526                             kNoiseUpdateConst,
527                             22); // Q7+(Q14*Q15>>22)
528                 }
529 
530                 // Long term correction of the noise mean
531                 ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4);
532                 ndelt -= tmp16_1; // Q8 - Q8
533                 nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt,
534                         kBackEta,
535                         9); // Q7+(Q8*Q8)>>9
536 
537                 // Control that the noise mean does not drift to much
538                 tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7);
539                 if (nmk3 < tmp16)
540                     nmk3 = tmp16;
541                 tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7);
542                 if (nmk3 > tmp16)
543                     nmk3 = tmp16;
544                 *nmean2ptr = nmk3;
545 
546                 if (vadflag)
547                 {
548                     // Update speech mean vector:
549                     // deltaS = (x-mu)/sigma^2
550                     // sgprvec[k] = probn[k]/(probn[0] + probn[1])
551 
552                     delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr],
553                             deltaS[nr],
554                             11); // (Q14*Q11)>>11=Q14
555                     tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt,
556                             kSpeechUpdateConst,
557                             21) + 1;
558                     smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22)
559 
560                     // Control that the speech mean does not drift to much
561                     maxmu = maxspe + 640;
562                     if (smk2 < kMinimumMean[k])
563                         smk2 = kMinimumMean[k];
564                     if (smk2 > maxmu)
565                         smk2 = maxmu;
566 
567                     *smean2ptr = smk2;
568 
569                     // (Q7>>3) = Q4
570                     tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3);
571 
572                     tmp16 = feature_vector[n] - tmp16; // Q4
573                     tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3);
574                     tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12
575                     tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2);
576                     tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24
577 
578                     tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20
579 
580                     // 0.1 * Q20 / Q7 = Q13
581                     if (tmp32_2 > 0)
582                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10);
583                     else
584                     {
585                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10);
586                         tmp16 = -tmp16;
587                     }
588                     // divide by 4 giving an update factor of 0.025
589                     tmp16 += 128; // Rounding
590                     ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8);
591                     // Division with 8 plus Q7
592                     if (ssk < MIN_STD)
593                         ssk = MIN_STD;
594                     *sstd2ptr = ssk;
595                 } else
596                 {
597                     // Update GMM variance vectors
598                     // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4
599                     tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3);
600 
601                     // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24
602                     tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096;
603                     tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2);
604                     tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1);
605                     tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14);
606                     // Q20  * approx 0.001 (2^-10=0.0009766)
607 
608                     // Q20 / Q7 = Q13
609                     tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
610                     if (tmp32_1 > 0)
611                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk);
612                     else
613                     {
614                         tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk);
615                         tmp16 = -tmp16;
616                     }
617                     tmp16 += 32; // Rounding
618                     nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6);
619 
620                     if (nsk < MIN_STD)
621                         nsk = MIN_STD;
622 
623                     *nstd2ptr = nsk;
624                 }
625             }
626 
627             // Separate models if they are too close - nmid in Q14
628             nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr);
629             nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr);
630 
631             // smid in Q14
632             smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr);
633             smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr);
634 
635             // diff = "global" speech mean - "global" noise mean
636             diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9);
637             tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9);
638             diff -= tmp16;
639 
640             if (diff < kMinimumDifference[n])
641             {
642 
643                 tmp16 = kMinimumDifference[n] - diff; // Q5
644 
645                 // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7
646                 // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7
647                 tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2);
648                 tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2);
649 
650                 // First Gauss, speech model
651                 tmp16 = tmp16_1 + *smean1ptr;
652                 *smean1ptr = tmp16;
653                 smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]);
654 
655                 // Second Gauss, speech model
656                 tmp16 = tmp16_1 + *smean2ptr;
657                 *smean2ptr = tmp16;
658                 smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]);
659 
660                 // First Gauss, noise model
661                 tmp16 = *nmean1ptr - tmp16_2;
662                 *nmean1ptr = tmp16;
663 
664                 nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]);
665 
666                 // Second Gauss, noise model
667                 tmp16 = *nmean2ptr - tmp16_2;
668                 *nmean2ptr = tmp16;
669                 nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]);
670             }
671 
672             // Control that the speech & noise means do not drift to much
673             maxspe = kMaximumSpeech[n];
674             tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7);
675             if (tmp16_2 > maxspe)
676             { // Upper limit of speech model
677                 tmp16_2 -= maxspe;
678 
679                 *smean1ptr -= tmp16_2;
680                 *smean2ptr -= tmp16_2;
681             }
682 
683             tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7);
684             if (tmp16_2 > kMaximumNoise[n])
685             {
686                 tmp16_2 -= kMaximumNoise[n];
687 
688                 *nmean1ptr -= tmp16_2;
689                 *nmean2ptr -= tmp16_2;
690             }
691 
692             nmean1ptr++;
693             smean1ptr++;
694             nstd1ptr++;
695             sstd1ptr++;
696         }
697         inst->frame_counter++;
698     } else
699     {
700         vadflag = 0;
701     }
702 
703     // Hangover smoothing
704     if (!vadflag)
705     {
706         if (inst->over_hang > 0)
707         {
708             vadflag = 2 + inst->over_hang;
709             inst->over_hang = inst->over_hang - 1;
710         }
711         inst->num_of_speech = 0;
712     } else
713     {
714         inst->num_of_speech = inst->num_of_speech + 1;
715         if (inst->num_of_speech > NSP_MAX)
716         {
717             inst->num_of_speech = NSP_MAX;
718             inst->over_hang = overhead2;
719         } else
720             inst->over_hang = overhead1;
721     }
722     return vadflag;
723 }
724