1 /*
2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 /*
13  * This header file includes the descriptions of the core VAD calls.
14  */
15 
16 #ifndef WEBRTC_VAD_CORE_H_
17 #define WEBRTC_VAD_CORE_H_
18 
19 #include "typedefs.h"
20 #include "vad_defines.h"
21 
22 typedef struct VadInstT_
23 {
24 
25     WebRtc_Word16 vad;
26     WebRtc_Word32 downsampling_filter_states[4];
27     WebRtc_Word16 noise_means[NUM_TABLE_VALUES];
28     WebRtc_Word16 speech_means[NUM_TABLE_VALUES];
29     WebRtc_Word16 noise_stds[NUM_TABLE_VALUES];
30     WebRtc_Word16 speech_stds[NUM_TABLE_VALUES];
31     // TODO(bjornv): Change to |frame_count|.
32     WebRtc_Word32 frame_counter;
33     WebRtc_Word16 over_hang; // Over Hang
34     WebRtc_Word16 num_of_speech;
35     // TODO(bjornv): Change to |age_vector|.
36     WebRtc_Word16 index_vector[16 * NUM_CHANNELS];
37     WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS];
38     // TODO(bjornv): Change to |median|.
39     WebRtc_Word16 mean_value[NUM_CHANNELS];
40     WebRtc_Word16 upper_state[5];
41     WebRtc_Word16 lower_state[5];
42     WebRtc_Word16 hp_filter_state[4];
43     WebRtc_Word16 over_hang_max_1[3];
44     WebRtc_Word16 over_hang_max_2[3];
45     WebRtc_Word16 individual[3];
46     WebRtc_Word16 total[3];
47 
48     short init_flag;
49 
50 } VadInstT;
51 
52 /****************************************************************************
53  * WebRtcVad_InitCore(...)
54  *
55  * This function initializes a VAD instance
56  *
57  * Input:
58  *      - inst      : Instance that should be initialized
59  *      - mode      : Aggressiveness degree
60  *                    0 (High quality) - 3 (Highly aggressive)
61  *
62  * Output:
63  *      - inst      : Initialized instance
64  *
65  * Return value     :  0 - Ok
66  *                    -1 - Error
67  */
68 int WebRtcVad_InitCore(VadInstT* inst, short mode);
69 
70 /****************************************************************************
71  * WebRtcVad_set_mode_core(...)
72  *
73  * This function changes the VAD settings
74  *
75  * Input:
76  *      - inst      : VAD instance
77  *      - mode      : Aggressiveness degree
78  *                    0 (High quality) - 3 (Highly aggressive)
79  *
80  * Output:
81  *      - inst      : Changed  instance
82  *
83  * Return value     :  0 - Ok
84  *                    -1 - Error
85  */
86 
87 int WebRtcVad_set_mode_core(VadInstT* inst, short mode);
88 
89 /****************************************************************************
90  * WebRtcVad_CalcVad32khz(...)
91  * WebRtcVad_CalcVad16khz(...)
92  * WebRtcVad_CalcVad8khz(...)
93  *
94  * Calculate probability for active speech and make VAD decision.
95  *
96  * Input:
97  *      - inst          : Instance that should be initialized
98  *      - speech_frame  : Input speech frame
99  *      - frame_length  : Number of input samples
100  *
101  * Output:
102  *      - inst          : Updated filter states etc.
103  *
104  * Return value         : VAD decision
105  *                        0 - No active speech
106  *                        1-6 - Active speech
107  */
108 WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT* inst, WebRtc_Word16* speech_frame,
109                                      int frame_length);
110 WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame,
111                                      int frame_length);
112 WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame,
113                                     int frame_length);
114 
115 /****************************************************************************
116  * WebRtcVad_GmmProbability(...)
117  *
118  * This function calculates the probabilities for background noise and
119  * speech using Gaussian Mixture Models. A hypothesis-test is performed to decide
120  * which type of signal is most probable.
121  *
122  * Input:
123  *      - inst              : Pointer to VAD instance
124  *      - feature_vector    : Feature vector = log10(energy in frequency band)
125  *      - total_power       : Total power in frame.
126  *      - frame_length      : Number of input samples
127  *
128  * Output:
129  *      VAD decision        : 0 - noise, 1 - speech
130  *
131  */
132 WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector,
133                                        WebRtc_Word16 total_power, int frame_length);
134 
135 #endif // WEBRTC_VAD_CORE_H_
136