1 /*
2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_processing/agc2/rnn_vad/spectral_features.h"
12
13 #include <algorithm>
14 #include <cmath>
15 #include <limits>
16 #include <numeric>
17
18 #include "rtc_base/checks.h"
19
20 namespace webrtc {
21 namespace rnn_vad {
22 namespace {
23
24 constexpr float kSilenceThreshold = 0.04f;
25
26 // Computes the new cepstral difference stats and pushes them into the passed
27 // symmetric matrix buffer.
UpdateCepstralDifferenceStats(rtc::ArrayView<const float,kNumBands> new_cepstral_coeffs,const RingBuffer<float,kNumBands,kCepstralCoeffsHistorySize> & ring_buf,SymmetricMatrixBuffer<float,kCepstralCoeffsHistorySize> * sym_matrix_buf)28 void UpdateCepstralDifferenceStats(
29 rtc::ArrayView<const float, kNumBands> new_cepstral_coeffs,
30 const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf,
31 SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) {
32 RTC_DCHECK(sym_matrix_buf);
33 // Compute the new cepstral distance stats.
34 std::array<float, kCepstralCoeffsHistorySize - 1> distances;
35 for (size_t i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) {
36 const size_t delay = i + 1;
37 auto old_cepstral_coeffs = ring_buf.GetArrayView(delay);
38 distances[i] = 0.f;
39 for (size_t k = 0; k < kNumBands; ++k) {
40 const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k];
41 distances[i] += c * c;
42 }
43 }
44 // Push the new spectral distance stats into the symmetric matrix buffer.
45 sym_matrix_buf->Push(distances);
46 }
47
48 // Computes the first half of the Vorbis window.
ComputeScaledHalfVorbisWindow(float scaling=1.f)49 std::array<float, kFrameSize20ms24kHz / 2> ComputeScaledHalfVorbisWindow(
50 float scaling = 1.f) {
51 constexpr size_t kHalfSize = kFrameSize20ms24kHz / 2;
52 std::array<float, kHalfSize> half_window{};
53 for (size_t i = 0; i < kHalfSize; ++i) {
54 half_window[i] =
55 scaling *
56 std::sin(0.5 * kPi * std::sin(0.5 * kPi * (i + 0.5) / kHalfSize) *
57 std::sin(0.5 * kPi * (i + 0.5) / kHalfSize));
58 }
59 return half_window;
60 }
61
62 // Computes the forward FFT on a 20 ms frame to which a given window function is
63 // applied. The Fourier coefficient corresponding to the Nyquist frequency is
64 // set to zero (it is never used and this allows to simplify the code).
ComputeWindowedForwardFft(rtc::ArrayView<const float,kFrameSize20ms24kHz> frame,const std::array<float,kFrameSize20ms24kHz/2> & half_window,Pffft::FloatBuffer * fft_input_buffer,Pffft::FloatBuffer * fft_output_buffer,Pffft * fft)65 void ComputeWindowedForwardFft(
66 rtc::ArrayView<const float, kFrameSize20ms24kHz> frame,
67 const std::array<float, kFrameSize20ms24kHz / 2>& half_window,
68 Pffft::FloatBuffer* fft_input_buffer,
69 Pffft::FloatBuffer* fft_output_buffer,
70 Pffft* fft) {
71 RTC_DCHECK_EQ(frame.size(), 2 * half_window.size());
72 // Apply windowing.
73 auto in = fft_input_buffer->GetView();
74 for (size_t i = 0, j = kFrameSize20ms24kHz - 1; i < half_window.size();
75 ++i, --j) {
76 in[i] = frame[i] * half_window[i];
77 in[j] = frame[j] * half_window[i];
78 }
79 fft->ForwardTransform(*fft_input_buffer, fft_output_buffer, /*ordered=*/true);
80 // Set the Nyquist frequency coefficient to zero.
81 auto out = fft_output_buffer->GetView();
82 out[1] = 0.f;
83 }
84
85 } // namespace
86
SpectralFeaturesExtractor()87 SpectralFeaturesExtractor::SpectralFeaturesExtractor()
88 : half_window_(ComputeScaledHalfVorbisWindow(
89 1.f / static_cast<float>(kFrameSize20ms24kHz))),
90 fft_(kFrameSize20ms24kHz, Pffft::FftType::kReal),
91 fft_buffer_(fft_.CreateBuffer()),
92 reference_frame_fft_(fft_.CreateBuffer()),
93 lagged_frame_fft_(fft_.CreateBuffer()),
94 dct_table_(ComputeDctTable()) {}
95
96 SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
97
Reset()98 void SpectralFeaturesExtractor::Reset() {
99 cepstral_coeffs_ring_buf_.Reset();
100 cepstral_diffs_buf_.Reset();
101 }
102
CheckSilenceComputeFeatures(rtc::ArrayView<const float,kFrameSize20ms24kHz> reference_frame,rtc::ArrayView<const float,kFrameSize20ms24kHz> lagged_frame,rtc::ArrayView<float,kNumBands-kNumLowerBands> higher_bands_cepstrum,rtc::ArrayView<float,kNumLowerBands> average,rtc::ArrayView<float,kNumLowerBands> first_derivative,rtc::ArrayView<float,kNumLowerBands> second_derivative,rtc::ArrayView<float,kNumLowerBands> bands_cross_corr,float * variability)103 bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
104 rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
105 rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
106 rtc::ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum,
107 rtc::ArrayView<float, kNumLowerBands> average,
108 rtc::ArrayView<float, kNumLowerBands> first_derivative,
109 rtc::ArrayView<float, kNumLowerBands> second_derivative,
110 rtc::ArrayView<float, kNumLowerBands> bands_cross_corr,
111 float* variability) {
112 // Compute the Opus band energies for the reference frame.
113 ComputeWindowedForwardFft(reference_frame, half_window_, fft_buffer_.get(),
114 reference_frame_fft_.get(), &fft_);
115 spectral_correlator_.ComputeAutoCorrelation(
116 reference_frame_fft_->GetConstView(), reference_frame_bands_energy_);
117 // Check if the reference frame has silence.
118 const float tot_energy =
119 std::accumulate(reference_frame_bands_energy_.begin(),
120 reference_frame_bands_energy_.end(), 0.f);
121 if (tot_energy < kSilenceThreshold) {
122 return true;
123 }
124 // Compute the Opus band energies for the lagged frame.
125 ComputeWindowedForwardFft(lagged_frame, half_window_, fft_buffer_.get(),
126 lagged_frame_fft_.get(), &fft_);
127 spectral_correlator_.ComputeAutoCorrelation(lagged_frame_fft_->GetConstView(),
128 lagged_frame_bands_energy_);
129 // Log of the band energies for the reference frame.
130 std::array<float, kNumBands> log_bands_energy;
131 ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_,
132 log_bands_energy);
133 // Reference frame cepstrum.
134 std::array<float, kNumBands> cepstrum;
135 ComputeDct(log_bands_energy, dct_table_, cepstrum);
136 // Ad-hoc correction terms for the first two cepstral coefficients.
137 cepstrum[0] -= 12.f;
138 cepstrum[1] -= 4.f;
139 // Update the ring buffer and the cepstral difference stats.
140 cepstral_coeffs_ring_buf_.Push(cepstrum);
141 UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_,
142 &cepstral_diffs_buf_);
143 // Write the higher bands cepstral coefficients.
144 RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size());
145 std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(),
146 higher_bands_cepstrum.begin());
147 // Compute and write remaining features.
148 ComputeAvgAndDerivatives(average, first_derivative, second_derivative);
149 ComputeNormalizedCepstralCorrelation(bands_cross_corr);
150 RTC_DCHECK(variability);
151 *variability = ComputeVariability();
152 return false;
153 }
154
ComputeAvgAndDerivatives(rtc::ArrayView<float,kNumLowerBands> average,rtc::ArrayView<float,kNumLowerBands> first_derivative,rtc::ArrayView<float,kNumLowerBands> second_derivative) const155 void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
156 rtc::ArrayView<float, kNumLowerBands> average,
157 rtc::ArrayView<float, kNumLowerBands> first_derivative,
158 rtc::ArrayView<float, kNumLowerBands> second_derivative) const {
159 auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0);
160 auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1);
161 auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2);
162 RTC_DCHECK_EQ(average.size(), first_derivative.size());
163 RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
164 RTC_DCHECK_LE(average.size(), curr.size());
165 for (size_t i = 0; i < average.size(); ++i) {
166 // Average, kernel: [1, 1, 1].
167 average[i] = curr[i] + prev1[i] + prev2[i];
168 // First derivative, kernel: [1, 0, - 1].
169 first_derivative[i] = curr[i] - prev2[i];
170 // Second derivative, Laplacian kernel: [1, -2, 1].
171 second_derivative[i] = curr[i] - 2 * prev1[i] + prev2[i];
172 }
173 }
174
ComputeNormalizedCepstralCorrelation(rtc::ArrayView<float,kNumLowerBands> bands_cross_corr)175 void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation(
176 rtc::ArrayView<float, kNumLowerBands> bands_cross_corr) {
177 spectral_correlator_.ComputeCrossCorrelation(
178 reference_frame_fft_->GetConstView(), lagged_frame_fft_->GetConstView(),
179 bands_cross_corr_);
180 // Normalize.
181 for (size_t i = 0; i < bands_cross_corr_.size(); ++i) {
182 bands_cross_corr_[i] =
183 bands_cross_corr_[i] /
184 std::sqrt(0.001f + reference_frame_bands_energy_[i] *
185 lagged_frame_bands_energy_[i]);
186 }
187 // Cepstrum.
188 ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr);
189 // Ad-hoc correction terms for the first two cepstral coefficients.
190 bands_cross_corr[0] -= 1.3f;
191 bands_cross_corr[1] -= 0.9f;
192 }
193
ComputeVariability() const194 float SpectralFeaturesExtractor::ComputeVariability() const {
195 // Compute cepstral variability score.
196 float variability = 0.f;
197 for (size_t delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) {
198 float min_dist = std::numeric_limits<float>::max();
199 for (size_t delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) {
200 if (delay1 == delay2) // The distance would be 0.
201 continue;
202 min_dist =
203 std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2));
204 }
205 variability += min_dist;
206 }
207 // Normalize (based on training set stats).
208 // TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction.
209 return variability / kCepstralCoeffsHistorySize - 2.1f;
210 }
211
212 } // namespace rnn_vad
213 } // namespace webrtc
214