1 /*
2  *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/agc2/rnn_vad/rnn.h"
12 
13 // Defines WEBRTC_ARCH_X86_FAMILY, used below.
14 #include "rtc_base/system/arch.h"
15 
16 #if defined(WEBRTC_HAS_NEON)
17 #include <arm_neon.h>
18 #endif
19 #if defined(WEBRTC_ARCH_X86_FAMILY)
20 #include <emmintrin.h>
21 #endif
22 #include <algorithm>
23 #include <array>
24 #include <cmath>
25 #include <numeric>
26 
27 #include "rtc_base/checks.h"
28 #include "rtc_base/logging.h"
29 #include "third_party/rnnoise/src/rnn_activations.h"
30 #include "third_party/rnnoise/src/rnn_vad_weights.h"
31 
32 namespace webrtc {
33 namespace rnn_vad {
34 namespace {
35 
36 using rnnoise::kWeightsScale;
37 
38 using rnnoise::kInputLayerInputSize;
39 static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
40 using rnnoise::kInputDenseBias;
41 using rnnoise::kInputDenseWeights;
42 using rnnoise::kInputLayerOutputSize;
43 static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
44               "Increase kFullyConnectedLayersMaxUnits.");
45 
46 using rnnoise::kHiddenGruBias;
47 using rnnoise::kHiddenGruRecurrentWeights;
48 using rnnoise::kHiddenGruWeights;
49 using rnnoise::kHiddenLayerOutputSize;
50 static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
51               "Increase kRecurrentLayersMaxUnits.");
52 
53 using rnnoise::kOutputDenseBias;
54 using rnnoise::kOutputDenseWeights;
55 using rnnoise::kOutputLayerOutputSize;
56 static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
57               "Increase kFullyConnectedLayersMaxUnits.");
58 
59 using rnnoise::SigmoidApproximated;
60 using rnnoise::TansigApproximated;
61 
RectifiedLinearUnit(float x)62 inline float RectifiedLinearUnit(float x) {
63   return x < 0.f ? 0.f : x;
64 }
65 
GetScaledParams(rtc::ArrayView<const int8_t> params)66 std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
67   std::vector<float> scaled_params(params.size());
68   std::transform(params.begin(), params.end(), scaled_params.begin(),
69                  [](int8_t x) -> float {
70                    return rnnoise::kWeightsScale * static_cast<float>(x);
71                  });
72   return scaled_params;
73 }
74 
75 // TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
76 // function to improve setup time.
77 // Casts and scales |weights| and re-arranges the layout.
GetPreprocessedFcWeights(rtc::ArrayView<const int8_t> weights,size_t output_size)78 std::vector<float> GetPreprocessedFcWeights(
79     rtc::ArrayView<const int8_t> weights,
80     size_t output_size) {
81   if (output_size == 1) {
82     return GetScaledParams(weights);
83   }
84   // Transpose, scale and cast.
85   const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
86   std::vector<float> w(weights.size());
87   for (size_t o = 0; o < output_size; ++o) {
88     for (size_t i = 0; i < input_size; ++i) {
89       w[o * input_size + i] = rnnoise::kWeightsScale *
90                               static_cast<float>(weights[i * output_size + o]);
91     }
92   }
93   return w;
94 }
95 
96 constexpr size_t kNumGruGates = 3;  // Update, reset, output.
97 
98 // TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
99 // function to improve setup time.
100 // Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
101 // It works both for weights, recurrent weights and bias.
GetPreprocessedGruTensor(rtc::ArrayView<const int8_t> tensor_src,size_t output_size)102 std::vector<float> GetPreprocessedGruTensor(
103     rtc::ArrayView<const int8_t> tensor_src,
104     size_t output_size) {
105   // Transpose, cast and scale.
106   // |n| is the size of the first dimension of the 3-dim tensor |weights|.
107   const size_t n =
108       rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
109   const size_t stride_src = kNumGruGates * output_size;
110   const size_t stride_dst = n * output_size;
111   std::vector<float> tensor_dst(tensor_src.size());
112   for (size_t g = 0; g < kNumGruGates; ++g) {
113     for (size_t o = 0; o < output_size; ++o) {
114       for (size_t i = 0; i < n; ++i) {
115         tensor_dst[g * stride_dst + o * n + i] =
116             rnnoise::kWeightsScale *
117             static_cast<float>(
118                 tensor_src[i * stride_src + g * output_size + o]);
119       }
120     }
121   }
122   return tensor_dst;
123 }
124 
ComputeGruUpdateResetGates(size_t input_size,size_t output_size,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> input,rtc::ArrayView<const float> state,rtc::ArrayView<float> gate)125 void ComputeGruUpdateResetGates(size_t input_size,
126                                 size_t output_size,
127                                 rtc::ArrayView<const float> weights,
128                                 rtc::ArrayView<const float> recurrent_weights,
129                                 rtc::ArrayView<const float> bias,
130                                 rtc::ArrayView<const float> input,
131                                 rtc::ArrayView<const float> state,
132                                 rtc::ArrayView<float> gate) {
133   for (size_t o = 0; o < output_size; ++o) {
134     gate[o] = bias[o];
135     for (size_t i = 0; i < input_size; ++i) {
136       gate[o] += input[i] * weights[o * input_size + i];
137     }
138     for (size_t s = 0; s < output_size; ++s) {
139       gate[o] += state[s] * recurrent_weights[o * output_size + s];
140     }
141     gate[o] = SigmoidApproximated(gate[o]);
142   }
143 }
144 
ComputeGruOutputGate(size_t input_size,size_t output_size,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> input,rtc::ArrayView<const float> state,rtc::ArrayView<const float> reset,rtc::ArrayView<float> gate)145 void ComputeGruOutputGate(size_t input_size,
146                           size_t output_size,
147                           rtc::ArrayView<const float> weights,
148                           rtc::ArrayView<const float> recurrent_weights,
149                           rtc::ArrayView<const float> bias,
150                           rtc::ArrayView<const float> input,
151                           rtc::ArrayView<const float> state,
152                           rtc::ArrayView<const float> reset,
153                           rtc::ArrayView<float> gate) {
154   for (size_t o = 0; o < output_size; ++o) {
155     gate[o] = bias[o];
156     for (size_t i = 0; i < input_size; ++i) {
157       gate[o] += input[i] * weights[o * input_size + i];
158     }
159     for (size_t s = 0; s < output_size; ++s) {
160       gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
161     }
162     gate[o] = RectifiedLinearUnit(gate[o]);
163   }
164 }
165 
166 // Gated recurrent unit (GRU) layer un-optimized implementation.
ComputeGruLayerOutput(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<float> state)167 void ComputeGruLayerOutput(size_t input_size,
168                            size_t output_size,
169                            rtc::ArrayView<const float> input,
170                            rtc::ArrayView<const float> weights,
171                            rtc::ArrayView<const float> recurrent_weights,
172                            rtc::ArrayView<const float> bias,
173                            rtc::ArrayView<float> state) {
174   RTC_DCHECK_EQ(input_size, input.size());
175   // Stride and offset used to read parameter arrays.
176   const size_t stride_in = input_size * output_size;
177   const size_t stride_out = output_size * output_size;
178 
179   // Update gate.
180   std::array<float, kRecurrentLayersMaxUnits> update;
181   ComputeGruUpdateResetGates(
182       input_size, output_size, weights.subview(0, stride_in),
183       recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
184       input, state, update);
185 
186   // Reset gate.
187   std::array<float, kRecurrentLayersMaxUnits> reset;
188   ComputeGruUpdateResetGates(
189       input_size, output_size, weights.subview(stride_in, stride_in),
190       recurrent_weights.subview(stride_out, stride_out),
191       bias.subview(output_size, output_size), input, state, reset);
192 
193   // Output gate.
194   std::array<float, kRecurrentLayersMaxUnits> output;
195   ComputeGruOutputGate(
196       input_size, output_size, weights.subview(2 * stride_in, stride_in),
197       recurrent_weights.subview(2 * stride_out, stride_out),
198       bias.subview(2 * output_size, output_size), input, state, reset, output);
199 
200   // Update output through the update gates and update the state.
201   for (size_t o = 0; o < output_size; ++o) {
202     output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
203     state[o] = output[o];
204   }
205 }
206 
207 // Fully connected layer un-optimized implementation.
ComputeFullyConnectedLayerOutput(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> weights,rtc::FunctionView<float (float)> activation_function,rtc::ArrayView<float> output)208 void ComputeFullyConnectedLayerOutput(
209     size_t input_size,
210     size_t output_size,
211     rtc::ArrayView<const float> input,
212     rtc::ArrayView<const float> bias,
213     rtc::ArrayView<const float> weights,
214     rtc::FunctionView<float(float)> activation_function,
215     rtc::ArrayView<float> output) {
216   RTC_DCHECK_EQ(input.size(), input_size);
217   RTC_DCHECK_EQ(bias.size(), output_size);
218   RTC_DCHECK_EQ(weights.size(), input_size * output_size);
219   for (size_t o = 0; o < output_size; ++o) {
220     output[o] = bias[o];
221     // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
222     // |weights_| change the performance across different platforms.
223     for (size_t i = 0; i < input_size; ++i) {
224       output[o] += input[i] * weights[o * input_size + i];
225     }
226     output[o] = activation_function(output[o]);
227   }
228 }
229 
230 #if defined(WEBRTC_ARCH_X86_FAMILY)
231 // Fully connected layer SSE2 implementation.
ComputeFullyConnectedLayerOutputSse2(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> weights,rtc::FunctionView<float (float)> activation_function,rtc::ArrayView<float> output)232 void ComputeFullyConnectedLayerOutputSse2(
233     size_t input_size,
234     size_t output_size,
235     rtc::ArrayView<const float> input,
236     rtc::ArrayView<const float> bias,
237     rtc::ArrayView<const float> weights,
238     rtc::FunctionView<float(float)> activation_function,
239     rtc::ArrayView<float> output) {
240   RTC_DCHECK_EQ(input.size(), input_size);
241   RTC_DCHECK_EQ(bias.size(), output_size);
242   RTC_DCHECK_EQ(weights.size(), input_size * output_size);
243   const size_t input_size_by_4 = input_size >> 2;
244   const size_t offset = input_size & ~3;
245   __m128 sum_wx_128;
246   const float* v = reinterpret_cast<const float*>(&sum_wx_128);
247   for (size_t o = 0; o < output_size; ++o) {
248     // Perform 128 bit vector operations.
249     sum_wx_128 = _mm_set1_ps(0);
250     const float* x_p = input.data();
251     const float* w_p = weights.data() + o * input_size;
252     for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
253       sum_wx_128 = _mm_add_ps(sum_wx_128,
254                               _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
255     }
256     // Perform non-vector operations for any remaining items, sum up bias term
257     // and results from the vectorized code, and apply the activation function.
258     output[o] = activation_function(
259         std::inner_product(input.begin() + offset, input.end(),
260                            weights.begin() + o * input_size + offset,
261                            bias[o] + v[0] + v[1] + v[2] + v[3]));
262   }
263 }
264 #endif
265 
266 }  // namespace
267 
FullyConnectedLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,rtc::FunctionView<float (float)> activation_function,Optimization optimization)268 FullyConnectedLayer::FullyConnectedLayer(
269     const size_t input_size,
270     const size_t output_size,
271     const rtc::ArrayView<const int8_t> bias,
272     const rtc::ArrayView<const int8_t> weights,
273     rtc::FunctionView<float(float)> activation_function,
274     Optimization optimization)
275     : input_size_(input_size),
276       output_size_(output_size),
277       bias_(GetScaledParams(bias)),
278       weights_(GetPreprocessedFcWeights(weights, output_size)),
279       activation_function_(activation_function),
280       optimization_(optimization) {
281   RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
282       << "Static over-allocation of fully-connected layers output vectors is "
283          "not sufficient.";
284   RTC_DCHECK_EQ(output_size_, bias_.size())
285       << "Mismatching output size and bias terms array size.";
286   RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
287       << "Mismatching input-output size and weight coefficients array size.";
288 }
289 
290 FullyConnectedLayer::~FullyConnectedLayer() = default;
291 
GetOutput() const292 rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
293   return rtc::ArrayView<const float>(output_.data(), output_size_);
294 }
295 
ComputeOutput(rtc::ArrayView<const float> input)296 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
297   switch (optimization_) {
298 #if defined(WEBRTC_ARCH_X86_FAMILY)
299     case Optimization::kSse2:
300       ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
301                                            bias_, weights_,
302                                            activation_function_, output_);
303       break;
304 #endif
305 #if defined(WEBRTC_HAS_NEON)
306     case Optimization::kNeon:
307       // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
308       ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
309                                        weights_, activation_function_, output_);
310       break;
311 #endif
312     default:
313       ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
314                                        weights_, activation_function_, output_);
315   }
316 }
317 
GatedRecurrentLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,const rtc::ArrayView<const int8_t> recurrent_weights,Optimization optimization)318 GatedRecurrentLayer::GatedRecurrentLayer(
319     const size_t input_size,
320     const size_t output_size,
321     const rtc::ArrayView<const int8_t> bias,
322     const rtc::ArrayView<const int8_t> weights,
323     const rtc::ArrayView<const int8_t> recurrent_weights,
324     Optimization optimization)
325     : input_size_(input_size),
326       output_size_(output_size),
327       bias_(GetPreprocessedGruTensor(bias, output_size)),
328       weights_(GetPreprocessedGruTensor(weights, output_size)),
329       recurrent_weights_(
330           GetPreprocessedGruTensor(recurrent_weights, output_size)),
331       optimization_(optimization) {
332   RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
333       << "Static over-allocation of recurrent layers state vectors is not "
334          "sufficient.";
335   RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
336       << "Mismatching output size and bias terms array size.";
337   RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
338       << "Mismatching input-output size and weight coefficients array size.";
339   RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
340                 recurrent_weights_.size())
341       << "Mismatching input-output size and recurrent weight coefficients array"
342          " size.";
343   Reset();
344 }
345 
346 GatedRecurrentLayer::~GatedRecurrentLayer() = default;
347 
GetOutput() const348 rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
349   return rtc::ArrayView<const float>(state_.data(), output_size_);
350 }
351 
Reset()352 void GatedRecurrentLayer::Reset() {
353   state_.fill(0.f);
354 }
355 
ComputeOutput(rtc::ArrayView<const float> input)356 void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
357   switch (optimization_) {
358 #if defined(WEBRTC_ARCH_X86_FAMILY)
359     case Optimization::kSse2:
360       // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
361       ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
362                             recurrent_weights_, bias_, state_);
363       break;
364 #endif
365 #if defined(WEBRTC_HAS_NEON)
366     case Optimization::kNeon:
367       // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
368       ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
369                             recurrent_weights_, bias_, state_);
370       break;
371 #endif
372     default:
373       ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
374                             recurrent_weights_, bias_, state_);
375   }
376 }
377 
RnnBasedVad()378 RnnBasedVad::RnnBasedVad()
379     : input_layer_(kInputLayerInputSize,
380                    kInputLayerOutputSize,
381                    kInputDenseBias,
382                    kInputDenseWeights,
383                    TansigApproximated,
384                    DetectOptimization()),
385       hidden_layer_(kInputLayerOutputSize,
386                     kHiddenLayerOutputSize,
387                     kHiddenGruBias,
388                     kHiddenGruWeights,
389                     kHiddenGruRecurrentWeights,
390                     DetectOptimization()),
391       output_layer_(kHiddenLayerOutputSize,
392                     kOutputLayerOutputSize,
393                     kOutputDenseBias,
394                     kOutputDenseWeights,
395                     SigmoidApproximated,
396                     DetectOptimization()) {
397   // Input-output chaining size checks.
398   RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
399       << "The input and the hidden layers sizes do not match.";
400   RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
401       << "The hidden and the output layers sizes do not match.";
402 }
403 
404 RnnBasedVad::~RnnBasedVad() = default;
405 
Reset()406 void RnnBasedVad::Reset() {
407   hidden_layer_.Reset();
408 }
409 
ComputeVadProbability(rtc::ArrayView<const float,kFeatureVectorSize> feature_vector,bool is_silence)410 float RnnBasedVad::ComputeVadProbability(
411     rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
412     bool is_silence) {
413   if (is_silence) {
414     Reset();
415     return 0.f;
416   }
417   input_layer_.ComputeOutput(feature_vector);
418   hidden_layer_.ComputeOutput(input_layer_.GetOutput());
419   output_layer_.ComputeOutput(hidden_layer_.GetOutput());
420   const auto vad_output = output_layer_.GetOutput();
421   return vad_output[0];
422 }
423 
424 }  // namespace rnn_vad
425 }  // namespace webrtc
426