1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_coding/neteq/time_stretch.h"
12 
13 #include <algorithm>  // min, max
14 #include <memory>
15 
16 #include "common_audio/signal_processing/include/signal_processing_library.h"
17 #include "modules/audio_coding/neteq/background_noise.h"
18 #include "modules/audio_coding/neteq/cross_correlation.h"
19 #include "modules/audio_coding/neteq/dsp_helper.h"
20 #include "rtc_base/numerics/safe_conversions.h"
21 
22 namespace webrtc {
23 
Process(const int16_t * input,size_t input_len,bool fast_mode,AudioMultiVector * output,size_t * length_change_samples)24 TimeStretch::ReturnCodes TimeStretch::Process(const int16_t* input,
25                                               size_t input_len,
26                                               bool fast_mode,
27                                               AudioMultiVector* output,
28                                               size_t* length_change_samples) {
29   // Pre-calculate common multiplication with |fs_mult_|.
30   size_t fs_mult_120 =
31       static_cast<size_t>(fs_mult_ * 120);  // Corresponds to 15 ms.
32 
33   const int16_t* signal;
34   std::unique_ptr<int16_t[]> signal_array;
35   size_t signal_len;
36   if (num_channels_ == 1) {
37     signal = input;
38     signal_len = input_len;
39   } else {
40     // We want |signal| to be only the first channel of |input|, which is
41     // interleaved. Thus, we take the first sample, skip forward |num_channels|
42     // samples, and continue like that.
43     signal_len = input_len / num_channels_;
44     signal_array.reset(new int16_t[signal_len]);
45     signal = signal_array.get();
46     size_t j = kRefChannel;
47     for (size_t i = 0; i < signal_len; ++i) {
48       signal_array[i] = input[j];
49       j += num_channels_;
50     }
51   }
52 
53   // Find maximum absolute value of input signal.
54   max_input_value_ = WebRtcSpl_MaxAbsValueW16(signal, signal_len);
55 
56   // Downsample to 4 kHz sample rate and calculate auto-correlation.
57   DspHelper::DownsampleTo4kHz(signal, signal_len, kDownsampledLen,
58                               sample_rate_hz_, true /* compensate delay*/,
59                               downsampled_input_);
60   AutoCorrelation();
61 
62   // Find the strongest correlation peak.
63   static const size_t kNumPeaks = 1;
64   size_t peak_index;
65   int16_t peak_value;
66   DspHelper::PeakDetection(auto_correlation_, kCorrelationLen, kNumPeaks,
67                            fs_mult_, &peak_index, &peak_value);
68   // Assert that |peak_index| stays within boundaries.
69   assert(peak_index <= (2 * kCorrelationLen - 1) * fs_mult_);
70 
71   // Compensate peak_index for displaced starting position. The displacement
72   // happens in AutoCorrelation(). Here, |kMinLag| is in the down-sampled 4 kHz
73   // domain, while the |peak_index| is in the original sample rate; hence, the
74   // multiplication by fs_mult_ * 2.
75   peak_index += kMinLag * fs_mult_ * 2;
76   // Assert that |peak_index| stays within boundaries.
77   assert(peak_index >= static_cast<size_t>(20 * fs_mult_));
78   assert(peak_index <= 20 * fs_mult_ + (2 * kCorrelationLen - 1) * fs_mult_);
79 
80   // Calculate scaling to ensure that |peak_index| samples can be square-summed
81   // without overflowing.
82   int scaling = 31 - WebRtcSpl_NormW32(max_input_value_ * max_input_value_) -
83                 WebRtcSpl_NormW32(static_cast<int32_t>(peak_index));
84   scaling = std::max(0, scaling);
85 
86   // |vec1| starts at 15 ms minus one pitch period.
87   const int16_t* vec1 = &signal[fs_mult_120 - peak_index];
88   // |vec2| start at 15 ms.
89   const int16_t* vec2 = &signal[fs_mult_120];
90   // Calculate energies for |vec1| and |vec2|, assuming they both contain
91   // |peak_index| samples.
92   int32_t vec1_energy =
93       WebRtcSpl_DotProductWithScale(vec1, vec1, peak_index, scaling);
94   int32_t vec2_energy =
95       WebRtcSpl_DotProductWithScale(vec2, vec2, peak_index, scaling);
96 
97   // Calculate cross-correlation between |vec1| and |vec2|.
98   int32_t cross_corr =
99       WebRtcSpl_DotProductWithScale(vec1, vec2, peak_index, scaling);
100 
101   // Check if the signal seems to be active speech or not (simple VAD).
102   bool active_speech =
103       SpeechDetection(vec1_energy, vec2_energy, peak_index, scaling);
104 
105   int16_t best_correlation;
106   if (!active_speech) {
107     SetParametersForPassiveSpeech(signal_len, &best_correlation, &peak_index);
108   } else {
109     // Calculate correlation:
110     // cross_corr / sqrt(vec1_energy * vec2_energy).
111 
112     // Start with calculating scale values.
113     int energy1_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec1_energy));
114     int energy2_scale = std::max(0, 16 - WebRtcSpl_NormW32(vec2_energy));
115 
116     // Make sure total scaling is even (to simplify scale factor after sqrt).
117     if ((energy1_scale + energy2_scale) & 1) {
118       // The sum is odd.
119       energy1_scale += 1;
120     }
121 
122     // Scale energies to int16_t.
123     int16_t vec1_energy_int16 =
124         static_cast<int16_t>(vec1_energy >> energy1_scale);
125     int16_t vec2_energy_int16 =
126         static_cast<int16_t>(vec2_energy >> energy2_scale);
127 
128     // Calculate square-root of energy product.
129     int16_t sqrt_energy_prod =
130         WebRtcSpl_SqrtFloor(vec1_energy_int16 * vec2_energy_int16);
131 
132     // Calculate cross_corr / sqrt(en1*en2) in Q14.
133     int temp_scale = 14 - (energy1_scale + energy2_scale) / 2;
134     cross_corr = WEBRTC_SPL_SHIFT_W32(cross_corr, temp_scale);
135     cross_corr = std::max(0, cross_corr);  // Don't use if negative.
136     best_correlation = WebRtcSpl_DivW32W16(cross_corr, sqrt_energy_prod);
137     // Make sure |best_correlation| is no larger than 1 in Q14.
138     best_correlation = std::min(static_cast<int16_t>(16384), best_correlation);
139   }
140 
141   // Check accelerate criteria and stretch the signal.
142   ReturnCodes return_value =
143       CheckCriteriaAndStretch(input, input_len, peak_index, best_correlation,
144                               active_speech, fast_mode, output);
145   switch (return_value) {
146     case kSuccess:
147       *length_change_samples = peak_index;
148       break;
149     case kSuccessLowEnergy:
150       *length_change_samples = peak_index;
151       break;
152     case kNoStretch:
153     case kError:
154       *length_change_samples = 0;
155       break;
156   }
157   return return_value;
158 }
159 
AutoCorrelation()160 void TimeStretch::AutoCorrelation() {
161   // Calculate correlation from lag kMinLag to lag kMaxLag in 4 kHz domain.
162   int32_t auto_corr[kCorrelationLen];
163   CrossCorrelationWithAutoShift(
164       &downsampled_input_[kMaxLag], &downsampled_input_[kMaxLag - kMinLag],
165       kCorrelationLen, kMaxLag - kMinLag, -1, auto_corr);
166 
167   // Normalize correlation to 14 bits and write to |auto_correlation_|.
168   int32_t max_corr = WebRtcSpl_MaxAbsValueW32(auto_corr, kCorrelationLen);
169   int scaling = std::max(0, 17 - WebRtcSpl_NormW32(max_corr));
170   WebRtcSpl_VectorBitShiftW32ToW16(auto_correlation_, kCorrelationLen,
171                                    auto_corr, scaling);
172 }
173 
SpeechDetection(int32_t vec1_energy,int32_t vec2_energy,size_t peak_index,int scaling) const174 bool TimeStretch::SpeechDetection(int32_t vec1_energy,
175                                   int32_t vec2_energy,
176                                   size_t peak_index,
177                                   int scaling) const {
178   // Check if the signal seems to be active speech or not (simple VAD).
179   // If (vec1_energy + vec2_energy) / (2 * peak_index) <=
180   // 8 * background_noise_energy, then we say that the signal contains no
181   // active speech.
182   // Rewrite the inequality as:
183   // (vec1_energy + vec2_energy) / 16 <= peak_index * background_noise_energy.
184   // The two sides of the inequality will be denoted |left_side| and
185   // |right_side|.
186   int32_t left_side = rtc::saturated_cast<int32_t>(
187       (static_cast<int64_t>(vec1_energy) + vec2_energy) / 16);
188   int32_t right_side;
189   if (background_noise_.initialized()) {
190     right_side = background_noise_.Energy(kRefChannel);
191   } else {
192     // If noise parameters have not been estimated, use a fixed threshold.
193     right_side = 75000;
194   }
195   int right_scale = 16 - WebRtcSpl_NormW32(right_side);
196   right_scale = std::max(0, right_scale);
197   left_side = left_side >> right_scale;
198   right_side =
199       rtc::dchecked_cast<int32_t>(peak_index) * (right_side >> right_scale);
200 
201   // Scale |left_side| properly before comparing with |right_side|.
202   // (|scaling| is the scale factor before energy calculation, thus the scale
203   // factor for the energy is 2 * scaling.)
204   if (WebRtcSpl_NormW32(left_side) < 2 * scaling) {
205     // Cannot scale only |left_side|, must scale |right_side| too.
206     int temp_scale = WebRtcSpl_NormW32(left_side);
207     left_side = left_side << temp_scale;
208     right_side = right_side >> (2 * scaling - temp_scale);
209   } else {
210     left_side = left_side << 2 * scaling;
211   }
212   return left_side > right_side;
213 }
214 
215 }  // namespace webrtc
216