1 /*
2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
12
13 #include <algorithm>
14 #include <iterator>
15
16 #include "rtc_base/logging.h"
17 #include "test/testsupport/file_utils.h"
18
19 namespace webrtc {
20 namespace test {
21 namespace conversational_speech {
22
MultiEndCall(rtc::ArrayView<const Turn> timing,const std::string & audiotracks_path,std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)23 MultiEndCall::MultiEndCall(
24 rtc::ArrayView<const Turn> timing,
25 const std::string& audiotracks_path,
26 std::unique_ptr<WavReaderAbstractFactory> wavreader_abstract_factory)
27 : timing_(timing),
28 audiotracks_path_(audiotracks_path),
29 wavreader_abstract_factory_(std::move(wavreader_abstract_factory)),
30 valid_(false) {
31 FindSpeakerNames();
32 if (CreateAudioTrackReaders())
33 valid_ = CheckTiming();
34 }
35
36 MultiEndCall::~MultiEndCall() = default;
37
FindSpeakerNames()38 void MultiEndCall::FindSpeakerNames() {
39 RTC_DCHECK(speaker_names_.empty());
40 for (const Turn& turn : timing_) {
41 speaker_names_.emplace(turn.speaker_name);
42 }
43 }
44
CreateAudioTrackReaders()45 bool MultiEndCall::CreateAudioTrackReaders() {
46 RTC_DCHECK(audiotrack_readers_.empty());
47 sample_rate_hz_ = 0; // Sample rate will be set when reading the first track.
48 for (const Turn& turn : timing_) {
49 auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
50 if (it != audiotrack_readers_.end())
51 continue;
52
53 const std::string audiotrack_file_path =
54 test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name);
55
56 // Map the audiotrack file name to a new instance of WavReaderInterface.
57 std::unique_ptr<WavReaderInterface> wavreader =
58 wavreader_abstract_factory_->Create(
59 test::JoinFilename(audiotracks_path_, turn.audiotrack_file_name));
60
61 if (sample_rate_hz_ == 0) {
62 sample_rate_hz_ = wavreader->SampleRate();
63 } else if (sample_rate_hz_ != wavreader->SampleRate()) {
64 RTC_LOG(LS_ERROR)
65 << "All the audio tracks should have the same sample rate.";
66 return false;
67 }
68
69 if (wavreader->NumChannels() != 1) {
70 RTC_LOG(LS_ERROR) << "Only mono audio tracks supported.";
71 return false;
72 }
73
74 audiotrack_readers_.emplace(turn.audiotrack_file_name,
75 std::move(wavreader));
76 }
77
78 return true;
79 }
80
CheckTiming()81 bool MultiEndCall::CheckTiming() {
82 struct Interval {
83 size_t begin;
84 size_t end;
85 };
86 size_t number_of_turns = timing_.size();
87 auto millisecond_to_samples = [](int ms, int sr) -> int {
88 // Truncation may happen if the sampling rate is not an integer multiple
89 // of 1000 (e.g., 44100).
90 return ms * sr / 1000;
91 };
92 auto in_interval = [](size_t value, const Interval& interval) {
93 return interval.begin <= value && value < interval.end;
94 };
95 total_duration_samples_ = 0;
96 speaking_turns_.clear();
97
98 // Begin and end timestamps for the last two turns (unit: number of samples).
99 Interval second_last_turn = {0, 0};
100 Interval last_turn = {0, 0};
101
102 // Initialize map to store speaking turn indices of each speaker (used to
103 // detect self cross-talk).
104 std::map<std::string, std::vector<size_t>> speaking_turn_indices;
105 for (const std::string& speaker_name : speaker_names_) {
106 speaking_turn_indices.emplace(std::piecewise_construct,
107 std::forward_as_tuple(speaker_name),
108 std::forward_as_tuple());
109 }
110
111 // Parse turns.
112 for (size_t turn_index = 0; turn_index < number_of_turns; ++turn_index) {
113 const Turn& turn = timing_[turn_index];
114 auto it = audiotrack_readers_.find(turn.audiotrack_file_name);
115 RTC_CHECK(it != audiotrack_readers_.end())
116 << "Audio track reader not created";
117
118 // Begin and end timestamps for the current turn.
119 int offset_samples =
120 millisecond_to_samples(turn.offset, it->second->SampleRate());
121 std::size_t begin_timestamp = last_turn.end + offset_samples;
122 std::size_t end_timestamp = begin_timestamp + it->second->NumSamples();
123 RTC_LOG(LS_INFO) << "turn #" << turn_index << " " << begin_timestamp << "-"
124 << end_timestamp << " ms";
125
126 // The order is invalid if the offset is negative and its absolute value is
127 // larger then the duration of the previous turn.
128 if (offset_samples < 0 &&
129 -offset_samples > static_cast<int>(last_turn.end - last_turn.begin)) {
130 RTC_LOG(LS_ERROR) << "invalid order";
131 return false;
132 }
133
134 // Cross-talk with 3 or more speakers occurs when the beginning of the
135 // current interval falls in the last two turns.
136 if (turn_index > 1 && in_interval(begin_timestamp, last_turn) &&
137 in_interval(begin_timestamp, second_last_turn)) {
138 RTC_LOG(LS_ERROR) << "cross-talk with 3+ speakers";
139 return false;
140 }
141
142 // Append turn.
143 speaking_turns_.emplace_back(turn.speaker_name, turn.audiotrack_file_name,
144 begin_timestamp, end_timestamp, turn.gain);
145
146 // Save speaking turn index for self cross-talk detection.
147 RTC_DCHECK_EQ(speaking_turns_.size(), turn_index + 1);
148 speaking_turn_indices[turn.speaker_name].push_back(turn_index);
149
150 // Update total duration of the consversational speech.
151 if (total_duration_samples_ < end_timestamp)
152 total_duration_samples_ = end_timestamp;
153
154 // Update and continue with next turn.
155 second_last_turn = last_turn;
156 last_turn.begin = begin_timestamp;
157 last_turn.end = end_timestamp;
158 }
159
160 // Detect self cross-talk.
161 for (const std::string& speaker_name : speaker_names_) {
162 RTC_LOG(LS_INFO) << "checking self cross-talk for <" << speaker_name << ">";
163
164 // Copy all turns for this speaker to new vector.
165 std::vector<SpeakingTurn> speaking_turns_for_name;
166 std::copy_if(speaking_turns_.begin(), speaking_turns_.end(),
167 std::back_inserter(speaking_turns_for_name),
168 [&speaker_name](const SpeakingTurn& st) {
169 return st.speaker_name == speaker_name;
170 });
171
172 // Check for overlap between adjacent elements.
173 // This is a sufficient condition for self cross-talk since the intervals
174 // are sorted by begin timestamp.
175 auto overlap = std::adjacent_find(
176 speaking_turns_for_name.begin(), speaking_turns_for_name.end(),
177 [](const SpeakingTurn& a, const SpeakingTurn& b) {
178 return a.end > b.begin;
179 });
180
181 if (overlap != speaking_turns_for_name.end()) {
182 RTC_LOG(LS_ERROR) << "Self cross-talk detected";
183 return false;
184 }
185 }
186
187 return true;
188 }
189
190 } // namespace conversational_speech
191 } // namespace test
192 } // namespace webrtc
193