1 /*
2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // This file consists of unit tests for webrtc::test::conversational_speech
12 // members. Part of them focus on accepting or rejecting different
13 // conversational speech setups. A setup is defined by a set of audio tracks and
14 // timing information).
15 // The docstring at the beginning of each TEST(ConversationalSpeechTest,
16 // MultiEndCallSetup*) function looks like the drawing below and indicates which
17 // setup is tested.
18 //
19 // Accept:
20 // A 0****.....
21 // B .....1****
22 //
23 // The drawing indicates the following:
24 // - the illustrated setup should be accepted,
25 // - there are two speakers (namely, A and B),
26 // - A is the first speaking, B is the second one,
27 // - each character after the speaker's letter indicates a time unit (e.g., 100
28 // ms),
29 // - "*" indicates speaking, "." listening,
30 // - numbers indicate the turn index in std::vector<Turn>.
31 //
32 // Note that the same speaker can appear in multiple lines in order to depict
33 // cases in which there are wrong offsets leading to self cross-talk (which is
34 // rejected).
35
36 // MSVC++ requires this to be set before any other includes to get M_PI.
37 #define _USE_MATH_DEFINES
38
39 #include <stdio.h>
40
41 #include <cmath>
42 #include <map>
43 #include <memory>
44 #include <vector>
45
46 #include "absl/types/optional.h"
47 #include "common_audio/wav_file.h"
48 #include "modules/audio_processing/test/conversational_speech/config.h"
49 #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
50 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
51 #include "modules/audio_processing/test/conversational_speech/simulator.h"
52 #include "modules/audio_processing/test/conversational_speech/timing.h"
53 #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h"
54 #include "rtc_base/logging.h"
55 #include "test/gmock.h"
56 #include "test/gtest.h"
57 #include "test/testsupport/file_utils.h"
58
59 namespace webrtc {
60 namespace test {
61 namespace {
62
63 using conversational_speech::LoadTiming;
64 using conversational_speech::MockWavReaderFactory;
65 using conversational_speech::MultiEndCall;
66 using conversational_speech::SaveTiming;
67 using conversational_speech::Turn;
68 using conversational_speech::WavReaderFactory;
69
70 const char* const audiotracks_path = "/path/to/audiotracks";
71 const char* const timing_filepath = "/path/to/timing_file.txt";
72 const char* const output_path = "/path/to/output_dir";
73
74 const std::vector<Turn> expected_timing = {
75 {"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
76 {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
77 };
78 const std::size_t kNumberOfTurns = expected_timing.size();
79
80 // Default arguments for MockWavReaderFactory ctor.
81 // Fake audio track parameters.
82 constexpr int kDefaultSampleRate = 48000;
83 const std::map<std::string, const MockWavReaderFactory::Params>
84 kDefaultMockWavReaderFactoryParamsMap = {
85 {"t300", {kDefaultSampleRate, 1u, 14400u}}, // Mono, 0.3 seconds.
86 {"t500", {kDefaultSampleRate, 1u, 24000u}}, // Mono, 0.5 seconds.
87 {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // Mono, 1.0 seconds.
88 {"sr8000", {8000, 1u, 8000u}}, // 8kHz sample rate, mono, 1 second.
89 {"sr16000", {16000, 1u, 16000u}}, // 16kHz sample rate, mono, 1 second.
90 {"sr16000_stereo", {16000, 2u, 16000u}}, // Like sr16000, but stereo.
91 };
92 const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
93 kDefaultMockWavReaderFactoryParamsMap.at("t500");
94
CreateMockWavReaderFactory()95 std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
96 return std::unique_ptr<MockWavReaderFactory>(
97 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
98 kDefaultMockWavReaderFactoryParamsMap));
99 }
100
CreateSineWavFile(const std::string & filepath,const MockWavReaderFactory::Params & params,float frequency=440.0f)101 void CreateSineWavFile(const std::string& filepath,
102 const MockWavReaderFactory::Params& params,
103 float frequency = 440.0f) {
104 // Create samples.
105 constexpr double two_pi = 2.0 * M_PI;
106 std::vector<int16_t> samples(params.num_samples);
107 for (std::size_t i = 0; i < params.num_samples; ++i) {
108 // TODO(alessiob): the produced tone is not pure, improve.
109 samples[i] = std::lround(
110 32767.0f * std::sin(two_pi * i * frequency / params.sample_rate));
111 }
112
113 // Write samples.
114 WavWriter wav_writer(filepath, params.sample_rate, params.num_channels);
115 wav_writer.WriteSamples(samples.data(), params.num_samples);
116 }
117
118 // Parameters to generate audio tracks with CreateSineWavFile.
119 struct SineAudioTrackParams {
120 MockWavReaderFactory::Params params;
121 float frequency;
122 };
123
124 // Creates a temporary directory in which sine audio tracks are written.
CreateTemporarySineAudioTracks(const std::map<std::string,SineAudioTrackParams> & sine_tracks_params)125 std::string CreateTemporarySineAudioTracks(
126 const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
127 // Create temporary directory.
128 std::string temp_directory =
129 OutputPath() + "TempConversationalSpeechAudioTracks";
130 CreateDir(temp_directory);
131
132 // Create sine tracks.
133 for (const auto& it : sine_tracks_params) {
134 const std::string temp_filepath = JoinFilename(temp_directory, it.first);
135 CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency);
136 }
137
138 return temp_directory;
139 }
140
CheckAudioTrackParams(const WavReaderFactory & wav_reader_factory,const std::string & filepath,const MockWavReaderFactory::Params & expeted_params)141 void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
142 const std::string& filepath,
143 const MockWavReaderFactory::Params& expeted_params) {
144 auto wav_reader = wav_reader_factory.Create(filepath);
145 EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
146 EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
147 EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
148 }
149
DeleteFolderAndContents(const std::string & dir)150 void DeleteFolderAndContents(const std::string& dir) {
151 if (!DirExists(dir)) {
152 return;
153 }
154 absl::optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
155 EXPECT_TRUE(dir_content);
156 for (const auto& path : *dir_content) {
157 if (DirExists(path)) {
158 DeleteFolderAndContents(path);
159 } else if (FileExists(path)) {
160 // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
161 RemoveFile(path);
162 } else {
163 FAIL();
164 }
165 }
166 // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
167 RemoveDir(dir);
168 }
169
170 } // namespace
171
172 using ::testing::_;
173
TEST(ConversationalSpeechTest,Settings)174 TEST(ConversationalSpeechTest, Settings) {
175 const conversational_speech::Config config(audiotracks_path, timing_filepath,
176 output_path);
177
178 // Test getters.
179 EXPECT_EQ(audiotracks_path, config.audiotracks_path());
180 EXPECT_EQ(timing_filepath, config.timing_filepath());
181 EXPECT_EQ(output_path, config.output_path());
182 }
183
TEST(ConversationalSpeechTest,TimingSaveLoad)184 TEST(ConversationalSpeechTest, TimingSaveLoad) {
185 // Save test timing.
186 const std::string temporary_filepath =
187 TempFilename(OutputPath(), "TempTimingTestFile");
188 SaveTiming(temporary_filepath, expected_timing);
189
190 // Create a std::vector<Turn> instance by loading from file.
191 std::vector<Turn> actual_timing = LoadTiming(temporary_filepath);
192 RemoveFile(temporary_filepath);
193
194 // Check size.
195 EXPECT_EQ(expected_timing.size(), actual_timing.size());
196
197 // Check Turn instances.
198 for (size_t index = 0; index < expected_timing.size(); ++index) {
199 EXPECT_EQ(expected_timing[index], actual_timing[index])
200 << "turn #" << index << " not matching";
201 }
202 }
203
TEST(ConversationalSpeechTest,MultiEndCallCreate)204 TEST(ConversationalSpeechTest, MultiEndCallCreate) {
205 auto mock_wavreader_factory = CreateMockWavReaderFactory();
206
207 // There are 5 unique audio tracks to read.
208 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
209
210 // Inject the mock wav reader factory.
211 conversational_speech::MultiEndCall multiend_call(
212 expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
213 EXPECT_TRUE(multiend_call.valid());
214
215 // Test.
216 EXPECT_EQ(2u, multiend_call.speaker_names().size());
217 EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
218 EXPECT_EQ(6u, multiend_call.speaking_turns().size());
219 }
220
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRates)221 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
222 const std::vector<Turn> timing = {
223 {"A", "sr8000", 0, 0},
224 {"B", "sr16000", 0, 0},
225 };
226 auto mock_wavreader_factory = CreateMockWavReaderFactory();
227
228 // There are two unique audio tracks to read.
229 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
230
231 MultiEndCall multiend_call(timing, audiotracks_path,
232 std::move(mock_wavreader_factory));
233 EXPECT_FALSE(multiend_call.valid());
234 }
235
TEST(ConversationalSpeechTest,MultiEndCallSetupMultipleChannels)236 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
237 const std::vector<Turn> timing = {
238 {"A", "sr16000_stereo", 0, 0},
239 {"B", "sr16000_stereo", 0, 0},
240 };
241 auto mock_wavreader_factory = CreateMockWavReaderFactory();
242
243 // There is one unique audio track to read.
244 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1);
245
246 MultiEndCall multiend_call(timing, audiotracks_path,
247 std::move(mock_wavreader_factory));
248 EXPECT_FALSE(multiend_call.valid());
249 }
250
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels)251 TEST(ConversationalSpeechTest,
252 MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
253 const std::vector<Turn> timing = {
254 {"A", "sr8000", 0, 0},
255 {"B", "sr16000_stereo", 0, 0},
256 };
257 auto mock_wavreader_factory = CreateMockWavReaderFactory();
258
259 // There are two unique audio tracks to read.
260 EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
261
262 MultiEndCall multiend_call(timing, audiotracks_path,
263 std::move(mock_wavreader_factory));
264 EXPECT_FALSE(multiend_call.valid());
265 }
266
TEST(ConversationalSpeechTest,MultiEndCallSetupFirstOffsetNegative)267 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
268 const std::vector<Turn> timing = {
269 {"A", "t500", -100, 0},
270 {"B", "t500", 0, 0},
271 };
272 auto mock_wavreader_factory = CreateMockWavReaderFactory();
273
274 // There is one unique audio track to read.
275 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
276
277 conversational_speech::MultiEndCall multiend_call(
278 timing, audiotracks_path, std::move(mock_wavreader_factory));
279 EXPECT_FALSE(multiend_call.valid());
280 }
281
TEST(ConversationalSpeechTest,MultiEndCallSetupSimple)282 TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
283 // Accept:
284 // A 0****.....
285 // B .....1****
286 constexpr std::size_t expected_duration = kDefaultSampleRate;
287 const std::vector<Turn> timing = {
288 {"A", "t500", 0, 0},
289 {"B", "t500", 0, 0},
290 };
291 auto mock_wavreader_factory = CreateMockWavReaderFactory();
292
293 // There is one unique audio track to read.
294 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
295
296 conversational_speech::MultiEndCall multiend_call(
297 timing, audiotracks_path, std::move(mock_wavreader_factory));
298 EXPECT_TRUE(multiend_call.valid());
299
300 // Test.
301 EXPECT_EQ(2u, multiend_call.speaker_names().size());
302 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
303 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
304 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
305 }
306
TEST(ConversationalSpeechTest,MultiEndCallSetupPause)307 TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
308 // Accept:
309 // A 0****.......
310 // B .......1****
311 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
312 const std::vector<Turn> timing = {
313 {"A", "t500", 0, 0},
314 {"B", "t500", 200, 0},
315 };
316 auto mock_wavreader_factory = CreateMockWavReaderFactory();
317
318 // There is one unique audio track to read.
319 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
320
321 conversational_speech::MultiEndCall multiend_call(
322 timing, audiotracks_path, std::move(mock_wavreader_factory));
323 EXPECT_TRUE(multiend_call.valid());
324
325 // Test.
326 EXPECT_EQ(2u, multiend_call.speaker_names().size());
327 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
328 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
329 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
330 }
331
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalk)332 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
333 // Accept:
334 // A 0****....
335 // B ....1****
336 constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
337 const std::vector<Turn> timing = {
338 {"A", "t500", 0, 0},
339 {"B", "t500", -100, 0},
340 };
341 auto mock_wavreader_factory = CreateMockWavReaderFactory();
342
343 // There is one unique audio track to read.
344 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
345
346 conversational_speech::MultiEndCall multiend_call(
347 timing, audiotracks_path, std::move(mock_wavreader_factory));
348 EXPECT_TRUE(multiend_call.valid());
349
350 // Test.
351 EXPECT_EQ(2u, multiend_call.speaker_names().size());
352 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
353 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
354 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
355 }
356
TEST(ConversationalSpeechTest,MultiEndCallSetupInvalidOrder)357 TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
358 // Reject:
359 // A ..0****
360 // B .1****. The n-th turn cannot start before the (n-1)-th one.
361 const std::vector<Turn> timing = {
362 {"A", "t500", 200, 0},
363 {"B", "t500", -600, 0},
364 };
365 auto mock_wavreader_factory = CreateMockWavReaderFactory();
366
367 // There is one unique audio track to read.
368 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
369
370 conversational_speech::MultiEndCall multiend_call(
371 timing, audiotracks_path, std::move(mock_wavreader_factory));
372 EXPECT_FALSE(multiend_call.valid());
373 }
374
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkThree)375 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
376 // Accept:
377 // A 0****2****...
378 // B ...1*********
379 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
380 const std::vector<Turn> timing = {
381 {"A", "t500", 0, 0},
382 {"B", "t1000", -200, 0},
383 {"A", "t500", -800, 0},
384 };
385 auto mock_wavreader_factory = CreateMockWavReaderFactory();
386
387 // There are two unique audio tracks to read.
388 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
389
390 conversational_speech::MultiEndCall multiend_call(
391 timing, audiotracks_path, std::move(mock_wavreader_factory));
392 EXPECT_TRUE(multiend_call.valid());
393
394 // Test.
395 EXPECT_EQ(2u, multiend_call.speaker_names().size());
396 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
397 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
398 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
399 }
400
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkNearInvalid)401 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
402 // Reject:
403 // A 0****......
404 // A ...1****...
405 // B ......2****
406 // ^ Turn #1 overlaps with #0 which is from the same speaker.
407 const std::vector<Turn> timing = {
408 {"A", "t500", 0, 0},
409 {"A", "t500", -200, 0},
410 {"B", "t500", -200, 0},
411 };
412 auto mock_wavreader_factory = CreateMockWavReaderFactory();
413
414 // There is one unique audio track to read.
415 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
416
417 conversational_speech::MultiEndCall multiend_call(
418 timing, audiotracks_path, std::move(mock_wavreader_factory));
419 EXPECT_FALSE(multiend_call.valid());
420 }
421
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkFarInvalid)422 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
423 // Reject:
424 // A 0*********
425 // B 1**.......
426 // C ...2**....
427 // A ......3**.
428 // ^ Turn #3 overlaps with #0 which is from the same speaker.
429 const std::vector<Turn> timing = {
430 {"A", "t1000", 0, 0},
431 {"B", "t300", -1000, 0},
432 {"C", "t300", 0, 0},
433 {"A", "t300", 0, 0},
434 };
435 auto mock_wavreader_factory = CreateMockWavReaderFactory();
436
437 // There are two unique audio tracks to read.
438 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
439
440 conversational_speech::MultiEndCall multiend_call(
441 timing, audiotracks_path, std::move(mock_wavreader_factory));
442 EXPECT_FALSE(multiend_call.valid());
443 }
444
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleValid)445 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
446 // Accept:
447 // A 0*********..
448 // B ..1****.....
449 // C .......2****
450 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
451 const std::vector<Turn> timing = {
452 {"A", "t1000", 0, 0},
453 {"B", "t500", -800, 0},
454 {"C", "t500", 0, 0},
455 };
456 auto mock_wavreader_factory = CreateMockWavReaderFactory();
457
458 // There are two unique audio tracks to read.
459 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
460
461 conversational_speech::MultiEndCall multiend_call(
462 timing, audiotracks_path, std::move(mock_wavreader_factory));
463 EXPECT_TRUE(multiend_call.valid());
464
465 // Test.
466 EXPECT_EQ(3u, multiend_call.speaker_names().size());
467 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
468 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
469 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
470 }
471
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleInvalid)472 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
473 // Reject:
474 // A 0*********
475 // B ..1****...
476 // C ....2****.
477 // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
478 // not permitted).
479 const std::vector<Turn> timing = {
480 {"A", "t1000", 0, 0},
481 {"B", "t500", -800, 0},
482 {"C", "t500", -300, 0},
483 };
484 auto mock_wavreader_factory = CreateMockWavReaderFactory();
485
486 // There are two unique audio tracks to read.
487 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
488
489 conversational_speech::MultiEndCall multiend_call(
490 timing, audiotracks_path, std::move(mock_wavreader_factory));
491 EXPECT_FALSE(multiend_call.valid());
492 }
493
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleAndPause)494 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
495 // Accept:
496 // A 0*********..
497 // B .2****......
498 // C .......3****
499 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
500 const std::vector<Turn> timing = {
501 {"A", "t1000", 0, 0},
502 {"B", "t500", -900, 0},
503 {"C", "t500", 100, 0},
504 };
505 auto mock_wavreader_factory = CreateMockWavReaderFactory();
506
507 // There are two unique audio tracks to read.
508 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
509
510 conversational_speech::MultiEndCall multiend_call(
511 timing, audiotracks_path, std::move(mock_wavreader_factory));
512 EXPECT_TRUE(multiend_call.valid());
513
514 // Test.
515 EXPECT_EQ(3u, multiend_call.speaker_names().size());
516 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
517 EXPECT_EQ(3u, multiend_call.speaking_turns().size());
518 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
519 }
520
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkFullOverlapValid)521 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
522 // Accept:
523 // A 0****
524 // B 1****
525 const std::vector<Turn> timing = {
526 {"A", "t500", 0, 0},
527 {"B", "t500", -500, 0},
528 };
529 auto mock_wavreader_factory = CreateMockWavReaderFactory();
530
531 // There is one unique audio track to read.
532 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
533
534 conversational_speech::MultiEndCall multiend_call(
535 timing, audiotracks_path, std::move(mock_wavreader_factory));
536 EXPECT_TRUE(multiend_call.valid());
537
538 // Test.
539 EXPECT_EQ(2u, multiend_call.speaker_names().size());
540 EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
541 EXPECT_EQ(2u, multiend_call.speaking_turns().size());
542 }
543
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequence)544 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
545 // Accept:
546 // A 0****....3****.5**.
547 // B .....1****...4**...
548 // C ......2**.......6**..
549 constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
550 const std::vector<Turn> timing = {
551 {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
552 {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
553 {"C", "t300", -200, 0},
554 };
555 auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
556 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
557 kDefaultMockWavReaderFactoryParamsMap));
558
559 // There are two unique audio tracks to read.
560 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
561
562 conversational_speech::MultiEndCall multiend_call(
563 timing, audiotracks_path, std::move(mock_wavreader_factory));
564 EXPECT_TRUE(multiend_call.valid());
565
566 // Test.
567 EXPECT_EQ(3u, multiend_call.speaker_names().size());
568 EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
569 EXPECT_EQ(7u, multiend_call.speaking_turns().size());
570 EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
571 }
572
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequenceInvalid)573 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
574 // Reject:
575 // A 0****....3****.6**
576 // B .....1****...4**..
577 // C ......2**.....5**..
578 // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
579 // speakers not permitted).
580 const std::vector<Turn> timing = {
581 {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
582 {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
583 {"C", "t300", -200, 0},
584 };
585 auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
586 new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
587 kDefaultMockWavReaderFactoryParamsMap));
588
589 // There are two unique audio tracks to read.
590 EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
591
592 conversational_speech::MultiEndCall multiend_call(
593 timing, audiotracks_path, std::move(mock_wavreader_factory));
594 EXPECT_FALSE(multiend_call.valid());
595 }
596
TEST(ConversationalSpeechTest,MultiEndCallWavReaderAdaptorSine)597 TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
598 // Parameters with which wav files are created.
599 constexpr int duration_seconds = 5;
600 const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
601
602 for (int sample_rate : sample_rates) {
603 const std::string temp_filename = OutputPath() + "TempSineWavFile_" +
604 std::to_string(sample_rate) + ".wav";
605
606 // Write wav file.
607 const std::size_t num_samples = duration_seconds * sample_rate;
608 MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
609 CreateSineWavFile(temp_filename, params);
610
611 // Load wav file and check if params match.
612 WavReaderFactory wav_reader_factory;
613 MockWavReaderFactory::Params expeted_params = {sample_rate, 1u,
614 num_samples};
615 CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params);
616
617 // Clean up.
618 RemoveFile(temp_filename);
619 }
620 }
621
TEST(ConversationalSpeechTest,DISABLED_MultiEndCallSimulator)622 TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
623 // Simulated call (one character corresponding to 500 ms):
624 // A 0*********...........2*********.....
625 // B ...........1*********.....3*********
626 const std::vector<Turn> expected_timing = {
627 {"A", "t5000_440.wav", 0, 0},
628 {"B", "t5000_880.wav", 500, 0},
629 {"A", "t5000_440.wav", 0, 0},
630 {"B", "t5000_880.wav", -2500, 0},
631 };
632 const std::size_t expected_duration_seconds = 18;
633
634 // Create temporary audio track files.
635 const int sample_rate = 16000;
636 const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
637 {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
638 {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
639 };
640 const std::string audiotracks_path =
641 CreateTemporarySineAudioTracks(sine_tracks_params);
642
643 // Set up the multi-end call.
644 auto wavreader_factory =
645 std::unique_ptr<WavReaderFactory>(new WavReaderFactory());
646 MultiEndCall multiend_call(expected_timing, audiotracks_path,
647 std::move(wavreader_factory));
648
649 // Simulate the call.
650 std::string output_path = JoinFilename(audiotracks_path, "output");
651 CreateDir(output_path);
652 RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path;
653 auto generated_audiotrak_pairs =
654 conversational_speech::Simulate(multiend_call, output_path);
655 EXPECT_EQ(2u, generated_audiotrak_pairs->size());
656
657 // Check the output.
658 WavReaderFactory wav_reader_factory;
659 const MockWavReaderFactory::Params expeted_params = {
660 sample_rate, 1u, sample_rate * expected_duration_seconds};
661 for (const auto& it : *generated_audiotrak_pairs) {
662 RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
663 CheckAudioTrackParams(wav_reader_factory, it.second.near_end,
664 expeted_params);
665 CheckAudioTrackParams(wav_reader_factory, it.second.far_end,
666 expeted_params);
667 }
668
669 // Clean.
670 EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
671 }
672
673 } // namespace test
674 } // namespace webrtc
675