1 /*
2  *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 // This file consists of unit tests for webrtc::test::conversational_speech
12 // members. Part of them focus on accepting or rejecting different
13 // conversational speech setups. A setup is defined by a set of audio tracks and
14 // timing information).
15 // The docstring at the beginning of each TEST(ConversationalSpeechTest,
16 // MultiEndCallSetup*) function looks like the drawing below and indicates which
17 // setup is tested.
18 //
19 //    Accept:
20 //    A 0****.....
21 //    B .....1****
22 //
23 // The drawing indicates the following:
24 // - the illustrated setup should be accepted,
25 // - there are two speakers (namely, A and B),
26 // - A is the first speaking, B is the second one,
27 // - each character after the speaker's letter indicates a time unit (e.g., 100
28 //   ms),
29 // - "*" indicates speaking, "." listening,
30 // - numbers indicate the turn index in std::vector<Turn>.
31 //
32 // Note that the same speaker can appear in multiple lines in order to depict
33 // cases in which there are wrong offsets leading to self cross-talk (which is
34 // rejected).
35 
36 // MSVC++ requires this to be set before any other includes to get M_PI.
37 #define _USE_MATH_DEFINES
38 
39 #include <stdio.h>
40 
41 #include <cmath>
42 #include <map>
43 #include <memory>
44 #include <vector>
45 
46 #include "absl/types/optional.h"
47 #include "common_audio/wav_file.h"
48 #include "modules/audio_processing/test/conversational_speech/config.h"
49 #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
50 #include "modules/audio_processing/test/conversational_speech/multiend_call.h"
51 #include "modules/audio_processing/test/conversational_speech/simulator.h"
52 #include "modules/audio_processing/test/conversational_speech/timing.h"
53 #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h"
54 #include "rtc_base/logging.h"
55 #include "test/gmock.h"
56 #include "test/gtest.h"
57 #include "test/testsupport/file_utils.h"
58 
59 namespace webrtc {
60 namespace test {
61 namespace {
62 
63 using conversational_speech::LoadTiming;
64 using conversational_speech::MockWavReaderFactory;
65 using conversational_speech::MultiEndCall;
66 using conversational_speech::SaveTiming;
67 using conversational_speech::Turn;
68 using conversational_speech::WavReaderFactory;
69 
70 const char* const audiotracks_path = "/path/to/audiotracks";
71 const char* const timing_filepath = "/path/to/timing_file.txt";
72 const char* const output_path = "/path/to/output_dir";
73 
74 const std::vector<Turn> expected_timing = {
75     {"A", "a1", 0, 0},    {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
76     {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
77 };
78 const std::size_t kNumberOfTurns = expected_timing.size();
79 
80 // Default arguments for MockWavReaderFactory ctor.
81 // Fake audio track parameters.
82 constexpr int kDefaultSampleRate = 48000;
83 const std::map<std::string, const MockWavReaderFactory::Params>
84     kDefaultMockWavReaderFactoryParamsMap = {
85         {"t300", {kDefaultSampleRate, 1u, 14400u}},   // Mono, 0.3 seconds.
86         {"t500", {kDefaultSampleRate, 1u, 24000u}},   // Mono, 0.5 seconds.
87         {"t1000", {kDefaultSampleRate, 1u, 48000u}},  // Mono, 1.0 seconds.
88         {"sr8000", {8000, 1u, 8000u}},     // 8kHz sample rate, mono, 1 second.
89         {"sr16000", {16000, 1u, 16000u}},  // 16kHz sample rate, mono, 1 second.
90         {"sr16000_stereo", {16000, 2u, 16000u}},  // Like sr16000, but stereo.
91 };
92 const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
93     kDefaultMockWavReaderFactoryParamsMap.at("t500");
94 
CreateMockWavReaderFactory()95 std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
96   return std::unique_ptr<MockWavReaderFactory>(
97       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
98                                kDefaultMockWavReaderFactoryParamsMap));
99 }
100 
CreateSineWavFile(const std::string & filepath,const MockWavReaderFactory::Params & params,float frequency=440.0f)101 void CreateSineWavFile(const std::string& filepath,
102                        const MockWavReaderFactory::Params& params,
103                        float frequency = 440.0f) {
104   // Create samples.
105   constexpr double two_pi = 2.0 * M_PI;
106   std::vector<int16_t> samples(params.num_samples);
107   for (std::size_t i = 0; i < params.num_samples; ++i) {
108     // TODO(alessiob): the produced tone is not pure, improve.
109     samples[i] = std::lround(
110         32767.0f * std::sin(two_pi * i * frequency / params.sample_rate));
111   }
112 
113   // Write samples.
114   WavWriter wav_writer(filepath, params.sample_rate, params.num_channels);
115   wav_writer.WriteSamples(samples.data(), params.num_samples);
116 }
117 
118 // Parameters to generate audio tracks with CreateSineWavFile.
119 struct SineAudioTrackParams {
120   MockWavReaderFactory::Params params;
121   float frequency;
122 };
123 
124 // Creates a temporary directory in which sine audio tracks are written.
CreateTemporarySineAudioTracks(const std::map<std::string,SineAudioTrackParams> & sine_tracks_params)125 std::string CreateTemporarySineAudioTracks(
126     const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
127   // Create temporary directory.
128   std::string temp_directory =
129       OutputPath() + "TempConversationalSpeechAudioTracks";
130   CreateDir(temp_directory);
131 
132   // Create sine tracks.
133   for (const auto& it : sine_tracks_params) {
134     const std::string temp_filepath = JoinFilename(temp_directory, it.first);
135     CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency);
136   }
137 
138   return temp_directory;
139 }
140 
CheckAudioTrackParams(const WavReaderFactory & wav_reader_factory,const std::string & filepath,const MockWavReaderFactory::Params & expeted_params)141 void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
142                            const std::string& filepath,
143                            const MockWavReaderFactory::Params& expeted_params) {
144   auto wav_reader = wav_reader_factory.Create(filepath);
145   EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
146   EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
147   EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
148 }
149 
DeleteFolderAndContents(const std::string & dir)150 void DeleteFolderAndContents(const std::string& dir) {
151   if (!DirExists(dir)) {
152     return;
153   }
154   absl::optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
155   EXPECT_TRUE(dir_content);
156   for (const auto& path : *dir_content) {
157     if (DirExists(path)) {
158       DeleteFolderAndContents(path);
159     } else if (FileExists(path)) {
160       // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
161       RemoveFile(path);
162     } else {
163       FAIL();
164     }
165   }
166   // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
167   RemoveDir(dir);
168 }
169 
170 }  // namespace
171 
172 using ::testing::_;
173 
TEST(ConversationalSpeechTest,Settings)174 TEST(ConversationalSpeechTest, Settings) {
175   const conversational_speech::Config config(audiotracks_path, timing_filepath,
176                                              output_path);
177 
178   // Test getters.
179   EXPECT_EQ(audiotracks_path, config.audiotracks_path());
180   EXPECT_EQ(timing_filepath, config.timing_filepath());
181   EXPECT_EQ(output_path, config.output_path());
182 }
183 
TEST(ConversationalSpeechTest,TimingSaveLoad)184 TEST(ConversationalSpeechTest, TimingSaveLoad) {
185   // Save test timing.
186   const std::string temporary_filepath =
187       TempFilename(OutputPath(), "TempTimingTestFile");
188   SaveTiming(temporary_filepath, expected_timing);
189 
190   // Create a std::vector<Turn> instance by loading from file.
191   std::vector<Turn> actual_timing = LoadTiming(temporary_filepath);
192   RemoveFile(temporary_filepath);
193 
194   // Check size.
195   EXPECT_EQ(expected_timing.size(), actual_timing.size());
196 
197   // Check Turn instances.
198   for (size_t index = 0; index < expected_timing.size(); ++index) {
199     EXPECT_EQ(expected_timing[index], actual_timing[index])
200         << "turn #" << index << " not matching";
201   }
202 }
203 
TEST(ConversationalSpeechTest,MultiEndCallCreate)204 TEST(ConversationalSpeechTest, MultiEndCallCreate) {
205   auto mock_wavreader_factory = CreateMockWavReaderFactory();
206 
207   // There are 5 unique audio tracks to read.
208   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
209 
210   // Inject the mock wav reader factory.
211   conversational_speech::MultiEndCall multiend_call(
212       expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
213   EXPECT_TRUE(multiend_call.valid());
214 
215   // Test.
216   EXPECT_EQ(2u, multiend_call.speaker_names().size());
217   EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
218   EXPECT_EQ(6u, multiend_call.speaking_turns().size());
219 }
220 
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRates)221 TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
222   const std::vector<Turn> timing = {
223       {"A", "sr8000", 0, 0},
224       {"B", "sr16000", 0, 0},
225   };
226   auto mock_wavreader_factory = CreateMockWavReaderFactory();
227 
228   // There are two unique audio tracks to read.
229   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
230 
231   MultiEndCall multiend_call(timing, audiotracks_path,
232                              std::move(mock_wavreader_factory));
233   EXPECT_FALSE(multiend_call.valid());
234 }
235 
TEST(ConversationalSpeechTest,MultiEndCallSetupMultipleChannels)236 TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
237   const std::vector<Turn> timing = {
238       {"A", "sr16000_stereo", 0, 0},
239       {"B", "sr16000_stereo", 0, 0},
240   };
241   auto mock_wavreader_factory = CreateMockWavReaderFactory();
242 
243   // There is one unique audio track to read.
244   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1);
245 
246   MultiEndCall multiend_call(timing, audiotracks_path,
247                              std::move(mock_wavreader_factory));
248   EXPECT_FALSE(multiend_call.valid());
249 }
250 
TEST(ConversationalSpeechTest,MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels)251 TEST(ConversationalSpeechTest,
252      MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
253   const std::vector<Turn> timing = {
254       {"A", "sr8000", 0, 0},
255       {"B", "sr16000_stereo", 0, 0},
256   };
257   auto mock_wavreader_factory = CreateMockWavReaderFactory();
258 
259   // There are two unique audio tracks to read.
260   EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
261 
262   MultiEndCall multiend_call(timing, audiotracks_path,
263                              std::move(mock_wavreader_factory));
264   EXPECT_FALSE(multiend_call.valid());
265 }
266 
TEST(ConversationalSpeechTest,MultiEndCallSetupFirstOffsetNegative)267 TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
268   const std::vector<Turn> timing = {
269       {"A", "t500", -100, 0},
270       {"B", "t500", 0, 0},
271   };
272   auto mock_wavreader_factory = CreateMockWavReaderFactory();
273 
274   // There is one unique audio track to read.
275   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
276 
277   conversational_speech::MultiEndCall multiend_call(
278       timing, audiotracks_path, std::move(mock_wavreader_factory));
279   EXPECT_FALSE(multiend_call.valid());
280 }
281 
TEST(ConversationalSpeechTest,MultiEndCallSetupSimple)282 TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
283   // Accept:
284   // A 0****.....
285   // B .....1****
286   constexpr std::size_t expected_duration = kDefaultSampleRate;
287   const std::vector<Turn> timing = {
288       {"A", "t500", 0, 0},
289       {"B", "t500", 0, 0},
290   };
291   auto mock_wavreader_factory = CreateMockWavReaderFactory();
292 
293   // There is one unique audio track to read.
294   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
295 
296   conversational_speech::MultiEndCall multiend_call(
297       timing, audiotracks_path, std::move(mock_wavreader_factory));
298   EXPECT_TRUE(multiend_call.valid());
299 
300   // Test.
301   EXPECT_EQ(2u, multiend_call.speaker_names().size());
302   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
303   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
304   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
305 }
306 
TEST(ConversationalSpeechTest,MultiEndCallSetupPause)307 TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
308   // Accept:
309   // A 0****.......
310   // B .......1****
311   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
312   const std::vector<Turn> timing = {
313       {"A", "t500", 0, 0},
314       {"B", "t500", 200, 0},
315   };
316   auto mock_wavreader_factory = CreateMockWavReaderFactory();
317 
318   // There is one unique audio track to read.
319   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
320 
321   conversational_speech::MultiEndCall multiend_call(
322       timing, audiotracks_path, std::move(mock_wavreader_factory));
323   EXPECT_TRUE(multiend_call.valid());
324 
325   // Test.
326   EXPECT_EQ(2u, multiend_call.speaker_names().size());
327   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
328   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
329   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
330 }
331 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalk)332 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
333   // Accept:
334   // A 0****....
335   // B ....1****
336   constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
337   const std::vector<Turn> timing = {
338       {"A", "t500", 0, 0},
339       {"B", "t500", -100, 0},
340   };
341   auto mock_wavreader_factory = CreateMockWavReaderFactory();
342 
343   // There is one unique audio track to read.
344   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
345 
346   conversational_speech::MultiEndCall multiend_call(
347       timing, audiotracks_path, std::move(mock_wavreader_factory));
348   EXPECT_TRUE(multiend_call.valid());
349 
350   // Test.
351   EXPECT_EQ(2u, multiend_call.speaker_names().size());
352   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
353   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
354   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
355 }
356 
TEST(ConversationalSpeechTest,MultiEndCallSetupInvalidOrder)357 TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
358   // Reject:
359   // A ..0****
360   // B .1****.  The n-th turn cannot start before the (n-1)-th one.
361   const std::vector<Turn> timing = {
362       {"A", "t500", 200, 0},
363       {"B", "t500", -600, 0},
364   };
365   auto mock_wavreader_factory = CreateMockWavReaderFactory();
366 
367   // There is one unique audio track to read.
368   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
369 
370   conversational_speech::MultiEndCall multiend_call(
371       timing, audiotracks_path, std::move(mock_wavreader_factory));
372   EXPECT_FALSE(multiend_call.valid());
373 }
374 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkThree)375 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
376   // Accept:
377   // A 0****2****...
378   // B ...1*********
379   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
380   const std::vector<Turn> timing = {
381       {"A", "t500", 0, 0},
382       {"B", "t1000", -200, 0},
383       {"A", "t500", -800, 0},
384   };
385   auto mock_wavreader_factory = CreateMockWavReaderFactory();
386 
387   // There are two unique audio tracks to read.
388   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
389 
390   conversational_speech::MultiEndCall multiend_call(
391       timing, audiotracks_path, std::move(mock_wavreader_factory));
392   EXPECT_TRUE(multiend_call.valid());
393 
394   // Test.
395   EXPECT_EQ(2u, multiend_call.speaker_names().size());
396   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
397   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
398   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
399 }
400 
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkNearInvalid)401 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
402   // Reject:
403   // A 0****......
404   // A ...1****...
405   // B ......2****
406   //      ^  Turn #1 overlaps with #0 which is from the same speaker.
407   const std::vector<Turn> timing = {
408       {"A", "t500", 0, 0},
409       {"A", "t500", -200, 0},
410       {"B", "t500", -200, 0},
411   };
412   auto mock_wavreader_factory = CreateMockWavReaderFactory();
413 
414   // There is one unique audio track to read.
415   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
416 
417   conversational_speech::MultiEndCall multiend_call(
418       timing, audiotracks_path, std::move(mock_wavreader_factory));
419   EXPECT_FALSE(multiend_call.valid());
420 }
421 
TEST(ConversationalSpeechTest,MultiEndCallSetupSelfCrossTalkFarInvalid)422 TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
423   // Reject:
424   // A 0*********
425   // B 1**.......
426   // C ...2**....
427   // A ......3**.
428   //         ^  Turn #3 overlaps with #0 which is from the same speaker.
429   const std::vector<Turn> timing = {
430       {"A", "t1000", 0, 0},
431       {"B", "t300", -1000, 0},
432       {"C", "t300", 0, 0},
433       {"A", "t300", 0, 0},
434   };
435   auto mock_wavreader_factory = CreateMockWavReaderFactory();
436 
437   // There are two unique audio tracks to read.
438   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
439 
440   conversational_speech::MultiEndCall multiend_call(
441       timing, audiotracks_path, std::move(mock_wavreader_factory));
442   EXPECT_FALSE(multiend_call.valid());
443 }
444 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleValid)445 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
446   // Accept:
447   // A 0*********..
448   // B ..1****.....
449   // C .......2****
450   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
451   const std::vector<Turn> timing = {
452       {"A", "t1000", 0, 0},
453       {"B", "t500", -800, 0},
454       {"C", "t500", 0, 0},
455   };
456   auto mock_wavreader_factory = CreateMockWavReaderFactory();
457 
458   // There are two unique audio tracks to read.
459   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
460 
461   conversational_speech::MultiEndCall multiend_call(
462       timing, audiotracks_path, std::move(mock_wavreader_factory));
463   EXPECT_TRUE(multiend_call.valid());
464 
465   // Test.
466   EXPECT_EQ(3u, multiend_call.speaker_names().size());
467   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
468   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
469   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
470 }
471 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleInvalid)472 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
473   // Reject:
474   // A 0*********
475   // B ..1****...
476   // C ....2****.
477   //       ^  Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
478   //          not permitted).
479   const std::vector<Turn> timing = {
480       {"A", "t1000", 0, 0},
481       {"B", "t500", -800, 0},
482       {"C", "t500", -300, 0},
483   };
484   auto mock_wavreader_factory = CreateMockWavReaderFactory();
485 
486   // There are two unique audio tracks to read.
487   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
488 
489   conversational_speech::MultiEndCall multiend_call(
490       timing, audiotracks_path, std::move(mock_wavreader_factory));
491   EXPECT_FALSE(multiend_call.valid());
492 }
493 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkMiddleAndPause)494 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
495   // Accept:
496   // A 0*********..
497   // B .2****......
498   // C .......3****
499   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
500   const std::vector<Turn> timing = {
501       {"A", "t1000", 0, 0},
502       {"B", "t500", -900, 0},
503       {"C", "t500", 100, 0},
504   };
505   auto mock_wavreader_factory = CreateMockWavReaderFactory();
506 
507   // There are two unique audio tracks to read.
508   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
509 
510   conversational_speech::MultiEndCall multiend_call(
511       timing, audiotracks_path, std::move(mock_wavreader_factory));
512   EXPECT_TRUE(multiend_call.valid());
513 
514   // Test.
515   EXPECT_EQ(3u, multiend_call.speaker_names().size());
516   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
517   EXPECT_EQ(3u, multiend_call.speaking_turns().size());
518   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
519 }
520 
TEST(ConversationalSpeechTest,MultiEndCallSetupCrossTalkFullOverlapValid)521 TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
522   // Accept:
523   // A 0****
524   // B 1****
525   const std::vector<Turn> timing = {
526       {"A", "t500", 0, 0},
527       {"B", "t500", -500, 0},
528   };
529   auto mock_wavreader_factory = CreateMockWavReaderFactory();
530 
531   // There is one unique audio track to read.
532   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
533 
534   conversational_speech::MultiEndCall multiend_call(
535       timing, audiotracks_path, std::move(mock_wavreader_factory));
536   EXPECT_TRUE(multiend_call.valid());
537 
538   // Test.
539   EXPECT_EQ(2u, multiend_call.speaker_names().size());
540   EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
541   EXPECT_EQ(2u, multiend_call.speaking_turns().size());
542 }
543 
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequence)544 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
545   // Accept:
546   // A 0****....3****.5**.
547   // B .....1****...4**...
548   // C ......2**.......6**..
549   constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
550   const std::vector<Turn> timing = {
551       {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
552       {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
553       {"C", "t300", -200, 0},
554   };
555   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
556       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
557                                kDefaultMockWavReaderFactoryParamsMap));
558 
559   // There are two unique audio tracks to read.
560   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
561 
562   conversational_speech::MultiEndCall multiend_call(
563       timing, audiotracks_path, std::move(mock_wavreader_factory));
564   EXPECT_TRUE(multiend_call.valid());
565 
566   // Test.
567   EXPECT_EQ(3u, multiend_call.speaker_names().size());
568   EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
569   EXPECT_EQ(7u, multiend_call.speaking_turns().size());
570   EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
571 }
572 
TEST(ConversationalSpeechTest,MultiEndCallSetupLongSequenceInvalid)573 TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
574   // Reject:
575   // A 0****....3****.6**
576   // B .....1****...4**..
577   // C ......2**.....5**..
578   //                 ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
579   //                   speakers not permitted).
580   const std::vector<Turn> timing = {
581       {"A", "t500", 0, 0},    {"B", "t500", 0, 0},    {"C", "t300", -400, 0},
582       {"A", "t500", 0, 0},    {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
583       {"C", "t300", -200, 0},
584   };
585   auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
586       new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
587                                kDefaultMockWavReaderFactoryParamsMap));
588 
589   // There are two unique audio tracks to read.
590   EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
591 
592   conversational_speech::MultiEndCall multiend_call(
593       timing, audiotracks_path, std::move(mock_wavreader_factory));
594   EXPECT_FALSE(multiend_call.valid());
595 }
596 
TEST(ConversationalSpeechTest,MultiEndCallWavReaderAdaptorSine)597 TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
598   // Parameters with which wav files are created.
599   constexpr int duration_seconds = 5;
600   const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
601 
602   for (int sample_rate : sample_rates) {
603     const std::string temp_filename = OutputPath() + "TempSineWavFile_" +
604                                       std::to_string(sample_rate) + ".wav";
605 
606     // Write wav file.
607     const std::size_t num_samples = duration_seconds * sample_rate;
608     MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
609     CreateSineWavFile(temp_filename, params);
610 
611     // Load wav file and check if params match.
612     WavReaderFactory wav_reader_factory;
613     MockWavReaderFactory::Params expeted_params = {sample_rate, 1u,
614                                                    num_samples};
615     CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params);
616 
617     // Clean up.
618     RemoveFile(temp_filename);
619   }
620 }
621 
TEST(ConversationalSpeechTest,DISABLED_MultiEndCallSimulator)622 TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
623   // Simulated call (one character corresponding to 500 ms):
624   // A 0*********...........2*********.....
625   // B ...........1*********.....3*********
626   const std::vector<Turn> expected_timing = {
627       {"A", "t5000_440.wav", 0, 0},
628       {"B", "t5000_880.wav", 500, 0},
629       {"A", "t5000_440.wav", 0, 0},
630       {"B", "t5000_880.wav", -2500, 0},
631   };
632   const std::size_t expected_duration_seconds = 18;
633 
634   // Create temporary audio track files.
635   const int sample_rate = 16000;
636   const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
637       {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
638       {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
639   };
640   const std::string audiotracks_path =
641       CreateTemporarySineAudioTracks(sine_tracks_params);
642 
643   // Set up the multi-end call.
644   auto wavreader_factory =
645       std::unique_ptr<WavReaderFactory>(new WavReaderFactory());
646   MultiEndCall multiend_call(expected_timing, audiotracks_path,
647                              std::move(wavreader_factory));
648 
649   // Simulate the call.
650   std::string output_path = JoinFilename(audiotracks_path, "output");
651   CreateDir(output_path);
652   RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path;
653   auto generated_audiotrak_pairs =
654       conversational_speech::Simulate(multiend_call, output_path);
655   EXPECT_EQ(2u, generated_audiotrak_pairs->size());
656 
657   // Check the output.
658   WavReaderFactory wav_reader_factory;
659   const MockWavReaderFactory::Params expeted_params = {
660       sample_rate, 1u, sample_rate * expected_duration_seconds};
661   for (const auto& it : *generated_audiotrak_pairs) {
662     RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
663     CheckAudioTrackParams(wav_reader_factory, it.second.near_end,
664                           expeted_params);
665     CheckAudioTrackParams(wav_reader_factory, it.second.far_end,
666                           expeted_params);
667   }
668 
669   // Clean.
670   EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
671 }
672 
673 }  // namespace test
674 }  // namespace webrtc
675