1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
17 #define TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
18 
19 #include <deque>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/platform/types.h"
25 
26 namespace tensorflow {
27 
28 // This class is designed to apply a very primitive decoding model on top of the
29 // instantaneous results from running an audio recognition model on a single
30 // window of samples. It applies smoothing over time so that noisy individual
31 // label scores are averaged, increasing the confidence that apparent matches
32 // are real.
33 // To use it, you should create a class object with the configuration you
34 // want, and then feed results from running a TensorFlow model into the
35 // processing method. The timestamp for each subsequent call should be
36 // increasing from the previous, since the class is designed to process a stream
37 // of data over time.
38 class RecognizeCommands {
39  public:
40   // labels should be a list of the strings associated with each one-hot score.
41   // The window duration controls the smoothing. Longer durations will give a
42   // higher confidence that the results are correct, but may miss some commands.
43   // The detection threshold has a similar effect, with high values increasing
44   // the precision at the cost of recall. The minimum count controls how many
45   // results need to be in the averaging window before it's seen as a reliable
46   // average. This prevents erroneous results when the averaging window is
47   // initially being populated for example. The suppression argument disables
48   // further recognitions for a set time after one has been triggered, which can
49   // help reduce spurious recognitions.
50   explicit RecognizeCommands(const std::vector<string>& labels,
51                              int32 average_window_duration_ms = 1000,
52                              float detection_threshold = 0.2,
53                              int32 suppression_ms = 500,
54                              int32 minimum_count = 3);
55 
56   // Call this with the results of running a model on sample data.
57   Status ProcessLatestResults(const Tensor& latest_results,
58                               const int64 current_time_ms,
59                               string* found_command, float* score,
60                               bool* is_new_command);
61 
62  private:
63   // Configuration
64   std::vector<string> labels_;
65   int32 average_window_duration_ms_;
66   float detection_threshold_;
67   int32 suppression_ms_;
68   int32 minimum_count_;
69 
70   // Working variables
71   std::deque<std::pair<int64, Tensor>> previous_results_;
72   string previous_top_label_;
73   int64 labels_count_;
74   int64 previous_top_label_time_;
75 };
76 
77 }  // namespace tensorflow
78 
79 #endif  // TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
80