1# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Stream accuracy recognize commands.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20import collections 21 22import numpy as np 23 24 25class RecognizeResult(object): 26 """Save recognition result temporarily. 27 28 Attributes: 29 founded_command: A string indicating the word just founded. Default value 30 is '_silence_' 31 score: An float representing the confidence of founded word. Default 32 value is zero. 33 is_new_command: A boolean indicating if the founded command is a new one 34 against the last one. Default value is False. 35 """ 36 37 def __init__(self): 38 self._founded_command = "_silence_" 39 self._score = 0 40 self._is_new_command = False 41 42 @property 43 def founded_command(self): 44 return self._founded_command 45 46 @founded_command.setter 47 def founded_command(self, value): 48 self._founded_command = value 49 50 @property 51 def score(self): 52 return self._score 53 54 @score.setter 55 def score(self, value): 56 self._score = value 57 58 @property 59 def is_new_command(self): 60 return self._is_new_command 61 62 @is_new_command.setter 63 def is_new_command(self, value): 64 self._is_new_command = value 65 66 67class RecognizeCommands(object): 68 """Smooth the inference results by using average window. 69 70 Maintain a slide window over the audio stream, which adds new result(a pair of 71 the 1.confidences of all classes and 2.the start timestamp of input audio 72 clip) directly the inference produces one and removes the most previous one 73 and other abnormal values. Then it smooth the results in the window to get 74 the most reliable command in this period. 75 76 Attributes: 77 _label: A list containing commands at corresponding lines. 78 _average_window_duration: The length of average window. 79 _detection_threshold: A confidence threshold for filtering out unreliable 80 command. 81 _suppression_ms: Milliseconds every two reliable founded commands should 82 apart. 83 _minimum_count: An integer count indicating the minimum results the average 84 window should cover. 85 _previous_results: A deque to store previous results. 86 _label_count: The length of label list. 87 _previous_top_label: Last founded command. Initial value is '_silence_'. 88 _previous_top_time: The timestamp of _previous results. Default is -np.inf. 89 """ 90 91 def __init__(self, labels, average_window_duration_ms, detection_threshold, 92 suppression_ms, minimum_count): 93 """Init the RecognizeCommands with parameters used for smoothing.""" 94 # Configuration 95 self._labels = labels 96 self._average_window_duration_ms = average_window_duration_ms 97 self._detection_threshold = detection_threshold 98 self._suppression_ms = suppression_ms 99 self._minimum_count = minimum_count 100 # Working Variable 101 self._previous_results = collections.deque() 102 self._label_count = len(labels) 103 self._previous_top_label = "_silence_" 104 self._previous_top_time = -np.inf 105 106 def process_latest_result(self, latest_results, current_time_ms, 107 recognize_element): 108 """Smoothing the results in average window when a new result is added in. 109 110 Receive a new result from inference and put the founded command into 111 a RecognizeResult instance after the smoothing procedure. 112 113 Args: 114 latest_results: A list containing the confidences of all labels. 115 current_time_ms: The start timestamp of the input audio clip. 116 recognize_element: An instance of RecognizeResult to store founded 117 command, its scores and if it is a new command. 118 119 Raises: 120 ValueError: The length of this result from inference doesn't match 121 label count. 122 ValueError: The timestamp of this result is earlier than the most 123 previous one in the average window 124 """ 125 if latest_results.shape[0] != self._label_count: 126 raise ValueError("The results for recognition should contain {} " 127 "elements, but there are {} produced".format( 128 self._label_count, latest_results.shape[0])) 129 if (self._previous_results.__len__() != 0 and 130 current_time_ms < self._previous_results[0][0]): 131 raise ValueError("Results must be fed in increasing time order, " 132 "but receive a timestamp of {}, which was earlier " 133 "than the previous one of {}".format( 134 current_time_ms, self._previous_results[0][0])) 135 136 # Add the latest result to the head of the deque. 137 self._previous_results.append([current_time_ms, latest_results]) 138 139 # Prune any earlier results that are too old for the averaging window. 140 time_limit = current_time_ms - self._average_window_duration_ms 141 while time_limit > self._previous_results[0][0]: 142 self._previous_results.popleft() 143 144 # If there are too few results, the result will be unreliable and bail. 145 how_many_results = self._previous_results.__len__() 146 earliest_time = self._previous_results[0][0] 147 sample_duration = current_time_ms - earliest_time 148 if (how_many_results < self._minimum_count or 149 sample_duration < self._average_window_duration_ms / 4): 150 recognize_element.founded_command = self._previous_top_label 151 recognize_element.score = 0.0 152 recognize_element.is_new_command = False 153 return 154 155 # Calculate the average score across all the results in the window. 156 average_scores = np.zeros(self._label_count) 157 for item in self._previous_results: 158 score = item[1] 159 for i in range(score.size): 160 average_scores[i] += score[i] / how_many_results 161 162 # Sort the averaged results in descending score order. 163 sorted_averaged_index_score = [] 164 for i in range(self._label_count): 165 sorted_averaged_index_score.append([i, average_scores[i]]) 166 sorted_averaged_index_score = sorted( 167 sorted_averaged_index_score, key=lambda p: p[1], reverse=True) 168 169 # Use the information of previous result to get current result 170 current_top_index = sorted_averaged_index_score[0][0] 171 current_top_label = self._labels[current_top_index] 172 current_top_score = sorted_averaged_index_score[0][1] 173 time_since_last_top = 0 174 if (self._previous_top_label == "_silence_" or 175 self._previous_top_time == -np.inf): 176 time_since_last_top = np.inf 177 else: 178 time_since_last_top = current_time_ms - self._previous_top_time 179 if (current_top_score > self._detection_threshold and 180 current_top_label != self._previous_top_label and 181 time_since_last_top > self._suppression_ms): 182 self._previous_top_label = current_top_label 183 self._previous_top_time = current_time_ms 184 recognize_element.is_new_command = True 185 else: 186 recognize_element.is_new_command = False 187 recognize_element.founded_command = current_top_label 188 recognize_element.score = current_top_score 189