1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Stream accuracy recognize commands."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import collections
21
22import numpy as np
23
24
25class RecognizeResult(object):
26  """Save recognition result temporarily.
27
28  Attributes:
29    founded_command: A string indicating the word just founded. Default value
30      is '_silence_'
31    score: An float representing the confidence of founded word. Default
32      value is zero.
33    is_new_command: A boolean indicating if the founded command is a new one
34      against the last one. Default value is False.
35  """
36
37  def __init__(self):
38    self._founded_command = "_silence_"
39    self._score = 0
40    self._is_new_command = False
41
42  @property
43  def founded_command(self):
44    return self._founded_command
45
46  @founded_command.setter
47  def founded_command(self, value):
48    self._founded_command = value
49
50  @property
51  def score(self):
52    return self._score
53
54  @score.setter
55  def score(self, value):
56    self._score = value
57
58  @property
59  def is_new_command(self):
60    return self._is_new_command
61
62  @is_new_command.setter
63  def is_new_command(self, value):
64    self._is_new_command = value
65
66
67class RecognizeCommands(object):
68  """Smooth the inference results by using average window.
69
70  Maintain a slide window over the audio stream, which adds new result(a pair of
71  the 1.confidences of all classes and 2.the start timestamp of input audio
72  clip) directly the inference produces one and removes the most previous one
73  and other abnormal values. Then it smooth the results in the window to get
74  the most reliable command in this period.
75
76  Attributes:
77    _label: A list containing commands at corresponding lines.
78    _average_window_duration: The length of average window.
79    _detection_threshold: A confidence threshold for filtering out unreliable
80      command.
81    _suppression_ms: Milliseconds every two reliable founded commands should
82      apart.
83    _minimum_count: An integer count indicating the minimum results the average
84      window should cover.
85    _previous_results: A deque to store previous results.
86    _label_count: The length of label list.
87    _previous_top_label: Last founded command. Initial value is '_silence_'.
88    _previous_top_time: The timestamp of _previous results. Default is -np.inf.
89  """
90
91  def __init__(self, labels, average_window_duration_ms, detection_threshold,
92               suppression_ms, minimum_count):
93    """Init the RecognizeCommands with parameters used for smoothing."""
94    # Configuration
95    self._labels = labels
96    self._average_window_duration_ms = average_window_duration_ms
97    self._detection_threshold = detection_threshold
98    self._suppression_ms = suppression_ms
99    self._minimum_count = minimum_count
100    # Working Variable
101    self._previous_results = collections.deque()
102    self._label_count = len(labels)
103    self._previous_top_label = "_silence_"
104    self._previous_top_time = -np.inf
105
106  def process_latest_result(self, latest_results, current_time_ms,
107                            recognize_element):
108    """Smoothing the results in average window when a new result is added in.
109
110    Receive a new result from inference and put the founded command into
111    a RecognizeResult instance after the smoothing procedure.
112
113    Args:
114      latest_results: A list containing the confidences of all labels.
115      current_time_ms: The start timestamp of the input audio clip.
116      recognize_element: An instance of RecognizeResult to store founded
117        command, its scores and if it is a new command.
118
119    Raises:
120      ValueError: The length of this result from inference doesn't match
121        label count.
122      ValueError: The timestamp of this result is earlier than the most
123        previous one in the average window
124    """
125    if latest_results.shape[0] != self._label_count:
126      raise ValueError("The results for recognition should contain {} "
127                       "elements, but there are {} produced".format(
128                           self._label_count, latest_results.shape[0]))
129    if (self._previous_results.__len__() != 0 and
130        current_time_ms < self._previous_results[0][0]):
131      raise ValueError("Results must be fed in increasing time order, "
132                       "but receive a timestamp of {}, which was earlier "
133                       "than the previous one of {}".format(
134                           current_time_ms, self._previous_results[0][0]))
135
136    # Add the latest result to the head of the deque.
137    self._previous_results.append([current_time_ms, latest_results])
138
139    # Prune any earlier results that are too old for the averaging window.
140    time_limit = current_time_ms - self._average_window_duration_ms
141    while time_limit > self._previous_results[0][0]:
142      self._previous_results.popleft()
143
144    # If there are too few results, the result will be unreliable and bail.
145    how_many_results = self._previous_results.__len__()
146    earliest_time = self._previous_results[0][0]
147    sample_duration = current_time_ms - earliest_time
148    if (how_many_results < self._minimum_count or
149        sample_duration < self._average_window_duration_ms / 4):
150      recognize_element.founded_command = self._previous_top_label
151      recognize_element.score = 0.0
152      recognize_element.is_new_command = False
153      return
154
155    # Calculate the average score across all the results in the window.
156    average_scores = np.zeros(self._label_count)
157    for item in self._previous_results:
158      score = item[1]
159      for i in range(score.size):
160        average_scores[i] += score[i] / how_many_results
161
162    # Sort the averaged results in descending score order.
163    sorted_averaged_index_score = []
164    for i in range(self._label_count):
165      sorted_averaged_index_score.append([i, average_scores[i]])
166    sorted_averaged_index_score = sorted(
167        sorted_averaged_index_score, key=lambda p: p[1], reverse=True)
168
169    # Use the information of previous result to get current result
170    current_top_index = sorted_averaged_index_score[0][0]
171    current_top_label = self._labels[current_top_index]
172    current_top_score = sorted_averaged_index_score[0][1]
173    time_since_last_top = 0
174    if (self._previous_top_label == "_silence_" or
175        self._previous_top_time == -np.inf):
176      time_since_last_top = np.inf
177    else:
178      time_since_last_top = current_time_ms - self._previous_top_time
179    if (current_top_score > self._detection_threshold and
180        current_top_label != self._previous_top_label and
181        time_since_last_top > self._suppression_ms):
182      self._previous_top_label = current_top_label
183      self._previous_top_time = current_time_ms
184      recognize_element.is_new_command = True
185    else:
186      recognize_element.is_new_command = False
187    recognize_element.founded_command = current_top_label
188    recognize_element.score = current_top_score
189