1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/duration/duration.h"
18 
19 #include <climits>
20 #include <cstdlib>
21 
22 #include "annotator/collections.h"
23 #include "annotator/types.h"
24 #include "utils/base/logging.h"
25 #include "utils/strings/numbers.h"
26 
27 namespace libtextclassifier3 {
28 
29 using DurationUnit = internal::DurationUnit;
30 
31 namespace internal {
32 
33 namespace {
FillDurationUnitMap(const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> * expressions,DurationUnit duration_unit,std::unordered_map<std::string,DurationUnit> * target_map)34 void FillDurationUnitMap(
35     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
36         expressions,
37     DurationUnit duration_unit,
38     std::unordered_map<std::string, DurationUnit>* target_map) {
39   if (expressions == nullptr) {
40     return;
41   }
42 
43   for (const flatbuffers::String* expression_string : *expressions) {
44     (*target_map)[expression_string->c_str()] = duration_unit;
45   }
46 }
47 }  // namespace
48 
BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions * options)49 std::unordered_map<std::string, DurationUnit> BuildTokenToDurationUnitMapping(
50     const DurationAnnotatorOptions* options) {
51   std::unordered_map<std::string, DurationUnit> mapping;
52   FillDurationUnitMap(options->week_expressions(), DurationUnit::WEEK,
53                       &mapping);
54   FillDurationUnitMap(options->day_expressions(), DurationUnit::DAY, &mapping);
55   FillDurationUnitMap(options->hour_expressions(), DurationUnit::HOUR,
56                       &mapping);
57   FillDurationUnitMap(options->minute_expressions(), DurationUnit::MINUTE,
58                       &mapping);
59   FillDurationUnitMap(options->second_expressions(), DurationUnit::SECOND,
60                       &mapping);
61   return mapping;
62 }
63 
BuildStringSet(const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> * strings)64 std::unordered_set<std::string> BuildStringSet(
65     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
66         strings) {
67   std::unordered_set<std::string> result;
68   if (strings == nullptr) {
69     return result;
70   }
71 
72   for (const flatbuffers::String* string_value : *strings) {
73     result.insert(string_value->c_str());
74   }
75 
76   return result;
77 }
78 
79 }  // namespace internal
80 
ClassifyText(const UnicodeText & context,CodepointSpan selection_indices,AnnotationUsecase annotation_usecase,ClassificationResult * classification_result) const81 bool DurationAnnotator::ClassifyText(
82     const UnicodeText& context, CodepointSpan selection_indices,
83     AnnotationUsecase annotation_usecase,
84     ClassificationResult* classification_result) const {
85   if (!options_->enabled() || ((options_->enabled_annotation_usecases() &
86                                 (1 << annotation_usecase))) == 0) {
87     return false;
88   }
89 
90   const UnicodeText selection =
91       UnicodeText::Substring(context, selection_indices.first,
92                              selection_indices.second, /*do_copy=*/false);
93   const std::vector<Token> tokens = feature_processor_->Tokenize(selection);
94 
95   AnnotatedSpan annotated_span;
96   if (FindDurationStartingAt(context, tokens, 0, &annotated_span) !=
97       tokens.size()) {
98     return false;
99   }
100 
101   TC3_CHECK(!annotated_span.classification.empty());
102 
103   *classification_result = annotated_span.classification[0];
104   return true;
105 }
106 
FindAll(const UnicodeText & context,const std::vector<Token> & tokens,AnnotationUsecase annotation_usecase,std::vector<AnnotatedSpan> * results) const107 bool DurationAnnotator::FindAll(const UnicodeText& context,
108                                 const std::vector<Token>& tokens,
109                                 AnnotationUsecase annotation_usecase,
110                                 std::vector<AnnotatedSpan>* results) const {
111   if (!options_->enabled() || ((options_->enabled_annotation_usecases() &
112                                 (1 << annotation_usecase))) == 0) {
113     return true;
114   }
115 
116   for (int i = 0; i < tokens.size();) {
117     AnnotatedSpan span;
118     const int next_i = FindDurationStartingAt(context, tokens, i, &span);
119     if (next_i != i) {
120       results->push_back(span);
121       i = next_i;
122     } else {
123       i++;
124     }
125   }
126   return true;
127 }
128 
FindDurationStartingAt(const UnicodeText & context,const std::vector<Token> & tokens,int start_token_index,AnnotatedSpan * result) const129 int DurationAnnotator::FindDurationStartingAt(const UnicodeText& context,
130                                               const std::vector<Token>& tokens,
131                                               int start_token_index,
132                                               AnnotatedSpan* result) const {
133   CodepointIndex start_index = kInvalidIndex;
134   CodepointIndex end_index = kInvalidIndex;
135 
136   bool has_quantity = false;
137   ParsedDurationAtom parsed_duration;
138 
139   std::vector<ParsedDurationAtom> parsed_duration_atoms;
140 
141   // This is the core algorithm for finding the duration expressions. It
142   // basically iterates over tokens and changes the state variables above as it
143   // goes.
144   int token_index;
145   for (token_index = start_token_index; token_index < tokens.size();
146        token_index++) {
147     const Token& token = tokens[token_index];
148 
149     if (ParseQuantityToken(token, &parsed_duration)) {
150       has_quantity = true;
151       if (start_index == kInvalidIndex) {
152         start_index = token.start;
153       }
154       end_index = token.end;
155     } else if (ParseDurationUnitToken(token, &parsed_duration.unit)) {
156       if (start_index == kInvalidIndex) {
157         start_index = token.start;
158       }
159       end_index = token.end;
160       parsed_duration_atoms.push_back(parsed_duration);
161       has_quantity = false;
162       parsed_duration = ParsedDurationAtom();
163     } else if (ParseFillerToken(token)) {
164     } else {
165       break;
166     }
167   }
168 
169   if (parsed_duration_atoms.empty()) {
170     return start_token_index;
171   }
172 
173   const bool parse_ended_without_unit_for_last_mentioned_quantity =
174       has_quantity;
175 
176   ClassificationResult classification{Collections::Duration(),
177                                       options_->score()};
178   classification.priority_score = options_->priority_score();
179   classification.duration_ms =
180       ParsedDurationAtomsToMillis(parsed_duration_atoms);
181 
182   // Process suffix expressions like "and half" that don't have the
183   // duration_unit explicitly mentioned.
184   if (parse_ended_without_unit_for_last_mentioned_quantity &&
185       parsed_duration.plus_half) {
186     ParsedDurationAtom atom = ParsedDurationAtom::Half();
187     atom.unit = parsed_duration_atoms.rbegin()->unit;
188     classification.duration_ms += ParsedDurationAtomsToMillis({atom});
189   }
190 
191   result->span = feature_processor_->StripBoundaryCodepoints(
192       context, {start_index, end_index});
193   result->classification.push_back(classification);
194   result->source = AnnotatedSpan::Source::DURATION;
195 
196   return token_index;
197 }
198 
ParsedDurationAtomsToMillis(const std::vector<ParsedDurationAtom> & atoms) const199 int64 DurationAnnotator::ParsedDurationAtomsToMillis(
200     const std::vector<ParsedDurationAtom>& atoms) const {
201   int64 result = 0;
202   for (auto atom : atoms) {
203     int multiplier;
204     switch (atom.unit) {
205       case DurationUnit::WEEK:
206         multiplier = 7 * 24 * 60 * 60 * 1000;
207         break;
208       case DurationUnit::DAY:
209         multiplier = 24 * 60 * 60 * 1000;
210         break;
211       case DurationUnit::HOUR:
212         multiplier = 60 * 60 * 1000;
213         break;
214       case DurationUnit::MINUTE:
215         multiplier = 60 * 1000;
216         break;
217       case DurationUnit::SECOND:
218         multiplier = 1000;
219         break;
220       case DurationUnit::UNKNOWN:
221         TC3_LOG(ERROR) << "Requesting parse of UNKNOWN duration duration_unit.";
222         return -1;
223         break;
224     }
225 
226     int value = atom.value;
227     // This condition handles expressions like "an hour", where the quantity is
228     // not specified. In this case we assume quantity 1. Except for cases like
229     // "half hour".
230     if (value == 0 && !atom.plus_half) {
231       value = 1;
232     }
233     result += value * multiplier;
234     result += atom.plus_half * multiplier / 2;
235   }
236   return result;
237 }
238 
ParseQuantityToken(const Token & token,ParsedDurationAtom * value) const239 bool DurationAnnotator::ParseQuantityToken(const Token& token,
240                                            ParsedDurationAtom* value) const {
241   if (token.value.empty()) {
242     return false;
243   }
244 
245   std::string token_value_buffer;
246   const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
247       token.value, &token_value_buffer);
248 
249   if (half_expressions_.find(token_value) != half_expressions_.end()) {
250     value->plus_half = true;
251     return true;
252   }
253 
254   int32 parsed_value;
255   if (ParseInt32(token_value.c_str(), &parsed_value)) {
256     value->value = parsed_value;
257     return true;
258   }
259 
260   return false;
261 }
262 
ParseDurationUnitToken(const Token & token,DurationUnit * duration_unit) const263 bool DurationAnnotator::ParseDurationUnitToken(
264     const Token& token, DurationUnit* duration_unit) const {
265   std::string token_value_buffer;
266   const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
267       token.value, &token_value_buffer);
268 
269   const auto it = token_value_to_duration_unit_.find(token_value);
270   if (it == token_value_to_duration_unit_.end()) {
271     return false;
272   }
273 
274   *duration_unit = it->second;
275   return true;
276 }
277 
ParseFillerToken(const Token & token) const278 bool DurationAnnotator::ParseFillerToken(const Token& token) const {
279   std::string token_value_buffer;
280   const std::string& token_value = feature_processor_->StripBoundaryCodepoints(
281       token.value, &token_value_buffer);
282 
283   if (filler_expressions_.find(token_value) == filler_expressions_.end()) {
284     return false;
285   }
286 
287   return true;
288 }
289 
290 }  // namespace libtextclassifier3
291