1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "annotator/datetime/extractor.h"
18
19 #include "utils/base/logging.h"
20
21 namespace libtextclassifier3 {
22
Extract(DateParseData * result,CodepointSpan * result_span) const23 bool DatetimeExtractor::Extract(DateParseData* result,
24 CodepointSpan* result_span) const {
25 result->field_set_mask = 0;
26 *result_span = {kInvalidIndex, kInvalidIndex};
27
28 if (rule_.regex->groups() == nullptr) {
29 return false;
30 }
31
32 for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
33 UnicodeText group_text;
34 const int group_type = rule_.regex->groups()->Get(group_id);
35 if (group_type == DatetimeGroupType_GROUP_UNUSED) {
36 continue;
37 }
38 if (!GroupTextFromMatch(group_id, &group_text)) {
39 TC3_LOG(ERROR) << "Couldn't retrieve group.";
40 return false;
41 }
42 // The pattern can have a group defined in a part that was not matched,
43 // e.g. an optional part. In this case we'll get an empty content here.
44 if (group_text.empty()) {
45 continue;
46 }
47 switch (group_type) {
48 case DatetimeGroupType_GROUP_YEAR: {
49 if (!ParseYear(group_text, &(result->year))) {
50 TC3_LOG(ERROR) << "Couldn't extract YEAR.";
51 return false;
52 }
53 result->field_set_mask |= DateParseData::YEAR_FIELD;
54 break;
55 }
56 case DatetimeGroupType_GROUP_MONTH: {
57 if (!ParseMonth(group_text, &(result->month))) {
58 TC3_LOG(ERROR) << "Couldn't extract MONTH.";
59 return false;
60 }
61 result->field_set_mask |= DateParseData::MONTH_FIELD;
62 break;
63 }
64 case DatetimeGroupType_GROUP_DAY: {
65 if (!ParseDigits(group_text, &(result->day_of_month))) {
66 TC3_LOG(ERROR) << "Couldn't extract DAY.";
67 return false;
68 }
69 result->field_set_mask |= DateParseData::DAY_FIELD;
70 break;
71 }
72 case DatetimeGroupType_GROUP_HOUR: {
73 if (!ParseDigits(group_text, &(result->hour))) {
74 TC3_LOG(ERROR) << "Couldn't extract HOUR.";
75 return false;
76 }
77 result->field_set_mask |= DateParseData::HOUR_FIELD;
78 break;
79 }
80 case DatetimeGroupType_GROUP_MINUTE: {
81 if (!ParseDigits(group_text, &(result->minute))) {
82 TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
83 return false;
84 }
85 result->field_set_mask |= DateParseData::MINUTE_FIELD;
86 break;
87 }
88 case DatetimeGroupType_GROUP_SECOND: {
89 if (!ParseDigits(group_text, &(result->second))) {
90 TC3_LOG(ERROR) << "Couldn't extract SECOND.";
91 return false;
92 }
93 result->field_set_mask |= DateParseData::SECOND_FIELD;
94 break;
95 }
96 case DatetimeGroupType_GROUP_AMPM: {
97 if (!ParseAMPM(group_text, &(result->ampm))) {
98 TC3_LOG(ERROR) << "Couldn't extract AMPM.";
99 return false;
100 }
101 result->field_set_mask |= DateParseData::AMPM_FIELD;
102 break;
103 }
104 case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
105 if (!ParseRelationDistance(group_text, &(result->relation_distance))) {
106 TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
107 return false;
108 }
109 result->field_set_mask |= DateParseData::RELATION_DISTANCE_FIELD;
110 break;
111 }
112 case DatetimeGroupType_GROUP_RELATION: {
113 if (!ParseRelation(group_text, &(result->relation))) {
114 TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
115 return false;
116 }
117 result->field_set_mask |= DateParseData::RELATION_FIELD;
118 break;
119 }
120 case DatetimeGroupType_GROUP_RELATIONTYPE: {
121 if (!ParseRelationType(group_text, &(result->relation_type))) {
122 TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
123 return false;
124 }
125 result->field_set_mask |= DateParseData::RELATION_TYPE_FIELD;
126 break;
127 }
128 case DatetimeGroupType_GROUP_DUMMY1:
129 case DatetimeGroupType_GROUP_DUMMY2:
130 break;
131 default:
132 TC3_LOG(INFO) << "Unknown group type.";
133 continue;
134 }
135 if (!UpdateMatchSpan(group_id, result_span)) {
136 TC3_LOG(ERROR) << "Couldn't update span.";
137 return false;
138 }
139 }
140
141 if (result_span->first == kInvalidIndex ||
142 result_span->second == kInvalidIndex) {
143 *result_span = {kInvalidIndex, kInvalidIndex};
144 }
145
146 return true;
147 }
148
RuleIdForType(DatetimeExtractorType type,int * rule_id) const149 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
150 int* rule_id) const {
151 auto type_it = type_and_locale_to_rule_.find(type);
152 if (type_it == type_and_locale_to_rule_.end()) {
153 return false;
154 }
155
156 auto locale_it = type_it->second.find(locale_id_);
157 if (locale_it == type_it->second.end()) {
158 return false;
159 }
160 *rule_id = locale_it->second;
161 return true;
162 }
163
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const164 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
165 DatetimeExtractorType extractor_type,
166 UnicodeText* match_result) const {
167 int rule_id;
168 if (!RuleIdForType(extractor_type, &rule_id)) {
169 return false;
170 }
171
172 std::unique_ptr<UniLib::RegexMatcher> matcher =
173 rules_[rule_id]->Matcher(input);
174 if (!matcher) {
175 return false;
176 }
177
178 int status;
179 if (!matcher->Find(&status)) {
180 return false;
181 }
182
183 if (match_result != nullptr) {
184 *match_result = matcher->Group(&status);
185 if (status != UniLib::RegexMatcher::kNoError) {
186 return false;
187 }
188 }
189 return true;
190 }
191
GroupTextFromMatch(int group_id,UnicodeText * result) const192 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
193 UnicodeText* result) const {
194 int status;
195 *result = matcher_.Group(group_id, &status);
196 if (status != UniLib::RegexMatcher::kNoError) {
197 return false;
198 }
199 return true;
200 }
201
UpdateMatchSpan(int group_id,CodepointSpan * span) const202 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
203 CodepointSpan* span) const {
204 int status;
205 const int match_start = matcher_.Start(group_id, &status);
206 if (status != UniLib::RegexMatcher::kNoError) {
207 return false;
208 }
209 const int match_end = matcher_.End(group_id, &status);
210 if (status != UniLib::RegexMatcher::kNoError) {
211 return false;
212 }
213 if (span->first == kInvalidIndex || span->first > match_start) {
214 span->first = match_start;
215 }
216 if (span->second == kInvalidIndex || span->second < match_end) {
217 span->second = match_end;
218 }
219
220 return true;
221 }
222
223 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const224 bool DatetimeExtractor::MapInput(
225 const UnicodeText& input,
226 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
227 T* result) const {
228 for (const auto& type_value_pair : mapping) {
229 if (ExtractType(input, type_value_pair.first)) {
230 *result = type_value_pair.second;
231 return true;
232 }
233 }
234 return false;
235 }
236
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const237 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
238 int* parsed_number) const {
239 std::vector<std::pair<int, int>> found_numbers;
240 for (const auto& type_value_pair :
241 std::vector<std::pair<DatetimeExtractorType, int>>{
242 {DatetimeExtractorType_ZERO, 0},
243 {DatetimeExtractorType_ONE, 1},
244 {DatetimeExtractorType_TWO, 2},
245 {DatetimeExtractorType_THREE, 3},
246 {DatetimeExtractorType_FOUR, 4},
247 {DatetimeExtractorType_FIVE, 5},
248 {DatetimeExtractorType_SIX, 6},
249 {DatetimeExtractorType_SEVEN, 7},
250 {DatetimeExtractorType_EIGHT, 8},
251 {DatetimeExtractorType_NINE, 9},
252 {DatetimeExtractorType_TEN, 10},
253 {DatetimeExtractorType_ELEVEN, 11},
254 {DatetimeExtractorType_TWELVE, 12},
255 {DatetimeExtractorType_THIRTEEN, 13},
256 {DatetimeExtractorType_FOURTEEN, 14},
257 {DatetimeExtractorType_FIFTEEN, 15},
258 {DatetimeExtractorType_SIXTEEN, 16},
259 {DatetimeExtractorType_SEVENTEEN, 17},
260 {DatetimeExtractorType_EIGHTEEN, 18},
261 {DatetimeExtractorType_NINETEEN, 19},
262 {DatetimeExtractorType_TWENTY, 20},
263 {DatetimeExtractorType_THIRTY, 30},
264 {DatetimeExtractorType_FORTY, 40},
265 {DatetimeExtractorType_FIFTY, 50},
266 {DatetimeExtractorType_SIXTY, 60},
267 {DatetimeExtractorType_SEVENTY, 70},
268 {DatetimeExtractorType_EIGHTY, 80},
269 {DatetimeExtractorType_NINETY, 90},
270 {DatetimeExtractorType_HUNDRED, 100},
271 {DatetimeExtractorType_THOUSAND, 1000},
272 }) {
273 int rule_id;
274 if (!RuleIdForType(type_value_pair.first, &rule_id)) {
275 return false;
276 }
277
278 std::unique_ptr<UniLib::RegexMatcher> matcher =
279 rules_[rule_id]->Matcher(input);
280 if (!matcher) {
281 return false;
282 }
283
284 int status;
285 while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
286 int span_start = matcher->Start(&status);
287 if (status != UniLib::RegexMatcher::kNoError) {
288 return false;
289 }
290 found_numbers.push_back({span_start, type_value_pair.second});
291 }
292 }
293
294 std::sort(found_numbers.begin(), found_numbers.end(),
295 [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
296 return a.first < b.first;
297 });
298
299 int sum = 0;
300 int running_value = -1;
301 // Simple math to make sure we handle written numerical modifiers correctly
302 // so that :="fifty one thousand and one" maps to 51001 and not 50 1 1000 1.
303 for (const std::pair<int, int> position_number_pair : found_numbers) {
304 if (running_value >= 0) {
305 if (running_value > position_number_pair.second) {
306 sum += running_value;
307 running_value = position_number_pair.second;
308 } else {
309 running_value *= position_number_pair.second;
310 }
311 } else {
312 running_value = position_number_pair.second;
313 }
314 }
315 sum += running_value;
316 *parsed_number = sum;
317 return true;
318 }
319
ParseDigits(const UnicodeText & input,int * parsed_digits) const320 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
321 int* parsed_digits) const {
322 UnicodeText digit;
323 if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
324 return false;
325 }
326
327 if (!unilib_.ParseInt32(digit, parsed_digits)) {
328 return false;
329 }
330 return true;
331 }
332
ParseYear(const UnicodeText & input,int * parsed_year) const333 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
334 int* parsed_year) const {
335 if (!ParseDigits(input, parsed_year)) {
336 return false;
337 }
338
339 if (*parsed_year < 100) {
340 if (*parsed_year < 50) {
341 *parsed_year += 2000;
342 } else {
343 *parsed_year += 1900;
344 }
345 }
346
347 return true;
348 }
349
ParseMonth(const UnicodeText & input,int * parsed_month) const350 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
351 int* parsed_month) const {
352 if (ParseDigits(input, parsed_month)) {
353 return true;
354 }
355
356 if (MapInput(input,
357 {
358 {DatetimeExtractorType_JANUARY, 1},
359 {DatetimeExtractorType_FEBRUARY, 2},
360 {DatetimeExtractorType_MARCH, 3},
361 {DatetimeExtractorType_APRIL, 4},
362 {DatetimeExtractorType_MAY, 5},
363 {DatetimeExtractorType_JUNE, 6},
364 {DatetimeExtractorType_JULY, 7},
365 {DatetimeExtractorType_AUGUST, 8},
366 {DatetimeExtractorType_SEPTEMBER, 9},
367 {DatetimeExtractorType_OCTOBER, 10},
368 {DatetimeExtractorType_NOVEMBER, 11},
369 {DatetimeExtractorType_DECEMBER, 12},
370 },
371 parsed_month)) {
372 return true;
373 }
374
375 return false;
376 }
377
ParseAMPM(const UnicodeText & input,DateParseData::AMPM * parsed_ampm) const378 bool DatetimeExtractor::ParseAMPM(const UnicodeText& input,
379 DateParseData::AMPM* parsed_ampm) const {
380 return MapInput(input,
381 {
382 {DatetimeExtractorType_AM, DateParseData::AMPM::AM},
383 {DatetimeExtractorType_PM, DateParseData::AMPM::PM},
384 },
385 parsed_ampm);
386 }
387
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const388 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
389 int* parsed_distance) const {
390 if (ParseDigits(input, parsed_distance)) {
391 return true;
392 }
393 if (ParseWrittenNumber(input, parsed_distance)) {
394 return true;
395 }
396 return false;
397 }
398
ParseRelation(const UnicodeText & input,DateParseData::Relation * parsed_relation) const399 bool DatetimeExtractor::ParseRelation(
400 const UnicodeText& input, DateParseData::Relation* parsed_relation) const {
401 return MapInput(
402 input,
403 {
404 {DatetimeExtractorType_NOW, DateParseData::Relation::NOW},
405 {DatetimeExtractorType_YESTERDAY, DateParseData::Relation::YESTERDAY},
406 {DatetimeExtractorType_TOMORROW, DateParseData::Relation::TOMORROW},
407 {DatetimeExtractorType_NEXT, DateParseData::Relation::NEXT},
408 {DatetimeExtractorType_NEXT_OR_SAME,
409 DateParseData::Relation::NEXT_OR_SAME},
410 {DatetimeExtractorType_LAST, DateParseData::Relation::LAST},
411 {DatetimeExtractorType_PAST, DateParseData::Relation::PAST},
412 {DatetimeExtractorType_FUTURE, DateParseData::Relation::FUTURE},
413 },
414 parsed_relation);
415 }
416
ParseRelationType(const UnicodeText & input,DateParseData::RelationType * parsed_relation_type) const417 bool DatetimeExtractor::ParseRelationType(
418 const UnicodeText& input,
419 DateParseData::RelationType* parsed_relation_type) const {
420 return MapInput(
421 input,
422 {
423 {DatetimeExtractorType_MONDAY, DateParseData::RelationType::MONDAY},
424 {DatetimeExtractorType_TUESDAY, DateParseData::RelationType::TUESDAY},
425 {DatetimeExtractorType_WEDNESDAY,
426 DateParseData::RelationType::WEDNESDAY},
427 {DatetimeExtractorType_THURSDAY,
428 DateParseData::RelationType::THURSDAY},
429 {DatetimeExtractorType_FRIDAY, DateParseData::RelationType::FRIDAY},
430 {DatetimeExtractorType_SATURDAY,
431 DateParseData::RelationType::SATURDAY},
432 {DatetimeExtractorType_SUNDAY, DateParseData::RelationType::SUNDAY},
433 {DatetimeExtractorType_SECONDS, DateParseData::RelationType::SECOND},
434 {DatetimeExtractorType_MINUTES, DateParseData::RelationType::MINUTE},
435 {DatetimeExtractorType_HOURS, DateParseData::RelationType::HOUR},
436 {DatetimeExtractorType_DAY, DateParseData::RelationType::DAY},
437 {DatetimeExtractorType_WEEK, DateParseData::RelationType::WEEK},
438 {DatetimeExtractorType_MONTH, DateParseData::RelationType::MONTH},
439 {DatetimeExtractorType_YEAR, DateParseData::RelationType::YEAR},
440 },
441 parsed_relation_type);
442 }
443
444 } // namespace libtextclassifier3
445