1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/datetime/extractor.h"
18 
19 #include "annotator/datetime/utils.h"
20 #include "annotator/model_generated.h"
21 #include "annotator/types.h"
22 #include "utils/base/logging.h"
23 
24 namespace libtextclassifier3 {
25 
Extract(DatetimeParsedData * result,CodepointSpan * result_span) const26 bool DatetimeExtractor::Extract(DatetimeParsedData* result,
27                                 CodepointSpan* result_span) const {
28   *result_span = {kInvalidIndex, kInvalidIndex};
29 
30   if (rule_.regex->groups() == nullptr) {
31     return false;
32   }
33 
34   // In the current implementation of extractor, the assumption is that there
35   // can only be one relative field.
36   DatetimeComponent::ComponentType component_type;
37   DatetimeComponent::RelativeQualifier relative_qualifier =
38       DatetimeComponent::RelativeQualifier::UNSPECIFIED;
39   int relative_count = 0;
40 
41   for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
42     UnicodeText group_text;
43     const int group_type = rule_.regex->groups()->Get(group_id);
44     if (group_type == DatetimeGroupType_GROUP_UNUSED) {
45       continue;
46     }
47     if (!GroupTextFromMatch(group_id, &group_text)) {
48       TC3_LOG(ERROR) << "Couldn't retrieve group.";
49       return false;
50     }
51     // The pattern can have a group defined in a part that was not matched,
52     // e.g. an optional part. In this case we'll get an empty content here.
53     if (group_text.empty()) {
54       continue;
55     }
56 
57     switch (group_type) {
58       case DatetimeGroupType_GROUP_YEAR: {
59         int year;
60         if (!ParseYear(group_text, &(year))) {
61           TC3_LOG(ERROR) << "Couldn't extract YEAR.";
62           return false;
63         }
64         result->SetAbsoluteValue(DatetimeComponent::ComponentType::YEAR, year);
65         break;
66       }
67       case DatetimeGroupType_GROUP_MONTH: {
68         int month;
69         if (!ParseMonth(group_text, &(month))) {
70           TC3_LOG(ERROR) << "Couldn't extract MONTH.";
71           return false;
72         }
73         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MONTH,
74                                  month);
75         break;
76       }
77       case DatetimeGroupType_GROUP_DAY: {
78         int day_of_month;
79         if (!ParseDigits(group_text, &(day_of_month))) {
80           TC3_LOG(ERROR) << "Couldn't extract DAY.";
81           return false;
82         }
83         result->SetAbsoluteValue(DatetimeComponent::ComponentType::DAY_OF_MONTH,
84                                  day_of_month);
85         break;
86       }
87       case DatetimeGroupType_GROUP_HOUR: {
88         int hour;
89         if (!ParseDigits(group_text, &(hour))) {
90           TC3_LOG(ERROR) << "Couldn't extract HOUR.";
91           return false;
92         }
93         result->SetAbsoluteValue(DatetimeComponent::ComponentType::HOUR, hour);
94         break;
95       }
96       case DatetimeGroupType_GROUP_MINUTE: {
97         int minute;
98         if (!ParseDigits(group_text, &(minute)) &&
99             !ParseWrittenNumber(group_text, &(minute))) {
100           TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
101           return false;
102         }
103         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MINUTE,
104                                  minute);
105         break;
106       }
107       case DatetimeGroupType_GROUP_SECOND: {
108         int second;
109         if (!ParseDigits(group_text, &(second))) {
110           TC3_LOG(ERROR) << "Couldn't extract SECOND.";
111           return false;
112         }
113         result->SetAbsoluteValue(DatetimeComponent::ComponentType::SECOND,
114                                  second);
115         break;
116       }
117       case DatetimeGroupType_GROUP_AMPM: {
118         int meridiem;
119         if (!ParseMeridiem(group_text, &(meridiem))) {
120           TC3_LOG(ERROR) << "Couldn't extract AMPM.";
121           return false;
122         }
123         result->SetAbsoluteValue(DatetimeComponent::ComponentType::MERIDIEM,
124                                  meridiem);
125         break;
126       }
127       case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
128         relative_count = 0;
129         if (!ParseRelationDistance(group_text, &(relative_count))) {
130           TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
131           return false;
132         }
133         break;
134       }
135       case DatetimeGroupType_GROUP_RELATION: {
136         if (!ParseRelativeValue(group_text, &relative_qualifier)) {
137           TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
138           return false;
139         }
140         ParseRelationAndConvertToRelativeCount(group_text, &relative_count);
141         if (relative_qualifier ==
142                 DatetimeComponent::RelativeQualifier::TOMORROW ||
143             relative_qualifier == DatetimeComponent::RelativeQualifier::NOW ||
144             relative_qualifier ==
145                 DatetimeComponent::RelativeQualifier::YESTERDAY) {
146           if (!ParseFieldType(group_text, &component_type)) {
147             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
148             return false;
149           }
150         }
151         break;
152       }
153       case DatetimeGroupType_GROUP_RELATIONTYPE: {
154         if (!ParseFieldType(group_text, &component_type)) {
155           TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
156           return false;
157         }
158         if (component_type == DatetimeComponent::ComponentType::DAY_OF_WEEK) {
159           int day_of_week;
160           if (!ParseDayOfWeek(group_text, &day_of_week)) {
161             TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
162             return false;
163           }
164           result->SetAbsoluteValue(component_type, day_of_week);
165         }
166         break;
167       }
168       case DatetimeGroupType_GROUP_ABSOLUTETIME: {
169         std::unordered_map<DatetimeComponent::ComponentType, int> values;
170         if (!ParseAbsoluteDateValues(group_text, &values)) {
171           TC3_LOG(ERROR) << "Couldn't extract Component values.";
172           return false;
173         }
174         for (const std::pair<const DatetimeComponent::ComponentType, int>&
175                  date_time_pair : values) {
176           result->SetAbsoluteValue(date_time_pair.first, date_time_pair.second);
177         }
178         break;
179       }
180       case DatetimeGroupType_GROUP_DUMMY1:
181       case DatetimeGroupType_GROUP_DUMMY2:
182         break;
183       default:
184         TC3_LOG(INFO) << "Unknown group type.";
185         continue;
186     }
187     if (!UpdateMatchSpan(group_id, result_span)) {
188       TC3_LOG(ERROR) << "Couldn't update span.";
189       return false;
190     }
191   }
192 
193   if (relative_qualifier != DatetimeComponent::RelativeQualifier::UNSPECIFIED) {
194     result->SetRelativeValue(component_type, relative_qualifier);
195     result->SetRelativeCount(component_type, relative_count);
196   }
197 
198   if (result_span->first == kInvalidIndex ||
199       result_span->second == kInvalidIndex) {
200     *result_span = {kInvalidIndex, kInvalidIndex};
201   }
202 
203   return true;
204 }
205 
RuleIdForType(DatetimeExtractorType type,int * rule_id) const206 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
207                                       int* rule_id) const {
208   auto type_it = type_and_locale_to_rule_.find(type);
209   if (type_it == type_and_locale_to_rule_.end()) {
210     return false;
211   }
212 
213   auto locale_it = type_it->second.find(locale_id_);
214   if (locale_it == type_it->second.end()) {
215     return false;
216   }
217   *rule_id = locale_it->second;
218   return true;
219 }
220 
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const221 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
222                                     DatetimeExtractorType extractor_type,
223                                     UnicodeText* match_result) const {
224   int rule_id;
225   if (!RuleIdForType(extractor_type, &rule_id)) {
226     return false;
227   }
228 
229   std::unique_ptr<UniLib::RegexMatcher> matcher =
230       rules_[rule_id]->Matcher(input);
231   if (!matcher) {
232     return false;
233   }
234 
235   int status;
236   if (!matcher->Find(&status)) {
237     return false;
238   }
239 
240   if (match_result != nullptr) {
241     *match_result = matcher->Group(&status);
242     if (status != UniLib::RegexMatcher::kNoError) {
243       return false;
244     }
245   }
246   return true;
247 }
248 
GroupTextFromMatch(int group_id,UnicodeText * result) const249 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
250                                            UnicodeText* result) const {
251   int status;
252   *result = matcher_.Group(group_id, &status);
253   if (status != UniLib::RegexMatcher::kNoError) {
254     return false;
255   }
256   return true;
257 }
258 
UpdateMatchSpan(int group_id,CodepointSpan * span) const259 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
260                                         CodepointSpan* span) const {
261   int status;
262   const int match_start = matcher_.Start(group_id, &status);
263   if (status != UniLib::RegexMatcher::kNoError) {
264     return false;
265   }
266   const int match_end = matcher_.End(group_id, &status);
267   if (status != UniLib::RegexMatcher::kNoError) {
268     return false;
269   }
270   if (span->first == kInvalidIndex || span->first > match_start) {
271     span->first = match_start;
272   }
273   if (span->second == kInvalidIndex || span->second < match_end) {
274     span->second = match_end;
275   }
276 
277   return true;
278 }
279 
280 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const281 bool DatetimeExtractor::MapInput(
282     const UnicodeText& input,
283     const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
284     T* result) const {
285   for (const auto& type_value_pair : mapping) {
286     if (ExtractType(input, type_value_pair.first)) {
287       *result = type_value_pair.second;
288       return true;
289     }
290   }
291   return false;
292 }
293 
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const294 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
295                                            int* parsed_number) const {
296   std::vector<std::pair<int, int>> found_numbers;
297   for (const auto& type_value_pair :
298        std::vector<std::pair<DatetimeExtractorType, int>>{
299            {DatetimeExtractorType_ZERO, 0},
300            {DatetimeExtractorType_ONE, 1},
301            {DatetimeExtractorType_TWO, 2},
302            {DatetimeExtractorType_THREE, 3},
303            {DatetimeExtractorType_FOUR, 4},
304            {DatetimeExtractorType_FIVE, 5},
305            {DatetimeExtractorType_SIX, 6},
306            {DatetimeExtractorType_SEVEN, 7},
307            {DatetimeExtractorType_EIGHT, 8},
308            {DatetimeExtractorType_NINE, 9},
309            {DatetimeExtractorType_TEN, 10},
310            {DatetimeExtractorType_ELEVEN, 11},
311            {DatetimeExtractorType_TWELVE, 12},
312            {DatetimeExtractorType_THIRTEEN, 13},
313            {DatetimeExtractorType_FOURTEEN, 14},
314            {DatetimeExtractorType_FIFTEEN, 15},
315            {DatetimeExtractorType_SIXTEEN, 16},
316            {DatetimeExtractorType_SEVENTEEN, 17},
317            {DatetimeExtractorType_EIGHTEEN, 18},
318            {DatetimeExtractorType_NINETEEN, 19},
319            {DatetimeExtractorType_TWENTY, 20},
320            {DatetimeExtractorType_THIRTY, 30},
321            {DatetimeExtractorType_FORTY, 40},
322            {DatetimeExtractorType_FIFTY, 50},
323            {DatetimeExtractorType_SIXTY, 60},
324            {DatetimeExtractorType_SEVENTY, 70},
325            {DatetimeExtractorType_EIGHTY, 80},
326            {DatetimeExtractorType_NINETY, 90},
327            {DatetimeExtractorType_HUNDRED, 100},
328            {DatetimeExtractorType_THOUSAND, 1000},
329        }) {
330     int rule_id;
331     if (!RuleIdForType(type_value_pair.first, &rule_id)) {
332       return false;
333     }
334 
335     std::unique_ptr<UniLib::RegexMatcher> matcher =
336         rules_[rule_id]->Matcher(input);
337     if (!matcher) {
338       return false;
339     }
340     int status;
341     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
342       int span_start = matcher->Start(&status);
343       if (status != UniLib::RegexMatcher::kNoError) {
344         return false;
345       }
346       found_numbers.push_back({span_start, type_value_pair.second});
347     }
348   }
349 
350   std::sort(found_numbers.begin(), found_numbers.end(),
351             [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
352               return a.first < b.first;
353             });
354 
355   int sum = 0;
356   int running_value = -1;
357   // Simple math to make sure we handle written numerical modifiers correctly
358   // so that :="fifty one  thousand and one" maps to 51001 and not 50 1 1000 1.
359   for (const std::pair<int, int>& position_number_pair : found_numbers) {
360     if (running_value >= 0) {
361       if (running_value > position_number_pair.second) {
362         sum += running_value;
363         running_value = position_number_pair.second;
364       } else {
365         running_value *= position_number_pair.second;
366       }
367     } else {
368       running_value = position_number_pair.second;
369     }
370   }
371   sum += running_value;
372   *parsed_number = sum;
373   return true;
374 }
375 
ParseDigits(const UnicodeText & input,int * parsed_digits) const376 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
377                                     int* parsed_digits) const {
378   UnicodeText digit;
379   if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
380     return false;
381   }
382 
383   if (!unilib_.ParseInt32(digit, parsed_digits)) {
384     return false;
385   }
386   return true;
387 }
388 
ParseYear(const UnicodeText & input,int * parsed_year) const389 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
390                                   int* parsed_year) const {
391   if (!ParseDigits(input, parsed_year)) {
392     return false;
393   }
394   *parsed_year = GetAdjustedYear(*parsed_year);
395 
396   return true;
397 }
398 
ParseMonth(const UnicodeText & input,int * parsed_month) const399 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
400                                    int* parsed_month) const {
401   if (ParseDigits(input, parsed_month)) {
402     return true;
403   }
404 
405   if (MapInput(input,
406                {
407                    {DatetimeExtractorType_JANUARY, 1},
408                    {DatetimeExtractorType_FEBRUARY, 2},
409                    {DatetimeExtractorType_MARCH, 3},
410                    {DatetimeExtractorType_APRIL, 4},
411                    {DatetimeExtractorType_MAY, 5},
412                    {DatetimeExtractorType_JUNE, 6},
413                    {DatetimeExtractorType_JULY, 7},
414                    {DatetimeExtractorType_AUGUST, 8},
415                    {DatetimeExtractorType_SEPTEMBER, 9},
416                    {DatetimeExtractorType_OCTOBER, 10},
417                    {DatetimeExtractorType_NOVEMBER, 11},
418                    {DatetimeExtractorType_DECEMBER, 12},
419                },
420                parsed_month)) {
421     return true;
422   }
423 
424   return false;
425 }
426 
ParseAbsoluteDateValues(const UnicodeText & input,std::unordered_map<DatetimeComponent::ComponentType,int> * values) const427 bool DatetimeExtractor::ParseAbsoluteDateValues(
428     const UnicodeText& input,
429     std::unordered_map<DatetimeComponent::ComponentType, int>* values) const {
430   if (MapInput(input,
431                {
432                    {DatetimeExtractorType_NOON,
433                     {{DatetimeComponent::ComponentType::MERIDIEM, 1},
434                      {DatetimeComponent::ComponentType::MINUTE, 0},
435                      {DatetimeComponent::ComponentType::HOUR, 12}}},
436                    {DatetimeExtractorType_MIDNIGHT,
437                     {{DatetimeComponent::ComponentType::MERIDIEM, 0},
438                      {DatetimeComponent::ComponentType::MINUTE, 0},
439                      {DatetimeComponent::ComponentType::HOUR, 0}}},
440                },
441                values)) {
442     return true;
443   }
444   return false;
445 }
446 
ParseMeridiem(const UnicodeText & input,int * parsed_meridiem) const447 bool DatetimeExtractor::ParseMeridiem(const UnicodeText& input,
448                                       int* parsed_meridiem) const {
449   return MapInput(input,
450                   {
451                       {DatetimeExtractorType_AM, 0 /* AM */},
452                       {DatetimeExtractorType_PM, 1 /* PM */},
453                   },
454                   parsed_meridiem);
455 }
456 
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const457 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
458                                               int* parsed_distance) const {
459   if (ParseDigits(input, parsed_distance)) {
460     return true;
461   }
462   if (ParseWrittenNumber(input, parsed_distance)) {
463     return true;
464   }
465   return false;
466 }
467 
ParseRelativeValue(const UnicodeText & input,DatetimeComponent::RelativeQualifier * parsed_relative_value) const468 bool DatetimeExtractor::ParseRelativeValue(
469     const UnicodeText& input,
470     DatetimeComponent::RelativeQualifier* parsed_relative_value) const {
471   return MapInput(input,
472                   {
473                       {DatetimeExtractorType_NOW,
474                        DatetimeComponent::RelativeQualifier::NOW},
475                       {DatetimeExtractorType_YESTERDAY,
476                        DatetimeComponent::RelativeQualifier::YESTERDAY},
477                       {DatetimeExtractorType_TOMORROW,
478                        DatetimeComponent::RelativeQualifier::TOMORROW},
479                       {DatetimeExtractorType_NEXT,
480                        DatetimeComponent::RelativeQualifier::NEXT},
481                       {DatetimeExtractorType_NEXT_OR_SAME,
482                        DatetimeComponent::RelativeQualifier::THIS},
483                       {DatetimeExtractorType_LAST,
484                        DatetimeComponent::RelativeQualifier::LAST},
485                       {DatetimeExtractorType_PAST,
486                        DatetimeComponent::RelativeQualifier::PAST},
487                       {DatetimeExtractorType_FUTURE,
488                        DatetimeComponent::RelativeQualifier::FUTURE},
489                   },
490                   parsed_relative_value);
491 }
492 
ParseRelationAndConvertToRelativeCount(const UnicodeText & input,int * relative_count) const493 bool DatetimeExtractor::ParseRelationAndConvertToRelativeCount(
494     const UnicodeText& input, int* relative_count) const {
495   return MapInput(input,
496                   {
497                       {DatetimeExtractorType_NOW, 0},
498                       {DatetimeExtractorType_YESTERDAY, -1},
499                       {DatetimeExtractorType_TOMORROW, 1},
500                       {DatetimeExtractorType_NEXT, 1},
501                       {DatetimeExtractorType_NEXT_OR_SAME, 1},
502                       {DatetimeExtractorType_LAST, -1},
503                       {DatetimeExtractorType_PAST, -1},
504                   },
505                   relative_count);
506 }
507 
ParseDayOfWeek(const UnicodeText & input,int * parsed_day_of_week) const508 bool DatetimeExtractor::ParseDayOfWeek(const UnicodeText& input,
509                                        int* parsed_day_of_week) const {
510   return MapInput(input,
511                   {
512                       {DatetimeExtractorType_SUNDAY, kSunday},
513                       {DatetimeExtractorType_MONDAY, kMonday},
514                       {DatetimeExtractorType_TUESDAY, kTuesday},
515                       {DatetimeExtractorType_WEDNESDAY, kWednesday},
516                       {DatetimeExtractorType_THURSDAY, kThursday},
517                       {DatetimeExtractorType_FRIDAY, kFriday},
518                       {DatetimeExtractorType_SATURDAY, kSaturday},
519                   },
520                   parsed_day_of_week);
521 }
522 
ParseFieldType(const UnicodeText & input,DatetimeComponent::ComponentType * parsed_field_type) const523 bool DatetimeExtractor::ParseFieldType(
524     const UnicodeText& input,
525     DatetimeComponent::ComponentType* parsed_field_type) const {
526   return MapInput(
527       input,
528       {
529           {DatetimeExtractorType_MONDAY,
530            DatetimeComponent::ComponentType::DAY_OF_WEEK},
531           {DatetimeExtractorType_TUESDAY,
532            DatetimeComponent::ComponentType::DAY_OF_WEEK},
533           {DatetimeExtractorType_WEDNESDAY,
534            DatetimeComponent::ComponentType::DAY_OF_WEEK},
535           {DatetimeExtractorType_THURSDAY,
536            DatetimeComponent::ComponentType::DAY_OF_WEEK},
537           {DatetimeExtractorType_FRIDAY,
538            DatetimeComponent::ComponentType::DAY_OF_WEEK},
539           {DatetimeExtractorType_SATURDAY,
540            DatetimeComponent::ComponentType::DAY_OF_WEEK},
541           {DatetimeExtractorType_SUNDAY,
542            DatetimeComponent::ComponentType::DAY_OF_WEEK},
543           {DatetimeExtractorType_SECONDS,
544            DatetimeComponent::ComponentType::SECOND},
545           {DatetimeExtractorType_MINUTES,
546            DatetimeComponent::ComponentType::MINUTE},
547           {DatetimeExtractorType_NOW,
548            DatetimeComponent::ComponentType::DAY_OF_MONTH},
549           {DatetimeExtractorType_HOURS, DatetimeComponent::ComponentType::HOUR},
550           {DatetimeExtractorType_DAY,
551            DatetimeComponent::ComponentType::DAY_OF_MONTH},
552           {DatetimeExtractorType_TOMORROW,
553            DatetimeComponent::ComponentType::DAY_OF_MONTH},
554           {DatetimeExtractorType_YESTERDAY,
555            DatetimeComponent::ComponentType::DAY_OF_MONTH},
556           {DatetimeExtractorType_WEEK, DatetimeComponent::ComponentType::WEEK},
557           {DatetimeExtractorType_MONTH,
558            DatetimeComponent::ComponentType::MONTH},
559           {DatetimeExtractorType_YEAR, DatetimeComponent::ComponentType::YEAR},
560       },
561       parsed_field_type);
562 }
563 
564 }  // namespace libtextclassifier3
565