1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "annotator/datetime/extractor.h"
18 
19 #include "utils/base/logging.h"
20 
21 namespace libtextclassifier3 {
22 
Extract(DateParseData * result,CodepointSpan * result_span) const23 bool DatetimeExtractor::Extract(DateParseData* result,
24                                 CodepointSpan* result_span) const {
25   result->field_set_mask = 0;
26   *result_span = {kInvalidIndex, kInvalidIndex};
27 
28   if (rule_.regex->groups() == nullptr) {
29     return false;
30   }
31 
32   for (int group_id = 0; group_id < rule_.regex->groups()->size(); group_id++) {
33     UnicodeText group_text;
34     const int group_type = rule_.regex->groups()->Get(group_id);
35     if (group_type == DatetimeGroupType_GROUP_UNUSED) {
36       continue;
37     }
38     if (!GroupTextFromMatch(group_id, &group_text)) {
39       TC3_LOG(ERROR) << "Couldn't retrieve group.";
40       return false;
41     }
42     // The pattern can have a group defined in a part that was not matched,
43     // e.g. an optional part. In this case we'll get an empty content here.
44     if (group_text.empty()) {
45       continue;
46     }
47     switch (group_type) {
48       case DatetimeGroupType_GROUP_YEAR: {
49         if (!ParseYear(group_text, &(result->year))) {
50           TC3_LOG(ERROR) << "Couldn't extract YEAR.";
51           return false;
52         }
53         result->field_set_mask |= DateParseData::YEAR_FIELD;
54         break;
55       }
56       case DatetimeGroupType_GROUP_MONTH: {
57         if (!ParseMonth(group_text, &(result->month))) {
58           TC3_LOG(ERROR) << "Couldn't extract MONTH.";
59           return false;
60         }
61         result->field_set_mask |= DateParseData::MONTH_FIELD;
62         break;
63       }
64       case DatetimeGroupType_GROUP_DAY: {
65         if (!ParseDigits(group_text, &(result->day_of_month))) {
66           TC3_LOG(ERROR) << "Couldn't extract DAY.";
67           return false;
68         }
69         result->field_set_mask |= DateParseData::DAY_FIELD;
70         break;
71       }
72       case DatetimeGroupType_GROUP_HOUR: {
73         if (!ParseDigits(group_text, &(result->hour))) {
74           TC3_LOG(ERROR) << "Couldn't extract HOUR.";
75           return false;
76         }
77         result->field_set_mask |= DateParseData::HOUR_FIELD;
78         break;
79       }
80       case DatetimeGroupType_GROUP_MINUTE: {
81         if (!ParseDigits(group_text, &(result->minute))) {
82           TC3_LOG(ERROR) << "Couldn't extract MINUTE.";
83           return false;
84         }
85         result->field_set_mask |= DateParseData::MINUTE_FIELD;
86         break;
87       }
88       case DatetimeGroupType_GROUP_SECOND: {
89         if (!ParseDigits(group_text, &(result->second))) {
90           TC3_LOG(ERROR) << "Couldn't extract SECOND.";
91           return false;
92         }
93         result->field_set_mask |= DateParseData::SECOND_FIELD;
94         break;
95       }
96       case DatetimeGroupType_GROUP_AMPM: {
97         if (!ParseAMPM(group_text, &(result->ampm))) {
98           TC3_LOG(ERROR) << "Couldn't extract AMPM.";
99           return false;
100         }
101         result->field_set_mask |= DateParseData::AMPM_FIELD;
102         break;
103       }
104       case DatetimeGroupType_GROUP_RELATIONDISTANCE: {
105         if (!ParseRelationDistance(group_text, &(result->relation_distance))) {
106           TC3_LOG(ERROR) << "Couldn't extract RELATION_DISTANCE_FIELD.";
107           return false;
108         }
109         result->field_set_mask |= DateParseData::RELATION_DISTANCE_FIELD;
110         break;
111       }
112       case DatetimeGroupType_GROUP_RELATION: {
113         if (!ParseRelation(group_text, &(result->relation))) {
114           TC3_LOG(ERROR) << "Couldn't extract RELATION_FIELD.";
115           return false;
116         }
117         result->field_set_mask |= DateParseData::RELATION_FIELD;
118         break;
119       }
120       case DatetimeGroupType_GROUP_RELATIONTYPE: {
121         if (!ParseRelationType(group_text, &(result->relation_type))) {
122           TC3_LOG(ERROR) << "Couldn't extract RELATION_TYPE_FIELD.";
123           return false;
124         }
125         result->field_set_mask |= DateParseData::RELATION_TYPE_FIELD;
126         break;
127       }
128       case DatetimeGroupType_GROUP_DUMMY1:
129       case DatetimeGroupType_GROUP_DUMMY2:
130         break;
131       default:
132         TC3_LOG(INFO) << "Unknown group type.";
133         continue;
134     }
135     if (!UpdateMatchSpan(group_id, result_span)) {
136       TC3_LOG(ERROR) << "Couldn't update span.";
137       return false;
138     }
139   }
140 
141   if (result_span->first == kInvalidIndex ||
142       result_span->second == kInvalidIndex) {
143     *result_span = {kInvalidIndex, kInvalidIndex};
144   }
145 
146   return true;
147 }
148 
RuleIdForType(DatetimeExtractorType type,int * rule_id) const149 bool DatetimeExtractor::RuleIdForType(DatetimeExtractorType type,
150                                       int* rule_id) const {
151   auto type_it = type_and_locale_to_rule_.find(type);
152   if (type_it == type_and_locale_to_rule_.end()) {
153     return false;
154   }
155 
156   auto locale_it = type_it->second.find(locale_id_);
157   if (locale_it == type_it->second.end()) {
158     return false;
159   }
160   *rule_id = locale_it->second;
161   return true;
162 }
163 
ExtractType(const UnicodeText & input,DatetimeExtractorType extractor_type,UnicodeText * match_result) const164 bool DatetimeExtractor::ExtractType(const UnicodeText& input,
165                                     DatetimeExtractorType extractor_type,
166                                     UnicodeText* match_result) const {
167   int rule_id;
168   if (!RuleIdForType(extractor_type, &rule_id)) {
169     return false;
170   }
171 
172   std::unique_ptr<UniLib::RegexMatcher> matcher =
173       rules_[rule_id]->Matcher(input);
174   if (!matcher) {
175     return false;
176   }
177 
178   int status;
179   if (!matcher->Find(&status)) {
180     return false;
181   }
182 
183   if (match_result != nullptr) {
184     *match_result = matcher->Group(&status);
185     if (status != UniLib::RegexMatcher::kNoError) {
186       return false;
187     }
188   }
189   return true;
190 }
191 
GroupTextFromMatch(int group_id,UnicodeText * result) const192 bool DatetimeExtractor::GroupTextFromMatch(int group_id,
193                                            UnicodeText* result) const {
194   int status;
195   *result = matcher_.Group(group_id, &status);
196   if (status != UniLib::RegexMatcher::kNoError) {
197     return false;
198   }
199   return true;
200 }
201 
UpdateMatchSpan(int group_id,CodepointSpan * span) const202 bool DatetimeExtractor::UpdateMatchSpan(int group_id,
203                                         CodepointSpan* span) const {
204   int status;
205   const int match_start = matcher_.Start(group_id, &status);
206   if (status != UniLib::RegexMatcher::kNoError) {
207     return false;
208   }
209   const int match_end = matcher_.End(group_id, &status);
210   if (status != UniLib::RegexMatcher::kNoError) {
211     return false;
212   }
213   if (span->first == kInvalidIndex || span->first > match_start) {
214     span->first = match_start;
215   }
216   if (span->second == kInvalidIndex || span->second < match_end) {
217     span->second = match_end;
218   }
219 
220   return true;
221 }
222 
223 template <typename T>
MapInput(const UnicodeText & input,const std::vector<std::pair<DatetimeExtractorType,T>> & mapping,T * result) const224 bool DatetimeExtractor::MapInput(
225     const UnicodeText& input,
226     const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
227     T* result) const {
228   for (const auto& type_value_pair : mapping) {
229     if (ExtractType(input, type_value_pair.first)) {
230       *result = type_value_pair.second;
231       return true;
232     }
233   }
234   return false;
235 }
236 
ParseWrittenNumber(const UnicodeText & input,int * parsed_number) const237 bool DatetimeExtractor::ParseWrittenNumber(const UnicodeText& input,
238                                            int* parsed_number) const {
239   std::vector<std::pair<int, int>> found_numbers;
240   for (const auto& type_value_pair :
241        std::vector<std::pair<DatetimeExtractorType, int>>{
242            {DatetimeExtractorType_ZERO, 0},
243            {DatetimeExtractorType_ONE, 1},
244            {DatetimeExtractorType_TWO, 2},
245            {DatetimeExtractorType_THREE, 3},
246            {DatetimeExtractorType_FOUR, 4},
247            {DatetimeExtractorType_FIVE, 5},
248            {DatetimeExtractorType_SIX, 6},
249            {DatetimeExtractorType_SEVEN, 7},
250            {DatetimeExtractorType_EIGHT, 8},
251            {DatetimeExtractorType_NINE, 9},
252            {DatetimeExtractorType_TEN, 10},
253            {DatetimeExtractorType_ELEVEN, 11},
254            {DatetimeExtractorType_TWELVE, 12},
255            {DatetimeExtractorType_THIRTEEN, 13},
256            {DatetimeExtractorType_FOURTEEN, 14},
257            {DatetimeExtractorType_FIFTEEN, 15},
258            {DatetimeExtractorType_SIXTEEN, 16},
259            {DatetimeExtractorType_SEVENTEEN, 17},
260            {DatetimeExtractorType_EIGHTEEN, 18},
261            {DatetimeExtractorType_NINETEEN, 19},
262            {DatetimeExtractorType_TWENTY, 20},
263            {DatetimeExtractorType_THIRTY, 30},
264            {DatetimeExtractorType_FORTY, 40},
265            {DatetimeExtractorType_FIFTY, 50},
266            {DatetimeExtractorType_SIXTY, 60},
267            {DatetimeExtractorType_SEVENTY, 70},
268            {DatetimeExtractorType_EIGHTY, 80},
269            {DatetimeExtractorType_NINETY, 90},
270            {DatetimeExtractorType_HUNDRED, 100},
271            {DatetimeExtractorType_THOUSAND, 1000},
272        }) {
273     int rule_id;
274     if (!RuleIdForType(type_value_pair.first, &rule_id)) {
275       return false;
276     }
277 
278     std::unique_ptr<UniLib::RegexMatcher> matcher =
279         rules_[rule_id]->Matcher(input);
280     if (!matcher) {
281       return false;
282     }
283 
284     int status;
285     while (matcher->Find(&status) && status == UniLib::RegexMatcher::kNoError) {
286       int span_start = matcher->Start(&status);
287       if (status != UniLib::RegexMatcher::kNoError) {
288         return false;
289       }
290       found_numbers.push_back({span_start, type_value_pair.second});
291     }
292   }
293 
294   std::sort(found_numbers.begin(), found_numbers.end(),
295             [](const std::pair<int, int>& a, const std::pair<int, int>& b) {
296               return a.first < b.first;
297             });
298 
299   int sum = 0;
300   int running_value = -1;
301   // Simple math to make sure we handle written numerical modifiers correctly
302   // so that :="fifty one  thousand and one" maps to 51001 and not 50 1 1000 1.
303   for (const std::pair<int, int> position_number_pair : found_numbers) {
304     if (running_value >= 0) {
305       if (running_value > position_number_pair.second) {
306         sum += running_value;
307         running_value = position_number_pair.second;
308       } else {
309         running_value *= position_number_pair.second;
310       }
311     } else {
312       running_value = position_number_pair.second;
313     }
314   }
315   sum += running_value;
316   *parsed_number = sum;
317   return true;
318 }
319 
ParseDigits(const UnicodeText & input,int * parsed_digits) const320 bool DatetimeExtractor::ParseDigits(const UnicodeText& input,
321                                     int* parsed_digits) const {
322   UnicodeText digit;
323   if (!ExtractType(input, DatetimeExtractorType_DIGITS, &digit)) {
324     return false;
325   }
326 
327   if (!unilib_.ParseInt32(digit, parsed_digits)) {
328     return false;
329   }
330   return true;
331 }
332 
ParseYear(const UnicodeText & input,int * parsed_year) const333 bool DatetimeExtractor::ParseYear(const UnicodeText& input,
334                                   int* parsed_year) const {
335   if (!ParseDigits(input, parsed_year)) {
336     return false;
337   }
338 
339   if (*parsed_year < 100) {
340     if (*parsed_year < 50) {
341       *parsed_year += 2000;
342     } else {
343       *parsed_year += 1900;
344     }
345   }
346 
347   return true;
348 }
349 
ParseMonth(const UnicodeText & input,int * parsed_month) const350 bool DatetimeExtractor::ParseMonth(const UnicodeText& input,
351                                    int* parsed_month) const {
352   if (ParseDigits(input, parsed_month)) {
353     return true;
354   }
355 
356   if (MapInput(input,
357                {
358                    {DatetimeExtractorType_JANUARY, 1},
359                    {DatetimeExtractorType_FEBRUARY, 2},
360                    {DatetimeExtractorType_MARCH, 3},
361                    {DatetimeExtractorType_APRIL, 4},
362                    {DatetimeExtractorType_MAY, 5},
363                    {DatetimeExtractorType_JUNE, 6},
364                    {DatetimeExtractorType_JULY, 7},
365                    {DatetimeExtractorType_AUGUST, 8},
366                    {DatetimeExtractorType_SEPTEMBER, 9},
367                    {DatetimeExtractorType_OCTOBER, 10},
368                    {DatetimeExtractorType_NOVEMBER, 11},
369                    {DatetimeExtractorType_DECEMBER, 12},
370                },
371                parsed_month)) {
372     return true;
373   }
374 
375   return false;
376 }
377 
ParseAMPM(const UnicodeText & input,DateParseData::AMPM * parsed_ampm) const378 bool DatetimeExtractor::ParseAMPM(const UnicodeText& input,
379                                   DateParseData::AMPM* parsed_ampm) const {
380   return MapInput(input,
381                   {
382                       {DatetimeExtractorType_AM, DateParseData::AMPM::AM},
383                       {DatetimeExtractorType_PM, DateParseData::AMPM::PM},
384                   },
385                   parsed_ampm);
386 }
387 
ParseRelationDistance(const UnicodeText & input,int * parsed_distance) const388 bool DatetimeExtractor::ParseRelationDistance(const UnicodeText& input,
389                                               int* parsed_distance) const {
390   if (ParseDigits(input, parsed_distance)) {
391     return true;
392   }
393   if (ParseWrittenNumber(input, parsed_distance)) {
394     return true;
395   }
396   return false;
397 }
398 
ParseRelation(const UnicodeText & input,DateParseData::Relation * parsed_relation) const399 bool DatetimeExtractor::ParseRelation(
400     const UnicodeText& input, DateParseData::Relation* parsed_relation) const {
401   return MapInput(
402       input,
403       {
404           {DatetimeExtractorType_NOW, DateParseData::Relation::NOW},
405           {DatetimeExtractorType_YESTERDAY, DateParseData::Relation::YESTERDAY},
406           {DatetimeExtractorType_TOMORROW, DateParseData::Relation::TOMORROW},
407           {DatetimeExtractorType_NEXT, DateParseData::Relation::NEXT},
408           {DatetimeExtractorType_NEXT_OR_SAME,
409            DateParseData::Relation::NEXT_OR_SAME},
410           {DatetimeExtractorType_LAST, DateParseData::Relation::LAST},
411           {DatetimeExtractorType_PAST, DateParseData::Relation::PAST},
412           {DatetimeExtractorType_FUTURE, DateParseData::Relation::FUTURE},
413       },
414       parsed_relation);
415 }
416 
ParseRelationType(const UnicodeText & input,DateParseData::RelationType * parsed_relation_type) const417 bool DatetimeExtractor::ParseRelationType(
418     const UnicodeText& input,
419     DateParseData::RelationType* parsed_relation_type) const {
420   return MapInput(
421       input,
422       {
423           {DatetimeExtractorType_MONDAY, DateParseData::RelationType::MONDAY},
424           {DatetimeExtractorType_TUESDAY, DateParseData::RelationType::TUESDAY},
425           {DatetimeExtractorType_WEDNESDAY,
426            DateParseData::RelationType::WEDNESDAY},
427           {DatetimeExtractorType_THURSDAY,
428            DateParseData::RelationType::THURSDAY},
429           {DatetimeExtractorType_FRIDAY, DateParseData::RelationType::FRIDAY},
430           {DatetimeExtractorType_SATURDAY,
431            DateParseData::RelationType::SATURDAY},
432           {DatetimeExtractorType_SUNDAY, DateParseData::RelationType::SUNDAY},
433           {DatetimeExtractorType_SECONDS, DateParseData::RelationType::SECOND},
434           {DatetimeExtractorType_MINUTES, DateParseData::RelationType::MINUTE},
435           {DatetimeExtractorType_HOURS, DateParseData::RelationType::HOUR},
436           {DatetimeExtractorType_DAY, DateParseData::RelationType::DAY},
437           {DatetimeExtractorType_WEEK, DateParseData::RelationType::WEEK},
438           {DatetimeExtractorType_MONTH, DateParseData::RelationType::MONTH},
439           {DatetimeExtractorType_YEAR, DateParseData::RelationType::YEAR},
440       },
441       parsed_relation_type);
442 }
443 
444 }  // namespace libtextclassifier3
445