1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
19 
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "annotator/model_generated.h"
25 #include "annotator/types.h"
26 #include "utils/strings/stringpiece.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 
30 namespace libtextclassifier3 {
31 
32 struct CompiledRule {
33   // The compiled regular expression.
34   std::unique_ptr<const UniLib::RegexPattern> compiled_regex;
35 
36   // The uncompiled pattern and information about the pattern groups.
37   const DatetimeModelPattern_::Regex* regex;
38 
39   // DatetimeModelPattern which 'regex' is part of and comes from.
40   const DatetimeModelPattern* pattern;
41 };
42 
43 // A helper class for DatetimeParser that extracts structured data
44 // (DateParseDate) from the current match of the passed RegexMatcher.
45 class DatetimeExtractor {
46  public:
DatetimeExtractor(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,int locale_id,const UniLib * unilib,const std::vector<std::unique_ptr<const UniLib::RegexPattern>> & extractor_rules,const std::unordered_map<DatetimeExtractorType,std::unordered_map<int,int>> & type_and_locale_to_extractor_rule)47   explicit DatetimeExtractor(
48       const CompiledRule& rule, const UniLib::RegexMatcher& matcher,
49       int locale_id, const UniLib* unilib,
50       const std::vector<std::unique_ptr<const UniLib::RegexPattern>>&
51           extractor_rules,
52       const std::unordered_map<DatetimeExtractorType,
53                                std::unordered_map<int, int>>&
54           type_and_locale_to_extractor_rule)
55       : rule_(rule),
56         matcher_(matcher),
57         locale_id_(locale_id),
58         unilib_(*unilib),
59         rules_(extractor_rules),
60         type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {}
61   bool Extract(DatetimeParsedData* result, CodepointSpan* result_span) const;
62 
63  private:
64   bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const;
65 
66   // Returns true if the rule for given extractor matched. If it matched,
67   // match_result will contain the first group of the rule (if match_result not
68   // nullptr).
69   bool ExtractType(const UnicodeText& input,
70                    DatetimeExtractorType extractor_type,
71                    UnicodeText* match_result = nullptr) const;
72 
73   bool GroupTextFromMatch(int group_id, UnicodeText* result) const;
74 
75   // Updates the span to include the current match for the given group.
76   bool UpdateMatchSpan(int group_id, CodepointSpan* span) const;
77 
78   // Returns true if any of the extractors from 'mapping' matched. If it did,
79   // will fill 'result' with the associated value from 'mapping'.
80   template <typename T>
81   bool MapInput(const UnicodeText& input,
82                 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
83                 T* result) const;
84 
85   bool ParseDigits(const UnicodeText& input, int* parsed_digits) const;
86   bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const;
87   bool ParseYear(const UnicodeText& input, int* parsed_year) const;
88   bool ParseMonth(const UnicodeText& input, int* parsed_month) const;
89   bool ParseMeridiem(const UnicodeText& input, int* parsed_meridiem) const;
90   bool ParseRelativeValue(
91       const UnicodeText& input,
92       DatetimeComponent::RelativeQualifier* parsed_relative_value) const;
93   bool ParseRelationDistance(const UnicodeText& input,
94                              int* parsed_distance) const;
95   bool ParseFieldType(
96       const UnicodeText& input,
97       DatetimeComponent::ComponentType* parsed_field_type) const;
98   bool ParseDayOfWeek(const UnicodeText& input, int* parsed_day_of_week) const;
99 
100   bool ParseRelationAndConvertToRelativeCount(const UnicodeText& input,
101                                               int* relative_count) const;
102 
103   // There are some special words which represent multiple date time components
104   // e.g. if the text says “by noon” it clearly indicates that the hour is 12,
105   // minute is 0 and meridiam is PM.
106   // The method handles such tokens and translates them into multiple date time
107   // components.
108   bool ParseAbsoluteDateValues(
109       const UnicodeText& input,
110       std::unordered_map<DatetimeComponent::ComponentType, int>* values) const;
111 
112   const CompiledRule& rule_;
113   const UniLib::RegexMatcher& matcher_;
114   int locale_id_;
115   const UniLib& unilib_;
116   const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_;
117   const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>&
118       type_and_locale_to_rule_;
119 };
120 
121 }  // namespace libtextclassifier3
122 
123 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
124