1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
19 
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "annotator/model_generated.h"
25 #include "annotator/types.h"
26 #include "utils/strings/stringpiece.h"
27 #include "utils/utf8/unicodetext.h"
28 #include "utils/utf8/unilib.h"
29 
30 namespace libtextclassifier3 {
31 
32 struct CompiledRule {
33   // The compiled regular expression.
34   std::unique_ptr<const UniLib::RegexPattern> compiled_regex;
35 
36   // The uncompiled pattern and information about the pattern groups.
37   const DatetimeModelPattern_::Regex* regex;
38 
39   // DatetimeModelPattern which 'regex' is part of and comes from.
40   const DatetimeModelPattern* pattern;
41 };
42 
43 // A helper class for DatetimeParser that extracts structured data
44 // (DateParseDate) from the current match of the passed RegexMatcher.
45 class DatetimeExtractor {
46  public:
DatetimeExtractor(const CompiledRule & rule,const UniLib::RegexMatcher & matcher,int locale_id,const UniLib & unilib,const std::vector<std::unique_ptr<const UniLib::RegexPattern>> & extractor_rules,const std::unordered_map<DatetimeExtractorType,std::unordered_map<int,int>> & type_and_locale_to_extractor_rule)47   DatetimeExtractor(
48       const CompiledRule& rule, const UniLib::RegexMatcher& matcher,
49       int locale_id, const UniLib& unilib,
50       const std::vector<std::unique_ptr<const UniLib::RegexPattern>>&
51           extractor_rules,
52       const std::unordered_map<DatetimeExtractorType,
53                                std::unordered_map<int, int>>&
54           type_and_locale_to_extractor_rule)
55       : rule_(rule),
56         matcher_(matcher),
57         locale_id_(locale_id),
58         unilib_(unilib),
59         rules_(extractor_rules),
60         type_and_locale_to_rule_(type_and_locale_to_extractor_rule) {}
61   bool Extract(DateParseData* result, CodepointSpan* result_span) const;
62 
63  private:
64   bool RuleIdForType(DatetimeExtractorType type, int* rule_id) const;
65 
66   // Returns true if the rule for given extractor matched. If it matched,
67   // match_result will contain the first group of the rule (if match_result not
68   // nullptr).
69   bool ExtractType(const UnicodeText& input,
70                    DatetimeExtractorType extractor_type,
71                    UnicodeText* match_result = nullptr) const;
72 
73   bool GroupTextFromMatch(int group_id, UnicodeText* result) const;
74 
75   // Updates the span to include the current match for the given group.
76   bool UpdateMatchSpan(int group_id, CodepointSpan* span) const;
77 
78   // Returns true if any of the extractors from 'mapping' matched. If it did,
79   // will fill 'result' with the associated value from 'mapping'.
80   template <typename T>
81   bool MapInput(const UnicodeText& input,
82                 const std::vector<std::pair<DatetimeExtractorType, T>>& mapping,
83                 T* result) const;
84 
85   bool ParseDigits(const UnicodeText& input, int* parsed_digits) const;
86   bool ParseWrittenNumber(const UnicodeText& input, int* parsed_number) const;
87   bool ParseYear(const UnicodeText& input, int* parsed_year) const;
88   bool ParseMonth(const UnicodeText& input, int* parsed_month) const;
89   bool ParseAMPM(const UnicodeText& input,
90                  DateParseData::AMPM* parsed_ampm) const;
91   bool ParseRelation(const UnicodeText& input,
92                      DateParseData::Relation* parsed_relation) const;
93   bool ParseRelationDistance(const UnicodeText& input,
94                              int* parsed_distance) const;
95   bool ParseTimeUnit(const UnicodeText& input,
96                      DateParseData::TimeUnit* parsed_time_unit) const;
97   bool ParseRelationType(
98       const UnicodeText& input,
99       DateParseData::RelationType* parsed_relation_type) const;
100   bool ParseWeekday(const UnicodeText& input,
101                     DateParseData::RelationType* parsed_weekday) const;
102 
103   const CompiledRule& rule_;
104   const UniLib::RegexMatcher& matcher_;
105   int locale_id_;
106   const UniLib& unilib_;
107   const std::vector<std::unique_ptr<const UniLib::RegexPattern>>& rules_;
108   const std::unordered_map<DatetimeExtractorType, std::unordered_map<int, int>>&
109       type_and_locale_to_rule_;
110 };
111 
112 }  // namespace libtextclassifier3
113 
114 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DATETIME_EXTRACTOR_H_
115