1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
19 
20 #include <string>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <vector>
24 
25 #include "annotator/feature-processor.h"
26 #include "annotator/model_generated.h"
27 #include "annotator/types.h"
28 #include "utils/utf8/unicodetext.h"
29 #include "utils/utf8/unilib.h"
30 
31 namespace libtextclassifier3 {
32 
33 namespace internal {
34 enum class DurationUnit {
35   UNKNOWN = -1,
36   WEEK = 0,
37   DAY = 1,
38   HOUR = 2,
39   MINUTE = 3,
40   SECOND = 4
41 
42   // NOTE: If we want to add MONTH and YEAR we'll have to think of different
43   // parsing format, because MONTH and YEAR don't have a fixed number of
44   // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
45   // savings time and assume the day is always 24 hours.
46 };
47 
48 // Prepares the mapping between token values and duration unit types.
49 std::unordered_map<std::string, internal::DurationUnit>
50 BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options,
51                                 const UniLib* unilib);
52 
53 // Creates a set of strings from a flatbuffer string vector.
54 std::unordered_set<std::string> BuildStringSet(
55     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*
56         strings,
57     const UniLib* unilib);
58 
59 // Creates a set of ints from a flatbuffer int vector.
60 std::unordered_set<int32> BuildInt32Set(const flatbuffers::Vector<int32>* ints);
61 
62 }  // namespace internal
63 
64 // Annotator of duration expressions like "3 minutes 30 seconds".
65 class DurationAnnotator {
66  public:
DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor,const UniLib * unilib)67   explicit DurationAnnotator(const DurationAnnotatorOptions* options,
68                              const FeatureProcessor* feature_processor,
69                              const UniLib* unilib)
70       : options_(options),
71         feature_processor_(feature_processor),
72         unilib_(unilib),
73         token_value_to_duration_unit_(
74             internal::BuildTokenToDurationUnitMapping(options, unilib)),
75         filler_expressions_(
76             internal::BuildStringSet(options->filler_expressions(), unilib)),
77         half_expressions_(
78             internal::BuildStringSet(options->half_expressions(), unilib)),
79         sub_token_separator_codepoints_(internal::BuildInt32Set(
80             options->sub_token_separator_codepoints())) {}
81 
82   // Classifies given text, and if it is a duration, it passes the result in
83   // 'classification_result' and returns true, otherwise returns false.
84   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
85                     AnnotationUsecase annotation_usecase,
86                     ClassificationResult* classification_result) const;
87 
88   // Finds all duration instances in the input text.
89   bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
90                AnnotationUsecase annotation_usecase,
91                std::vector<AnnotatedSpan>* results) const;
92 
93  private:
94   // Represents a component of duration parsed from text (e.g. "3 hours" from
95   // the expression "3 hours and 20 minutes").
96   struct ParsedDurationAtom {
97     // Unit of the duration.
98     internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;
99 
100     // Quantity of the duration unit.
101     double value = 0;
102 
103     // True, if half an unit was specified (either in addition, or exclusively).
104     // E.g. "hour and a half".
105     // NOTE: Quarter, three-quarters etc. is not supported.
106     bool plus_half = false;
107 
HalfParsedDurationAtom108     static ParsedDurationAtom Half() {
109       ParsedDurationAtom result;
110       result.plus_half = true;
111       return result;
112     }
113   };
114 
115   // Starts consuming tokens and returns the index past the last consumed token.
116   int FindDurationStartingAt(const UnicodeText& context,
117                              const std::vector<Token>& tokens,
118                              int start_token_index,
119                              AnnotatedSpan* result) const;
120 
121   bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
122   bool ParseDurationUnitToken(const Token& token,
123                               internal::DurationUnit* duration_unit) const;
124   bool ParseQuantityDurationUnitToken(const Token& token,
125                                       ParsedDurationAtom* value) const;
126   bool ParseFillerToken(const Token& token) const;
127 
128   int64 ParsedDurationAtomsToMillis(
129       const std::vector<ParsedDurationAtom>& atoms) const;
130 
131   const DurationAnnotatorOptions* options_;
132   const FeatureProcessor* feature_processor_;
133   const UniLib* unilib_;
134   const std::unordered_map<std::string, internal::DurationUnit>
135       token_value_to_duration_unit_;
136   const std::unordered_set<std::string> filler_expressions_;
137   const std::unordered_set<std::string> half_expressions_;
138   const std::unordered_set<int32> sub_token_separator_codepoints_;
139 };
140 
141 }  // namespace libtextclassifier3
142 
143 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
144