1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
19 
20 #include <string>
21 #include <unordered_map>
22 #include <unordered_set>
23 #include <vector>
24 
25 #include "annotator/feature-processor.h"
26 #include "annotator/model_generated.h"
27 #include "annotator/types.h"
28 #include "utils/utf8/unicodetext.h"
29 
30 namespace libtextclassifier3 {
31 
32 namespace internal {
33 enum class DurationUnit {
34   UNKNOWN = -1,
35   WEEK = 0,
36   DAY = 1,
37   HOUR = 2,
38   MINUTE = 3,
39   SECOND = 4
40 
41   // NOTE: If we want to add MONTH and YEAR we'll have to think of different
42   // parsing format, because MONTH and YEAR don't have a fixed number of
43   // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
44   // savings time and assume the day is always 24 hours.
45 };
46 
47 // Prepares the mapping between token values and duration unit types.
48 std::unordered_map<std::string, internal::DurationUnit>
49 BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options);
50 
51 // Creates a set of strings from a flatbuffer string vector.
52 std::unordered_set<std::string> BuildStringSet(
53     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*);
54 
55 }  // namespace internal
56 
57 // Annotator of duration expressions like "3 minutes 30 seconds".
58 class DurationAnnotator {
59  public:
DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor)60   explicit DurationAnnotator(const DurationAnnotatorOptions* options,
61                              const FeatureProcessor* feature_processor)
62       : options_(options),
63         feature_processor_(feature_processor),
64         token_value_to_duration_unit_(
65             internal::BuildTokenToDurationUnitMapping(options)),
66         filler_expressions_(
67             internal::BuildStringSet(options->filler_expressions())),
68         half_expressions_(
69             internal::BuildStringSet(options->half_expressions())) {}
70 
71   // Classifies given text, and if it is a duration, it passes the result in
72   // 'classification_result' and returns true, otherwise returns false.
73   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
74                     AnnotationUsecase annotation_usecase,
75                     ClassificationResult* classification_result) const;
76 
77   // Finds all duration instances in the input text.
78   bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
79                AnnotationUsecase annotation_usecase,
80                std::vector<AnnotatedSpan>* results) const;
81 
82  private:
83   // Represents a component of duration parsed from text (e.g. "3 hours" from
84   // the expression "3 hours and 20 minutes").
85   struct ParsedDurationAtom {
86     // Unit of the duration.
87     internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;
88 
89     // Quantity of the duration unit.
90     int value = 0;
91 
92     // True, if half an unit was specified (either in addition, or exclusively).
93     // E.g. "hour and a half".
94     // NOTE: Quarter, three-quarters etc. is not supported.
95     bool plus_half = false;
96 
HalfParsedDurationAtom97     static ParsedDurationAtom Half() {
98       ParsedDurationAtom result;
99       result.plus_half = true;
100       return result;
101     }
102   };
103 
104   // Starts consuming tokens and returns the index past the last consumed token.
105   int FindDurationStartingAt(const UnicodeText& context,
106                              const std::vector<Token>& tokens,
107                              int start_token_index,
108                              AnnotatedSpan* result) const;
109 
110   bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
111   bool ParseDurationUnitToken(const Token& token,
112                               internal::DurationUnit* duration_unit) const;
113   bool ParseFillerToken(const Token& token) const;
114 
115   int64 ParsedDurationAtomsToMillis(
116       const std::vector<ParsedDurationAtom>& atoms) const;
117 
118   const DurationAnnotatorOptions* options_;
119   const FeatureProcessor* feature_processor_;
120   const std::unordered_map<std::string, internal::DurationUnit>
121       token_value_to_duration_unit_;
122   const std::unordered_set<std::string> filler_expressions_;
123   const std::unordered_set<std::string> half_expressions_;
124 };
125 
126 }  // namespace libtextclassifier3
127 
128 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
129