1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 18 #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 19 20 #include <string> 21 #include <unordered_map> 22 #include <unordered_set> 23 #include <vector> 24 25 #include "annotator/feature-processor.h" 26 #include "annotator/model_generated.h" 27 #include "annotator/types.h" 28 #include "utils/utf8/unicodetext.h" 29 30 namespace libtextclassifier3 { 31 32 namespace internal { 33 enum class DurationUnit { 34 UNKNOWN = -1, 35 WEEK = 0, 36 DAY = 1, 37 HOUR = 2, 38 MINUTE = 3, 39 SECOND = 4 40 41 // NOTE: If we want to add MONTH and YEAR we'll have to think of different 42 // parsing format, because MONTH and YEAR don't have a fixed number of 43 // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight 44 // savings time and assume the day is always 24 hours. 45 }; 46 47 // Prepares the mapping between token values and duration unit types. 48 std::unordered_map<std::string, internal::DurationUnit> 49 BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options); 50 51 // Creates a set of strings from a flatbuffer string vector. 52 std::unordered_set<std::string> BuildStringSet( 53 const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*); 54 55 } // namespace internal 56 57 // Annotator of duration expressions like "3 minutes 30 seconds". 58 class DurationAnnotator { 59 public: DurationAnnotator(const DurationAnnotatorOptions * options,const FeatureProcessor * feature_processor)60 explicit DurationAnnotator(const DurationAnnotatorOptions* options, 61 const FeatureProcessor* feature_processor) 62 : options_(options), 63 feature_processor_(feature_processor), 64 token_value_to_duration_unit_( 65 internal::BuildTokenToDurationUnitMapping(options)), 66 filler_expressions_( 67 internal::BuildStringSet(options->filler_expressions())), 68 half_expressions_( 69 internal::BuildStringSet(options->half_expressions())) {} 70 71 // Classifies given text, and if it is a duration, it passes the result in 72 // 'classification_result' and returns true, otherwise returns false. 73 bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, 74 AnnotationUsecase annotation_usecase, 75 ClassificationResult* classification_result) const; 76 77 // Finds all duration instances in the input text. 78 bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens, 79 AnnotationUsecase annotation_usecase, 80 std::vector<AnnotatedSpan>* results) const; 81 82 private: 83 // Represents a component of duration parsed from text (e.g. "3 hours" from 84 // the expression "3 hours and 20 minutes"). 85 struct ParsedDurationAtom { 86 // Unit of the duration. 87 internal::DurationUnit unit = internal::DurationUnit::UNKNOWN; 88 89 // Quantity of the duration unit. 90 int value = 0; 91 92 // True, if half an unit was specified (either in addition, or exclusively). 93 // E.g. "hour and a half". 94 // NOTE: Quarter, three-quarters etc. is not supported. 95 bool plus_half = false; 96 HalfParsedDurationAtom97 static ParsedDurationAtom Half() { 98 ParsedDurationAtom result; 99 result.plus_half = true; 100 return result; 101 } 102 }; 103 104 // Starts consuming tokens and returns the index past the last consumed token. 105 int FindDurationStartingAt(const UnicodeText& context, 106 const std::vector<Token>& tokens, 107 int start_token_index, 108 AnnotatedSpan* result) const; 109 110 bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const; 111 bool ParseDurationUnitToken(const Token& token, 112 internal::DurationUnit* duration_unit) const; 113 bool ParseFillerToken(const Token& token) const; 114 115 int64 ParsedDurationAtomsToMillis( 116 const std::vector<ParsedDurationAtom>& atoms) const; 117 118 const DurationAnnotatorOptions* options_; 119 const FeatureProcessor* feature_processor_; 120 const std::unordered_map<std::string, internal::DurationUnit> 121 token_value_to_duration_unit_; 122 const std::unordered_set<std::string> filler_expressions_; 123 const std::unordered_set<std::string> half_expressions_; 124 }; 125 126 } // namespace libtextclassifier3 127 128 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ 129