1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
19 
20 #include <string>
21 #include <unordered_set>
22 #include <vector>
23 
24 #include "annotator/feature-processor.h"
25 #include "annotator/model_generated.h"
26 #include "annotator/types.h"
27 #include "utils/utf8/unicodetext.h"
28 
29 namespace libtextclassifier3 {
30 
31 // Annotator of numbers in text.
32 //
33 // Only supports values in range [-999 999 999, 999 999 999] (inclusive).
34 //
35 // TODO(zilka): Add support for non-ASCII digits.
36 // TODO(zilka): Add support for written-out numbers.
37 class NumberAnnotator {
38  public:
NumberAnnotator(const NumberAnnotatorOptions * options,const FeatureProcessor * feature_processor)39   explicit NumberAnnotator(const NumberAnnotatorOptions* options,
40                            const FeatureProcessor* feature_processor)
41       : options_(options),
42         feature_processor_(feature_processor),
43         allowed_prefix_codepoints_(
44             FlatbuffersVectorToSet(options->allowed_prefix_codepoints())),
45         allowed_suffix_codepoints_(
46             FlatbuffersVectorToSet(options->allowed_suffix_codepoints())) {}
47 
48   // Classifies given text, and if it is a number, it passes the result in
49   // 'classification_result' and returns true, otherwise returns false.
50   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
51                     AnnotationUsecase annotation_usecase,
52                     ClassificationResult* classification_result) const;
53 
54   // Finds all number instances in the input text.
55   bool FindAll(const UnicodeText& context_unicode,
56                AnnotationUsecase annotation_usecase,
57                std::vector<AnnotatedSpan>* result) const;
58 
59  private:
60   static std::unordered_set<int> FlatbuffersVectorToSet(
61       const flatbuffers::Vector<int32_t>* codepoints);
62 
63   // Parses the text to an int64 value and returns true if succeeded, otherwise
64   // false. Also returns the number of prefix/suffix codepoints that were
65   // stripped from the number.
66   bool ParseNumber(const UnicodeText& text, int64* result,
67                    int* num_prefix_codepoints,
68                    int* num_suffix_codepoints) const;
69 
70   const NumberAnnotatorOptions* options_;
71   const FeatureProcessor* feature_processor_;
72   const std::unordered_set<int> allowed_prefix_codepoints_;
73   const std::unordered_set<int> allowed_suffix_codepoints_;
74 };
75 
76 }  // namespace libtextclassifier3
77 
78 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_NUMBER_NUMBER_H_
79