1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_
19 
20 #include <memory>
21 
22 #include "annotator/model_generated.h"
23 #include "annotator/types.h"
24 #include "utils/bert_tokenizer.h"
25 #include "utils/utf8/unicodetext.h"
26 #include "utils/utf8/unilib.h"
27 #include "tensorflow/lite/context.h"
28 #include "tensorflow/lite/interpreter.h"
29 #include "tensorflow/lite/kernels/register.h"
30 #include "tensorflow/lite/string_util.h"
31 
32 namespace libtextclassifier3 {
33 
34 // Uses POD NER BERT-based model for annotating various types of entities.
35 class PodNerAnnotator {
36  public:
37   static std::unique_ptr<PodNerAnnotator> Create(const PodNerModel *model,
38                                                  const UniLib &unilib);
39 
40   bool Annotate(const UnicodeText &context,
41                 std::vector<AnnotatedSpan> *results) const;
42 
43   // Returns true if an entity was detected under 'click', and the selection
44   // indices expanded and assigned to 'result'. Otherwise returns false, and
45   // resets 'result'.
46   bool SuggestSelection(const UnicodeText &context, CodepointSpan click,
47                         AnnotatedSpan *result) const;
48 
49   bool ClassifyText(const UnicodeText &context, CodepointSpan click,
50                     ClassificationResult *result) const;
51 
52   std::vector<std::string> GetSupportedCollections() const;
53 
54  private:
PodNerAnnotator(const UniLib & unilib)55   explicit PodNerAnnotator(const UniLib &unilib) : unilib_(unilib) {}
56 
57   std::vector<PodNerModel_::LabelT> ReadResultsFromInterpreter(
58       tflite::Interpreter &interpreter) const;
59 
60   std::vector<PodNerModel_::LabelT> ExecuteModel(
61       const VectorSpan<int> &wordpiece_indices,
62       const VectorSpan<int32_t> &token_starts,
63       const VectorSpan<Token> &tokens) const;
64 
65   bool PrepareText(const UnicodeText &text_unicode,
66                    std::vector<int32_t> *wordpiece_indices,
67                    std::vector<int32_t> *token_starts,
68                    std::vector<Token> *tokens) const;
69 
70   bool AnnotateAroundSpanOfInterest(const UnicodeText &context,
71                                     const CodepointSpan &span_of_interest,
72                                     std::vector<AnnotatedSpan> *results) const;
73 
74   const UniLib &unilib_;
75   bool lowercase_input_;
76   int logits_index_in_output_tensor_;
77   bool append_final_period_;
78   int max_num_effective_wordpieces_;
79   int sliding_window_num_wordpieces_overlap_;
80   float max_ratio_unknown_wordpieces_;
81   int min_number_of_tokens_;
82   int min_number_of_wordpieces_;
83   int cls_wordpiece_id_;
84   int sep_wordpiece_id_;
85   int period_wordpiece_id_;
86   int unknown_wordpiece_id_;
87   std::vector<PodNerModel_::CollectionT> collections_;
88   std::vector<PodNerModel_::LabelT> labels_;
89   std::unique_ptr<BertTokenizer> tokenizer_;
90   const PodNerModel *model_;
91 };
92 
93 }  // namespace libtextclassifier3
94 
95 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_POD_NER_POD_NER_IMPL_H_
96