1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
18 #define LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
19 
20 #include "annotator/model_generated.h"
21 #include "annotator/types.h"
22 #include "utils/utf8/unicodetext.h"
23 #include "utils/utf8/unilib.h"
24 #include "lang_id/lang-id.h"
25 
26 namespace libtextclassifier3 {
27 
28 // Returns classification with "translate" when the input text is in a language
29 // not understood by the user.
30 class TranslateAnnotator {
31  public:
TranslateAnnotator(const TranslateAnnotatorOptions * options,const libtextclassifier3::mobile::lang_id::LangId * langid_model,const UniLib * unilib)32   TranslateAnnotator(const TranslateAnnotatorOptions* options,
33                      const libtextclassifier3::mobile::lang_id::LangId* langid_model,
34                      const UniLib* unilib)
35       : options_(options), langid_model_(langid_model), unilib_(unilib) {}
36 
37   // Returns true if a classification_result was filled with "translate"
38   // classification.
39   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
40                     const std::string& user_familiar_language_tags,
41                     ClassificationResult* classification_result) const;
42 
43  protected:
44   struct LanguageConfidence {
45     std::string language;
46     float confidence = -1.0;
47   };
48 
49   // Detects language of the selection in given context using the "Backoff
50   // algorithm", sorted by the score descendingly. It is based on several
51   // heuristics, see the code. This is the same algorithm that TextClassifier
52   // uses in Android Q.
53   std::vector<LanguageConfidence> BackoffDetectLanguages(
54       const UnicodeText& context, CodepointSpan selection_indices) const;
55 
56   // Returns the iterator of the next whitespace/punctuation character in given
57   // text, starting from given position and going forward (iff direction == 1),
58   // and backward (iff direction == -1).
59   UnicodeText::const_iterator FindIndexOfNextWhitespaceOrPunctuation(
60       const UnicodeText& text, int start_index, int direction) const;
61 
62   // Returns substring from given text, centered around the specified indices,
63   // of certain minimum length. The substring is token aligned, so it is
64   // guaranteed that the words won't be broken down.
65   UnicodeText TokenAlignedSubstringAroundSpan(const UnicodeText& text,
66                                               CodepointSpan indices,
67                                               int minimum_length) const;
68 
69  private:
70   std::string CreateSerializedEntityData(
71       const std::vector<TranslateAnnotator::LanguageConfidence>& confidences)
72       const;
73 
74   const TranslateAnnotatorOptions* options_;
75   const libtextclassifier3::mobile::lang_id::LangId* langid_model_;
76   const UniLib* unilib_;
77 };
78 
79 }  // namespace libtextclassifier3
80 
81 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_TRANSLATE_TRANSLATE_H_
82