1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
19 
20 
21 #include <stddef.h>
22 
23 #include <memory>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "lang_id/common/lite_base/macros.h"
29 #include "lang_id/model-provider.h"
30 
31 namespace libtextclassifier3 {
32 namespace mobile {
33 namespace lang_id {
34 
35 // Forward-declaration of the class that performs all underlying work.
36 class LangIdImpl;
37 
38 struct LangIdResult {
39   // An n-best list of possible language codes for a given input sorted in
40   // descending order according to each code's respective probability.
41   //
42   // This list is guaranteed to be non-empty after calling
43   // LangId::FindLanguages.  The most likely language code is always the first
44   // item in this array.
45   //
46   // If the model cannot make a prediction, this array contains a single result:
47   // a language code LangId::kUnknownLanguageCode with probability 1.
48   std::vector<std::pair<std::string, float>> predictions;
49 };
50 
51 // Class for detecting the language of a document.
52 //
53 // Note: this class does not handle the details of loading the actual model.
54 // Those details have been "outsourced" to the ModelProvider class.
55 //
56 // This class is thread safe.
57 class LangId {
58  public:
59   // Standard BCP-47 language code for Unknown/Undetermined language.
60   static const char kUnknownLanguageCode[];
61 
62   // Constructs a LangId object, based on |model_provider|.
63   //
64   // Note: we don't crash if we detect a problem at construction time (e.g., the
65   // model provider can't read an underlying file).  Instead, we mark the
66   // newly-constructed object as invalid; clients can invoke FindLanguage() on
67   // an invalid object: nothing crashes, but accuracy will be bad.
68   explicit LangId(std::unique_ptr<ModelProvider> model_provider);
69 
70   virtual ~LangId();
71 
72   // Computes the n-best list of language codes and probabilities corresponding
73   // to the most likely languages the given input text is written in.  That list
74   // includes the most likely |max_results| languages and is sorted in
75   // descending order by language probability.
76   //
77   // The input text consists of the |num_bytes| bytes that starts at |data|.
78   //
79   // If max_results <= 0, we report probabilities for all languages known by
80   // this LangId object (as always, in decreasing order of their probabilities).
81   //
82   // Note: If this LangId object is not valid (see is_valid()) or if this LangId
83   // object can't make a prediction, this method sets the LangIdResult to
84   // contain a single entry with kUnknownLanguageCode with probability 1.
85   //
86   void FindLanguages(const char *data, size_t num_bytes, LangIdResult *result,
87                      int max_results = 0) const;
88 
89   // Convenience version of FindLanguages(const char *, size_t, LangIdResult *).
90   void FindLanguages(const std::string &text, LangIdResult *result,
91                      int max_results = 0) const {
92     FindLanguages(text.data(), text.size(), result, max_results);
93   }
94 
95   // Returns language code for the most likely language for a piece of text.
96   //
97   // The input text consists of the |num_bytes| bytes that start at |data|.
98   //
99   // Note: this method reports the most likely (1-best) language only if its
100   // probability is high enough; otherwise, it returns
101   // LangId::kUnknownLanguageCode.  The specific probability threshold is tuned
102   // to the needs of an early client.  If you need a different threshold, you
103   // can use FindLanguages (plural) to get the full LangIdResult, and apply your
104   // own threshold.
105   //
106   // Note: if this LangId object is not valid (see is_valid()) or if this LangId
107   // object can't make a prediction, then this method returns
108   // LangId::kUnknownLanguageCode.
109   //
110   std::string FindLanguage(const char *data, size_t num_bytes) const;
111 
112   // Convenience version of FindLanguage(const char *, size_t).
FindLanguage(const std::string & text)113   std::string FindLanguage(const std::string &text) const {
114     return FindLanguage(text.data(), text.size());
115   }
116 
117   // Returns true if this object has been correctly initialized and is ready to
118   // perform predictions.  For more info, see doc for LangId
119   // constructor above.
120   bool is_valid() const;
121 
122   // Returns the version of the model used by this LangId object.  On success,
123   // the returned version number is a strictly positive integer.  Returns 0 if
124   // the model version can not be determined (e.g., for old models that do not
125   // specify a version number).
126   int GetModelVersion() const;
127 
128   // Returns a typed property stored in the model file.
129   float GetFloatProperty(const std::string &property,
130                          float default_value) const;
131 
132  private:
133   // Pimpl ("pointer to implementation") pattern, to hide all internals from our
134   // clients.
135   std::unique_ptr<LangIdImpl> pimpl_;
136 
137   SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId);
138 };
139 
140 }  // namespace lang_id
141 }  // namespace mobile
142 }  // namespace nlp_saft
143 
144 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
145