1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
19 
20 
21 #include <stddef.h>
22 
23 #include <memory>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "lang_id/common/lite_base/macros.h"
29 #include "lang_id/model-provider.h"
30 
31 namespace libtextclassifier3 {
32 namespace mobile {
33 namespace lang_id {
34 
35 // Forward-declaration of the class that performs all underlying work.
36 class LangIdImpl;
37 
38 struct LangIdResult {
39   // An n-best list of possible language codes for a given input sorted in
40   // descending order according to each code's respective probability.
41   //
42   // This list is guaranteed to be non-empty after calling
43   // LangId::FindLanguages.  The most likely language code is always the first
44   // item in this array.
45   //
46   // If the model cannot make a prediction, this array contains a single result:
47   // a language code LangId::kUnknownLanguageCode with probability 1.
48   std::vector<std::pair<string, float>> predictions;
49 };
50 
51 // Class for detecting the language of a document.
52 //
53 // Note: this class does not handle the details of loading the actual model.
54 // Those details have been "outsourced" to the ModelProvider class.
55 //
56 // This class is thread safe.
57 class LangId {
58  public:
59   // Standard BCP-47 language code for Unknown/Undetermined language.
60   static const char kUnknownLanguageCode[];
61 
62   // Constructs a LangId object, based on |model_provider|.
63   //
64   // Note: we don't crash if we detect a problem at construction time (e.g., the
65   // model provider can't read an underlying file).  Instead, we mark the
66   // newly-constructed object as invalid; clients can invoke FindLanguage() on
67   // an invalid object: nothing crashes, but accuracy will be bad.
68   explicit LangId(std::unique_ptr<ModelProvider> model_provider);
69 
70   virtual ~LangId();
71 
72   // Computes the an n-best list of language codes and probabilities
73   // corresponding to the most likely languages the given input text is written
74   // in. The list is sorted in descending order by language probability.
75   //
76   // The input text consists of the |num_bytes| bytes that starts at |data|.
77   //
78   // Note: If this LangId object is not valid (see is_valid()) or if this LangId
79   // object can't make a prediction, this method sets the LangIdResult to
80   // contain a single entry with kUnknownLanguageCode with probability 1.
81   void FindLanguages(const char *data, size_t num_bytes,
82                      LangIdResult *result) const;
83 
84   // Convenience version of FindLanguages(const char *, size_t, LangIdResult *).
FindLanguages(const string & text,LangIdResult * result)85   void FindLanguages(const string &text, LangIdResult *result) const {
86     FindLanguages(text.data(), text.size(), result);
87   }
88 
89   // Returns language code for the most likely language for a piece of text.
90   //
91   // The input text consists of the |num_bytes| bytes that start at |data|.
92   //
93   // Note: this method reports the most likely (1-best) language only if its
94   // probability is high enough; otherwise, it returns
95   // LangId::kUnknownLanguageCode.  The specific probability threshold is tuned
96   // to the needs of an early client.  If you need a different threshold, you
97   // can use FindLanguages (plural) to get the full LangIdResult, and apply your
98   // own threshold.
99   //
100   // Note: if this LangId object is not valid (see is_valid()) or if this LangId
101   // object can't make a prediction, then this method returns
102   // LangId::kUnknownLanguageCode.
103   //
104   string FindLanguage(const char *data, size_t num_bytes) const;
105 
106   // Convenience version of FindLanguage(const char *, size_t).
FindLanguage(const string & text)107   string FindLanguage(const string &text) const {
108     return FindLanguage(text.data(), text.size());
109   }
110 
111   // Returns true if this object has been correctly initialized and is ready to
112   // perform predictions.  For more info, see doc for LangId
113   // constructor above.
114   bool is_valid() const;
115 
116   // Returns the version of the model used by this LangId object.  On success,
117   // the returned version number is a strictly positive integer.  Returns 0 if
118   // the model version can not be determined (e.g., for old models that do not
119   // specify a version number).
120   int GetModelVersion() const;
121 
122   // Returns a typed property stored in the model file.
123   float GetFloatProperty(const string &property, float default_value) const;
124 
125  private:
126   // Pimpl ("pointer to implementation") pattern, to hide all internals from our
127   // clients.
128   std::unique_ptr<LangIdImpl> pimpl_;
129 
130   SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId);
131 };
132 
133 }  // namespace lang_id
134 }  // namespace mobile
135 }  // namespace nlp_saft
136 
137 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_
138