1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 19 20 21 #include <stddef.h> 22 23 #include <memory> 24 #include <string> 25 #include <utility> 26 #include <vector> 27 28 #include "lang_id/common/lite_base/macros.h" 29 #include "lang_id/model-provider.h" 30 31 namespace libtextclassifier3 { 32 namespace mobile { 33 namespace lang_id { 34 35 // Forward-declaration of the class that performs all underlying work. 36 class LangIdImpl; 37 38 struct LangIdResult { 39 // An n-best list of possible language codes for a given input sorted in 40 // descending order according to each code's respective probability. 41 // 42 // This list is guaranteed to be non-empty after calling 43 // LangId::FindLanguages. The most likely language code is always the first 44 // item in this array. 45 // 46 // If the model cannot make a prediction, this array contains a single result: 47 // a language code LangId::kUnknownLanguageCode with probability 1. 48 std::vector<std::pair<string, float>> predictions; 49 }; 50 51 // Class for detecting the language of a document. 52 // 53 // Note: this class does not handle the details of loading the actual model. 54 // Those details have been "outsourced" to the ModelProvider class. 55 // 56 // This class is thread safe. 57 class LangId { 58 public: 59 // Standard BCP-47 language code for Unknown/Undetermined language. 60 static const char kUnknownLanguageCode[]; 61 62 // Constructs a LangId object, based on |model_provider|. 63 // 64 // Note: we don't crash if we detect a problem at construction time (e.g., the 65 // model provider can't read an underlying file). Instead, we mark the 66 // newly-constructed object as invalid; clients can invoke FindLanguage() on 67 // an invalid object: nothing crashes, but accuracy will be bad. 68 explicit LangId(std::unique_ptr<ModelProvider> model_provider); 69 70 virtual ~LangId(); 71 72 // Computes the an n-best list of language codes and probabilities 73 // corresponding to the most likely languages the given input text is written 74 // in. The list is sorted in descending order by language probability. 75 // 76 // The input text consists of the |num_bytes| bytes that starts at |data|. 77 // 78 // Note: If this LangId object is not valid (see is_valid()) or if this LangId 79 // object can't make a prediction, this method sets the LangIdResult to 80 // contain a single entry with kUnknownLanguageCode with probability 1. 81 void FindLanguages(const char *data, size_t num_bytes, 82 LangIdResult *result) const; 83 84 // Convenience version of FindLanguages(const char *, size_t, LangIdResult *). FindLanguages(const string & text,LangIdResult * result)85 void FindLanguages(const string &text, LangIdResult *result) const { 86 FindLanguages(text.data(), text.size(), result); 87 } 88 89 // Returns language code for the most likely language for a piece of text. 90 // 91 // The input text consists of the |num_bytes| bytes that start at |data|. 92 // 93 // Note: this method reports the most likely (1-best) language only if its 94 // probability is high enough; otherwise, it returns 95 // LangId::kUnknownLanguageCode. The specific probability threshold is tuned 96 // to the needs of an early client. If you need a different threshold, you 97 // can use FindLanguages (plural) to get the full LangIdResult, and apply your 98 // own threshold. 99 // 100 // Note: if this LangId object is not valid (see is_valid()) or if this LangId 101 // object can't make a prediction, then this method returns 102 // LangId::kUnknownLanguageCode. 103 // 104 string FindLanguage(const char *data, size_t num_bytes) const; 105 106 // Convenience version of FindLanguage(const char *, size_t). FindLanguage(const string & text)107 string FindLanguage(const string &text) const { 108 return FindLanguage(text.data(), text.size()); 109 } 110 111 // Returns true if this object has been correctly initialized and is ready to 112 // perform predictions. For more info, see doc for LangId 113 // constructor above. 114 bool is_valid() const; 115 116 // Returns the version of the model used by this LangId object. On success, 117 // the returned version number is a strictly positive integer. Returns 0 if 118 // the model version can not be determined (e.g., for old models that do not 119 // specify a version number). 120 int GetModelVersion() const; 121 122 // Returns a typed property stored in the model file. 123 float GetFloatProperty(const string &property, float default_value) const; 124 125 private: 126 // Pimpl ("pointer to implementation") pattern, to hide all internals from our 127 // clients. 128 std::unique_ptr<LangIdImpl> pimpl_; 129 130 SAFTM_DISALLOW_COPY_AND_ASSIGN(LangId); 131 }; 132 133 } // namespace lang_id 134 } // namespace mobile 135 } // namespace nlp_saft 136 137 #endif // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_LANG_ID_H_ 138