1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
18 #define NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
19 
20 #include <string>
21 
22 #include "lang_id/common/fel/task-context.h"
23 #include "lang_id/common/lite_strings/stringpiece.h"
24 #include "lang_id/light-sentence.h"
25 
26 namespace libtextclassifier3 {
27 namespace mobile {
28 namespace lang_id {
29 
30 // Custom tokenizer for the LangId model.
31 class TokenizerForLangId {
32  public:
33   void Setup(TaskContext *context);
34 
35   // Tokenizes |text|, placing the tokens into |sentence|.  Customized for
36   // LangId.  Currently (Sep 15, 2016) we tokenize on space, newline, tab, and
37   // any other 1-byte UTF8 character which is not a letter, ignore all empty
38   // tokens, and (for each of the remaining tokens) prepend "^" (special token
39   // begin marker) and append "$" (special token end marker).
40   //
41   // Tokens are stored into the "repeated Token token;" field of *sentence.
42   void Tokenize(StringPiece text, LightSentence *sentence) const;
43 
44  private:
45   // If true, during tokenization, we use the lowercase version of each Unicode
46   // character from the text to tokenize.  E.g., if this is true, the text "Foo
47   // bar" is tokenized as ["foo", "bar"]; otherwise, we get ["Foo", "bar"].
48   bool lowercase_input_ = false;
49 };
50 
51 }  // namespace lang_id
52 }  // namespace mobile
53 }  // namespace nlp_saft
54 
55 #endif  // NLP_SAFT_COMPONENTS_LANG_ID_MOBILE_CUSTOM_TOKENIZER_H_
56