1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "lang_id/relevant-script-feature.h"
18 
19 #include <string>
20 
21 #include "common/feature-extractor.h"
22 #include "common/feature-types.h"
23 #include "common/task-context.h"
24 #include "common/workspace.h"
25 #include "lang_id/script-detector.h"
26 #include "util/base/logging.h"
27 #include "util/strings/utf8.h"
28 
29 namespace libtextclassifier {
30 namespace nlp_core {
31 namespace lang_id {
32 
Setup(TaskContext * context)33 bool RelevantScriptFeature::Setup(TaskContext *context) { return true; }
34 
Init(TaskContext * context)35 bool RelevantScriptFeature::Init(TaskContext *context) {
36   set_feature_type(new NumericFeatureType(name(), kNumRelevantScripts));
37   return true;
38 }
39 
Evaluate(const WorkspaceSet & workspaces,const LightSentence & sentence,FeatureVector * result) const40 void RelevantScriptFeature::Evaluate(const WorkspaceSet &workspaces,
41                                      const LightSentence &sentence,
42                                      FeatureVector *result) const {
43   // We expect kNumRelevantScripts to be small, so we stack-allocate the array
44   // of counts.  Still, if that changes, we want to find out.
45   static_assert(
46       kNumRelevantScripts < 25,
47       "switch counts to vector<int>: too big for stack-allocated int[]");
48 
49   // counts[s] is the number of characters with script s.
50   // Note: {} "value-initializes" the array to zero.
51   int counts[kNumRelevantScripts]{};
52   int total_count = 0;
53   for (int i = 0; i < sentence.num_words(); ++i) {
54     const std::string &word = sentence.word(i);
55     const char *const word_end = word.data() + word.size();
56     const char *curr = word.data();
57 
58     // Skip over token start '^'.
59     TC_DCHECK_EQ(*curr, '^');
60     curr += GetNumBytesForNonZeroUTF8Char(curr);
61     while (true) {
62       const int num_bytes = GetNumBytesForNonZeroUTF8Char(curr);
63       Script script = GetScript(curr, num_bytes);
64 
65       // We do this update and the if (...) break below *before* incrementing
66       // counts[script] in order to skip the token end '$'.
67       curr += num_bytes;
68       if (curr >= word_end) {
69         TC_DCHECK_EQ(*(curr - num_bytes), '$');
70         break;
71       }
72       TC_DCHECK_GE(script, 0);
73       TC_DCHECK_LT(script, kNumRelevantScripts);
74       counts[script]++;
75       total_count++;
76     }
77   }
78 
79   for (int script_id = 0; script_id < kNumRelevantScripts; ++script_id) {
80     int count = counts[script_id];
81     if (count > 0) {
82       const float weight = static_cast<float>(count) / total_count;
83       FloatFeatureValue value(script_id, weight);
84       result->add(feature_type(), value.discrete_value);
85     }
86   }
87 }
88 
89 }  // namespace lang_id
90 }  // namespace nlp_core
91 }  // namespace libtextclassifier
92