1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "actions/feature-processor.h"
18 
19 namespace libtextclassifier3 {
20 namespace {
BuildTokenFeatureExtractorOptions(const ActionsTokenFeatureProcessorOptions * const options)21 TokenFeatureExtractorOptions BuildTokenFeatureExtractorOptions(
22     const ActionsTokenFeatureProcessorOptions* const options) {
23   TokenFeatureExtractorOptions extractor_options;
24   extractor_options.num_buckets = options->num_buckets();
25   if (options->chargram_orders() != nullptr) {
26     for (int order : *options->chargram_orders()) {
27       extractor_options.chargram_orders.push_back(order);
28     }
29   }
30   extractor_options.max_word_length = options->max_token_length();
31   extractor_options.extract_case_feature = options->extract_case_feature();
32   extractor_options.unicode_aware_features = options->unicode_aware_features();
33   extractor_options.extract_selection_mask_feature = false;
34   if (options->regexp_features() != nullptr) {
35     for (const auto& regexp_feauture : *options->regexp_features()) {
36       extractor_options.regexp_features.push_back(regexp_feauture->str());
37     }
38   }
39   extractor_options.remap_digits = options->remap_digits();
40   extractor_options.lowercase_tokens = options->lowercase_tokens();
41   return extractor_options;
42 }
43 }  // namespace
44 
CreateTokenizer(const ActionsTokenizerOptions * options,const UniLib * unilib)45 std::unique_ptr<Tokenizer> CreateTokenizer(
46     const ActionsTokenizerOptions* options, const UniLib* unilib) {
47   std::vector<const TokenizationCodepointRange*> codepoint_config;
48   if (options->tokenization_codepoint_config() != nullptr) {
49     codepoint_config.insert(codepoint_config.end(),
50                             options->tokenization_codepoint_config()->begin(),
51                             options->tokenization_codepoint_config()->end());
52   }
53   std::vector<const CodepointRange*> internal_codepoint_config;
54   if (options->internal_tokenizer_codepoint_ranges() != nullptr) {
55     internal_codepoint_config.insert(
56         internal_codepoint_config.end(),
57         options->internal_tokenizer_codepoint_ranges()->begin(),
58         options->internal_tokenizer_codepoint_ranges()->end());
59   }
60   const bool tokenize_on_script_change =
61       options->tokenization_codepoint_config() != nullptr &&
62       options->tokenize_on_script_change();
63   return std::unique_ptr<Tokenizer>(new Tokenizer(
64       options->type(), unilib, codepoint_config, internal_codepoint_config,
65       tokenize_on_script_change, options->icu_preserve_whitespace_tokens()));
66 }
67 
ActionsFeatureProcessor(const ActionsTokenFeatureProcessorOptions * options,const UniLib * unilib)68 ActionsFeatureProcessor::ActionsFeatureProcessor(
69     const ActionsTokenFeatureProcessorOptions* options, const UniLib* unilib)
70     : options_(options),
71       tokenizer_(CreateTokenizer(options->tokenizer_options(), unilib)),
72       token_feature_extractor_(BuildTokenFeatureExtractorOptions(options),
73                                *unilib) {}
74 
GetTokenEmbeddingSize() const75 int ActionsFeatureProcessor::GetTokenEmbeddingSize() const {
76   return options_->embedding_size() +
77          token_feature_extractor_.DenseFeaturesCount();
78 }
79 
AppendFeatures(const std::vector<int> & sparse_features,const std::vector<float> & dense_features,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const80 bool ActionsFeatureProcessor::AppendFeatures(
81     const std::vector<int>& sparse_features,
82     const std::vector<float>& dense_features,
83     const EmbeddingExecutor* embedding_executor,
84     std::vector<float>* output_features) const {
85   // Embed the sparse features, appending them directly to the output.
86   const int embedding_size = options_->embedding_size();
87   output_features->resize(output_features->size() + embedding_size);
88   float* output_features_end =
89       output_features->data() + output_features->size();
90   if (!embedding_executor->AddEmbedding(
91           TensorView<int>(sparse_features.data(),
92                           {static_cast<int>(sparse_features.size())}),
93           /*dest=*/output_features_end - embedding_size,
94           /*dest_size=*/embedding_size)) {
95     TC3_LOG(ERROR) << "Could not embed token's sparse features.";
96     return false;
97   }
98 
99   // Append the dense features to the output.
100   output_features->insert(output_features->end(), dense_features.begin(),
101                           dense_features.end());
102   return true;
103 }
104 
AppendTokenFeatures(const Token & token,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const105 bool ActionsFeatureProcessor::AppendTokenFeatures(
106     const Token& token, const EmbeddingExecutor* embedding_executor,
107     std::vector<float>* output_features) const {
108   // Extract the sparse and dense features.
109   std::vector<int> sparse_features;
110   std::vector<float> dense_features;
111   if (!token_feature_extractor_.Extract(token, /*(unused) is_in_span=*/false,
112                                         &sparse_features, &dense_features)) {
113     TC3_LOG(ERROR) << "Could not extract token's features.";
114     return false;
115   }
116   return AppendFeatures(sparse_features, dense_features, embedding_executor,
117                         output_features);
118 }
119 
AppendTokenFeatures(const std::vector<Token> & tokens,const EmbeddingExecutor * embedding_executor,std::vector<float> * output_features) const120 bool ActionsFeatureProcessor::AppendTokenFeatures(
121     const std::vector<Token>& tokens,
122     const EmbeddingExecutor* embedding_executor,
123     std::vector<float>* output_features) const {
124   for (const Token& token : tokens) {
125     if (!AppendTokenFeatures(token, embedding_executor, output_features)) {
126       return false;
127     }
128   }
129   return true;
130 }
131 
132 }  // namespace libtextclassifier3
133