1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 // Common feature types for parser components.
18 
19 #ifndef LIBTEXTCLASSIFIER_COMMON_FEATURE_TYPES_H_
20 #define LIBTEXTCLASSIFIER_COMMON_FEATURE_TYPES_H_
21 
22 #include <algorithm>
23 #include <map>
24 #include <string>
25 #include <utility>
26 
27 #include "util/base/integral_types.h"
28 #include "util/base/logging.h"
29 #include "util/strings/numbers.h"
30 
31 namespace libtextclassifier {
32 namespace nlp_core {
33 
34 // TODO(djweiss) Clean this up as well.
35 // Use the same type for feature values as is used for predicated.
36 typedef int64 Predicate;
37 typedef Predicate FeatureValue;
38 
39 // Each feature value in a feature vector has a feature type. The feature type
40 // is used for converting feature type and value pairs to predicate values. The
41 // feature type can also return names for feature values and calculate the size
42 // of the feature value domain. The FeatureType class is abstract and must be
43 // specialized for the concrete feature types.
44 class FeatureType {
45  public:
46   // Initializes a feature type.
FeatureType(const std::string & name)47   explicit FeatureType(const std::string &name)
48       : name_(name), base_(0),
49         is_continuous_(name.find("continuous") != std::string::npos) {
50   }
51 
~FeatureType()52   virtual ~FeatureType() {}
53 
54   // Converts a feature value to a name.
55   virtual std::string GetFeatureValueName(FeatureValue value) const = 0;
56 
57   // Returns the size of the feature values domain.
58   virtual int64 GetDomainSize() const = 0;
59 
60   // Returns the feature type name.
name()61   const std::string &name() const { return name_; }
62 
base()63   Predicate base() const { return base_; }
set_base(Predicate base)64   void set_base(Predicate base) { base_ = base; }
65 
66   // Returns true iff this feature is continuous; see FloatFeatureValue.
is_continuous()67   bool is_continuous() const { return is_continuous_; }
68 
69  private:
70   // Feature type name.
71   std::string name_;
72 
73   // "Base" feature value: i.e. a "slot" in a global ordering of features.
74   Predicate base_;
75 
76   // See doc for is_continuous().
77   bool is_continuous_;
78 };
79 
80 // Feature type that is defined using an explicit map from FeatureValue to
81 // std::string values.  This can reduce some of the boilerplate when defining
82 // features that generate enum values.  Example usage:
83 //
84 //   class BeverageSizeFeature : public FeatureFunction<Beverage>
85 //     enum FeatureValue { SMALL, MEDIUM, LARGE };  // values for this feature
86 //     void Init(TaskContext *context) override {
87 //       set_feature_type(new EnumFeatureType("beverage_size",
88 //           {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
89 //     }
90 //     [...]
91 //   };
92 class EnumFeatureType : public FeatureType {
93  public:
EnumFeatureType(const std::string & name,const std::map<FeatureValue,std::string> & value_names)94   EnumFeatureType(const std::string &name,
95                   const std::map<FeatureValue, std::string> &value_names)
96       : FeatureType(name), value_names_(value_names) {
97     for (const auto &pair : value_names) {
98       TC_DCHECK_GE(pair.first, 0)
99           << "Invalid feature value: " << pair.first << ", " << pair.second;
100       domain_size_ = std::max(domain_size_, pair.first + 1);
101     }
102   }
103 
104   // Returns the feature name for a given feature value.
GetFeatureValueName(FeatureValue value)105   std::string GetFeatureValueName(FeatureValue value) const override {
106     auto it = value_names_.find(value);
107     if (it == value_names_.end()) {
108       TC_LOG(ERROR) << "Invalid feature value " << value << " for " << name();
109       return "<INVALID>";
110     }
111     return it->second;
112   }
113 
114   // Returns the number of possible values for this feature type. This is one
115   // greater than the largest value in the value_names map.
GetDomainSize()116   FeatureValue GetDomainSize() const override { return domain_size_; }
117 
118  protected:
119   // Maximum possible value this feature could take.
120   FeatureValue domain_size_ = 0;
121 
122   // Names of feature values.
123   std::map<FeatureValue, std::string> value_names_;
124 };
125 
126 // Feature type for binary features.
127 class BinaryFeatureType : public FeatureType {
128  public:
BinaryFeatureType(const std::string & name,const std::string & off,const std::string & on)129   BinaryFeatureType(const std::string &name, const std::string &off,
130                     const std::string &on)
131       : FeatureType(name), off_(off), on_(on) {}
132 
133   // Returns the feature name for a given feature value.
GetFeatureValueName(FeatureValue value)134   std::string GetFeatureValueName(FeatureValue value) const override {
135     if (value == 0) return off_;
136     if (value == 1) return on_;
137     return "";
138   }
139 
140   // Binary features always have two feature values.
GetDomainSize()141   FeatureValue GetDomainSize() const override { return 2; }
142 
143  private:
144   // Feature value names for on and off.
145   std::string off_;
146   std::string on_;
147 };
148 
149 // Feature type for numeric features.
150 class NumericFeatureType : public FeatureType {
151  public:
152   // Initializes numeric feature.
NumericFeatureType(const std::string & name,FeatureValue size)153   NumericFeatureType(const std::string &name, FeatureValue size)
154       : FeatureType(name), size_(size) {}
155 
156   // Returns numeric feature value.
GetFeatureValueName(FeatureValue value)157   std::string GetFeatureValueName(FeatureValue value) const override {
158     if (value < 0) return "";
159     return IntToString(value);
160   }
161 
162   // Returns the number of feature values.
GetDomainSize()163   FeatureValue GetDomainSize() const override { return size_; }
164 
165  private:
166   // The underlying size of the numeric feature.
167   FeatureValue size_;
168 };
169 
170 // Feature type for byte features, including an "outside" value.
171 class ByteFeatureType : public NumericFeatureType {
172  public:
ByteFeatureType(const std::string & name)173   explicit ByteFeatureType(const std::string &name)
174       : NumericFeatureType(name, 257) {}
175 
GetFeatureValueName(FeatureValue value)176   std::string GetFeatureValueName(FeatureValue value) const override {
177     if (value == 256) {
178       return "<NULL>";
179     }
180     std::string result;
181     result += static_cast<char>(value);
182     return result;
183   }
184 };
185 
186 }  // namespace nlp_core
187 }  // namespace libtextclassifier
188 
189 #endif  // LIBTEXTCLASSIFIER_COMMON_FEATURE_TYPES_H_
190