1 // Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 // =============================================================================
15 #ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
16 #define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
17
18 #include <limits>
19
20 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_types.h"
25 #include "tensorflow/core/lib/random/distribution_sampler.h"
26 #include "tensorflow/core/lib/random/simple_philox.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/types.h"
30
31 namespace tensorflow {
32 namespace tensorforest {
33
34 // We hide Eigen's hideous types behind a function that returns the (i, j)-th
35 // entry of a two dimensional tensor; this is that function's type.
36 using GetFeatureFnType = std::function<float(int32, int32)>;
37
38 // TODO(gilberth): Put these in protos so they can be shared by C++ and python.
39 // Indexes in the tree representation's 2nd dimension for children and features.
40 const int32 CHILDREN_INDEX = 0;
41 const int32 FEATURE_INDEX = 1;
42
43 // Used in the tree's children sub-tensor to indicate leaf and free nodes.
44 const int32 LEAF_NODE = -1;
45 const int32 FREE_NODE = -2;
46
47 // Used to indicate column types, e.g. categorical vs. float
48 enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
49
50 // Calculates the sum of a tensor.
51 template <typename T>
Sum(Tensor counts)52 T Sum(Tensor counts) {
53 Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
54 counts.unaligned_flat<T>().sum();
55 return count_sum(0);
56 }
57
58 // Get the two best scores and their indices among max splits.
59 void GetTwoBest(int max, const std::function<float(int)>& score_fn,
60 float* best_score, int* best_index, float* second_best_score,
61 int* second_best_index);
62
63 // If the Gini Impurity is g, this returns n^2 (g - 1). This is an int
64 // rather than a float, and so can be more easily checked for ties.
65 int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
66 random::SimplePhilox* rand);
67
68 // Get the DataColumnTypes number for the given feature.
69 DataColumnTypes FindDenseFeatureSpec(
70 int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
71 DataColumnTypes FindSparseFeatureSpec(
72 int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
73
74 // Given an Eigen::Tensor type, calculate the Gini impurity.
75 template <typename T>
RawWeightedGiniImpurity(const T & counts)76 float RawWeightedGiniImpurity(const T& counts) {
77 // Our split score is the Gini impurity times the number of examples
78 // seen by the leaf. If c(i) denotes the i-th class count and c = sum_i c(i)
79 // then
80 // score = c * (1 - sum_i ( c(i) / c )^2 )
81 // = c - sum_i c(i)^2 / c
82 const auto sum = counts.sum();
83 const auto sum2 = counts.square().sum();
84 Eigen::Tensor<float, 0, Eigen::RowMajor> ret = sum - (sum2 / sum);
85 return ret(0);
86 }
87
88 // Given an Eigen::Tensor type, calculate the smoothed Gini impurity, which we
89 // use to determine the best split (lowest) and which nodes to allocate first
90 // (highest).
91 template <typename T>
WeightedGiniImpurity(const T & counts)92 float WeightedGiniImpurity(const T& counts) {
93 const auto smoothed = counts + counts.constant(1.0f);
94 return RawWeightedGiniImpurity(smoothed);
95 }
96
97 template <typename T1, typename T2>
WeightedVariance(const T1 & sums,const T2 & squares,float count)98 float WeightedVariance(const T1& sums, const T2& squares, float count) {
99 const auto e_x = sums / count;
100 const auto e_x2 = squares / count;
101 Eigen::Tensor<float, 0, Eigen::RowMajor> ret = (e_x2 - e_x.square()).sum();
102 return count * ret(0);
103 }
104
105 // Returns the best split to use based on the (lowest) Gini impurity.
106 // Takes in the whole total and per-split count tensors because using
107 // Tensor::Slice returns a tensor of the same dimensionality, which makes
108 // things a little awkward.
109 int32 BestFeatureClassification(const Tensor& total_counts,
110 const Tensor& split_counts, int32 accumulator);
111
112 // Returns the best split to use based on the (lowest) variance.
113 int32 BestFeatureRegression(const Tensor& total_sums,
114 const Tensor& total_squares,
115 const Tensor& split_sums,
116 const Tensor& split_squares, int32 accumulator);
117
118 // Returns true if the best split's variance is sufficiently smaller than
119 // that of the next best split.
120 bool BestSplitDominatesRegression(const Tensor& total_sums,
121 const Tensor& total_squares,
122 const Tensor& split_sums,
123 const Tensor& split_squares,
124 int32 accumulator);
125
126 // Performs bootstrap_samples bootstrap samples of the best split's class
127 // counts and the second best splits's class counts, and returns true if at
128 // least dominate_fraction of the time, the former has a better (lower)
129 // Gini impurity. Does not take over ownership of *rand.
130 bool BestSplitDominatesClassificationBootstrap(
131 const Tensor& total_counts, const Tensor& split_counts, int32 accumulator,
132 float dominate_fraction, tensorflow::random::SimplePhilox* rand);
133
134 // Returns true if the best split's Gini impurity is sufficiently smaller than
135 // that of the next best split, as measured by the Hoeffding Tree bound.
136 bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts,
137 const Tensor& split_counts,
138 int32 accumulator,
139 float dominate_fraction);
140
141 // Returns true if the best split's Gini impurity is sufficiently smaller than
142 // that of the next best split, as measured by a Chebyshev bound.
143 bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts,
144 const Tensor& split_counts,
145 int32 accumulator,
146 float dominate_fraction);
147
148 // Initializes everything in the given tensor to the given value.
149 template <typename T>
150 void Initialize(Tensor counts, T val = 0) {
151 auto flat = counts.unaligned_flat<T>();
152 std::fill(flat.data(), flat.data() + flat.size(), val);
153 }
154
155 // Returns a function that accesses the (i,j)-th element of the specified two
156 // dimensional tensor.
157 GetFeatureFnType GetDenseFunctor(const Tensor& dense);
158
159 // Returns a function that looks for the j-th feature of the i-th example
160 // in the sparse data, or the default value if not found. See FindSparseValue.
161 GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices,
162 const Tensor& sparse_values);
163
164 // Returns true if the point falls to the right (i.e., the selected feature
165 // of the input point is greater than the bias threshold), and false if it
166 // falls to the left.
167 // Even though our input data is forced into float Tensors, it could have
168 // originally been something else (e.g. categorical string data) which
169 // we treat differently.
170 bool DecideNode(const GetFeatureFnType& get_dense,
171 const GetFeatureFnType& get_sparse, int32 i, int32 feature,
172 float bias, const tensorforest::TensorForestDataSpec& spec);
173
174 // If T is a sparse float matrix represented by sparse_input_indices and
175 // sparse_input_values, FindSparseValue returns T(i,j), or 0.0 if (i,j)
176 // isn't present in sparse_input_indices. sparse_input_indices is assumed
177 // to be sorted.
178 template <typename T1, typename T2>
FindSparseValue(const T1 & sparse_input_indices,const T2 & sparse_input_values,int32 i,int32 j)179 float FindSparseValue(const T1& sparse_input_indices,
180 const T2& sparse_input_values, int32 i, int32 j) {
181 int32 low = 0;
182 int32 high = sparse_input_values.dimension(0);
183 while (low < high) {
184 int32 mid = (low + high) / 2;
185 int64 midi = internal::SubtleMustCopy(sparse_input_indices(mid, 0));
186 int64 midj = internal::SubtleMustCopy(sparse_input_indices(mid, 1));
187 if (midi == i) {
188 if (midj == j) {
189 return sparse_input_values(mid);
190 }
191 if (midj < j) {
192 low = mid + 1;
193 } else {
194 high = mid;
195 }
196 continue;
197 }
198 if (midi < i) {
199 low = mid + 1;
200 } else {
201 high = mid;
202 }
203 }
204 return 0.0;
205 }
206
207 // Returns the number of sparse values for example input_index.
208 // Also returns the index where those features start in sparse_input_start
209 // if any were found.
210 // Assumes that the first column in indices is ordered.
211 template <typename T1>
GetNumSparseFeatures(const T1 & indices,int32 input_index,int64 * sparse_input_start)212 int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
213 int64* sparse_input_start) {
214 // Binary search for input_index.
215 // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
216 // for a simpler but possibly slower solution, or searching for
217 // input_start and input_end simultaneously.
218 const int64 num_total = indices.dimension(0);
219 int64 index;
220 int64 low = 0;
221 int64 high = num_total;
222 *sparse_input_start = -1; // Easy error checking.
223
224 while (true) {
225 if (low == high) {
226 return 0;
227 }
228 index = low + (high - low) / 2;
229 const int64 feature_index = indices(index, 0);
230 if (feature_index == input_index) {
231 // found it.
232 break;
233 } else if (feature_index < input_index) {
234 // Correct for the implicit floor in the index assignment.
235 if (low == index) {
236 return 0;
237 }
238 low = index;
239 } else {
240 high = index;
241 }
242 }
243
244 // Scan for the start and end of the input_index range.
245 int64 input_start = index;
246 int64 val = indices(input_start, 0);
247 while (val == input_index) {
248 --input_start;
249 if (input_start < 0) {
250 break;
251 }
252 val = indices(input_start, 0);
253 }
254 *sparse_input_start = input_start + 1;
255 int32 input_end = index;
256 val = indices(input_end, 0);
257 while (val == input_index) {
258 ++input_end;
259 if (input_end >= num_total) {
260 break;
261 }
262 val = indices(input_end, 0);
263 }
264 return input_end - input_start - 1;
265 }
266
267 // Returns left/right decision between the input value and the threshold bias.
268 // For floating point types, the decision is value > bias, but for
269 // categorical data, it is value != bias.
270 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
271
272 // Returns true if all the splits are initialized. Since they get initialized
273 // in order, we can simply infer this from the last split.
274 // This should only be called for a single allocator's candidate features
275 // (i.e. candidate_split_features.Slice(accumulator, accumulator + 1) ).
276 template <typename EigenType>
IsAllInitialized(const EigenType & features,int32 accumulator,int32 num_splits)277 bool IsAllInitialized(const EigenType& features, int32 accumulator,
278 int32 num_splits) {
279 return features(accumulator, num_splits - 1) >= 0;
280 }
281
282 // Tensorforest currently only allows tensors up to 2^31 elements. Return false
283 // if any dimension is greater than that, true otherwise.
CheckTensorBounds(OpKernelContext * context,const Tensor & tensor)284 inline bool CheckTensorBounds(OpKernelContext* context, const Tensor& tensor) {
285 for (int i = 0; i < (tensor).dims(); ++i) {
286 if (!TF_PREDICT_TRUE(tensor.shape().dim_size(i) <
287 std::numeric_limits<int32>::max())) {
288 context->CtxFailure((errors::InvalidArgument(
289 strings::StrCat("Tensor has a dimension that is greater than 2^31: ",
290 tensor.DebugString()))));
291 return false;
292 }
293 }
294 return true;
295 }
296
297 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
298 float parent_sum, const float* parent_data,
299 float valid_leaf_threshold, int num_outputs,
300 std::vector<float>* mean);
301
302 } // namespace tensorforest
303 } // namespace tensorflow
304
305 #endif // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
306