1 // Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 // =============================================================================
15 #ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
16 #define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
17 
18 #include <limits>
19 
20 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_types.h"
25 #include "tensorflow/core/lib/random/distribution_sampler.h"
26 #include "tensorflow/core/lib/random/simple_philox.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/types.h"
30 
31 namespace tensorflow {
32 namespace tensorforest {
33 
34 // We hide Eigen's hideous types behind a function that returns the (i, j)-th
35 // entry of a two dimensional tensor; this is that function's type.
36 using GetFeatureFnType = std::function<float(int32, int32)>;
37 
38 // TODO(gilberth): Put these in protos so they can be shared by C++ and python.
39 // Indexes in the tree representation's 2nd dimension for children and features.
40 const int32 CHILDREN_INDEX = 0;
41 const int32 FEATURE_INDEX = 1;
42 
43 // Used in the tree's children sub-tensor to indicate leaf and free nodes.
44 const int32 LEAF_NODE = -1;
45 const int32 FREE_NODE = -2;
46 
47 // Used to indicate column types, e.g. categorical vs. float
48 enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
49 
50 // Calculates the sum of a tensor.
51 template <typename T>
Sum(Tensor counts)52 T Sum(Tensor counts) {
53   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
54       counts.unaligned_flat<T>().sum();
55   return count_sum(0);
56 }
57 
58 // Get the two best scores and their indices among max splits.
59 void GetTwoBest(int max, const std::function<float(int)>& score_fn,
60                 float* best_score, int* best_index, float* second_best_score,
61                 int* second_best_index);
62 
63 // If the Gini Impurity is g, this returns n^2 (g - 1).  This is an int
64 // rather than a float, and so can be more easily checked for ties.
65 int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
66                   random::SimplePhilox* rand);
67 
68 // Get the DataColumnTypes number for the given feature.
69 DataColumnTypes FindDenseFeatureSpec(
70     int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
71 DataColumnTypes FindSparseFeatureSpec(
72     int32 input_feature, const tensorforest::TensorForestDataSpec& spec);
73 
74 // Given an Eigen::Tensor type, calculate the Gini impurity.
75 template <typename T>
RawWeightedGiniImpurity(const T & counts)76 float RawWeightedGiniImpurity(const T& counts) {
77   // Our split score is the Gini impurity times the number of examples
78   // seen by the leaf.  If c(i) denotes the i-th class count and c = sum_i c(i)
79   // then
80   // score = c * (1 - sum_i ( c(i) / c )^2 )
81   //       = c - sum_i c(i)^2 / c
82   const auto sum = counts.sum();
83   const auto sum2 = counts.square().sum();
84   Eigen::Tensor<float, 0, Eigen::RowMajor> ret = sum - (sum2 / sum);
85   return ret(0);
86 }
87 
88 // Given an Eigen::Tensor type, calculate the smoothed Gini impurity, which we
89 // use to determine the best split (lowest) and which nodes to allocate first
90 // (highest).
91 template <typename T>
WeightedGiniImpurity(const T & counts)92 float WeightedGiniImpurity(const T& counts) {
93   const auto smoothed = counts + counts.constant(1.0f);
94   return RawWeightedGiniImpurity(smoothed);
95 }
96 
97 template <typename T1, typename T2>
WeightedVariance(const T1 & sums,const T2 & squares,float count)98 float WeightedVariance(const T1& sums, const T2& squares, float count) {
99   const auto e_x = sums / count;
100   const auto e_x2 = squares / count;
101   Eigen::Tensor<float, 0, Eigen::RowMajor> ret = (e_x2 - e_x.square()).sum();
102   return count * ret(0);
103 }
104 
105 // Returns the best split to use based on the (lowest) Gini impurity.
106 // Takes in the whole total and per-split count tensors because using
107 // Tensor::Slice returns a tensor of the same dimensionality, which makes
108 // things a little awkward.
109 int32 BestFeatureClassification(const Tensor& total_counts,
110                                 const Tensor& split_counts, int32 accumulator);
111 
112 // Returns the best split to use based on the (lowest) variance.
113 int32 BestFeatureRegression(const Tensor& total_sums,
114                             const Tensor& total_squares,
115                             const Tensor& split_sums,
116                             const Tensor& split_squares, int32 accumulator);
117 
118 // Returns true if the best split's variance is sufficiently smaller than
119 // that of the next best split.
120 bool BestSplitDominatesRegression(const Tensor& total_sums,
121                                   const Tensor& total_squares,
122                                   const Tensor& split_sums,
123                                   const Tensor& split_squares,
124                                   int32 accumulator);
125 
126 // Performs bootstrap_samples bootstrap samples of the best split's class
127 // counts and the second best splits's class counts, and returns true if at
128 // least dominate_fraction of the time, the former has a better (lower)
129 // Gini impurity.  Does not take over ownership of *rand.
130 bool BestSplitDominatesClassificationBootstrap(
131     const Tensor& total_counts, const Tensor& split_counts, int32 accumulator,
132     float dominate_fraction, tensorflow::random::SimplePhilox* rand);
133 
134 // Returns true if the best split's Gini impurity is sufficiently smaller than
135 // that of the next best split, as measured by the Hoeffding Tree bound.
136 bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts,
137                                                const Tensor& split_counts,
138                                                int32 accumulator,
139                                                float dominate_fraction);
140 
141 // Returns true if the best split's Gini impurity is sufficiently smaller than
142 // that of the next best split, as measured by a Chebyshev bound.
143 bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts,
144                                                const Tensor& split_counts,
145                                                int32 accumulator,
146                                                float dominate_fraction);
147 
148 // Initializes everything in the given tensor to the given value.
149 template <typename T>
150 void Initialize(Tensor counts, T val = 0) {
151   auto flat = counts.unaligned_flat<T>();
152   std::fill(flat.data(), flat.data() + flat.size(), val);
153 }
154 
155 // Returns a function that accesses the (i,j)-th element of the specified two
156 // dimensional tensor.
157 GetFeatureFnType GetDenseFunctor(const Tensor& dense);
158 
159 // Returns a function that looks for the j-th feature of the i-th example
160 // in the sparse data, or the default value if not found.  See FindSparseValue.
161 GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices,
162                                   const Tensor& sparse_values);
163 
164 // Returns true if the point falls to the right (i.e., the selected feature
165 // of the input point is greater than the bias threshold), and false if it
166 // falls to the left.
167 // Even though our input data is forced into float Tensors, it could have
168 // originally been something else (e.g. categorical string data) which
169 // we treat differently.
170 bool DecideNode(const GetFeatureFnType& get_dense,
171                 const GetFeatureFnType& get_sparse, int32 i, int32 feature,
172                 float bias, const tensorforest::TensorForestDataSpec& spec);
173 
174 // If T is a sparse float matrix represented by sparse_input_indices and
175 // sparse_input_values, FindSparseValue returns T(i,j), or 0.0 if (i,j)
176 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
177 // to be sorted.
178 template <typename T1, typename T2>
FindSparseValue(const T1 & sparse_input_indices,const T2 & sparse_input_values,int32 i,int32 j)179 float FindSparseValue(const T1& sparse_input_indices,
180                       const T2& sparse_input_values, int32 i, int32 j) {
181   int32 low = 0;
182   int32 high = sparse_input_values.dimension(0);
183   while (low < high) {
184     int32 mid = (low + high) / 2;
185     int64 midi = internal::SubtleMustCopy(sparse_input_indices(mid, 0));
186     int64 midj = internal::SubtleMustCopy(sparse_input_indices(mid, 1));
187     if (midi == i) {
188       if (midj == j) {
189         return sparse_input_values(mid);
190       }
191       if (midj < j) {
192         low = mid + 1;
193       } else {
194         high = mid;
195       }
196       continue;
197     }
198     if (midi < i) {
199       low = mid + 1;
200     } else {
201       high = mid;
202     }
203   }
204   return 0.0;
205 }
206 
207 // Returns the number of sparse values for example input_index.
208 // Also returns the index where those features start in sparse_input_start
209 // if any were found.
210 // Assumes that the first column in indices is ordered.
211 template <typename T1>
GetNumSparseFeatures(const T1 & indices,int32 input_index,int64 * sparse_input_start)212 int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
213                            int64* sparse_input_start) {
214   // Binary search for input_index.
215   // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
216   // for a simpler but possibly slower solution, or searching for
217   // input_start and input_end simultaneously.
218   const int64 num_total = indices.dimension(0);
219   int64 index;
220   int64 low = 0;
221   int64 high = num_total;
222   *sparse_input_start = -1;  // Easy error checking.
223 
224   while (true) {
225     if (low == high) {
226       return 0;
227     }
228     index = low + (high - low) / 2;
229     const int64 feature_index = indices(index, 0);
230     if (feature_index == input_index) {
231       // found it.
232       break;
233     } else if (feature_index < input_index) {
234       // Correct for the implicit floor in the index assignment.
235       if (low == index) {
236         return 0;
237       }
238       low = index;
239     } else {
240       high = index;
241     }
242   }
243 
244   // Scan for the start and end of the input_index range.
245   int64 input_start = index;
246   int64 val = indices(input_start, 0);
247   while (val == input_index) {
248     --input_start;
249     if (input_start < 0) {
250       break;
251     }
252     val = indices(input_start, 0);
253   }
254   *sparse_input_start = input_start + 1;
255   int32 input_end = index;
256   val = indices(input_end, 0);
257   while (val == input_index) {
258     ++input_end;
259     if (input_end >= num_total) {
260       break;
261     }
262     val = indices(input_end, 0);
263   }
264   return input_end - input_start - 1;
265 }
266 
267 // Returns left/right decision between the input value and the threshold bias.
268 // For floating point types, the decision is value > bias, but for
269 // categorical data, it is value != bias.
270 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
271 
272 // Returns true if all the splits are initialized. Since they get initialized
273 // in order, we can simply infer this from the last split.
274 // This should only be called for a single allocator's candidate features
275 // (i.e. candidate_split_features.Slice(accumulator, accumulator + 1) ).
276 template <typename EigenType>
IsAllInitialized(const EigenType & features,int32 accumulator,int32 num_splits)277 bool IsAllInitialized(const EigenType& features, int32 accumulator,
278                       int32 num_splits) {
279   return features(accumulator, num_splits - 1) >= 0;
280 }
281 
282 // Tensorforest currently only allows tensors up to 2^31 elements.  Return false
283 // if any dimension is greater than that, true otherwise.
CheckTensorBounds(OpKernelContext * context,const Tensor & tensor)284 inline bool CheckTensorBounds(OpKernelContext* context, const Tensor& tensor) {
285   for (int i = 0; i < (tensor).dims(); ++i) {
286     if (!TF_PREDICT_TRUE(tensor.shape().dim_size(i) <
287                          std::numeric_limits<int32>::max())) {
288       context->CtxFailure((errors::InvalidArgument(
289           strings::StrCat("Tensor has a dimension that is greater than 2^31: ",
290                           tensor.DebugString()))));
291       return false;
292     }
293   }
294   return true;
295 }
296 
297 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
298                            float parent_sum, const float* parent_data,
299                            float valid_leaf_threshold, int num_outputs,
300                            std::vector<float>* mean);
301 
302 }  // namespace tensorforest
303 }  // namespace tensorflow
304 
305 #endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_TREE_UTILS_H_
306