1 // Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 // =============================================================================
15 #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
16 #include <algorithm>
17 #include <cfloat>
18 #include "tensorflow/core/lib/random/philox_random.h"
19 #include "tensorflow/core/platform/logging.h"
20
21 namespace tensorflow {
22 namespace tensorforest {
23
24 using tensorflow::Tensor;
25
FindDenseFeatureSpec(int32 input_feature,const tensorforest::TensorForestDataSpec & spec)26 DataColumnTypes FindDenseFeatureSpec(
27 int32 input_feature, const tensorforest::TensorForestDataSpec& spec) {
28 return static_cast<DataColumnTypes>(spec.GetDenseFeatureType(input_feature));
29 }
30
FindSparseFeatureSpec(int32 input_feature,const tensorforest::TensorForestDataSpec & spec)31 DataColumnTypes FindSparseFeatureSpec(
32 int32 input_feature, const tensorforest::TensorForestDataSpec& spec) {
33 // TODO(thomaswc): Binary search here, especially when we start using more
34 // than one sparse column
35 int32 size_sum = spec.sparse(0).size();
36 int32 column_num = 0;
37 while (input_feature >= size_sum && column_num < spec.sparse_size()) {
38 ++column_num;
39 size_sum += spec.sparse(column_num).size();
40 }
41
42 return static_cast<DataColumnTypes>(spec.sparse(column_num).original_type());
43 }
44
GetTwoBest(int max,const std::function<float (int)> & score_fn,float * best_score,int * best_index,float * second_best_score,int * second_best_index)45 void GetTwoBest(int max, const std::function<float(int)>& score_fn,
46 float* best_score, int* best_index, float* second_best_score,
47 int* second_best_index) {
48 *best_index = -1;
49 *second_best_index = -1;
50 *best_score = FLT_MAX;
51 *second_best_score = FLT_MAX;
52 for (int i = 0; i < max; i++) {
53 float score = score_fn(i);
54 if (score < *best_score) {
55 *second_best_score = *best_score;
56 *second_best_index = *best_index;
57 *best_score = score;
58 *best_index = i;
59 } else if (score < *second_best_score) {
60 *second_best_score = score;
61 *second_best_index = i;
62 }
63 }
64 }
65
ClassificationSplitScore(const Eigen::Tensor<float,1,Eigen::RowMajor> & splits,const Eigen::Tensor<float,1,Eigen::RowMajor> & rights,int32 num_classes,int i)66 float ClassificationSplitScore(
67 const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits,
68 const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights, int32 num_classes,
69 int i) {
70 Eigen::array<Eigen::Index, 1> offsets;
71 // Class counts are stored with the total in [0], so the length of each
72 // count vector is num_classes + 1.
73 offsets[0] = i * (num_classes + 1) + 1;
74 Eigen::array<Eigen::Index, 1> extents;
75 extents[0] = num_classes;
76 return WeightedGiniImpurity(splits.slice(offsets, extents)) +
77 WeightedGiniImpurity(rights.slice(offsets, extents));
78 }
79
GetTwoBestClassification(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,float * best_score,int * best_index,float * second_best_score,int * second_best_index)80 void GetTwoBestClassification(const Tensor& total_counts,
81 const Tensor& split_counts, int32 accumulator,
82 float* best_score, int* best_index,
83 float* second_best_score,
84 int* second_best_index) {
85 const int32 num_splits = static_cast<int32>(split_counts.shape().dim_size(1));
86 const int32 num_classes =
87 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
88
89 // Ideally, Eigen::Tensor::chip would be best to use here but it results
90 // in seg faults, so we have to go with flat views of these tensors. However,
91 // it is still pretty efficient because we put off evaluation until the
92 // score is actually returned.
93 const auto tc =
94 total_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
95
96 // TODO(gilberth): See if we can delay evaluation here by templating the
97 // arguments to ClassificationSplitScore.
98 const Eigen::Tensor<float, 1, Eigen::RowMajor> splits =
99 split_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
100 Eigen::array<Eigen::Index, 1> bcast;
101 bcast[0] = num_splits;
102 const Eigen::Tensor<float, 1, Eigen::RowMajor> rights =
103 tc.broadcast(bcast) - splits;
104
105 std::function<float(int)> score_fn =
106 std::bind(ClassificationSplitScore, splits, rights, num_classes,
107 std::placeholders::_1);
108
109 GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score,
110 second_best_index);
111 }
112
BestFeatureClassification(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator)113 int32 BestFeatureClassification(const Tensor& total_counts,
114 const Tensor& split_counts, int32 accumulator) {
115 float best_score;
116 float second_best_score;
117 int best_feature_index;
118 int second_best_index;
119 GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score,
120 &best_feature_index, &second_best_score,
121 &second_best_index);
122 return best_feature_index;
123 }
124
RegressionSplitScore(const Eigen::Tensor<float,3,Eigen::RowMajor> & splits_count_accessor,const Eigen::Tensor<float,2,Eigen::RowMajor> & totals_count_accessor,const Eigen::Tensor<float,1,Eigen::RowMajor> & splits_sum,const Eigen::Tensor<float,1,Eigen::RowMajor> & splits_square,const Eigen::Tensor<float,1,Eigen::RowMajor> & right_sums,const Eigen::Tensor<float,1,Eigen::RowMajor> & right_squares,int32 accumulator,int32 num_regression_dims,int i)125 float RegressionSplitScore(
126 const Eigen::Tensor<float, 3, Eigen::RowMajor>& splits_count_accessor,
127 const Eigen::Tensor<float, 2, Eigen::RowMajor>& totals_count_accessor,
128 const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_sum,
129 const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_square,
130 const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_sums,
131 const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_squares,
132 int32 accumulator, int32 num_regression_dims, int i) {
133 Eigen::array<Eigen::Index, 1> offsets = {i * num_regression_dims + 1};
134 Eigen::array<Eigen::Index, 1> extents = {num_regression_dims - 1};
135 float left_count = splits_count_accessor(accumulator, i, 0);
136 float right_count = totals_count_accessor(accumulator, 0) - left_count;
137
138 float score = 0;
139
140 // Guard against divide-by-zero.
141 if (left_count > 0) {
142 score +=
143 WeightedVariance(splits_sum.slice(offsets, extents),
144 splits_square.slice(offsets, extents), left_count);
145 }
146
147 if (right_count > 0) {
148 score +=
149 WeightedVariance(right_sums.slice(offsets, extents),
150 right_squares.slice(offsets, extents), right_count);
151 }
152 return score;
153 }
154
GetTwoBestRegression(const Tensor & total_sums,const Tensor & total_squares,const Tensor & split_sums,const Tensor & split_squares,int32 accumulator,float * best_score,int * best_index,float * second_best_score,int * second_best_index)155 void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
156 const Tensor& split_sums, const Tensor& split_squares,
157 int32 accumulator, float* best_score, int* best_index,
158 float* second_best_score, int* second_best_index) {
159 const int32 num_splits = static_cast<int32>(split_sums.shape().dim_size(1));
160 const int32 num_regression_dims =
161 static_cast<int32>(split_sums.shape().dim_size(2));
162 // Ideally, Eigen::Tensor::chip would be best to use here but it results
163 // in seg faults, so we have to go with flat views of these tensors. However,
164 // it is still pretty efficient because we put off evaluation until the
165 // score is actually returned.
166 const auto tc_sum =
167 total_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
168 const auto tc_square =
169 total_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
170 const auto splits_sum =
171 split_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
172 const auto splits_square =
173 split_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
174 // Eigen is infuriating to work with, usually resulting in all kinds of
175 // unhelpful compiler errors when trying something that seems sane. This
176 // helps us do a simple thing like access the first element (the counts)
177 // of these tensors so we can calculate expected value in Variance().
178 const auto splits_count_accessor = split_sums.tensor<float, 3>();
179 const auto totals_count_accessor = total_sums.tensor<float, 2>();
180
181 Eigen::array<Eigen::Index, 1> bcast;
182 bcast[0] = num_splits;
183 const auto right_sums = tc_sum.broadcast(bcast) - splits_sum;
184 const auto right_squares = tc_square.broadcast(bcast) - splits_square;
185
186 GetTwoBest(num_splits,
187 std::bind(RegressionSplitScore, splits_count_accessor,
188 totals_count_accessor, splits_sum, splits_square,
189 right_sums, right_squares, accumulator,
190 num_regression_dims, std::placeholders::_1),
191 best_score, best_index, second_best_score, second_best_index);
192 }
193
BestFeatureRegression(const Tensor & total_sums,const Tensor & total_squares,const Tensor & split_sums,const Tensor & split_squares,int32 accumulator)194 int32 BestFeatureRegression(const Tensor& total_sums,
195 const Tensor& total_squares,
196 const Tensor& split_sums,
197 const Tensor& split_squares, int32 accumulator) {
198 float best_score;
199 float second_best_score;
200 int best_feature_index;
201 int second_best_index;
202 GetTwoBestRegression(total_sums, total_squares, split_sums, split_squares,
203 accumulator, &best_score, &best_feature_index,
204 &second_best_score, &second_best_index);
205 return best_feature_index;
206 }
207
BestSplitDominatesRegression(const Tensor & total_sums,const Tensor & total_squares,const Tensor & split_sums,const Tensor & split_squares,int32 accumulator)208 bool BestSplitDominatesRegression(const Tensor& total_sums,
209 const Tensor& total_squares,
210 const Tensor& split_sums,
211 const Tensor& split_squares,
212 int32 accumulator) {
213 // TODO(thomaswc): Implement this, probably as part of v3.
214 return false;
215 }
216
BootstrapGini(int n,int s,const random::DistributionSampler & ds,random::SimplePhilox * rand)217 int BootstrapGini(int n, int s, const random::DistributionSampler& ds,
218 random::SimplePhilox* rand) {
219 std::vector<int> counts(s, 0);
220 for (int i = 0; i < n; i++) {
221 int j = ds.Sample(rand);
222 counts[j] += 1;
223 }
224 int g = 0;
225 for (int j = 0; j < s; j++) {
226 g += counts[j] * counts[j];
227 }
228 // The true gini is 1 + (-g) / n^2
229 return -g;
230 }
231
232 // Populate *weights with the smoothed per-class frequencies needed to
233 // initialize a DistributionSampler. Returns the total number of samples
234 // seen by this accumulator.
MakeBootstrapWeights(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,int index,std::vector<float> * weights)235 int MakeBootstrapWeights(const Tensor& total_counts, const Tensor& split_counts,
236 int32 accumulator, int index,
237 std::vector<float>* weights) {
238 const int32 num_classes =
239 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
240
241 auto tc = total_counts.tensor<float, 2>();
242 auto lc = split_counts.tensor<float, 3>();
243
244 int n = tc(accumulator, 0);
245
246 float denom = static_cast<float>(n) + static_cast<float>(num_classes);
247
248 weights->resize(num_classes * 2);
249 for (int i = 0; i < num_classes; i++) {
250 // Use the Laplace smoothed per-class probabilities when generating the
251 // bootstrap samples.
252 float left_count = lc(accumulator, index, i + 1);
253 (*weights)[i] = (left_count + 1.0) / denom;
254 float right_count = tc(accumulator, i + 1) - left_count;
255 (*weights)[num_classes + i] = (right_count + 1.0) / denom;
256 }
257
258 return n;
259 }
260
BestSplitDominatesClassificationBootstrap(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,float dominate_fraction,random::SimplePhilox * rand)261 bool BestSplitDominatesClassificationBootstrap(const Tensor& total_counts,
262 const Tensor& split_counts,
263 int32 accumulator,
264 float dominate_fraction,
265 random::SimplePhilox* rand) {
266 float best_score;
267 float second_best_score;
268 int best_feature_index;
269 int second_best_index;
270 GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score,
271 &best_feature_index, &second_best_score,
272 &second_best_index);
273
274 std::vector<float> weights1;
275 int n1 = MakeBootstrapWeights(total_counts, split_counts, accumulator,
276 best_feature_index, &weights1);
277 random::DistributionSampler ds1(weights1);
278
279 std::vector<float> weights2;
280 int n2 = MakeBootstrapWeights(total_counts, split_counts, accumulator,
281 second_best_index, &weights2);
282 random::DistributionSampler ds2(weights2);
283
284 const int32 num_classes =
285 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
286
287 float p = 1.0 - dominate_fraction;
288 if (p <= 0 || p > 1.0) {
289 LOG(FATAL) << "Invalid dominate fraction " << dominate_fraction;
290 }
291
292 int bootstrap_samples = 1;
293 while (p < 1.0) {
294 bootstrap_samples += 1;
295 p = p * 2;
296 }
297
298 int worst_g1 = 0;
299 for (int i = 0; i < bootstrap_samples; i++) {
300 int g1 = BootstrapGini(n1, 2 * num_classes, ds1, rand);
301 worst_g1 = std::max(worst_g1, g1);
302 }
303
304 int best_g2 = 99;
305 for (int i = 0; i < bootstrap_samples; i++) {
306 int g2 = BootstrapGini(n2, 2 * num_classes, ds2, rand);
307 best_g2 = std::min(best_g2, g2);
308 }
309
310 return worst_g1 < best_g2;
311 }
312
BestSplitDominatesClassificationHoeffding(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,float dominate_fraction)313 bool BestSplitDominatesClassificationHoeffding(const Tensor& total_counts,
314 const Tensor& split_counts,
315 int32 accumulator,
316 float dominate_fraction) {
317 float best_score;
318 float second_best_score;
319 int best_feature_index;
320 int second_best_index;
321 VLOG(1) << "BSDC for accumulator " << accumulator;
322 GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score,
323 &best_feature_index, &second_best_score,
324 &second_best_index);
325 VLOG(1) << "Best score = " << best_score;
326 VLOG(1) << "2nd best score = " << second_best_score;
327
328 const int32 num_classes =
329 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
330 const float n = total_counts.Slice(accumulator, accumulator + 1)
331 .unaligned_flat<float>()(0);
332
333 // Each term in the Gini impurity can range from 0 to 0.5 * 0.5.
334 float range = 0.25 * static_cast<float>(num_classes) * n;
335
336 float hoeffding_bound =
337 range * sqrt(log(1.0 / (1.0 - dominate_fraction)) / (2.0 * n));
338
339 VLOG(1) << "num_classes = " << num_classes;
340 VLOG(1) << "n = " << n;
341 VLOG(1) << "range = " << range;
342 VLOG(1) << "hoeffding_bound = " << hoeffding_bound;
343 return (second_best_score - best_score) > hoeffding_bound;
344 }
345
DirichletCovarianceTrace(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,int index)346 double DirichletCovarianceTrace(const Tensor& total_counts,
347 const Tensor& split_counts, int32 accumulator,
348 int index) {
349 const int32 num_classes =
350 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
351
352 auto tc = total_counts.tensor<float, 2>();
353 auto lc = split_counts.tensor<float, 3>();
354
355 double leftc = 0.0;
356 double leftc2 = 0.0;
357 double rightc = 0.0;
358 double rightc2 = 0.0;
359 for (int i = 1; i <= num_classes; i++) {
360 double l = lc(accumulator, index, i) + 1.0;
361 leftc += l;
362 leftc2 += l * l;
363
364 double r = tc(accumulator, i) - lc(accumulator, index, i) + 1.0;
365 rightc += r;
366 rightc2 += r * r;
367 }
368
369 double left_trace = (1.0 - leftc2 / (leftc * leftc)) / (leftc + 1.0);
370 double right_trace = (1.0 - rightc2 / (rightc * rightc)) / (rightc + 1.0);
371 return left_trace + right_trace;
372 }
373
getDirichletMean(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,int index,std::vector<float> * mu)374 void getDirichletMean(const Tensor& total_counts, const Tensor& split_counts,
375 int32 accumulator, int index, std::vector<float>* mu) {
376 const int32 num_classes =
377 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
378
379 mu->resize(num_classes * 2);
380 auto tc = total_counts.tensor<float, 2>();
381 auto lc = split_counts.tensor<float, 3>();
382
383 double total = tc(accumulator, 0);
384
385 for (int i = 0; i < num_classes; i++) {
386 double l = lc(accumulator, index, i + 1);
387 mu->at(i) = (l + 1.0) / (total + num_classes);
388
389 double r = tc(accumulator, i) - l;
390 mu->at(i + num_classes) = (r + 1.) / (total + num_classes);
391 }
392 }
393
394 // Given lambda3, returns the distance from (mu1, mu2) to the surface.
getDistanceFromLambda3(double lambda3,const std::vector<float> & mu1,const std::vector<float> & mu2)395 double getDistanceFromLambda3(double lambda3, const std::vector<float>& mu1,
396 const std::vector<float>& mu2) {
397 if (fabs(lambda3) == 1.0) {
398 return 0.0;
399 }
400
401 int n = mu1.size();
402 double lambda1 = -2.0 * lambda3 / n;
403 double lambda2 = 2.0 * lambda3 / n;
404 // From below,
405 // x = (lambda_1 1 + 2 mu1) / (2 - 2 lambda_3)
406 // y = (lambda_2 1 + 2 mu2) / (2 + 2 lambda_3)
407 double dist = 0.0;
408 for (size_t i = 0; i < mu1.size(); i++) {
409 double diff = (lambda1 + 2.0 * mu1[i]) / (2.0 - 2.0 * lambda3) - mu1[i];
410 dist += diff * diff;
411 diff = (lambda2 + 2.0 * mu2[i]) / (2.0 + 2.0 * lambda3) - mu2[i];
412 dist += diff * diff;
413 }
414 return dist;
415 }
416
417 // Returns the distance between (mu1, mu2) and (x, y), where (x, y) is the
418 // nearest point that lies on the surface defined by
419 // {x dot 1 = 1, y dot 1 = 1, x dot x - y dot y = 0}.
getChebyshevEpsilon(const std::vector<float> & mu1,const std::vector<float> & mu2)420 double getChebyshevEpsilon(const std::vector<float>& mu1,
421 const std::vector<float>& mu2) {
422 // Math time!!
423 // We are trying to minimize d = |mu1 - x|^2 + |mu2 - y|^2 over the surface.
424 // Using Lagrange multipliers, we get
425 // partial d / partial x = -2 mu1 + 2 x = lambda_1 1 + 2 lambda_3 x
426 // partial d / partial y = -2 mu2 + 2 y = lambda_2 1 - 2 lambda_3 y
427 // or
428 // x = (lambda_1 1 + 2 mu1) / (2 - 2 lambda_3)
429 // y = (lambda_2 1 + 2 mu2) / (2 + 2 lambda_3)
430 // which implies
431 // 2 - 2 lambda_3 = lambda_1 1 dot 1 + 2 mu1 dot 1
432 // 2 + 2 lambda_3 = lambda_2 1 dot 1 + 2 mu2 dot 1
433 // |lambda_1 1 + 2 mu1|^2 (2 + 2 lambda_3)^2 =
434 // |lambda_2 1 + 2 mu2|^2 (2 - 2 lambda_3)^2
435 // So solving for the lambda's and using the fact that
436 // mu1 dot 1 = 1 and mu2 dot 1 = 1,
437 // lambda_1 = -2 lambda_3 / (1 dot 1)
438 // lambda_2 = 2 lambda_3 / (1 dot 1)
439 // and (letting n = 1 dot 1)
440 // | - lambda_3 1 + n mu1 |^2 (1 + lambda_3)^2 =
441 // | lambda_3 1 + n mu2 |^2 (1 - lambda_3)^2
442 // or
443 // (lambda_3^2 n - 2 n lambda_3 + n^2 mu1 dot mu1)(1 + lambda_3)^2 =
444 // (lambda_3^2 n + 2 n lambda_3 + n^2 mu2 dot mu2)(1 - lambda_3)^2
445 // or
446 // (lambda_3^2 - 2 lambda_3 + n mu1 dot mu1)(1 + 2 lambda_3 + lambda_3^2) =
447 // (lambda_3^2 + 2 lambda_3 + n mu2 dot mu2)(1 - 2 lambda_3 + lambda_3^2)
448 // or
449 // lambda_3^2 - 2 lambda_3 + n mu1 dot mu1
450 // + 2 lambda_3^3 - 2 lambda_3^2 + 2n lambda_3 mu1 dot mu1
451 // + lambda_3^4 - 2 lambda_3^3 + n lambda_3^2 mu1 dot mu1
452 // =
453 // lambda_3^2 + 2 lambda_3 + n mu2 dot mu2
454 // - 2 lambda_3^3 -4 lambda_3^2 - 2n lambda_3 mu2 dot mu2
455 // + lambda_3^4 + 2 lambda_3^3 + n lambda_3^2 mu2 dot mu2
456 // or
457 // - 2 lambda_3 + n mu1 dot mu1
458 // - 2 lambda_3^2 + 2n lambda_3 mu1 dot mu1
459 // + n lambda_3^2 mu1 dot mu1
460 // =
461 // + 2 lambda_3 + n mu2 dot mu2
462 // -4 lambda_3^2 - 2n lambda_3 mu2 dot mu2
463 // + n lambda_3^2 mu2 dot mu2
464 // or
465 // lambda_3^2 (2 + n mu1 dot mu1 + n mu2 dot mu2)
466 // + lambda_3 (2n mu1 dot mu1 + 2n mu2 dot mu2 - 4)
467 // + n mu1 dot mu1 - n mu2 dot mu2 = 0
468 // which can be solved using the quadratic formula.
469 int n = mu1.size();
470 double len1 = 0.0;
471 for (float m : mu1) {
472 len1 += m * m;
473 }
474 double len2 = 0.0;
475 for (float m : mu2) {
476 len2 += m * m;
477 }
478 double a = 2 + n * (len1 + len2);
479 double b = 2 * n * (len1 + len2) - 4;
480 double c = n * (len1 - len2);
481 double discrim = b * b - 4 * a * c;
482 if (discrim < 0.0) {
483 LOG(WARNING) << "Negative discriminant " << discrim;
484 return 0.0;
485 }
486
487 double sdiscrim = sqrt(discrim);
488 // TODO(thomaswc): Analyze whatever one of these is always closer.
489 double v1 = (-b + sdiscrim) / (2 * a);
490 double v2 = (-b - sdiscrim) / (2 * a);
491 double dist1 = getDistanceFromLambda3(v1, mu1, mu2);
492 double dist2 = getDistanceFromLambda3(v2, mu1, mu2);
493 return std::min(dist1, dist2);
494 }
495
BestSplitDominatesClassificationChebyshev(const Tensor & total_counts,const Tensor & split_counts,int32 accumulator,float dominate_fraction)496 bool BestSplitDominatesClassificationChebyshev(const Tensor& total_counts,
497 const Tensor& split_counts,
498 int32 accumulator,
499 float dominate_fraction) {
500 float best_score;
501 float second_best_score;
502 int best_feature_index;
503 int second_best_index;
504 VLOG(1) << "BSDC for accumulator " << accumulator;
505 GetTwoBestClassification(total_counts, split_counts, accumulator, &best_score,
506 &best_feature_index, &second_best_score,
507 &second_best_index);
508 VLOG(1) << "Best score = " << best_score;
509 VLOG(1) << "2nd best score = " << second_best_score;
510
511 const int32 num_classes =
512 static_cast<int32>(split_counts.shape().dim_size(2)) - 1;
513 const float n = total_counts.Slice(accumulator, accumulator + 1)
514 .unaligned_flat<float>()(0);
515
516 VLOG(1) << "num_classes = " << num_classes;
517 VLOG(1) << "n = " << n;
518 double trace = DirichletCovarianceTrace(total_counts, split_counts,
519 accumulator, best_feature_index) +
520 DirichletCovarianceTrace(total_counts, split_counts,
521 accumulator, second_best_index);
522
523 std::vector<float> mu1;
524 getDirichletMean(total_counts, split_counts, accumulator, best_feature_index,
525 &mu1);
526 std::vector<float> mu2;
527 getDirichletMean(total_counts, split_counts, accumulator, second_best_index,
528 &mu2);
529 double epsilon = getChebyshevEpsilon(mu1, mu2);
530
531 if (epsilon == 0.0) {
532 return false;
533 }
534
535 double dirichlet_bound = 1.0 - trace / (epsilon * epsilon);
536 return dirichlet_bound > dominate_fraction;
537 }
538
GetDenseFunctor(const Tensor & dense)539 GetFeatureFnType GetDenseFunctor(const Tensor& dense) {
540 if (dense.shape().dims() == 2) {
541 const auto dense_features = dense.matrix<float>();
542 // Here we capture by value, which shouldn't incur a copy of the data
543 // because of the underlying use of Eigen::TensorMap.
544 return [dense_features](int32 i, int32 feature) {
545 return dense_features(i, feature);
546 };
547 } else {
548 return [](int32 i, int32 feature) {
549 LOG(ERROR) << "trying to access nonexistent dense features.";
550 return 0;
551 };
552 }
553 }
554
GetSparseFunctor(const Tensor & sparse_indices,const Tensor & sparse_values)555 GetFeatureFnType GetSparseFunctor(const Tensor& sparse_indices,
556 const Tensor& sparse_values) {
557 if (sparse_indices.shape().dims() == 2) {
558 const auto indices = sparse_indices.matrix<int64>();
559 const auto values = sparse_values.vec<float>();
560 // Here we capture by value, which shouldn't incur a copy of the data
561 // because of the underlying use of Eigen::TensorMap.
562 return [indices, values](int32 i, int32 feature) {
563 return tensorforest::FindSparseValue(indices, values, i, feature);
564 };
565 } else {
566 return [](int32 i, int32 feature) {
567 LOG(ERROR) << "trying to access nonexistent sparse features.";
568 return 0;
569 };
570 }
571 }
572
DecideNode(const GetFeatureFnType & get_dense,const GetFeatureFnType & get_sparse,int32 i,int32 feature,float bias,const tensorforest::TensorForestDataSpec & spec)573 bool DecideNode(const GetFeatureFnType& get_dense,
574 const GetFeatureFnType& get_sparse, int32 i, int32 feature,
575 float bias, const tensorforest::TensorForestDataSpec& spec) {
576 if (feature < spec.dense_features_size()) {
577 return Decide(get_dense(i, feature), bias,
578 FindDenseFeatureSpec(feature, spec));
579 } else {
580 const int32 sparse_feature = feature - spec.dense_features_size();
581 return Decide(get_sparse(i, sparse_feature), bias,
582 FindSparseFeatureSpec(sparse_feature, spec));
583 }
584 }
585
Decide(float value,float bias,DataColumnTypes type)586 bool Decide(float value, float bias, DataColumnTypes type) {
587 switch (type) {
588 case kDataFloat:
589 return value >= bias;
590
591 case kDataCategorical:
592 // We arbitrarily define categorical equality as going left.
593 return value != bias;
594
595 default:
596 LOG(ERROR) << "Got unknown column type: " << type;
597 return false;
598 }
599 }
600
GetParentWeightedMean(float leaf_sum,const float * leaf_data,float parent_sum,const float * parent_data,float valid_leaf_threshold,int num_outputs,std::vector<float> * mean)601 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
602 float parent_sum, const float* parent_data,
603 float valid_leaf_threshold, int num_outputs,
604 std::vector<float>* mean) {
605 float parent_weight = 0.0;
606 if (leaf_sum < valid_leaf_threshold && parent_sum >= 0) {
607 VLOG(1) << "not enough samples at leaf, including parent counts."
608 << "child sum = " << leaf_sum;
609 // Weight the parent's counts just enough so that the new sum is
610 // valid_leaf_threshold_, but never give any counts a weight of
611 // more than 1.
612 parent_weight =
613 std::min(1.0f, (valid_leaf_threshold - leaf_sum) / parent_sum);
614 leaf_sum += parent_weight * parent_sum;
615 VLOG(1) << "Sum w/ parent included = " << leaf_sum;
616 }
617
618 for (int c = 0; c < num_outputs; c++) {
619 float w = leaf_data[c];
620 if (parent_weight > 0.0) {
621 w += parent_weight * parent_data[c];
622 }
623 (*mean)[c] = w / leaf_sum;
624 }
625 }
626
627 } // namespace tensorforest
628 } // namespace tensorflow
629