1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
17 #define TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
18 
19 #define EIGEN_USE_THREADS
20 
21 #include <stddef.h>
22 #include <algorithm>
23 #include <cmath>
24 #include <memory>
25 #include <new>
26 #include <unordered_map>
27 #include <utility>
28 #include <vector>
29 
30 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
31 #include "tensorflow/core/framework/device_base.h"
32 #include "tensorflow/core/framework/op_kernel.h"
33 #include "tensorflow/core/framework/tensor.h"
34 #include "tensorflow/core/framework/tensor_shape.h"
35 #include "tensorflow/core/framework/tensor_types.h"
36 #include "tensorflow/core/framework/types.h"
37 #include "tensorflow/core/kernels/loss.h"
38 #include "tensorflow/core/lib/core/coding.h"
39 #include "tensorflow/core/lib/core/errors.h"
40 #include "tensorflow/core/lib/core/status.h"
41 #include "tensorflow/core/lib/core/stringpiece.h"
42 #include "tensorflow/core/lib/gtl/inlined_vector.h"
43 #include "tensorflow/core/lib/random/distribution_sampler.h"
44 #include "tensorflow/core/lib/strings/stringprintf.h"
45 #include "tensorflow/core/util/guarded_philox_random.h"
46 #include "tensorflow/core/util/work_sharder.h"
47 
48 namespace tensorflow {
49 
50 namespace sdca {
51 
52 // Statistics computed with input (ModelWeights, Example).
53 struct ExampleStatistics {
54   // Logits for each class.
55   // For binary case, this should be a vector of length 1; while for multiclass
56   // case, this vector has the same length as the number of classes, where each
57   // value corresponds to one class.
58   // Use InlinedVector to avoid heap allocation for small number of classes.
59   gtl::InlinedVector<double, 1> wx;
60 
61   // Logits for each class, using the previous weights.
62   gtl::InlinedVector<double, 1> prev_wx;
63 
64   // Sum of squared feature values occurring in the example divided by
65   // L2 * sum(example_weights).
66   double normalized_squared_norm = 0;
67 
68   // Num_weight_vectors equals to the number of classification classes in the
69   // multiclass case; while for binary case, it is 1.
ExampleStatisticsExampleStatistics70   ExampleStatistics(const int num_weight_vectors)
71       : wx(num_weight_vectors, 0.0), prev_wx(num_weight_vectors, 0.0) {}
72 };
73 
74 class Regularizations {
75  public:
Regularizations()76   Regularizations() {}
77 
78   // Initialize() must be called immediately after construction.
Initialize(OpKernelConstruction * const context)79   Status Initialize(OpKernelConstruction* const context) {
80     TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
81     TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
82     shrinkage_ = symmetric_l1_ / symmetric_l2_;
83     return Status::OK();
84   }
85 
86   // Proximal SDCA shrinking for L1 regularization.
Shrink(const double weight)87   double Shrink(const double weight) const {
88     const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
89     if (shrinked > 0.0) {
90       return std::copysign(shrinked, weight);
91     }
92     return 0.0;
93   }
94 
95   // Vectorized float variant of the above.
EigenShrinkVector(const Eigen::Tensor<float,1,Eigen::RowMajor> weights)96   Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrinkVector(
97       const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
98     // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
99     return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
100                                  .cwiseMax(weights.constant(0.0)));
101   }
102 
103   // Matrix float variant of the above.
EigenShrinkMatrix(const Eigen::Tensor<float,2,Eigen::RowMajor> weights)104   Eigen::Tensor<float, 2, Eigen::RowMajor> EigenShrinkMatrix(
105       const Eigen::Tensor<float, 2, Eigen::RowMajor> weights) const {
106     // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
107     return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
108                                  .cwiseMax(weights.constant(0.0)));
109   }
110 
symmetric_l2()111   float symmetric_l2() const { return symmetric_l2_; }
112 
113  private:
114   float symmetric_l1_ = 0;
115   float symmetric_l2_ = 0;
116 
117   // L1 divided by L2, pre-computed for use during weight shrinking.
118   double shrinkage_ = 0;
119 
120   TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
121 };
122 
123 class ModelWeights;
124 
125 // Struct describing a single example.
126 class Example {
127  public:
128   // Compute matrix vector product between weights (a matrix) and features
129   // (a vector). This method also computes the normalized example norm used
130   // in SDCA update.
131   // For multiclass case, num_weight_vectors equals to the number of classes;
132   // while for binary case, it is 1.
133   const ExampleStatistics ComputeWxAndWeightedExampleNorm(
134       const int num_loss_partitions, const ModelWeights& model_weights,
135       const Regularizations& regularization,
136       const int num_weight_vectors) const;
137 
example_label()138   float example_label() const { return example_label_; }
139 
example_weight()140   float example_weight() const { return example_weight_; }
141 
squared_norm()142   double squared_norm() const { return squared_norm_; }
143 
144   // Sparse features associated with the example.
145   // Indices and Values are the associated feature index, and values. Values
146   // can be optionally absent, in which we case we implicitly assume a value of
147   // 1.0f.
148   struct SparseFeatures {
149     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
150     std::unique_ptr<TTypes<const float>::UnalignedConstVec>
151         values;  // nullptr encodes optional.
152   };
153 
154   // A dense vector which is a row-slice of the underlying matrix.
155   struct DenseVector {
156     // Returns a row slice from the matrix.
RowDenseVector157     Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row()
158         const {
159       return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
160           data_matrix.data() + row_index * data_matrix.dimension(1),
161           data_matrix.dimension(1));
162     }
163 
164     // Returns a row slice as a 1 * F matrix, where F is the number of features.
165     Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>
RowAsMatrixDenseVector166     RowAsMatrix() const {
167       return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>(
168           data_matrix.data() + row_index * data_matrix.dimension(1), 1,
169           data_matrix.dimension(1));
170     }
171 
172     const TTypes<float>::ConstMatrix data_matrix;
173     const int64 row_index;
174   };
175 
176  private:
177   std::vector<SparseFeatures> sparse_features_;
178   std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
179 
180   float example_label_ = 0;
181   float example_weight_ = 0;
182   double squared_norm_ = 0;  // sum squared norm of the features.
183 
184   // Examples fills Example in a multi-threaded way.
185   friend class Examples;
186 
187   // ModelWeights use each example for model update w += \alpha * x_{i};
188   friend class ModelWeights;
189 };
190 
191 // Weights related to features. For example, say you have two sets of sparse
192 // features i.e. age bracket and country, then FeatureWeightsDenseStorage hold
193 // the parameters for it. We keep track of the original weight passed in and the
194 // delta weight which the optimizer learns in each call to the optimizer.
195 class FeatureWeightsDenseStorage {
196  public:
FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,TTypes<float>::Matrix deltas)197   FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,
198                              TTypes<float>::Matrix deltas)
199       : nominals_(nominals), deltas_(deltas) {
200     CHECK_GT(deltas.rank(), 1);
201   }
202 
203   // Check if a feature index is with-in the bounds.
IndexValid(const int64 index)204   bool IndexValid(const int64 index) const {
205     return index >= 0 && index < deltas_.dimension(1);
206   }
207 
208   // Nominals here are the original weight matrix.
nominals()209   TTypes<const float>::Matrix nominals() const { return nominals_; }
210 
211   // Delta weights durining mini-batch updates.
deltas()212   TTypes<float>::Matrix deltas() const { return deltas_; }
213 
214   // Updates delta weights based on active dense features in the example and
215   // the corresponding dual residual.
216   void UpdateDenseDeltaWeights(
217       const Eigen::ThreadPoolDevice& device,
218       const Example::DenseVector& dense_vector,
219       const std::vector<double>& normalized_bounded_dual_delta);
220 
221  private:
222   // The nominal value of the weight for a feature (indexed by its id).
223   const TTypes<const float>::Matrix nominals_;
224   // The accumulated delta weight for a feature (indexed by its id).
225   TTypes<float>::Matrix deltas_;
226 };
227 
228 // Similar to FeatureWeightsDenseStorage, but the underlying weights are stored
229 // in an unordered map.
230 class FeatureWeightsSparseStorage {
231  public:
FeatureWeightsSparseStorage(const TTypes<const int64>::Vec indices,const TTypes<const float>::Matrix nominals,TTypes<float>::Matrix deltas)232   FeatureWeightsSparseStorage(const TTypes<const int64>::Vec indices,
233                               const TTypes<const float>::Matrix nominals,
234                               TTypes<float>::Matrix deltas)
235       : nominals_(nominals), deltas_(deltas) {
236     // Create a map from sparse index to the dense index of the underlying
237     // storage.
238     for (int64 j = 0; j < indices.size(); ++j) {
239       indices_to_id_[indices(j)] = j;
240     }
241   }
242 
243   // Check if a feature index exists.
IndexValid(const int64 index)244   bool IndexValid(const int64 index) const {
245     return indices_to_id_.find(index) != indices_to_id_.end();
246   }
247 
248   // Nominal value at a particular feature index and class label.
nominals(const int class_id,const int64 index)249   float nominals(const int class_id, const int64 index) const {
250     auto it = indices_to_id_.find(index);
251     return nominals_(class_id, it->second);
252   }
253 
254   // Delta weights durining mini-batch updates.
deltas(const int class_id,const int64 index)255   float deltas(const int class_id, const int64 index) const {
256     auto it = indices_to_id_.find(index);
257     return deltas_(class_id, it->second);
258   }
259 
260   // Updates delta weights based on active sparse features in the example and
261   // the corresponding dual residual.
262   void UpdateSparseDeltaWeights(
263       const Eigen::ThreadPoolDevice& device,
264       const Example::SparseFeatures& sparse_features,
265       const std::vector<double>& normalized_bounded_dual_delta);
266 
267  private:
268   // The nominal value of the weight for a feature (indexed by its id).
269   const TTypes<const float>::Matrix nominals_;
270   // The accumulated delta weight for a feature (indexed by its id).
271   TTypes<float>::Matrix deltas_;
272   // Map from feature index to an index to the dense vector.
273   std::unordered_map<int64, int64> indices_to_id_;
274 };
275 
276 // Weights in the model, wraps both current weights, and the delta weights
277 // for both sparse and dense features.
278 class ModelWeights {
279  public:
ModelWeights()280   ModelWeights() {}
281 
SparseIndexValid(const int col,const int64 index)282   bool SparseIndexValid(const int col, const int64 index) const {
283     return sparse_weights_[col].IndexValid(index);
284   }
285 
DenseIndexValid(const int col,const int64 index)286   bool DenseIndexValid(const int col, const int64 index) const {
287     return dense_weights_[col].IndexValid(index);
288   }
289 
290   // Go through all the features present in the example, and update the
291   // weights based on the dual delta.
292   void UpdateDeltaWeights(
293       const Eigen::ThreadPoolDevice& device, const Example& example,
294       const std::vector<double>& normalized_bounded_dual_delta);
295 
296   Status Initialize(OpKernelContext* const context);
297 
sparse_weights()298   const std::vector<FeatureWeightsSparseStorage>& sparse_weights() const {
299     return sparse_weights_;
300   }
301 
dense_weights()302   const std::vector<FeatureWeightsDenseStorage>& dense_weights() const {
303     return dense_weights_;
304   }
305 
306  private:
307   std::vector<FeatureWeightsSparseStorage> sparse_weights_;
308   std::vector<FeatureWeightsDenseStorage> dense_weights_;
309 
310   TF_DISALLOW_COPY_AND_ASSIGN(ModelWeights);
311 };
312 
313 // Examples contains all the training examples that SDCA uses for a mini-batch.
314 class Examples {
315  public:
Examples()316   Examples() {}
317 
318   // Returns the Example at |example_index|.
example(const int example_index)319   const Example& example(const int example_index) const {
320     return examples_.at(example_index);
321   }
322 
sampled_index(const int id)323   int sampled_index(const int id) const { return sampled_index_[id]; }
324 
325   // Adaptive SDCA in the current implementation only works for
326   // binary classification, where the input argument for num_weight_vectors
327   // is 1.
328   Status SampleAdaptiveProbabilities(
329       const int num_loss_partitions, const Regularizations& regularization,
330       const ModelWeights& model_weights,
331       const TTypes<float>::Matrix example_state_data,
332       const std::unique_ptr<DualLossUpdater>& loss_updater,
333       const int num_weight_vectors);
334 
335   void RandomShuffle();
336 
num_examples()337   int num_examples() const { return examples_.size(); }
338 
num_features()339   int num_features() const { return num_features_; }
340 
341   // Initialize() must be called immediately after construction.
342   Status Initialize(OpKernelContext* const context, const ModelWeights& weights,
343                     int num_sparse_features,
344                     int num_sparse_features_with_values,
345                     int num_dense_features);
346 
347  private:
348   // Reads the input tensors, and builds the internal representation for sparse
349   // features per example. This function modifies the |examples| passed in
350   // to build the sparse representations.
351   static Status CreateSparseFeatureRepresentation(
352       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
353       int num_sparse_features, const ModelWeights& weights,
354       const OpInputList& sparse_example_indices_inputs,
355       const OpInputList& sparse_feature_indices_inputs,
356       const OpInputList& sparse_feature_values_inputs,
357       std::vector<Example>* const examples);
358 
359   // Reads the input tensors, and builds the internal representation for dense
360   // features per example. This function modifies the |examples| passed in
361   // to build the sparse representations.
362   static Status CreateDenseFeatureRepresentation(
363       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
364       int num_dense_features, const ModelWeights& weights,
365       const OpInputList& dense_features_inputs,
366       std::vector<Example>* const examples);
367 
368   // Computes squared example norm per example i.e |x|^2. This function modifies
369   // the |examples| passed in and adds the squared norm per example.
370   static Status ComputeSquaredNormPerExample(
371       const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
372       int num_sparse_features, int num_dense_features,
373       std::vector<Example>* const examples);
374 
375   // All examples in the batch.
376   std::vector<Example> examples_;
377 
378   // Adaptive sampling variables.
379   std::vector<float> probabilities_;
380   std::vector<int> sampled_index_;
381   std::vector<int> sampled_count_;
382 
383   int num_features_ = 0;
384 
385   TF_DISALLOW_COPY_AND_ASSIGN(Examples);
386 };
387 
388 }  // namespace sdca
389 }  // namespace tensorflow
390 
391 #endif  // TENSORFLOW_CORE_KERNELS_SDCA_INTERNAL_H_
392