1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
17#define TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
18
19#define EIGEN_USE_THREADS
20
21#include <stddef.h>
22#include <algorithm>
23#include <cmath>
24#include <memory>
25#include <new>
26#include <unordered_map>
27#include <utility>
28#include <vector>
29
30#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
31#include "tensorflow/core/framework/device_base.h"
32#include "tensorflow/core/framework/op_kernel.h"
33#include "tensorflow/core/framework/tensor.h"
34#include "tensorflow/core/framework/tensor_shape.h"
35#include "tensorflow/core/framework/tensor_types.h"
36#include "tensorflow/core/framework/types.h"
37#include "tensorflow/core/kernels/loss.h"
38#include "tensorflow/core/lib/core/coding.h"
39#include "tensorflow/core/lib/core/errors.h"
40#include "tensorflow/core/lib/core/status.h"
41#include "tensorflow/core/lib/core/stringpiece.h"
42#include "tensorflow/core/lib/gtl/inlined_vector.h"
43#include "tensorflow/core/lib/random/distribution_sampler.h"
44#include "tensorflow/core/lib/strings/stringprintf.h"
45#include "tensorflow/core/util/guarded_philox_random.h"
46#include "tensorflow/core/util/sparse/group_iterator.h"
47#include "tensorflow/core/util/sparse/sparse_tensor.h"
48#include "tensorflow/core/util/work_sharder.h"
49
50namespace tensorflow {
51
52namespace sdca {
53
54// Statistics computed with input (ModelWeights, Example).
55struct ExampleStatistics {
56  // Logits for each class.
57  // For binary case, this should be a vector of length 1; while for multiclass
58  // case, this vector has the same length as the number of classes, where each
59  // value corresponds to one class.
60  // Use InlinedVector to avoid heap allocation for small number of classes.
61  gtl::InlinedVector<double, 1> wx;
62
63  // Logits for each class, using the previous weights.
64  gtl::InlinedVector<double, 1> prev_wx;
65
66  // Sum of squared feature values occurring in the example divided by
67  // L2 * sum(example_weights).
68  double normalized_squared_norm = 0;
69
70  // Num_weight_vectors equals to the number of classification classes in the
71  // multiclass case; while for binary case, it is 1.
72  ExampleStatistics(const int num_weight_vectors)
73      : wx(num_weight_vectors, 0.0), prev_wx(num_weight_vectors, 0.0) {}
74};
75
76class Regularizations {
77 public:
78  Regularizations(){};
79
80  // Initialize() must be called immediately after construction.
81  Status Initialize(OpKernelConstruction* const context) {
82    TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
83    TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
84    shrinkage_ = symmetric_l1_ / symmetric_l2_;
85    return Status::OK();
86  }
87
88  // Proximal SDCA shrinking for L1 regularization.
89  double Shrink(const double weight) const {
90    const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
91    if (shrinked > 0.0) {
92      return std::copysign(shrinked, weight);
93    }
94    return 0.0;
95  }
96
97  // Vectorized float variant of the above.
98  Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrinkVector(
99      const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
100    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
101    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
102                                 .cwiseMax(weights.constant(0.0)));
103  }
104
105  // Matrix float variant of the above.
106  Eigen::Tensor<float, 2, Eigen::RowMajor> EigenShrinkMatrix(
107      const Eigen::Tensor<float, 2, Eigen::RowMajor> weights) const {
108    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
109    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
110                                 .cwiseMax(weights.constant(0.0)));
111  }
112
113  float symmetric_l2() const { return symmetric_l2_; }
114
115 private:
116  float symmetric_l1_ = 0;
117  float symmetric_l2_ = 0;
118
119  // L1 divided by L2, pre-computed for use during weight shrinking.
120  double shrinkage_ = 0;
121
122  TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
123};
124
125class ModelWeights;
126
127// Struct describing a single example.
128class Example {
129 public:
130  // Compute matrix vector product between weights (a matrix) and features
131  // (a vector). This method also computes the normalized example norm used
132  // in SDCA update.
133  // For multiclass case, num_weight_vectors equals to the number of classes;
134  // while for binary case, it is 1.
135  const ExampleStatistics ComputeWxAndWeightedExampleNorm(
136      const int num_loss_partitions, const ModelWeights& model_weights,
137      const Regularizations& regularization,
138      const int num_weight_vectors) const;
139
140  float example_label() const { return example_label_; }
141
142  float example_weight() const { return example_weight_; }
143
144  double squared_norm() const { return squared_norm_; }
145
146  // Sparse features associated with the example.
147  // Indices and Values are the associated feature index, and values. Values
148  // can be optionally absent, in which we case we implicitly assume a value of
149  // 1.0f.
150  struct SparseFeatures {
151    std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
152    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
153        values;  // nullptr encodes optional.
154  };
155
156  // A dense vector which is a row-slice of the underlying matrix.
157  struct DenseVector {
158    // Returns a row slice from the matrix.
159    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> Row()
160        const {
161      return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
162          data_matrix.data() + row_index * data_matrix.dimension(1),
163          data_matrix.dimension(1));
164    }
165
166    // Returns a row slice as a 1 * F matrix, where F is the number of features.
167    Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>
168    RowAsMatrix() const {
169      return Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor>>(
170          data_matrix.data() + row_index * data_matrix.dimension(1), 1,
171          data_matrix.dimension(1));
172    }
173
174    const TTypes<float>::ConstMatrix data_matrix;
175    const int64 row_index;
176  };
177
178 private:
179  std::vector<SparseFeatures> sparse_features_;
180  std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
181
182  float example_label_ = 0;
183  float example_weight_ = 0;
184  double squared_norm_ = 0;  // sum squared norm of the features.
185
186  // Examples fills Example in a multi-threaded way.
187  friend class Examples;
188
189  // ModelWeights use each example for model update w += \alpha * x_{i};
190  friend class ModelWeights;
191};
192
193// Weights related to features. For example, say you have two sets of sparse
194// features i.e. age bracket and country, then FeatureWeightsDenseStorage hold
195// the parameters for it. We keep track of the original weight passed in and the
196// delta weight which the optimizer learns in each call to the optimizer.
197class FeatureWeightsDenseStorage {
198 public:
199  FeatureWeightsDenseStorage(const TTypes<const float>::Matrix nominals,
200                             TTypes<float>::Matrix deltas)
201      : nominals_(nominals), deltas_(deltas) {
202    CHECK(deltas.rank() > 1);
203  }
204
205  // Check if a feature index is with-in the bounds.
206  bool IndexValid(const int64 index) const {
207    return index >= 0 && index < deltas_.dimension(1);
208  }
209
210  // Nominals here are the original weight matrix.
211  TTypes<const float>::Matrix nominals() const { return nominals_; }
212
213  // Delta weights durining mini-batch updates.
214  TTypes<float>::Matrix deltas() const { return deltas_; }
215
216  // Updates delta weights based on active dense features in the example and
217  // the corresponding dual residual.
218  void UpdateDenseDeltaWeights(
219      const Eigen::ThreadPoolDevice& device,
220      const Example::DenseVector& dense_vector,
221      const std::vector<double>& normalized_bounded_dual_delta);
222
223 private:
224  // The nominal value of the weight for a feature (indexed by its id).
225  const TTypes<const float>::Matrix nominals_;
226  // The accumulated delta weight for a feature (indexed by its id).
227  TTypes<float>::Matrix deltas_;
228};
229
230// Similar to FeatureWeightsDenseStorage, but the underlying weights are stored
231// in an unordered map.
232class FeatureWeightsSparseStorage {
233 public:
234  FeatureWeightsSparseStorage(const TTypes<const int64>::Vec indices,
235                              const TTypes<const float>::Matrix nominals,
236                              TTypes<float>::Matrix deltas)
237      : nominals_(nominals), deltas_(deltas) {
238    // Create a map from sparse index to the dense index of the underlying
239    // storage.
240    for (int64 j = 0; j < indices.size(); ++j) {
241      indices_to_id_[indices(j)] = j;
242    }
243  }
244
245  // Check if a feature index exists.
246  bool IndexValid(const int64 index) const {
247    return indices_to_id_.find(index) != indices_to_id_.end();
248  }
249
250  // Nominal value at a particular feature index and class label.
251  float nominals(const int class_id, const int64 index) const {
252    auto it = indices_to_id_.find(index);
253    return nominals_(class_id, it->second);
254  }
255
256  // Delta weights durining mini-batch updates.
257  float deltas(const int class_id, const int64 index) const {
258    auto it = indices_to_id_.find(index);
259    return deltas_(class_id, it->second);
260  }
261
262  // Updates delta weights based on active sparse features in the example and
263  // the corresponding dual residual.
264  void UpdateSparseDeltaWeights(
265      const Eigen::ThreadPoolDevice& device,
266      const Example::SparseFeatures& sparse_features,
267      const std::vector<double>& normalized_bounded_dual_delta);
268
269 private:
270  // The nominal value of the weight for a feature (indexed by its id).
271  const TTypes<const float>::Matrix nominals_;
272  // The accumulated delta weight for a feature (indexed by its id).
273  TTypes<float>::Matrix deltas_;
274  // Map from feature index to an index to the dense vector.
275  std::unordered_map<int64, int64> indices_to_id_;
276};
277
278// Weights in the model, wraps both current weights, and the delta weights
279// for both sparse and dense features.
280class ModelWeights {
281 public:
282  ModelWeights() {}
283
284  bool SparseIndexValid(const int col, const int64 index) const {
285    return sparse_weights_[col].IndexValid(index);
286  }
287
288  bool DenseIndexValid(const int col, const int64 index) const {
289    return dense_weights_[col].IndexValid(index);
290  }
291
292  // Go through all the features present in the example, and update the
293  // weights based on the dual delta.
294  void UpdateDeltaWeights(
295      const Eigen::ThreadPoolDevice& device, const Example& example,
296      const std::vector<double>& normalized_bounded_dual_delta);
297
298  Status Initialize(OpKernelContext* const context);
299
300  const std::vector<FeatureWeightsSparseStorage>& sparse_weights() const {
301    return sparse_weights_;
302  }
303
304  const std::vector<FeatureWeightsDenseStorage>& dense_weights() const {
305    return dense_weights_;
306  }
307
308 private:
309  std::vector<FeatureWeightsSparseStorage> sparse_weights_;
310  std::vector<FeatureWeightsDenseStorage> dense_weights_;
311
312  TF_DISALLOW_COPY_AND_ASSIGN(ModelWeights);
313};
314
315// Examples contains all the training examples that SDCA uses for a mini-batch.
316class Examples {
317 public:
318  Examples() {}
319
320  // Returns the Example at |example_index|.
321  const Example& example(const int example_index) const {
322    return examples_.at(example_index);
323  }
324
325  int sampled_index(const int id, const bool adaptative) const {
326    if (adaptative) return sampled_index_[id];
327    return id;
328  }
329
330  // Adaptive SDCA in the current implementation only works for
331  // binary classification, where the input argument for num_weight_vectors
332  // is 1.
333  Status SampleAdaptativeProbabilities(
334      const int num_loss_partitions, const Regularizations& regularization,
335      const ModelWeights& model_weights,
336      const TTypes<float>::Matrix example_state_data,
337      const std::unique_ptr<DualLossUpdater>& loss_updater,
338      const int num_weight_vectors);
339
340  int num_examples() const { return examples_.size(); }
341
342  int num_features() const { return num_features_; }
343
344  // Initialize() must be called immediately after construction.
345  Status Initialize(OpKernelContext* const context, const ModelWeights& weights,
346                    int num_sparse_features,
347                    int num_sparse_features_with_values,
348                    int num_dense_features);
349
350 private:
351  // Reads the input tensors, and builds the internal representation for sparse
352  // features per example. This function modifies the |examples| passed in
353  // to build the sparse representations.
354  static Status CreateSparseFeatureRepresentation(
355      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
356      int num_sparse_features, const ModelWeights& weights,
357      const OpInputList& sparse_example_indices_inputs,
358      const OpInputList& sparse_feature_indices_inputs,
359      const OpInputList& sparse_feature_values_inputs,
360      std::vector<Example>* const examples);
361
362  // Reads the input tensors, and builds the internal representation for dense
363  // features per example. This function modifies the |examples| passed in
364  // to build the sparse representations.
365  static Status CreateDenseFeatureRepresentation(
366      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
367      int num_dense_features, const ModelWeights& weights,
368      const OpInputList& dense_features_inputs,
369      std::vector<Example>* const examples);
370
371  // Computes squared example norm per example i.e |x|^2. This function modifies
372  // the |examples| passed in and adds the squared norm per example.
373  static void ComputeSquaredNormPerExample(
374      const DeviceBase::CpuWorkerThreads& worker_threads, int num_examples,
375      int num_sparse_features, int num_dense_features,
376      std::vector<Example>* const examples);
377
378  // All examples in the batch.
379  std::vector<Example> examples_;
380
381  // Adaptative sampling variables
382  std::vector<float> probabilities_;
383  std::vector<int> sampled_index_;
384  std::vector<int> sampled_count_;
385
386  int num_features_ = 0;
387
388  TF_DISALLOW_COPY_AND_ASSIGN(Examples);
389};
390
391}  // namespace sdca
392}  // namespace tensorflow
393
394#endif  // TENSORFLOW_KERNELS_SDCA_INTERNAL_H_
395