1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12
13namespace Eigen {
14
15/** \class TensorEvaluator
16  * \ingroup CXX11_Tensor_Module
17  *
18  * \brief A cost model used to limit the number of threads used for evaluating
19  * tensor expression.
20  *
21  */
22
23// Class storing the cost of evaluating a tensor expression in terms of the
24// estimated number of operand bytes loads, bytes stored, and compute cycles.
25class TensorOpCost {
26 public:
27  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
28  // model based on minimal reciprocal throughput numbers from Intel or
29  // Agner Fog's tables would be better than what is there now.
30  template <typename ArgType>
31  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
32    return internal::functor_traits<
33        internal::scalar_product_op<ArgType, ArgType> >::Cost;
34  }
35  template <typename ArgType>
36  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
37    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
38  }
39  template <typename ArgType>
40  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
41    return internal::functor_traits<
42        internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
43  }
44  template <typename ArgType>
45  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
46    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
47  }
48  template <typename SrcType, typename TargetType>
49  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
50    return internal::functor_traits<
51        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
52  }
53
54  EIGEN_DEVICE_FUNC
55  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
56  EIGEN_DEVICE_FUNC
57  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
58      : bytes_loaded_(bytes_loaded),
59        bytes_stored_(bytes_stored),
60        compute_cycles_(compute_cycles) {}
61
62  EIGEN_DEVICE_FUNC
63  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
64               bool vectorized, double packet_size)
65      : bytes_loaded_(bytes_loaded),
66        bytes_stored_(bytes_stored),
67        compute_cycles_(vectorized ? compute_cycles / packet_size
68                                   : compute_cycles) {
69    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
70    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
71    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
72  }
73
74  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
75    return bytes_loaded_;
76  }
77  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
78    return bytes_stored_;
79  }
80  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
81    return compute_cycles_;
82  }
83  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
84      double load_cost, double store_cost, double compute_cost) const {
85    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86           compute_cost * compute_cycles_;
87  }
88
89  // Drop memory access component. Intended for cases when memory accesses are
90  // sequential or are completely masked by computations.
91  EIGEN_DEVICE_FUNC void dropMemoryCost() {
92    bytes_loaded_ = 0;
93    bytes_stored_ = 0;
94  }
95
96  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
97  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
98      const TensorOpCost& rhs) const {
99    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
100    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
101    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
102    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
103  }
104
105  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
106  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
107      const TensorOpCost& rhs) const {
108    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
109    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
110    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
111    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
112  }
113
114  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
115      const TensorOpCost& rhs) {
116    bytes_loaded_ += rhs.bytes_loaded();
117    bytes_stored_ += rhs.bytes_stored();
118    compute_cycles_ += rhs.compute_cycles();
119    return *this;
120  }
121
122  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
123    bytes_loaded_ *= rhs;
124    bytes_stored_ *= rhs;
125    compute_cycles_ *= rhs;
126    return *this;
127  }
128
129  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
130      TensorOpCost lhs, const TensorOpCost& rhs) {
131    lhs += rhs;
132    return lhs;
133  }
134  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
135      TensorOpCost lhs, double rhs) {
136    lhs *= rhs;
137    return lhs;
138  }
139  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
140      double lhs, TensorOpCost rhs) {
141    rhs *= lhs;
142    return rhs;
143  }
144
145  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
146    return os << "[bytes_loaded = " << tc.bytes_loaded()
147              << ", bytes_stored = " << tc.bytes_stored()
148              << ", compute_cycles = " << tc.compute_cycles() << "]";
149  }
150
151 private:
152  double bytes_loaded_;
153  double bytes_stored_;
154  double compute_cycles_;
155};
156
157// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
158// in [1:max_threads] instead of just switching multi-threading off for small
159// work units.
160template <typename Device>
161class TensorCostModel {
162 public:
163  // Scaling from Eigen compute cost to device cycles.
164  static const int kDeviceCyclesPerComputeCycle = 1;
165
166 // Costs in device cycles.
167  static const int kStartupCycles = 100000;
168  static const int kPerThreadCycles = 100000;
169  static const int kTaskSize = 40000;
170
171  // Returns the number of threads in [1:max_threads] to use for
172  // evaluating an expression with the given output size and cost per
173  // coefficient.
174  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
175      double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
176    double cost = totalCost(output_size, cost_per_coeff);
177    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178    return numext::mini(max_threads, numext::maxi(1, threads));
179  }
180
181  // taskSize assesses parallel task size.
182  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
183  // granularity needs to be increased to mitigate parallelization overheads.
184  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
185      double output_size, const TensorOpCost& cost_per_coeff) {
186    return totalCost(output_size, cost_per_coeff) / kTaskSize;
187  }
188
189 private:
190  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
191      double output_size, const TensorOpCost& cost_per_coeff) {
192    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
193    // 11 is L2 cache latency on Haswell.
194    // We don't know whether data is in L1, L2 or L3. But we are most interested
195    // in single-threaded computational time around 100us-10ms (smaller time
196    // is too small for parallelization, larger time is not intersting
197    // either because we are probably using all available threads already).
198    // And for the target time range, L2 seems to be what matters. Data set
199    // fitting into L1 is too small to take noticeable time. Data set fitting
200    // only into L3 presumably will take more than 10ms to load and process.
201    const double kLoadCycles = 1.0 / 64 * 11;
202    const double kStoreCycles = 1.0 / 64 * 11;
203    // Scaling from Eigen compute cost to device cycles.
204    return output_size *
205        cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
206                                  kDeviceCyclesPerComputeCycle);
207  }
208};
209
210}  // namespace Eigen
211
212#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
213