1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 namespace Eigen {
14 
15 /** \class TensorEvaluator
16   * \ingroup CXX11_Tensor_Module
17   *
18   * \brief A cost model used to limit the number of threads used for evaluating
19   * tensor expression.
20   *
21   */
22 
23 // Class storing the cost of evaluating a tensor expression in terms of the
24 // estimated number of operand bytes loads, bytes stored, and compute cycles.
25 class TensorOpCost {
26  public:
27   // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
28   // model based on minimal reciprocal throughput numbers from Intel or
29   // Agner Fog's tables would be better than what is there now.
30   template <typename ArgType>
MulCost()31   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
32     return internal::functor_traits<
33         internal::scalar_product_op<ArgType, ArgType> >::Cost;
34   }
35   template <typename ArgType>
AddCost()36   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
37     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
38   }
39   template <typename ArgType>
DivCost()40   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
41     return internal::functor_traits<
42         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
43   }
44   template <typename ArgType>
ModCost()45   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
46     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
47   }
48   template <typename SrcType, typename TargetType>
CastCost()49   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
50     return internal::functor_traits<
51         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
52   }
53 
54   EIGEN_DEVICE_FUNC
TensorOpCost()55   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
56   EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded,double bytes_stored,double compute_cycles)57   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
58       : bytes_loaded_(bytes_loaded),
59         bytes_stored_(bytes_stored),
60         compute_cycles_(compute_cycles) {}
61 
62   EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded,double bytes_stored,double compute_cycles,bool vectorized,double packet_size)63   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
64                bool vectorized, double packet_size)
65       : bytes_loaded_(bytes_loaded),
66         bytes_stored_(bytes_stored),
67         compute_cycles_(vectorized ? compute_cycles / packet_size
68                                    : compute_cycles) {
69     eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
70     eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
71     eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
72   }
73 
bytes_loaded()74   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
75     return bytes_loaded_;
76   }
bytes_stored()77   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
78     return bytes_stored_;
79   }
compute_cycles()80   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
81     return compute_cycles_;
82   }
total_cost(double load_cost,double store_cost,double compute_cost)83   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
84       double load_cost, double store_cost, double compute_cost) const {
85     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86            compute_cost * compute_cycles_;
87   }
88 
89   // Drop memory access component. Intended for cases when memory accesses are
90   // sequential or are completely masked by computations.
dropMemoryCost()91   EIGEN_DEVICE_FUNC void dropMemoryCost() {
92     bytes_loaded_ = 0;
93     bytes_stored_ = 0;
94   }
95 
96   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
cwiseMin(const TensorOpCost & rhs)97   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
98       const TensorOpCost& rhs) const {
99     double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
100     double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
101     double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
102     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
103   }
104 
105   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
cwiseMax(const TensorOpCost & rhs)106   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
107       const TensorOpCost& rhs) const {
108     double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
109     double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
110     double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
111     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
112   }
113 
114   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
115       const TensorOpCost& rhs) {
116     bytes_loaded_ += rhs.bytes_loaded();
117     bytes_stored_ += rhs.bytes_stored();
118     compute_cycles_ += rhs.compute_cycles();
119     return *this;
120   }
121 
122   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
123     bytes_loaded_ *= rhs;
124     bytes_stored_ *= rhs;
125     compute_cycles_ *= rhs;
126     return *this;
127   }
128 
129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
130       TensorOpCost lhs, const TensorOpCost& rhs) {
131     lhs += rhs;
132     return lhs;
133   }
134   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
135       TensorOpCost lhs, double rhs) {
136     lhs *= rhs;
137     return lhs;
138   }
139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
140       double lhs, TensorOpCost rhs) {
141     rhs *= lhs;
142     return rhs;
143   }
144 
145   friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
146     return os << "[bytes_loaded = " << tc.bytes_loaded()
147               << ", bytes_stored = " << tc.bytes_stored()
148               << ", compute_cycles = " << tc.compute_cycles() << "]";
149   }
150 
151  private:
152   double bytes_loaded_;
153   double bytes_stored_;
154   double compute_cycles_;
155 };
156 
157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
158 // in [1:max_threads] instead of just switching multi-threading off for small
159 // work units.
160 template <typename Device>
161 class TensorCostModel {
162  public:
163   // Scaling from Eigen compute cost to device cycles.
164   static const int kDeviceCyclesPerComputeCycle = 1;
165 
166  // Costs in device cycles.
167   static const int kStartupCycles = 100000;
168   static const int kPerThreadCycles = 100000;
169   static const int kTaskSize = 40000;
170 
171   // Returns the number of threads in [1:max_threads] to use for
172   // evaluating an expression with the given output size and cost per
173   // coefficient.
numThreads(double output_size,const TensorOpCost & cost_per_coeff,int max_threads)174   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
175       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
176     double cost = totalCost(output_size, cost_per_coeff);
177     int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178     return numext::mini(max_threads, numext::maxi(1, threads));
179   }
180 
181   // taskSize assesses parallel task size.
182   // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
183   // granularity needs to be increased to mitigate parallelization overheads.
taskSize(double output_size,const TensorOpCost & cost_per_coeff)184   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
185       double output_size, const TensorOpCost& cost_per_coeff) {
186     return totalCost(output_size, cost_per_coeff) / kTaskSize;
187   }
188 
189  private:
totalCost(double output_size,const TensorOpCost & cost_per_coeff)190   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
191       double output_size, const TensorOpCost& cost_per_coeff) {
192     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
193     // 11 is L2 cache latency on Haswell.
194     // We don't know whether data is in L1, L2 or L3. But we are most interested
195     // in single-threaded computational time around 100us-10ms (smaller time
196     // is too small for parallelization, larger time is not intersting
197     // either because we are probably using all available threads already).
198     // And for the target time range, L2 seems to be what matters. Data set
199     // fitting into L1 is too small to take noticeable time. Data set fitting
200     // only into L3 presumably will take more than 10ms to load and process.
201     const double kLoadCycles = 1.0 / 64 * 11;
202     const double kStoreCycles = 1.0 / 64 * 11;
203     // Scaling from Eigen compute cost to device cycles.
204     return output_size *
205         cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
206                                   kDeviceCyclesPerComputeCycle);
207   }
208 };
209 
210 }  // namespace Eigen
211 
212 #endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
213