1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
3 
4 typedef int TensorIndex;
5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
6 
7 #include "unsupported/Eigen/CXX11/Tensor"
8 #include "benchmark.h"
9 
10 #define BENCHMARK_RANGE(bench, lo, hi) \
11   BENCHMARK(bench)->Range(lo, hi)
12 
13 using Eigen::Tensor;
14 using Eigen::TensorMap;
15 
16 // TODO(bsteiner): also templatize on the input type since we have users
17 // for int8 as well as floats.
18 template <typename Device, typename T> class BenchmarkSuite {
19  public:
BenchmarkSuite(const Device & device,size_t m,size_t k,size_t n)20   BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
21       : m_(m), k_(k), n_(n), device_(device) {
22     initialize();
23   }
24 
BenchmarkSuite(const Device & device,size_t m)25   BenchmarkSuite(const Device& device, size_t m)
26       : m_(m), k_(m), n_(m), device_(device) {
27     initialize();
28   }
29 
~BenchmarkSuite()30   ~BenchmarkSuite() {
31     device_.deallocate(a_);
32     device_.deallocate(b_);
33     device_.deallocate(c_);
34   }
35 
memcpy(int num_iters)36   void memcpy(int num_iters) {
37     eigen_assert(m_ == k_ && k_ == n_);
38     StartBenchmarkTiming();
39     for (int iter = 0; iter < num_iters; ++iter) {
40       device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
41     }
42     // Record the number of values copied per second
43     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
44   }
45 
typeCasting(int num_iters)46   void typeCasting(int num_iters) {
47     eigen_assert(m_ == n_);
48     Eigen::array<TensorIndex, 2> sizes;
49     if (sizeof(T) >= sizeof(int)) {
50       sizes[0] = m_;
51       sizes[1] = k_;
52     } else {
53       sizes[0] = m_ * sizeof(T) / sizeof(int);
54       sizes[1] = k_ * sizeof(T) / sizeof(int);
55     }
56     const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
57     TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
58 
59     StartBenchmarkTiming();
60     for (int iter = 0; iter < num_iters; ++iter) {
61       B.device(device_) = A.template cast<T>();
62     }
63     // Record the number of values copied per second
64     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
65   }
66 
random(int num_iters)67   void random(int num_iters) {
68     eigen_assert(m_ == k_ && k_ == n_);
69     Eigen::array<TensorIndex, 2> sizes;
70     sizes[0] = m_;
71     sizes[1] = m_;
72     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
73 
74     StartBenchmarkTiming();
75     for (int iter = 0; iter < num_iters; ++iter) {
76       C.device(device_) = C.random();
77     }
78     // Record the number of random numbers generated per second
79     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
80   }
81 
slicing(int num_iters)82   void slicing(int num_iters) {
83     eigen_assert(m_ == k_ && k_ == n_);
84     Eigen::array<TensorIndex, 2> sizes;
85     sizes[0] = m_;
86     sizes[1] = m_;
87     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
88     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
89     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
90 
91     const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2);
92     const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0);
93     const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
94     const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
95     const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
96 
97     StartBenchmarkTiming();
98     for (int iter = 0; iter < num_iters; ++iter) {
99       C.slice(first_quadrant, quarter_sizes).device(device_) =
100           A.slice(first_quadrant, quarter_sizes);
101       C.slice(second_quadrant, quarter_sizes).device(device_) =
102           B.slice(second_quadrant, quarter_sizes);
103       C.slice(third_quadrant, quarter_sizes).device(device_) =
104           A.slice(third_quadrant, quarter_sizes);
105       C.slice(fourth_quadrant, quarter_sizes).device(device_) =
106           B.slice(fourth_quadrant, quarter_sizes);
107     }
108     // Record the number of values copied from the rhs slice to the lhs slice
109     // each second
110     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
111   }
112 
rowChip(int num_iters)113   void rowChip(int num_iters) {
114     Eigen::array<TensorIndex, 2> input_size;
115     input_size[0] = k_;
116     input_size[1] = n_;
117     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
118     Eigen::array<TensorIndex, 1> output_size;
119     output_size[0] = n_;
120     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
121 
122     StartBenchmarkTiming();
123     for (int iter = 0; iter < num_iters; ++iter) {
124       C.device(device_) = B.chip(iter % k_, 0);
125     }
126     // Record the number of values copied from the rhs chip to the lhs.
127     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
128   }
129 
colChip(int num_iters)130   void colChip(int num_iters) {
131     Eigen::array<TensorIndex, 2> input_size;
132     input_size[0] = k_;
133     input_size[1] = n_;
134     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
135     Eigen::array<TensorIndex, 1> output_size;
136     output_size[0] = n_;
137     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
138 
139     StartBenchmarkTiming();
140     for (int iter = 0; iter < num_iters; ++iter) {
141       C.device(device_) = B.chip(iter % n_, 1);
142     }
143     // Record the number of values copied from the rhs chip to the lhs.
144     finalizeBenchmark(static_cast<int64_t>(n_) * num_iters);
145   }
146 
shuffling(int num_iters)147   void shuffling(int num_iters) {
148     eigen_assert(m_ == n_);
149     Eigen::array<TensorIndex, 2> size_a;
150     size_a[0] = m_;
151     size_a[1] = k_;
152     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
153     Eigen::array<TensorIndex, 2> size_b;
154     size_b[0] = k_;
155     size_b[1] = m_;
156     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
157 
158     Eigen::array<int, 2> shuffle;
159     shuffle[0] = 1;
160     shuffle[1] = 0;
161 
162     StartBenchmarkTiming();
163     for (int iter = 0; iter < num_iters; ++iter) {
164       B.device(device_) = A.shuffle(shuffle);
165     }
166     // Record the number of values shuffled from A and copied to B each second
167     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
168   }
169 
padding(int num_iters)170  void padding(int num_iters) {
171     eigen_assert(m_ == k_);
172     Eigen::array<TensorIndex, 2> size_a;
173     size_a[0] = m_;
174     size_a[1] = k_-3;
175     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
176     Eigen::array<TensorIndex, 2> size_b;
177     size_b[0] = k_;
178     size_b[1] = m_;
179     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
180 
181 #if defined(EIGEN_HAS_INDEX_LIST)
182     Eigen::IndexPairList<Eigen::type2indexpair<0, 0>,
183                          Eigen::type2indexpair<2, 1> > paddings;
184 #else
185     Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
186     paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
187     paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
188 #endif
189 
190     StartBenchmarkTiming();
191     for (int iter = 0; iter < num_iters; ++iter) {
192       B.device(device_) = A.pad(paddings);
193     }
194     // Record the number of values copied from the padded tensor A each second
195     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
196   }
197 
striding(int num_iters)198  void striding(int num_iters) {
199     eigen_assert(m_ == k_);
200     Eigen::array<TensorIndex, 2> size_a;
201     size_a[0] = m_;
202     size_a[1] = k_;
203     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
204     Eigen::array<TensorIndex, 2> size_b;
205     size_b[0] = m_;
206     size_b[1] = k_/2;
207     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b);
208 
209 #ifndef EIGEN_HAS_INDEX_LIST
210     Eigen::array<TensorIndex, 2> strides;
211     strides[0] = 1;
212     strides[1] = 2;
213 #else
214     // Take advantage of cxx11 to give the compiler information it can use to
215     // optimize the code.
216     Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
217 #endif
218 
219     StartBenchmarkTiming();
220     for (int iter = 0; iter < num_iters; ++iter) {
221       B.device(device_) = A.stride(strides);
222     }
223     // Record the number of values copied from the padded tensor A each second
224     finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters);
225   }
226 
broadcasting(int num_iters)227   void broadcasting(int num_iters) {
228     Eigen::array<TensorIndex, 2> size_a;
229     size_a[0] = m_;
230     size_a[1] = 1;
231     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a);
232     Eigen::array<TensorIndex, 2> size_c;
233     size_c[0] = m_;
234     size_c[1] = n_;
235     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c);
236 
237 #ifndef EIGEN_HAS_INDEX_LIST
238     Eigen::array<int, 2> broadcast;
239     broadcast[0] = 1;
240     broadcast[1] = n_;
241 #else
242     // Take advantage of cxx11 to give the compiler information it can use to
243     // optimize the code.
244     Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
245     broadcast.set(1, n_);
246 #endif
247 
248     StartBenchmarkTiming();
249     for (int iter = 0; iter < num_iters; ++iter) {
250       C.device(device_) = A.broadcast(broadcast);
251     }
252     // Record the number of values broadcasted from A and copied to C each second
253     finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters);
254   }
255 
coeffWiseOp(int num_iters)256   void coeffWiseOp(int num_iters) {
257     eigen_assert(m_ == k_ && k_ == n_);
258     Eigen::array<TensorIndex, 2> sizes;
259     sizes[0] = m_;
260     sizes[1] = m_;
261     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
262     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
263     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
264 
265     StartBenchmarkTiming();
266     for (int iter = 0; iter < num_iters; ++iter) {
267       C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
268     }
269     // Record the number of FLOP executed per second (2 multiplications and
270     // 1 addition per value)
271     finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters);
272   }
273 
algebraicFunc(int num_iters)274   void algebraicFunc(int num_iters) {
275     eigen_assert(m_ == k_ && k_ == n_);
276     Eigen::array<TensorIndex, 2> sizes;
277     sizes[0] = m_;
278     sizes[1] = m_;
279     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
280     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
281     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
282 
283     StartBenchmarkTiming();
284     for (int iter = 0; iter < num_iters; ++iter) {
285       C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
286     }
287     // Record the number of FLOP executed per second (assuming one operation
288     // per value)
289     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
290   }
291 
transcendentalFunc(int num_iters)292   void transcendentalFunc(int num_iters) {
293     eigen_assert(m_ == k_ && k_ == n_);
294     Eigen::array<TensorIndex, 2> sizes;
295     sizes[0] = m_;
296     sizes[1] = m_;
297     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
298     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
299     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
300 
301     StartBenchmarkTiming();
302     for (int iter = 0; iter < num_iters; ++iter) {
303       C.device(device_) = A.exp() + B.log();
304     }
305     // Record the number of FLOP executed per second (assuming one operation
306     // per value)
307     finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters);
308   }
309 
310  // Row reduction
rowReduction(int num_iters)311   void rowReduction(int num_iters) {
312     Eigen::array<TensorIndex, 2> input_size;
313     input_size[0] = k_;
314     input_size[1] = n_;
315     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size);
316     Eigen::array<TensorIndex, 1> output_size;
317     output_size[0] = n_;
318     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
319 
320 #ifndef EIGEN_HAS_INDEX_LIST
321     Eigen::array<TensorIndex, 1> sum_along_dim;
322     sum_along_dim[0] = 0;
323 #else
324     // Take advantage of cxx11 to give the compiler information it can use to
325     // optimize the code.
326     Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
327 #endif
328 
329     StartBenchmarkTiming();
330     for (int iter = 0; iter < num_iters; ++iter) {
331       C.device(device_) = B.sum(sum_along_dim);
332     }
333     // Record the number of FLOP executed per second (assuming one operation
334     // per value)
335     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
336   }
337 
338   // Column reduction
colReduction(int num_iters)339   void colReduction(int num_iters) {
340     Eigen::array<TensorIndex, 2> input_size;
341     input_size[0] = k_;
342     input_size[1] = n_;
343     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
344         b_, input_size);
345     Eigen::array<TensorIndex, 1> output_size;
346     output_size[0] = k_;
347     TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(
348         c_, output_size);
349 
350 #ifndef EIGEN_HAS_INDEX_LIST
351     Eigen::array<TensorIndex, 1> sum_along_dim;
352     sum_along_dim[0] = 1;
353 #else
354     // Take advantage of cxx11 to give the compiler information it can use to
355     // optimize the code.
356     Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
357 #endif
358 
359     StartBenchmarkTiming();
360     for (int iter = 0; iter < num_iters; ++iter) {
361       C.device(device_) = B.sum(sum_along_dim);
362     }
363     // Record the number of FLOP executed per second (assuming one operation
364     // per value)
365     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
366   }
367 
368   // Full reduction
fullReduction(int num_iters)369   void fullReduction(int num_iters) {
370     Eigen::array<TensorIndex, 2> input_size;
371     input_size[0] = k_;
372     input_size[1] = n_;
373     const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(
374         b_, input_size);
375     Eigen::array<TensorIndex, 0> output_size;
376     TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
377         c_, output_size);
378 
379     StartBenchmarkTiming();
380     for (int iter = 0; iter < num_iters; ++iter) {
381       C.device(device_) = B.sum();
382     }
383     // Record the number of FLOP executed per second (assuming one operation
384     // per value)
385     finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters);
386   }
387 
388   // do a contraction which is equivalent to a matrix multiplication
contraction(int num_iters)389   void contraction(int num_iters) {
390     Eigen::array<TensorIndex, 2> sizeA;
391     sizeA[0] = m_;
392     sizeA[1] = k_;
393     Eigen::array<TensorIndex, 2> sizeB;
394     sizeB[0] = k_;
395     sizeB[1] = n_;
396     Eigen::array<TensorIndex, 2> sizeC;
397     sizeC[0] = m_;
398     sizeC[1] = n_;
399 
400     const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
401     const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
402     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC);
403 
404     typedef typename Tensor<T, 2>::DimensionPair DimPair;
405     Eigen::array<DimPair, 1> dims;
406     dims[0] = DimPair(1, 0);
407 
408     StartBenchmarkTiming();
409     for (int iter = 0; iter < num_iters; ++iter) {
410       C.device(device_) = A.contract(B, dims);
411     }
412     // Record the number of FLOP executed per second (size_ multiplications and
413     // additions for each value in the resulting tensor)
414     finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters);
415   }
416 
convolution(int num_iters,int kernel_x,int kernel_y)417   void convolution(int num_iters, int kernel_x, int kernel_y) {
418     Eigen::array<TensorIndex, 2> input_sizes;
419     input_sizes[0] = m_;
420     input_sizes[1] = n_;
421     TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes);
422     Eigen::array<TensorIndex, 2> kernel_sizes;
423     kernel_sizes[0] = kernel_x;
424     kernel_sizes[1] = kernel_y;
425     TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes);
426     Eigen::array<TensorIndex, 2> result_sizes;
427     result_sizes[0] = m_ - kernel_x + 1;
428     result_sizes[1] = n_ - kernel_y + 1;
429     TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes);
430     Eigen::array<TensorIndex, 2> dims;
431     dims[0] = 0;
432     dims[1] = 1;
433 
434     StartBenchmarkTiming();
435     for (int iter = 0; iter < num_iters; ++iter) {
436       C.device(device_) = A.convolve(B, dims);
437     }
438     // Record the number of FLOP executed per second (kernel_size
439     // multiplications and additions for each value in the resulting tensor)
440     finalizeBenchmark(static_cast<int64_t>(2) *
441         (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters);
442   }
443 
444  private:
initialize()445   void initialize() {
446     a_ = (T *) device_.allocate(m_ * k_ * sizeof(T));
447     b_ = (T *) device_.allocate(k_ * n_ * sizeof(T));
448     c_ = (T *) device_.allocate(m_ * n_ * sizeof(T));
449 
450     // Initialize the content of the memory pools to prevent asan from
451     // complaining.
452     device_.memset(a_, 12, m_ * k_ * sizeof(T));
453     device_.memset(b_, 23, k_ * n_ * sizeof(T));
454     device_.memset(c_, 31, m_ * n_ * sizeof(T));
455 
456     //BenchmarkUseRealTime();
457   }
458 
finalizeBenchmark(int64_t num_items)459   inline void finalizeBenchmark(int64_t num_items) {
460 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
461     if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
462       device_.synchronize();
463     }
464 #endif
465     StopBenchmarkTiming();
466     SetBenchmarkFlopsProcessed(num_items);
467   }
468 
469 
470   TensorIndex m_;
471   TensorIndex k_;
472   TensorIndex n_;
473   T* a_;
474   T* b_;
475   T* c_;
476   Device device_;
477 };
478 #endif  // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
479