1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
17 
18 #include "absl/strings/str_cat.h"
19 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
20 #include "tensorflow/core/platform/test.h"
21 #include "tensorflow/core/platform/test_benchmark.h"
22 
23 namespace Eigen {
24 
25 #define EigenApprox(a, b) \
26   { ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3); }
ceil_div(int a,int b)27 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
28 
TEST(EigenSpatialConvolutionsTest,Simple)29 TEST(EigenSpatialConvolutionsTest, Simple) {
30   const int input_depth = 7;
31   const int input_rows = 4;
32   const int input_cols = 5;
33   const int output_depth = 10;
34   const int patch_rows = 3;
35   const int patch_cols = 4;
36   const int output_rows = input_rows;
37   const int output_cols = input_cols;
38 
39   Tensor<float, 3> input(input_depth, input_rows, input_cols);
40   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
41   Tensor<float, 3> result(output_depth, output_rows, output_cols);
42 
43   input = input.constant(11.0f) + input.random();
44   kernel = kernel.constant(2.0f) + kernel.random();
45   result.setRandom();
46 
47   result = SpatialConvolution(input, kernel);
48 
49   EXPECT_EQ(result.dimension(0), output_depth);
50   EXPECT_EQ(result.dimension(1), output_rows);
51   EXPECT_EQ(result.dimension(2), output_cols);
52 
53   for (int od = 0; od < output_depth; ++od) {
54     for (int i = 0; i < output_rows; ++i) {
55       for (int j = 0; j < output_cols; ++j) {
56         float expected = 0.0f;
57         for (int c = 0; c < patch_cols; ++c) {
58           for (int r = 0; r < patch_rows; ++r) {
59             for (int id = 0; id < input_depth; ++id) {
60               if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
61                   c - 1 + j < output_cols) {
62                 expected +=
63                     input(id, r - 1 + i, c - 1 + j) * kernel(od, id, r, c);
64               }
65             }
66           }
67         }
68         EigenApprox(result(od, i, j), expected);
69       }
70     }
71   }
72 }
73 
TEST(EigenSpatialConvolutionsTest,SimpleRowMajor)74 TEST(EigenSpatialConvolutionsTest, SimpleRowMajor) {
75   const int input_depth = 7;
76   const int input_rows = 4;
77   const int input_cols = 5;
78   const int output_depth = 10;
79   const int patch_rows = 3;
80   const int patch_cols = 4;
81   const int output_rows = input_rows;
82   const int output_cols = input_cols;
83 
84   Tensor<float, 3, RowMajor> input(input_cols, input_rows, input_depth);
85   Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
86                                     output_depth);
87   Tensor<float, 3, RowMajor> result(output_cols, output_rows, output_depth);
88   input = input.constant(11.0f) + input.random();
89   kernel = kernel.constant(2.0f) + kernel.random();
90   result.setRandom();
91 
92   result = SpatialConvolution(input, kernel);
93 
94   EXPECT_EQ(result.dimension(0), output_cols);
95   EXPECT_EQ(result.dimension(1), output_rows);
96   EXPECT_EQ(result.dimension(2), output_depth);
97 
98   for (int od = 0; od < output_depth; ++od) {
99     for (int i = 0; i < output_rows; ++i) {
100       for (int j = 0; j < output_cols; ++j) {
101         float expected = 0.0f;
102         for (int c = 0; c < patch_cols; ++c) {
103           for (int r = 0; r < patch_rows; ++r) {
104             for (int id = 0; id < input_depth; ++id) {
105               if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < output_rows &&
106                   c - 1 + j < output_cols) {
107                 expected +=
108                     input(c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
109               }
110             }
111           }
112         }
113         EigenApprox(result(j, i, od), expected);
114       }
115     }
116   }
117 }
118 
TEST(EigenSpatialConvolutionsTest,BatchedSpatialConvolution)119 TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolution) {
120   Tensor<float, 4> input(10, 5, 5, 13);
121   Tensor<float, 4> kernel(7, 10, 3, 3);
122   Tensor<float, 4> result(7, 5, 5, 13);
123   input = input.constant(11.0f) + input.random();
124   kernel = kernel.constant(2.0f) + kernel.random();
125   result.setRandom();
126 
127   result = SpatialConvolution(input, kernel);
128 
129   EXPECT_EQ(result.dimension(0), 7);
130   EXPECT_EQ(result.dimension(1), 5);
131   EXPECT_EQ(result.dimension(2), 5);
132 
133   for (int b = 0; b < 13; ++b) {
134     for (int od = 0; od < 7; ++od) {
135       for (int i = 0; i < 5; ++i) {
136         for (int j = 0; j < 5; ++j) {
137           float expected = 0.0f;
138           for (int c = 0; c < 3; ++c) {
139             for (int r = 0; r < 3; ++r) {
140               for (int id = 0; id < 10; ++id) {
141                 if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
142                     c - 1 + j < 5) {
143                   expected +=
144                       input(id, r - 1 + i, c - 1 + j, b) * kernel(od, id, r, c);
145                 }
146               }
147             }
148           }
149           EigenApprox(result(od, i, j, b), expected);
150         }
151       }
152     }
153   }
154 }
155 
TEST(EigenSpatialConvolutionsTest,BatchedSpatialConvolutionRowMajor)156 TEST(EigenSpatialConvolutionsTest, BatchedSpatialConvolutionRowMajor) {
157   Tensor<float, 4, RowMajor> input(13, 5, 5, 10);
158   Tensor<float, 4, RowMajor> kernel(3, 3, 10, 7);
159   Tensor<float, 4, RowMajor> result(13, 5, 5, 7);
160   input = input.constant(11.0f) + input.random();
161   kernel = kernel.constant(2.0f) + kernel.random();
162   result.setRandom();
163 
164   result = SpatialConvolution(input, kernel);
165 
166   EXPECT_EQ(result.dimension(1), 5);
167   EXPECT_EQ(result.dimension(2), 5);
168   EXPECT_EQ(result.dimension(3), 7);
169 
170   for (int b = 0; b < 13; ++b) {
171     for (int od = 0; od < 7; ++od) {
172       for (int i = 0; i < 5; ++i) {
173         for (int j = 0; j < 5; ++j) {
174           float expected = 0.0f;
175           for (int c = 0; c < 3; ++c) {
176             for (int r = 0; r < 3; ++r) {
177               for (int id = 0; id < 10; ++id) {
178                 if (r - 1 + i >= 0 && c - 1 + j >= 0 && r - 1 + i < 5 &&
179                     c - 1 + j < 5) {
180                   expected +=
181                       input(b, c - 1 + j, r - 1 + i, id) * kernel(c, r, id, od);
182                 }
183               }
184             }
185           }
186           EigenApprox(result(b, j, i, od), expected);
187         }
188       }
189     }
190   }
191 }
192 
TEST(EigenSpatialConvolutionsTest,ValidSpatialConvolution)193 TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolution) {
194   const int input_depth = 10;
195   const int input_rows = 5;
196   const int input_cols = 5;
197   const int num_batches = 13;
198   const int output_depth = 7;
199   const int patch_rows = 4;
200   const int patch_cols = 4;
201   const int output_rows = input_rows - patch_rows + 1;
202   const int output_cols = input_cols - patch_cols + 1;
203 
204   Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
205   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
206   Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
207   input = input.constant(11.0f) + input.random();
208   kernel = kernel.constant(2.0f) + kernel.random();
209   result.setRandom();
210 
211   // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
212   // of 1.
213   const int stride = 1;
214   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
215 
216   EXPECT_EQ(result.dimension(0), output_depth);
217   EXPECT_EQ(result.dimension(1), output_rows);
218   EXPECT_EQ(result.dimension(2), output_cols);
219   EXPECT_EQ(result.dimension(3), num_batches);
220 
221   for (int b = 0; b < num_batches; ++b) {
222     for (int od = 0; od < output_depth; ++od) {
223       for (int i = 0; i < output_rows; ++i) {
224         for (int j = 0; j < output_cols; ++j) {
225           float expected = 0.0f;
226           for (int c = 0; c < patch_cols; ++c) {
227             for (int r = 0; r < patch_rows; ++r) {
228               for (int id = 0; id < input_depth; ++id) {
229                 expected += input(id, r + i, c + j, b) * kernel(od, id, r, c);
230               }
231             }
232           }
233           if (result(od, i, j, b) != expected) {
234             std::cout << "at od=" << od << " b=" << b << " i=" << i
235                       << " j=" << j << " " << result(od, i, j, b) << " vs "
236                       << expected << std::endl;
237           }
238           EigenApprox(result(od, i, j, b), expected);
239         }
240       }
241     }
242   }
243 }
244 
TEST(EigenSpatialConvolutionsTest,ValidSpatialConvolutionUnequalStrides)245 TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionUnequalStrides) {
246   const int input_depth = 10;
247   const int input_rows = 5;
248   const int input_cols = 5;
249   const int num_batches = 13;
250   const int output_depth = 7;
251   const int patch_rows = 4;
252   const int patch_cols = 4;
253 
254   const int row_stride = 1;
255   const int col_stride = 2;
256   const int output_rows = 2;
257   const int output_cols = 1;
258 
259   Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
260   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
261   Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
262   input = input.constant(11.0f) + input.random();
263   kernel = kernel.constant(2.0f) + kernel.random();
264   result.setRandom();
265 
266   // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
267   // of 1.
268   result =
269       SpatialConvolution(input, kernel, row_stride, col_stride, PADDING_VALID);
270 
271   EXPECT_EQ(result.dimension(0), output_depth);
272   EXPECT_EQ(result.dimension(1), output_rows);
273   EXPECT_EQ(result.dimension(2), output_cols);
274   EXPECT_EQ(result.dimension(3), num_batches);
275   if (true) return;
276 
277   for (int b = 0; b < num_batches; ++b) {
278     for (int od = 0; od < output_depth; ++od) {
279       for (int i = 0; i < output_rows; ++i) {
280         for (int j = 0; j < output_cols; ++j) {
281           float expected = 0.0f;
282           for (int c = 0; c < patch_cols; ++c) {
283             for (int r = 0; r < patch_rows; ++r) {
284               for (int id = 0; id < input_depth; ++id) {
285                 expected +=
286                     input(id, r + row_stride * i, c + col_stride * j, b) *
287                     kernel(od, id, r, c);
288               }
289             }
290           }
291           if (result(od, i, j, b) != expected) {
292             std::cout << "at od=" << od << " b=" << b << " i=" << i
293                       << " j=" << j << " " << result(od, i, j, b) << " vs "
294                       << expected << std::endl;
295           }
296           EigenApprox(result(od, i, j, b), expected);
297         }
298       }
299     }
300   }
301 }
302 
TEST(EigenSpatialConvolutionsTest,ValidSpatialConvolutionRowMajor)303 TEST(EigenSpatialConvolutionsTest, ValidSpatialConvolutionRowMajor) {
304   const int input_depth = 10;
305   const int input_rows = 5;
306   const int input_cols = 5;
307   const int num_batches = 13;
308   const int output_depth = 7;
309   const int patch_rows = 4;
310   const int patch_cols = 4;
311   const int output_rows = input_rows - patch_rows + 1;
312   const int output_cols = input_cols - patch_cols + 1;
313 
314   Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
315                                    input_depth);
316   Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
317                                     output_depth);
318   Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
319                                     output_depth);
320 
321   input = input.constant(11.0f) + input.random();
322   kernel = kernel.constant(2.0f) + kernel.random();
323   result.setRandom();
324 
325   // Apply a spatial convolution using a 4x4 kernel, valid padding, and a stride
326   // of 1.
327   const int stride = 1;
328   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
329 
330   EXPECT_EQ(result.dimension(0), num_batches);
331   EXPECT_EQ(result.dimension(1), output_cols);
332   EXPECT_EQ(result.dimension(2), output_rows);
333   EXPECT_EQ(result.dimension(3), output_depth);
334 
335   for (int b = 0; b < num_batches; ++b) {
336     for (int od = 0; od < output_depth; ++od) {
337       for (int i = 0; i < output_rows; ++i) {
338         for (int j = 0; j < output_cols; ++j) {
339           float expected = 0.0f;
340           for (int c = 0; c < patch_rows; ++c) {
341             for (int r = 0; r < patch_cols; ++r) {
342               for (int id = 0; id < input_depth; ++id) {
343                 expected += input(b, c + j, r + i, id) * kernel(c, r, id, od);
344               }
345             }
346           }
347           if (result(b, j, i, od) != expected) {
348             std::cout << "at od=" << od << " b=" << b << " i=" << i
349                       << " j=" << j << " " << result(b, j, i, od) << " vs "
350                       << expected << std::endl;
351           }
352           EigenApprox(result(b, j, i, od), expected);
353         }
354       }
355     }
356   }
357 }
358 
TEST(EigenSpatialConvolutionsTest,StridedSpatialConvolution)359 TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolution) {
360   const int input_depth = 10;
361   const int input_rows = 5;
362   const int input_cols = 5;
363   const int num_batches = 13;
364   const int output_depth = 7;
365   const int patch_rows = 3;
366   const int patch_cols = 3;
367   const int output_rows = 2;
368   const int output_cols = 2;
369 
370   Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
371   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
372   Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
373   input = input.constant(11.0f) + input.random();
374   kernel = kernel.constant(2.0f) + kernel.random();
375   result.setRandom();
376 
377   // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
378   // of 2.
379   int stride = 2;
380   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
381 
382   EXPECT_EQ(result.dimension(0), output_depth);
383   EXPECT_EQ(result.dimension(1), output_rows);
384   EXPECT_EQ(result.dimension(2), output_cols);
385   EXPECT_EQ(result.dimension(3), num_batches);
386 
387   for (int b = 0; b < num_batches; ++b) {
388     for (int od = 0; od < output_depth; ++od) {
389       for (int i = 0; i < output_rows; ++i) {
390         for (int j = 0; j < output_cols; ++j) {
391           float expected = 0.0f;
392           for (int c = 0; c < patch_cols; ++c) {
393             for (int r = 0; r < patch_rows; ++r) {
394               for (int id = 0; id < input_depth; ++id) {
395                 expected += input(id, r + stride * i, c + stride * j, b) *
396                             kernel(od, id, r, c);
397               }
398             }
399           }
400           EigenApprox(result(od, i, j, b), expected);
401         }
402       }
403     }
404   }
405 }
406 
TEST(EigenSpatialConvolutionsTest,KernelSmallerThanStride)407 TEST(EigenSpatialConvolutionsTest, KernelSmallerThanStride) {
408   const int input_depth = 2;
409   const int input_rows = 3;
410   const int input_cols = 3;
411   const int num_batches = 5;
412   const int output_depth = 6;
413   const int patch_rows = 1;
414   const int patch_cols = 1;
415   const int output_rows = 2;
416   const int output_cols = 2;
417 
418   Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
419   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
420   Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
421   input = input.constant(11.0f) + input.random();
422   kernel = kernel.constant(2.0f) + kernel.random();
423   result.setRandom();
424 
425   // Apply a spatial convolution using a 1x1 kernel, valid padding, and a stride
426   // of 2.
427   int stride = 2;
428   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
429 
430   EXPECT_EQ(result.dimension(0), output_depth);
431   EXPECT_EQ(result.dimension(1), output_rows);
432   EXPECT_EQ(result.dimension(2), output_cols);
433   EXPECT_EQ(result.dimension(3), num_batches);
434 
435   for (int b = 0; b < num_batches; ++b) {
436     for (int od = 0; od < output_depth; ++od) {
437       for (int i = 0; i < output_rows; ++i) {
438         for (int j = 0; j < output_cols; ++j) {
439           float expected = 0.0f;
440           for (int c = 0; c < patch_cols; ++c) {
441             for (int r = 0; r < patch_rows; ++r) {
442               for (int id = 0; id < input_depth; ++id) {
443                 expected += input(id, r + stride * i, c + stride * j, b) *
444                             kernel(od, id, r, c);
445               }
446             }
447           }
448           EigenApprox(result(od, i, j, b), expected);
449         }
450       }
451     }
452   }
453 }
454 
TEST(EigenSpatialConvolutionsTest,StridedSpatialConvolutionRowMajor)455 TEST(EigenSpatialConvolutionsTest, StridedSpatialConvolutionRowMajor) {
456   const int input_depth = 10;
457   const int input_rows = 5;
458   const int input_cols = 5;
459   const int num_batches = 13;
460   const int output_depth = 7;
461   const int patch_rows = 3;
462   const int patch_cols = 3;
463   const int output_rows = 2;
464   const int output_cols = 2;
465 
466   Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
467                                    input_depth);
468   Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
469                                     output_depth);
470   Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
471                                     output_depth);
472   input = input.constant(11.0f) + input.random();
473   kernel = kernel.constant(2.0f) + kernel.random();
474   result.setRandom();
475 
476   // Apply a spatial convolution using a 3x3 kernel, valid padding, and a stride
477   // of 2.
478   int stride = 2;
479   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID);
480 
481   EXPECT_EQ(result.dimension(0), num_batches);
482   EXPECT_EQ(result.dimension(1), output_cols);
483   EXPECT_EQ(result.dimension(2), output_rows);
484   EXPECT_EQ(result.dimension(3), output_depth);
485 
486   for (int b = 0; b < num_batches; ++b) {
487     for (int od = 0; od < output_depth; ++od) {
488       for (int i = 0; i < output_rows; ++i) {
489         for (int j = 0; j < output_cols; ++j) {
490           float expected = 0.0f;
491           for (int c = 0; c < patch_cols; ++c) {
492             for (int r = 0; r < patch_rows; ++r) {
493               for (int id = 0; id < input_depth; ++id) {
494                 expected += input(b, c + stride * j, r + stride * i, id) *
495                             kernel(c, r, id, od);
496               }
497             }
498           }
499           EigenApprox(result(b, j, i, od), expected);
500         }
501       }
502     }
503   }
504 }
505 
TEST(EigenSpatialConvolutionsTest,AtrousSpatial)506 TEST(EigenSpatialConvolutionsTest, AtrousSpatial) {
507   const int input_depth = 10;
508   const int input_rows = 7;
509   const int input_cols = 7;
510   const int num_batches = 13;
511   const int output_depth = 7;
512   const int patch_rows = 3;
513   const int patch_cols = 3;
514   const int output_rows = 3;
515   const int output_cols = 3;
516 
517   Tensor<float, 4> input(input_depth, input_rows, input_cols, num_batches);
518   Tensor<float, 4> kernel(output_depth, input_depth, patch_rows, patch_cols);
519   Tensor<float, 4> result(output_depth, output_rows, output_cols, num_batches);
520   input = input.constant(11.0f) + input.random();
521   kernel = kernel.constant(2.0f) + kernel.random();
522   result.setRandom();
523 
524   // Apply a spatial convolution using a 3x3 kernel, valid padding
525   // output (standard) stride 1, and input (atrous) stride of 2.
526   int stride = 1;
527   int in_stride = 2;
528   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
529                               in_stride, in_stride);
530 
531   EXPECT_EQ(result.dimension(0), output_depth);
532   EXPECT_EQ(result.dimension(1), output_rows);
533   EXPECT_EQ(result.dimension(2), output_cols);
534   EXPECT_EQ(result.dimension(3), num_batches);
535 
536   for (int b = 0; b < num_batches; ++b) {
537     for (int od = 0; od < output_depth; ++od) {
538       for (int i = 0; i < output_rows; ++i) {
539         for (int j = 0; j < output_cols; ++j) {
540           float expected = 0.0f;
541           for (int c = 0; c < patch_cols; ++c) {
542             for (int r = 0; r < patch_rows; ++r) {
543               for (int id = 0; id < input_depth; ++id) {
544                 expected += input(id, in_stride * r + stride * i,
545                                   in_stride * c + stride * j, b) *
546                             kernel(od, id, r, c);
547               }
548             }
549           }
550           EigenApprox(result(od, i, j, b), expected);
551         }
552       }
553     }
554   }
555 }
556 
TEST(EigenSpatialConvolutionsTest,AtrousSpatialRowMajor)557 TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajor) {
558   const int input_depth = 10;
559   const int input_rows = 7;
560   const int input_cols = 7;
561   const int num_batches = 13;
562   const int output_depth = 7;
563   const int patch_rows = 3;
564   const int patch_cols = 3;
565   const int output_rows = 3;
566   const int output_cols = 3;
567 
568   Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
569                                    input_depth);
570   Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
571                                     output_depth);
572   Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
573                                     output_depth);
574   input = input.constant(11.0f) + input.random();
575   kernel = kernel.constant(2.0f) + kernel.random();
576   result.setRandom();
577 
578   // Apply a spatial convolution using a 3x3 kernel, valid padding
579   // output (standard) stride 1, and input (atrous) stride of 2.
580   int stride = 1;
581   int in_stride = 2;
582   result = SpatialConvolution(input, kernel, stride, stride, PADDING_VALID,
583                               in_stride, in_stride);
584 
585   EXPECT_EQ(result.dimension(0), num_batches);
586   EXPECT_EQ(result.dimension(1), output_cols);
587   EXPECT_EQ(result.dimension(2), output_rows);
588   EXPECT_EQ(result.dimension(3), output_depth);
589 
590   for (int b = 0; b < num_batches; ++b) {
591     for (int od = 0; od < output_depth; ++od) {
592       for (int i = 0; i < output_rows; ++i) {
593         for (int j = 0; j < output_cols; ++j) {
594           float expected = 0.0f;
595           for (int c = 0; c < patch_cols; ++c) {
596             for (int r = 0; r < patch_rows; ++r) {
597               for (int id = 0; id < input_depth; ++id) {
598                 expected += input(b, in_stride * c + stride * j,
599                                   in_stride * r + stride * i, id) *
600                             kernel(c, r, id, od);
601               }
602             }
603           }
604           EigenApprox(result(b, j, i, od), expected);
605         }
606       }
607     }
608   }
609 }
610 
TEST(EigenSpatialConvolutionsTest,AtrousSpatialRowMajorUnequalStrides)611 TEST(EigenSpatialConvolutionsTest, AtrousSpatialRowMajorUnequalStrides) {
612   const int input_depth = 10;
613   const int input_rows = 7;
614   const int input_cols = 7;
615   const int num_batches = 13;
616   const int output_depth = 7;
617   const int patch_rows = 3;
618   const int patch_cols = 3;
619   const int output_rows = 1;
620   const int output_cols = 3;
621 
622   Tensor<float, 4, RowMajor> input(num_batches, input_cols, input_rows,
623                                    input_depth);
624   Tensor<float, 4, RowMajor> kernel(patch_cols, patch_rows, input_depth,
625                                     output_depth);
626   Tensor<float, 4, RowMajor> result(num_batches, output_cols, output_rows,
627                                     output_depth);
628   input = input.constant(11.0f) + input.random();
629   kernel = kernel.constant(2.0f) + kernel.random();
630   result.setRandom();
631 
632   // Apply a spatial convolution using a 3x3 kernel, valid padding
633   // output (standard) stride 1, and input (atrous) stride of 2.
634   int row_stride = 1;
635   int col_stride = 2;
636   int row_in_stride = 3;
637   int col_in_stride = 1;
638   result = SpatialConvolution(input, kernel, row_stride, col_stride,
639                               PADDING_VALID, row_in_stride, col_in_stride);
640 
641   EXPECT_EQ(result.dimension(0), num_batches);
642   EXPECT_EQ(result.dimension(1), output_cols);
643   EXPECT_EQ(result.dimension(2), output_rows);
644   EXPECT_EQ(result.dimension(3), output_depth);
645 
646   for (int b = 0; b < num_batches; ++b) {
647     for (int od = 0; od < output_depth; ++od) {
648       for (int i = 0; i < output_rows; ++i) {
649         for (int j = 0; j < output_cols; ++j) {
650           float expected = 0.0f;
651           for (int c = 0; c < patch_cols; ++c) {
652             for (int r = 0; r < patch_rows; ++r) {
653               for (int id = 0; id < input_depth; ++id) {
654                 expected += input(b, col_in_stride * c + col_stride * j,
655                                   row_in_stride * r + row_stride * i, id) *
656                             kernel(c, r, id, od);
657               }
658             }
659           }
660           EigenApprox(result(b, j, i, od), expected);
661         }
662       }
663     }
664   }
665 }
666 
TEST(EigenSpatialConvolutionsTest,Cuboid)667 TEST(EigenSpatialConvolutionsTest, Cuboid) {
668   const int in_channels = 10;
669   const int in_depth = 5;
670   const int in_rows = 8;
671   const int in_cols = 7;
672 
673   const int kern_filters = 7;
674   const int kern_depth = 3;
675   const int kern_width = 4;
676   const int kern_height = 4;
677 
678   const int out_depth = in_depth;
679   const int out_height = in_rows;
680   const int out_width = in_cols;
681 
682   Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
683   Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
684                           kern_width);
685   Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
686   input = input.constant(11.0f) + input.random();
687   kernel = kernel.constant(2.0f) + kernel.random();
688   result.setRandom();
689 
690   result = CuboidConvolution(input, kernel);
691 
692   EXPECT_EQ(result.dimension(0), kern_filters);
693   EXPECT_EQ(result.dimension(1), out_depth);
694   EXPECT_EQ(result.dimension(2), out_height);
695   EXPECT_EQ(result.dimension(3), out_width);
696 
697   const int off_p = (kern_depth - 1) / 2;
698   const int off_r = (kern_height - 1) / 2;
699   const int off_c = (kern_width - 1) / 2;
700 
701   for (int od = 0; od < kern_filters; ++od) {
702     for (int i = 0; i < out_depth; ++i) {
703       for (int j = 0; j < out_height; ++j) {
704         for (int k = 0; k < out_width; ++k) {
705           float expected = 0.0f;
706           for (int c = 0; c < kern_width; ++c) {
707             for (int r = 0; r < kern_height; ++r) {
708               for (int p = 0; p < kern_depth; ++p) {
709                 for (int id = 0; id < in_channels; ++id) {
710                   if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
711                       c - off_c + k >= 0 && p - off_p + i < in_depth &&
712                       r - off_r + j < in_rows && c - off_c + k < in_cols) {
713                     expected +=
714                         input(id, p - off_p + i, r - off_r + j, c - off_c + k) *
715                         kernel(od, id, p, r, c);
716                   }
717                 }
718               }
719             }
720           }
721           EigenApprox(result(od, i, j, k), expected);
722         }
723       }
724     }
725   }
726 }
727 
TEST(EigenSpatialConvolutionsTest,CuboidRowMajor)728 TEST(EigenSpatialConvolutionsTest, CuboidRowMajor) {
729   const int in_channels = 10;
730   const int in_depth = 5;
731   const int in_rows = 8;
732   const int in_cols = 7;
733 
734   const int kern_filters = 7;
735   const int kern_depth = 3;
736   const int kern_width = 4;
737   const int kern_height = 4;
738 
739   const int out_depth = in_depth;
740   const int out_height = in_rows;
741   const int out_width = in_cols;
742 
743   Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
744   Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
745                                     in_channels, kern_filters);
746   Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
747                                     kern_filters);
748   input = input.constant(11.0f) + input.random();
749   kernel = kernel.constant(2.0f) + kernel.random();
750   result.setRandom();
751 
752   result = CuboidConvolution(input, kernel);
753 
754   EXPECT_EQ(result.dimension(3), kern_filters);
755   EXPECT_EQ(result.dimension(2), out_depth);
756   EXPECT_EQ(result.dimension(1), out_height);
757   EXPECT_EQ(result.dimension(0), out_width);
758 
759   const int off_p = (kern_depth - 1) / 2;
760   const int off_r = (kern_height - 1) / 2;
761   const int off_c = (kern_width - 1) / 2;
762 
763   for (int od = 0; od < kern_filters; ++od) {
764     for (int i = 0; i < out_depth; ++i) {
765       for (int j = 0; j < out_height; ++j) {
766         for (int k = 0; k < out_width; ++k) {
767           float expected = 0.0f;
768           for (int c = 0; c < kern_width; ++c) {
769             for (int r = 0; r < kern_height; ++r) {
770               for (int p = 0; p < kern_depth; ++p) {
771                 for (int id = 0; id < in_channels; ++id) {
772                   if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
773                       c - off_c + k >= 0 && p - off_p + i < in_depth &&
774                       r - off_r + j < in_rows && c - off_c + k < in_cols) {
775                     expected +=
776                         input(c - off_c + k, r - off_r + j, p - off_p + i, id) *
777                         kernel(c, r, p, id, od);
778                   }
779                 }
780               }
781             }
782           }
783           EigenApprox(result(k, j, i, od), expected);
784         }
785       }
786     }
787   }
788 }
789 
TEST(EigenSpatialConvolutionsTest,ValidCuboid)790 TEST(EigenSpatialConvolutionsTest, ValidCuboid) {
791   const int in_channels = 10;
792   const int in_depth = 5;
793   const int in_rows = 5;
794   const int in_cols = 5;
795 
796   const int kern_filters = 7;
797   const int kern_depth = 3;
798   const int kern_width = 3;
799   const int kern_height = 3;
800 
801   const int out_depth = 3;
802   const int out_height = 3;
803   const int out_width = 3;
804 
805   Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
806   Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
807                           kern_width);
808   Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
809   input = input.constant(11.0f) + input.random();
810   kernel = kernel.constant(2.0f) + kernel.random();
811   result.setRandom();
812 
813   result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
814 
815   EXPECT_EQ(result.dimension(0), kern_filters);
816   EXPECT_EQ(result.dimension(1), out_depth);
817   EXPECT_EQ(result.dimension(2), out_height);
818   EXPECT_EQ(result.dimension(3), out_width);
819 
820   for (int od = 0; od < kern_filters; ++od) {
821     for (int i = 0; i < out_depth; ++i) {
822       for (int j = 0; j < out_height; ++j) {
823         for (int k = 0; k < out_width; ++k) {
824           float expected = 0.0f;
825           for (int c = 0; c < kern_width; ++c) {
826             for (int r = 0; r < kern_height; ++r) {
827               for (int p = 0; p < kern_depth; ++p) {
828                 for (int id = 0; id < in_channels; ++id) {
829                   expected +=
830                       input(id, p + i, r + j, c + k) * kernel(od, id, p, r, c);
831                 }
832               }
833             }
834           }
835           EigenApprox(result(od, i, j, k), expected);
836         }
837       }
838     }
839   }
840 }
841 
TEST(EigenSpatialConvolutionsTest,ValidCuboidRowMajor)842 TEST(EigenSpatialConvolutionsTest, ValidCuboidRowMajor) {
843   const int in_channels = 10;
844   const int in_depth = 5;
845   const int in_rows = 5;
846   const int in_cols = 5;
847 
848   const int kern_filters = 7;
849   const int kern_depth = 3;
850   const int kern_width = 3;
851   const int kern_height = 3;
852 
853   const int out_depth = 3;
854   const int out_height = 3;
855   const int out_width = 3;
856 
857   Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
858   Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
859                                     in_channels, kern_filters);
860   Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
861                                     kern_filters);
862   input = input.constant(11.0f) + input.random();
863   kernel = kernel.constant(2.0f) + kernel.random();
864   result.setRandom();
865 
866   result = CuboidConvolution(input, kernel, 1, 1, 1, PADDING_VALID);
867 
868   EXPECT_EQ(result.dimension(3), kern_filters);
869   EXPECT_EQ(result.dimension(2), out_depth);
870   EXPECT_EQ(result.dimension(1), out_height);
871   EXPECT_EQ(result.dimension(0), out_width);
872 
873   for (int od = 0; od < kern_filters; ++od) {
874     for (int i = 0; i < out_depth; ++i) {
875       for (int j = 0; j < out_height; ++j) {
876         for (int k = 0; k < out_width; ++k) {
877           float expected = 0.0f;
878           for (int c = 0; c < kern_width; ++c) {
879             for (int r = 0; r < kern_height; ++r) {
880               for (int p = 0; p < kern_depth; ++p) {
881                 for (int id = 0; id < in_channels; ++id) {
882                   expected +=
883                       input(c + k, r + j, p + i, id) * kernel(c, r, p, id, od);
884                 }
885               }
886             }
887           }
888           EigenApprox(result(k, j, i, od), expected);
889         }
890       }
891     }
892   }
893 }
894 
TEST(EigenSpatialConvolutionsTest,BatchedCuboid)895 TEST(EigenSpatialConvolutionsTest, BatchedCuboid) {
896   const int batches = 2;
897   const int in_channels = 10;
898   const int in_depth = 5;
899   const int in_rows = 8;
900   const int in_cols = 7;
901 
902   const int kern_filters = 7;
903   const int kern_depth = 3;
904   const int kern_width = 4;
905   const int kern_height = 4;
906 
907   const int out_depth = in_depth;
908   const int out_height = in_rows;
909   const int out_width = in_cols;
910 
911   Tensor<float, 5> input(in_channels, in_depth, in_rows, in_cols, batches);
912   Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
913                           kern_width);
914   Tensor<float, 5> result(kern_filters, out_depth, out_height, out_width,
915                           batches);
916   input = input.constant(11.0f) + input.random();
917   kernel = kernel.constant(2.0f) + kernel.random();
918   result.setRandom();
919 
920   result = CuboidConvolution(input, kernel);
921 
922   EXPECT_EQ(result.dimension(0), kern_filters);
923   EXPECT_EQ(result.dimension(1), out_depth);
924   EXPECT_EQ(result.dimension(2), out_height);
925   EXPECT_EQ(result.dimension(3), out_width);
926   EXPECT_EQ(result.dimension(4), batches);
927 
928   const int off_p = (kern_depth - 1) / 2;
929   const int off_r = (kern_height - 1) / 2;
930   const int off_c = (kern_width - 1) / 2;
931 
932   for (int b = 0; b < batches; b++) {
933     for (int od = 0; od < kern_filters; ++od) {
934       for (int i = 0; i < out_depth; ++i) {
935         for (int j = 0; j < out_height; ++j) {
936           for (int k = 0; k < out_width; ++k) {
937             float expected = 0.0f;
938             for (int c = 0; c < kern_width; ++c) {
939               for (int r = 0; r < kern_height; ++r) {
940                 for (int p = 0; p < kern_depth; ++p) {
941                   for (int id = 0; id < in_channels; ++id) {
942                     if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
943                         c - off_c + k >= 0 && p - off_p + i < in_depth &&
944                         r - off_r + j < in_rows && c - off_c + k < in_cols) {
945                       expected += input(id, p - off_p + i, r - off_r + j,
946                                         c - off_c + k, b) *
947                                   kernel(od, id, p, r, c);
948                     }
949                   }
950                 }
951               }
952             }
953             EigenApprox(result(od, i, j, k, b), expected);
954           }
955         }
956       }
957     }
958   }
959 }
960 
TEST(EigenSpatialConvolutionsTest,BatchedCuboidRowMajor)961 TEST(EigenSpatialConvolutionsTest, BatchedCuboidRowMajor) {
962   const int batches = 2;
963   const int in_channels = 10;
964   const int in_depth = 5;
965   const int in_rows = 8;
966   const int in_cols = 7;
967 
968   const int kern_filters = 7;
969   const int kern_depth = 3;
970   const int kern_width = 4;
971   const int kern_height = 4;
972 
973   const int out_depth = in_depth;
974   const int out_height = in_rows;
975   const int out_width = in_cols;
976 
977   Tensor<float, 5, RowMajor> input(batches, in_cols, in_rows, in_depth,
978                                    in_channels);
979   Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
980                                     in_channels, kern_filters);
981   Tensor<float, 5, RowMajor> result(batches, out_width, out_height, out_depth,
982                                     kern_filters);
983   input = input.constant(11.0f) + input.random();
984   kernel = kernel.constant(2.0f) + kernel.random();
985   result.setRandom();
986 
987   result = CuboidConvolution(input, kernel);
988 
989   EXPECT_EQ(result.dimension(4), kern_filters);
990   EXPECT_EQ(result.dimension(3), out_depth);
991   EXPECT_EQ(result.dimension(2), out_height);
992   EXPECT_EQ(result.dimension(1), out_width);
993   EXPECT_EQ(result.dimension(0), batches);
994 
995   const int off_p = (kern_depth - 1) / 2;
996   const int off_r = (kern_height - 1) / 2;
997   const int off_c = (kern_width - 1) / 2;
998 
999   for (int b = 0; b < batches; b++) {
1000     for (int od = 0; od < kern_filters; ++od) {
1001       for (int i = 0; i < out_depth; ++i) {
1002         for (int j = 0; j < out_height; ++j) {
1003           for (int k = 0; k < out_width; ++k) {
1004             float expected = 0.0f;
1005             for (int c = 0; c < kern_width; ++c) {
1006               for (int r = 0; r < kern_height; ++r) {
1007                 for (int p = 0; p < kern_depth; ++p) {
1008                   for (int id = 0; id < in_channels; ++id) {
1009                     if (p - off_p + i >= 0 && r - off_r + j >= 0 &&
1010                         c - off_c + k >= 0 && p - off_p + i < in_depth &&
1011                         r - off_r + j < in_rows && c - off_c + k < in_cols) {
1012                       expected += input(b, c - off_c + k, r - off_r + j,
1013                                         p - off_p + i, id) *
1014                                   kernel(c, r, p, id, od);
1015                     }
1016                   }
1017                 }
1018               }
1019             }
1020             EigenApprox(result(b, k, j, i, od), expected);
1021           }
1022         }
1023       }
1024     }
1025   }
1026 }
1027 
TEST(EigenSpatialConvolutionsTest,StridedValidCuboid)1028 TEST(EigenSpatialConvolutionsTest, StridedValidCuboid) {
1029   const int in_channels = 10;
1030   const int in_depth = 8;
1031   const int in_rows = 7;
1032   const int in_cols = 5;
1033 
1034   const int kern_filters = 7;
1035   const int kern_depth = 3;
1036   const int kern_width = 3;
1037   const int kern_height = 3;
1038 
1039   const int out_depth = 3;
1040   const int out_height = 3;
1041   const int out_width = 2;
1042 
1043   Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
1044   Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
1045                           kern_width);
1046   Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
1047   input = input.constant(11.0f) + input.random();
1048   kernel = kernel.constant(2.0f) + kernel.random();
1049   result.setRandom();
1050 
1051   const int stride = 2;
1052   result =
1053       CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
1054 
1055   EXPECT_EQ(result.dimension(0), kern_filters);
1056   EXPECT_EQ(result.dimension(1), out_depth);
1057   EXPECT_EQ(result.dimension(2), out_height);
1058   EXPECT_EQ(result.dimension(3), out_width);
1059 
1060   for (int od = 0; od < kern_filters; ++od) {
1061     for (int i = 0; i < out_depth; ++i) {
1062       for (int j = 0; j < out_height; ++j) {
1063         for (int k = 0; k < out_width; ++k) {
1064           float expected = 0.0f;
1065           for (int c = 0; c < kern_width; ++c) {
1066             for (int r = 0; r < kern_height; ++r) {
1067               for (int p = 0; p < kern_depth; ++p) {
1068                 for (int id = 0; id < in_channels; ++id) {
1069                   expected += input(id, p + stride * i, r + stride * j,
1070                                     c + stride * k) *
1071                               kernel(od, id, p, r, c);
1072                 }
1073               }
1074             }
1075           }
1076           EigenApprox(result(od, i, j, k), expected);
1077         }
1078       }
1079     }
1080   }
1081 }
1082 
TEST(EigenSpatialConvolutionsTest,StridedValidCuboidRowMajor)1083 TEST(EigenSpatialConvolutionsTest, StridedValidCuboidRowMajor) {
1084   const int in_channels = 10;
1085   const int in_depth = 8;
1086   const int in_rows = 7;
1087   const int in_cols = 5;
1088 
1089   const int kern_filters = 7;
1090   const int kern_depth = 3;
1091   const int kern_width = 3;
1092   const int kern_height = 3;
1093 
1094   const int out_depth = 3;
1095   const int out_height = 3;
1096   const int out_width = 2;
1097 
1098   Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
1099   Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
1100                                     in_channels, kern_filters);
1101   Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
1102                                     kern_filters);
1103   input = input.constant(11.0f) + input.random();
1104   kernel = kernel.constant(2.0f) + kernel.random();
1105   result.setRandom();
1106 
1107   const int stride = 2;
1108   result =
1109       CuboidConvolution(input, kernel, stride, stride, stride, PADDING_VALID);
1110 
1111   EXPECT_EQ(result.dimension(3), kern_filters);
1112   EXPECT_EQ(result.dimension(2), out_depth);
1113   EXPECT_EQ(result.dimension(1), out_height);
1114   EXPECT_EQ(result.dimension(0), out_width);
1115 
1116   for (int od = 0; od < kern_filters; ++od) {
1117     for (int i = 0; i < out_depth; ++i) {
1118       for (int j = 0; j < out_height; ++j) {
1119         for (int k = 0; k < out_width; ++k) {
1120           float expected = 0.0f;
1121           for (int c = 0; c < kern_width; ++c) {
1122             for (int r = 0; r < kern_height; ++r) {
1123               for (int p = 0; p < kern_depth; ++p) {
1124                 for (int id = 0; id < in_channels; ++id) {
1125                   expected += input(c + stride * k, r + stride * j,
1126                                     p + stride * i, id) *
1127                               kernel(c, r, p, id, od);
1128                 }
1129               }
1130             }
1131           }
1132           EigenApprox(result(k, j, i, od), expected);
1133         }
1134       }
1135     }
1136   }
1137 }
1138 
TEST(EigenSpatialConvolutionsTest,StridedSameCuboid)1139 TEST(EigenSpatialConvolutionsTest, StridedSameCuboid) {
1140   const int in_channels = 10;
1141   const int in_depth = 8;
1142   const int in_rows = 7;
1143   const int in_cols = 5;
1144 
1145   const int kern_filters = 7;
1146   const int kern_depth = 3;
1147   const int kern_width = 3;
1148   const int kern_height = 3;
1149 
1150   const int stride = 2;
1151   const int out_depth = ceil_div(in_depth, stride);
1152   const int out_height = ceil_div(in_rows, stride);
1153   const int out_width = ceil_div(in_cols, stride);
1154 
1155   Tensor<float, 4> input(in_channels, in_depth, in_rows, in_cols);
1156   Tensor<float, 5> kernel(kern_filters, in_channels, kern_depth, kern_height,
1157                           kern_width);
1158   Tensor<float, 4> result(kern_filters, out_depth, out_height, out_width);
1159   input = input.constant(11.0f) + input.random();
1160   kernel = kernel.constant(2.0f) + kernel.random();
1161   result.setRandom();
1162 
1163   result =
1164       CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
1165 
1166   EXPECT_EQ(result.dimension(0), kern_filters);
1167   EXPECT_EQ(result.dimension(1), out_depth);
1168   EXPECT_EQ(result.dimension(2), out_height);
1169   EXPECT_EQ(result.dimension(3), out_width);
1170 
1171   const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
1172   const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
1173   const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
1174 
1175   // Number of pixels the input is extended with at the lower end in every
1176   // dimension.
1177   const int dp = pad_p / 2;
1178   const int dr = pad_r / 2;
1179   const int dc = pad_c / 2;
1180 
1181   for (int od = 0; od < kern_filters; ++od) {
1182     for (int i = 0; i < out_depth; ++i) {
1183       for (int j = 0; j < out_height; ++j) {
1184         for (int k = 0; k < out_width; ++k) {
1185           float expected = 0.0f;
1186           for (int c = 0; c < kern_width; ++c) {
1187             for (int r = 0; r < kern_height; ++r) {
1188               for (int p = 0; p < kern_depth; ++p) {
1189                 for (int id = 0; id < in_channels; ++id) {
1190                   const int in_p = p - dp + i * stride;
1191                   const int in_r = r - dr + j * stride;
1192                   const int in_c = c - dc + k * stride;
1193                   if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
1194                       in_r < in_rows && in_c < in_cols) {
1195                     expected +=
1196                         input(id, in_p, in_r, in_c) * kernel(od, id, p, r, c);
1197                   }
1198                 }
1199               }
1200             }
1201           }
1202           EigenApprox(result(od, i, j, k), expected);
1203         }
1204       }
1205     }
1206   }
1207 }
1208 
TEST(EigenSpatialConvolutionsTest,StridedSameCuboidRowMajor)1209 TEST(EigenSpatialConvolutionsTest, StridedSameCuboidRowMajor) {
1210   const int in_channels = 10;
1211   const int in_depth = 8;
1212   const int in_rows = 7;
1213   const int in_cols = 5;
1214 
1215   const int kern_filters = 7;
1216   const int kern_depth = 3;
1217   const int kern_width = 3;
1218   const int kern_height = 3;
1219 
1220   const int stride = 2;
1221   const int out_depth = ceil_div(in_depth, stride);
1222   const int out_height = ceil_div(in_rows, stride);
1223   const int out_width = ceil_div(in_cols, stride);
1224 
1225   Tensor<float, 4, RowMajor> input(in_cols, in_rows, in_depth, in_channels);
1226   Tensor<float, 5, RowMajor> kernel(kern_width, kern_height, kern_depth,
1227                                     in_channels, kern_filters);
1228   Tensor<float, 4, RowMajor> result(out_width, out_height, out_depth,
1229                                     kern_filters);
1230   input = input.constant(11.0f) + input.random();
1231   kernel = kernel.constant(2.0f) + kernel.random();
1232   result.setRandom();
1233 
1234   result =
1235       CuboidConvolution(input, kernel, stride, stride, stride, PADDING_SAME);
1236 
1237   EXPECT_EQ(result.dimension(3), kern_filters);
1238   EXPECT_EQ(result.dimension(2), out_depth);
1239   EXPECT_EQ(result.dimension(1), out_height);
1240   EXPECT_EQ(result.dimension(0), out_width);
1241 
1242   const int pad_p = (out_depth - 1) * stride - in_depth + kern_depth;
1243   const int pad_r = (out_height - 1) * stride - in_rows + kern_height;
1244   const int pad_c = (out_width - 1) * stride - in_cols + kern_width;
1245 
1246   // Number of pixels the input is extended with at the lower end in every
1247   // dimension.
1248   const int dp = pad_p / 2;
1249   const int dr = pad_r / 2;
1250   const int dc = pad_c / 2;
1251 
1252   for (int od = 0; od < kern_filters; ++od) {
1253     for (int i = 0; i < out_depth; ++i) {
1254       for (int j = 0; j < out_height; ++j) {
1255         for (int k = 0; k < out_width; ++k) {
1256           float expected = 0.0f;
1257           for (int c = 0; c < kern_width; ++c) {
1258             for (int r = 0; r < kern_height; ++r) {
1259               for (int p = 0; p < kern_depth; ++p) {
1260                 for (int id = 0; id < in_channels; ++id) {
1261                   const int in_p = p - dp + i * stride;
1262                   const int in_r = r - dr + j * stride;
1263                   const int in_c = c - dc + k * stride;
1264                   if (in_p >= 0 && in_r >= 0 && in_c >= 0 && in_p < in_depth &&
1265                       in_r < in_rows && in_c < in_cols) {
1266                     expected +=
1267                         input(in_c, in_r, in_p, id) * kernel(c, r, p, id, od);
1268                   }
1269                 }
1270               }
1271             }
1272           }
1273           EigenApprox(result(k, j, i, od), expected);
1274         }
1275       }
1276     }
1277   }
1278 }
1279 
1280 // A test case discovered when testing backward spatial convolution where the
1281 // special tensor contraction mapper for spatial convolution contains a bug.
TEST(EigenSpatialConvolutionsTest,SpatialConvContractionMapper)1282 TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
1283   // We have a 3x4 input image with 2x2 patch and stride of 2.
1284   // The output has size 1x2.
1285   typedef Tensor<float, 1>::DimensionPair DimPair;
1286   Tensor<float, 4> out(1, 1, 2, 1);
1287   Tensor<float, 4> kern(1, 1, 2, 2);
1288   for (int i = 0; i < kern.size(); ++i) {
1289     kern.coeffRef(i) = static_cast<float>(i) + 1;
1290   }
1291   for (int i = 0; i < out.size(); ++i) {
1292     out.coeffRef(i) = static_cast<float>(i) + 1;
1293   }
1294 
1295   DSizes<ptrdiff_t, 4> strides;
1296   strides[0] = 1;
1297   strides[1] = 2;
1298   strides[2] = 2;
1299   strides[3] = 1;
1300 
1301   array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
1302   paddings[0] = std::make_pair(0, 0);
1303   paddings[1] = std::make_pair(1, 2);
1304   paddings[2] = std::make_pair(1, 1);
1305   paddings[3] = std::make_pair(0, 0);
1306 
1307   DSizes<ptrdiff_t, 3> out_dim;
1308   out_dim[0] = 1;
1309   out_dim[1] = 4;
1310   out_dim[2] = 12;
1311 
1312   array<bool, 4> kernel_reverse;
1313   kernel_reverse[0] = false;
1314   kernel_reverse[1] = false;
1315   kernel_reverse[2] = true;
1316   kernel_reverse[3] = true;
1317 
1318   DSizes<ptrdiff_t, 3> k_dims;
1319   k_dims[0] = 1;
1320   k_dims[1] = 1;
1321   k_dims[2] = 4;
1322 
1323   array<DimPair, 2> contract_dims;
1324   contract_dims[0] = DimPair(0, 0);
1325   contract_dims[1] = DimPair(2, 1);
1326 
1327   DSizes<ptrdiff_t, 4> in_dim;
1328   in_dim[0] = 1;
1329   in_dim[1] = 3;
1330   in_dim[2] = 4;
1331   in_dim[3] = 1;
1332 
1333   DSizes<ptrdiff_t, 2> in_dbg_dim;
1334   in_dbg_dim[0] = 3;
1335   in_dbg_dim[1] = 4;
1336 
1337   DSizes<ptrdiff_t, 2> out_dbg_dim;
1338   out_dbg_dim[0] = 4;
1339   out_dbg_dim[1] = 12;
1340 
1341   // This is the formula for computing the backward prop for input with a
1342   // spatial convolution.
1343   Tensor<float, 4> direct =
1344       kern.reverse(kernel_reverse)
1345           .reshape(k_dims)
1346           .contract(
1347               out.extract_image_patches(2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 0)
1348                   .reshape(out_dim),
1349               contract_dims)
1350           .reshape(in_dim);
1351 
1352   Tensor<float, 4> indirect =
1353       kern.reverse(kernel_reverse)
1354           .reshape(k_dims)
1355           .contract(
1356               out.inflate(strides)
1357                   .pad(paddings)
1358                   .extract_image_patches(2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0)
1359                   .reshape(out_dim),
1360               contract_dims)
1361           .reshape(in_dim);
1362 
1363   eigen_assert(dimensions_match(direct.dimensions(), indirect.dimensions()));
1364   for (size_t i = 0; i < direct.dimensions().TotalSize(); ++i) {
1365     EigenApprox(direct.data()[i], indirect.data()[i]);
1366   }
1367   EigenApprox(1.0f, direct(0, 0, 0, 0));
1368   EigenApprox(3.0f, direct(0, 0, 1, 0));
1369   EigenApprox(2.0f, direct(0, 0, 2, 0));
1370   EigenApprox(6.0f, direct(0, 0, 3, 0));
1371 
1372   EigenApprox(2.0f, direct(0, 1, 0, 0));
1373   EigenApprox(4.0f, direct(0, 1, 1, 0));
1374   EigenApprox(4.0f, direct(0, 1, 2, 0));
1375   EigenApprox(8.0f, direct(0, 1, 3, 0));
1376 }
1377 
1378 template <typename T>
PackRhsHelper(::testing::benchmark::State & state,int input_batches,int input_cols,int input_rows,int input_depth,int filter_count,int filter_cols,int filter_rows,Eigen::PaddingType padding,int col_strides,int row_strides,int patch_col_inflate_stride,int patch_row_inflate_stride,Index block_rows,Index block_cols)1379 static void PackRhsHelper(::testing::benchmark::State& state,
1380                           /* Input dimensions: */
1381                           int input_batches, int input_cols, int input_rows,
1382                           int input_depth,
1383                           /* Filter (kernel) dimensions: */
1384                           int filter_count, int filter_cols, int filter_rows,
1385                           Eigen::PaddingType padding,
1386                           /* Input strides: */
1387                           int col_strides, int row_strides,
1388                           /* Patch inflate strides: */
1389                           int patch_col_inflate_stride,
1390                           int patch_row_inflate_stride,
1391                           /* Block dimensions: */
1392                           Index block_rows, Index block_cols) {
1393   // Set random seed for benchmark repeatability.
1394   srand(12345);
1395 
1396   using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
1397 
1398   // Default Eigen::Tensor layout is column major, so we configure dimensions
1399   // starting from the inner most (channels aka depth in this case).
1400   Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
1401 
1402   static const int packet_size = Eigen::internal::packet_traits<T>::size;
1403 
1404   // Reshape dimensions.
1405   using NewDimension = Eigen::DSizes<Index, 2>;
1406 
1407   // Contraction dimensions.
1408   using nocontract_t = Eigen::array<Eigen::Index, 1>;
1409   using contract_t = Eigen::array<Eigen::Index, 1>;
1410 
1411   // Input to the TensorImagePatchOp. It is the tensorflow TTypes<T>::Tensor
1412   // with ColMajor layout, instead of RowMajor. But that doesn't make any
1413   // difference, because TensorContraction swaps LHS with RHS for row major
1414   // inputs, and contraction mapper always works with column major data.
1415   using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
1416 
1417   using Evaluator = TensorEvaluator<
1418       const TensorReshapingOp<
1419           NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
1420       Eigen::DefaultDevice>;
1421 
1422   using InputMapper = Eigen::internal::TensorContractionInputMapper<
1423       T, Index, Eigen::internal::Rhs, Evaluator,  //
1424       nocontract_t, contract_t,                   //
1425       packet_size,                                //
1426       /*inner_dim_contiguous*/ true,              //
1427       /*inner_dim_reordered*/ false,              //
1428       /*Alignment*/ 0>;
1429 
1430   using SubMapper = Eigen::internal::TensorContractionSubMapper<
1431       T, Index, Eigen::internal::Rhs, Evaluator,  //
1432       nocontract_t, contract_t,                   //
1433       packet_size,                                //
1434       /*inner_dim_contiguous*/ true,              //
1435       /*inner_dim_reordered*/ false,              //
1436       /*Alignment*/ 0>;
1437 
1438 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
1439   using PackRhsImpl =
1440       Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
1441                                                 ColMajor>;
1442 #else
1443   using Traits = typename Eigen::internal::gebp_traits<T, T>;
1444   using PackRhsImpl =
1445       Eigen::internal::gemm_pack_rhs<T, Eigen::Index, SubMapper,  //
1446                                      Traits::nr,                  //
1447                                      ColMajor,                    //
1448                                      /*Conjugate*/ false,         //
1449                                      /*PanelMode*/ false>;
1450 #endif
1451 
1452   Eigen::DefaultDevice device;
1453 
1454   // Actual contract dimensions are not important.
1455   const Eigen::Index not_important = -1234;
1456   nocontract_t nocontract_dim = {not_important};
1457   contract_t contract_dim = {not_important};
1458 
1459   // We use tensor of the same dimensions to store packed data.
1460   Tensor<T, 4> packed(input_dims);
1461 
1462   // We generate multiple input tensors, around 512mb in total size to measure
1463   // realistic workload when input data in not in L1-L3 cache.
1464   size_t input_bytes = input_dims.TotalSize() * sizeof(T);
1465   size_t mem_size_bytes = 1024 * 1024 * 512;
1466   size_t num_inputs =
1467       std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
1468 
1469   std::vector<Tensor<T, 4>> inputs;
1470   std::vector<Evaluator> evaluators;
1471   std::vector<InputMapper> input_mappers;
1472 
1473   inputs.reserve(num_inputs);
1474   evaluators.reserve(num_inputs);
1475   input_mappers.reserve(num_inputs);
1476 
1477   for (int i = 0; i < num_inputs; ++i) {
1478     inputs.emplace_back(input_dims);
1479     inputs[i].setRandom();
1480 
1481     ArgType tensor_map(inputs[i].data(), input_dims);
1482 
1483     // 1. Extract image patches from input tensor. All strides are `1`.
1484     const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
1485         tensor_map,                                          //
1486         filter_rows, filter_cols,                            //
1487         row_strides, col_strides,                            //
1488         /*in_row_strides=*/1, /*in_col_strides=*/1,          //
1489         patch_row_inflate_stride, patch_col_inflate_stride,  //
1490         padding, /*padding_value=*/0.0);
1491 
1492     // 2. Reshape extracted patches into "virtual" 2d tensor.
1493     Index input_rows_eff = (input_rows - 1) * patch_row_inflate_stride + 1;
1494     Index input_cols_eff = (input_cols - 1) * patch_col_inflate_stride + 1;
1495 
1496     Index output_rows = 0;
1497     Index output_cols = 0;
1498 
1499     if (padding == Eigen::PADDING_SAME) {
1500       output_rows = input_rows_eff / row_strides;
1501       output_cols = input_cols_eff / col_strides;
1502     } else if (padding == Eigen::PADDING_VALID) {
1503       output_rows =
1504           numext::ceil((input_rows_eff - filter_rows + 1.f) / row_strides);
1505       output_cols =
1506           numext::ceil((input_cols_eff - filter_cols + 1.f) / col_strides);
1507     } else {
1508       eigen_assert(false && "not supported");
1509     }
1510 
1511     NewDimension reshape_dims;
1512     reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
1513     reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
1514 
1515     const auto reshape_op =
1516         TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
1517             image_patch_op, reshape_dims);
1518 
1519     evaluators.emplace_back(reshape_op, device);
1520 
1521     input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
1522                                contract_dim, contract_dim);
1523   }
1524 
1525   // We read properties of extracted image patches directly from evaluator.
1526   const Index patch_depth = evaluators[0].impl().dimensions()[0];
1527   const Index patch_rows = evaluators[0].impl().dimensions()[1];
1528   const Index patch_cols = evaluators[0].impl().dimensions()[2];
1529 
1530   // Number of patches is the same as the maximum column available through the
1531   // InputMapper (SubMapper).
1532   const Index num_patches = evaluators[0].impl().dimensions()[3];
1533 
1534   // The size of a single patch, it's the same as the maximum depth available
1535   // through the InputMapper (SubMapper).
1536   const Index patch_size = patch_depth * patch_rows * patch_cols;
1537 
1538   PackRhsImpl pack_rhs;
1539 
1540   const Index packed_total_size = input_dims.TotalSize();
1541 
1542   // Round up row/col/memory offsets to make them multiple of packet size.
1543   const auto round_up = [](const Index idx) {
1544     return (idx / packet_size) * packet_size;
1545   };
1546 
1547   for (auto s : state) {
1548     int input_idx =
1549         num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
1550 
1551     // Depth offset must be a multiple packet size.
1552     Index depth_offset =
1553         (patch_size > block_rows)
1554             ? round_up(internal::random<Index>(0, patch_size - 10))
1555             : 0;
1556     Index col_offset = internal::random<Index>(0, num_patches - 10);
1557 
1558     Index depth = std::min(block_rows, patch_size - depth_offset);
1559     Index cols = std::min(block_cols, num_patches - col_offset);
1560 
1561     // Write packed data to random memory location to emulate cold caches.
1562     Index packed_size = depth * cols;
1563     Index packed_offset =
1564         internal::random<Index>(0, packed_total_size - packed_size - 1);
1565 
1566     SubMapper sub_mapper =
1567         input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
1568     pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
1569   }
1570 
1571   state.SetLabel(
1572       absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
1573                    "; num_patches=", num_patches, " patch_size=", patch_size,
1574                    " num_inputs=", num_inputs, " padding=", padding));
1575 }
1576 
1577 template <typename T>
PackLhsHelper(::testing::benchmark::State & state,int input_depth,int filter_count,int filter_cols,int filter_rows,Index block_rows,Index block_cols)1578 static void PackLhsHelper(::testing::benchmark::State& state,
1579                           /* Input dimensions: */
1580                           int input_depth,
1581                           /* Filter (kernel) dimensions: */
1582                           int filter_count, int filter_cols, int filter_rows,
1583                           /* Block dimensions: */
1584                           Index block_rows, Index block_cols) {
1585   // Set random seed for benchmark repeatability.
1586   srand(12345);
1587 
1588   eigen_assert(block_rows <= filter_count);
1589   eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
1590 
1591   using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
1592 
1593   // Default Eigen::Tensor layout is column major, so we configure dimensions
1594   // starting from the inner most (`filter count` aka `kernel filers`).
1595   Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
1596 
1597   static const int packet_size = Eigen::internal::packet_traits<T>::size;
1598 
1599   // We are going to reshape filter into 2D tensor.
1600   using NewDimension = Eigen::DSizes<Index, 2>;
1601 
1602   // Contraction dimensions.
1603   using nocontract_t = Eigen::array<Eigen::Index, 1>;
1604   using contract_t = Eigen::array<Eigen::Index, 1>;
1605 
1606   // Input to the ReshapeOp. It is the tensorflow TTypes<T>::Tensor
1607   // with ColMajor layout, instead of RowMajor. But that doesn't make any
1608   // difference, because TensorContraction swaps LHS with RHS for row major
1609   // inputs, and contraction mapper always works with column major data.
1610   using ArgType = TensorMap<Tensor<T, 4>, Eigen::Aligned>;
1611 
1612   using Evaluator =
1613       TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
1614                       Eigen::DefaultDevice>;
1615 
1616   using InputMapper = Eigen::internal::TensorContractionInputMapper<
1617       T, Index, Eigen::internal::Lhs, Evaluator,  //
1618       nocontract_t, contract_t,                   //
1619       packet_size,                                //
1620       /*inner_dim_contiguous*/ true,              //
1621       /*inner_dim_reordered*/ false,              //
1622       /*Alignment*/ 0>;
1623 
1624   using SubMapper = Eigen::internal::TensorContractionSubMapper<
1625       T, Index, Eigen::internal::Lhs, Evaluator,  //
1626       nocontract_t, contract_t,                   //
1627       packet_size,                                //
1628       /*inner_dim_contiguous*/ true,              //
1629       /*inner_dim_reordered*/ false,              //
1630       /*Alignment*/ 0>;
1631 
1632 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
1633   using PackLhsImpl =
1634       Eigen::internal::gemm_pack_colmajor_block<T, Eigen::Index, SubMapper,
1635                                                 ColMajor>;
1636 #else
1637   using Traits = typename Eigen::internal::gebp_traits<T, T>;
1638   using PackLhsImpl =
1639       Eigen::internal::gemm_pack_lhs<T, Eigen::Index, SubMapper,          //
1640                                      Traits::mr,                          //
1641                                      Traits::LhsProgress,                 //
1642                                      typename Traits::LhsPacket4Packing,  //
1643                                      ColMajor>;
1644 #endif
1645 
1646   Eigen::DefaultDevice device;
1647 
1648   // We will reshape kernel into 2D tensor.
1649   NewDimension reshape_dims;
1650   reshape_dims[0] = filter_count;
1651   reshape_dims[1] = input_depth * filter_rows * filter_cols;
1652 
1653   // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
1654   nocontract_t nocontract_dim = {0};
1655   contract_t contract_dim = {1};
1656 
1657   // These values computed using the algorithm in TensorContraction.h, with
1658   // 'nocontract_dim' and 'contract_dim' values specified above.
1659   nocontract_t nocontract_strides = {1};
1660   contract_t contract_strides = {filter_count};
1661   nocontract_t i_strides = {1};
1662   contract_t k_strides = {1};
1663 
1664   // We use tensor of the same dimensions to store packed data.
1665   Tensor<T, 4> packed(filter_dims);
1666 
1667   // We generate multiple filter tensors, around 512mb in total size to measure
1668   // realistic workload when input data in not in L1-L3 cache.
1669   size_t input_bytes = filter_dims.TotalSize() * sizeof(T);
1670   size_t mem_size_bytes = 1024 * 1024 * 512;
1671   size_t num_filters =
1672       std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
1673 
1674   std::vector<Tensor<T, 4>> filters;
1675   std::vector<Evaluator> evaluators;
1676   std::vector<InputMapper> input_mappers;
1677 
1678   filters.reserve(num_filters);
1679   evaluators.reserve(num_filters);
1680   input_mappers.reserve(num_filters);
1681 
1682   for (int i = 0; i < num_filters; ++i) {
1683     filters.emplace_back(filter_dims);
1684     filters[i].setRandom();
1685 
1686     ArgType tensor_map(filters[i].data(), filter_dims);
1687 
1688     const auto reshape_op =
1689         TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
1690 
1691     evaluators.emplace_back(reshape_op, device);
1692 
1693     input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
1694                                contract_strides, k_strides);
1695   }
1696 
1697   PackLhsImpl pack_lhs;
1698 
1699   const Index packed_total_size = filter_dims.TotalSize();
1700 
1701   // Round up row/col/memory offsets to make them multiple of packet size.
1702   const auto round_up = [](const Index idx) {
1703     return (idx / packet_size) * packet_size;
1704   };
1705 
1706   // Block rows is in the [0, filter_count) range.
1707   // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
1708 
1709   const Index max_row = filter_count;
1710   const Index max_col = filter_rows * filter_cols * input_depth;
1711 
1712   for (auto s : state) {
1713     int filter_idx =
1714         num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
1715 
1716     Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
1717     Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
1718 
1719     Index rows = std::min(block_rows, max_row - row_offset);
1720     Index cols = std::min(block_cols, max_col - col_offset);
1721 
1722     // Write packed data to random memory location to emulate cold caches.
1723     Index packed_offset = round_up(
1724         internal::random<Index>(0, packed_total_size - rows * cols - 1));
1725 
1726     SubMapper sub_mapper =
1727         input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
1728 
1729 // NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
1730 // first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
1731 // and accepts block rows and cols in the same order for lhs and rhs.
1732 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
1733     pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
1734 #else
1735     pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
1736 #endif
1737   }
1738   state.SetLabel(absl::StrCat(
1739       "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
1740       "; input: depth=", input_depth, "; num_filers=", num_filters));
1741 }
1742 
1743 // -------------------------------------------------------------------------- //
1744 // Pack RHS
1745 //
1746 // Macro argument names:
1747 //    N: batch size
1748 //    H: height
1749 //    W: width
1750 //    C: input channels
1751 //   FC: filter channels
1752 //   FH: filter height
1753 //   FW: filter width
1754 //   SH: stride in height dimensions
1755 //   SW: stride in width dimensions
1756 //  ISH: patch inflate stride in height dimension
1757 //  ISW: patch inflate stride in width dimension
1758 //   BR: block rows
1759 //   BC: block cols
1760 
1761 #define BM_CONCAT(a, b) a##b
1762 
1763 #define BM_RHS_NAME(prefix, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, \
1764                     BR, BC)                                                   \
1765   BM_CONCAT(                                                                  \
1766       BM_##prefix##_##T##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW,     \
1767       _##PAD##_s##SH##x##SW##_is##ISH##x##ISW##_B##BR##x##BC)
1768 
1769 #define BM_PackRhs(T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, ISW, BR, BC)  \
1770   static void BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW,    \
1771                           ISH, ISW, BR,                                       \
1772                           BC)(::testing::benchmark::State & state) {          \
1773     PackRhsHelper<T>(state, N, H, W, C, FC, FH, FW, PADDING_##PAD, SH, SW,    \
1774                      ISH, ISW, BR, BC);                                       \
1775   }                                                                           \
1776   BENCHMARK(BM_RHS_NAME(PackRhs, T, N, H, W, C, FC, FH, FW, PAD, SH, SW, ISH, \
1777                         ISW, BR, BC))                                         \
1778       ->UseRealTime()
1779 
1780 // Number of input channel (input depth) it equal to the number of patch
1781 // channels (patch depth).
1782 
1783 // Fast path: input channel dimension is the multiple of the packet size.
1784 BM_PackRhs(/*type*/ float,                 //
1785            /*batch*/ 32,                   //
1786            /*image*/ 64, 64,               //
1787            /*channels*/ 32,                //
1788            /*num_filters*/ 64,             //
1789            /*filter*/ 5, 5,                //
1790            /*padding*/ VALID,              //
1791            /*stride*/ 1, 1,                //
1792            /*patch inflate stride*/ 1, 1,  //
1793            /*block*/ 256, 56);
1794 
1795 BM_PackRhs(/*type*/ float,                 //
1796            /*batch*/ 32,                   //
1797            /*image*/ 64, 64,               //
1798            /*channels*/ 32,                //
1799            /*num_filters*/ 64,             //
1800            /*filter*/ 5, 5,                //
1801            /*padding*/ SAME,               //
1802            /*stride*/ 1, 1,                //
1803            /*patch inflate stride*/ 1, 1,  //
1804            /*block*/ 256, 56);
1805 
1806 BM_PackRhs(/*type*/ float,                 //
1807            /*batch*/ 32,                   //
1808            /*image*/ 64, 64,               //
1809            /*channels*/ 32,                //
1810            /*num_filters*/ 64,             //
1811            /*filter*/ 5, 5,                //
1812            /*padding*/ VALID,              //
1813            /*stride*/ 2, 2,                //
1814            /*patch inflate stride*/ 1, 1,  //
1815            /*block*/ 256, 56);
1816 
1817 BM_PackRhs(/*type*/ float,                 //
1818            /*batch*/ 32,                   //
1819            /*image*/ 64, 64,               //
1820            /*channels*/ 32,                //
1821            /*num_filters*/ 64,             //
1822            /*filter*/ 5, 5,                //
1823            /*padding*/ SAME,               //
1824            /*stride*/ 2, 2,                //
1825            /*patch inflate stride*/ 1, 1,  //
1826            /*block*/ 256, 56);
1827 
1828 // Slow path: input channel dimension is not the multiple of the packet size.
1829 BM_PackRhs(/*type*/ float,                 //
1830            /*batch*/ 32,                   //
1831            /*image*/ 64, 64,               //
1832            /*channels*/ 30,                //
1833            /*num_filters*/ 64,             //
1834            /*filter*/ 5, 5,                //
1835            /*padding*/ SAME,               //
1836            /*stride*/ 1, 1,                //
1837            /*patch inflate stride*/ 1, 1,  //
1838            /*block*/ 256, 56);
1839 
1840 BM_PackRhs(/*type*/ float,                 //
1841            /*batch*/ 32,                   //
1842            /*image*/ 64, 64,               //
1843            /*channels*/ 30,                //
1844            /*num_filters*/ 64,             //
1845            /*filter*/ 5, 5,                //
1846            /*padding*/ VALID,              //
1847            /*stride*/ 1, 1,                //
1848            /*patch inflate stride*/ 1, 1,  //
1849            /*block*/ 256, 56);
1850 
1851 BM_PackRhs(/*type*/ float,                 //
1852            /*batch*/ 32,                   //
1853            /*image*/ 64, 64,               //
1854            /*channels*/ 30,                //
1855            /*num_filters*/ 64,             //
1856            /*filter*/ 5, 5,                //
1857            /*padding*/ SAME,               //
1858            /*stride*/ 2, 2,                //
1859            /*patch inflate stride*/ 1, 1,  //
1860            /*block*/ 256, 56);
1861 
1862 BM_PackRhs(/*type*/ float,                 //
1863            /*batch*/ 32,                   //
1864            /*image*/ 64, 64,               //
1865            /*channels*/ 30,                //
1866            /*num_filters*/ 64,             //
1867            /*filter*/ 5, 5,                //
1868            /*padding*/ VALID,              //
1869            /*stride*/ 2, 2,                //
1870            /*patch inflate stride*/ 1, 1,  //
1871            /*block*/ 256, 56);
1872 
1873 // Slow path with input channel dimension smaller than the packet size.
1874 BM_PackRhs(/*type*/ float,                 //
1875            /*batch*/ 32,                   //
1876            /*image*/ 256, 256,             //
1877            /*channels*/ 4,                 //
1878            /*num_filters*/ 16,             //
1879            /*filter*/ 8, 8,                //
1880            /*padding*/ SAME,               //
1881            /*stride*/ 1, 1,                //
1882            /*patch inflate stride*/ 1, 1,  //
1883            /*block*/ 256, 56);
1884 
1885 BM_PackRhs(/*type*/ float,                 //
1886            /*batch*/ 32,                   //
1887            /*image*/ 256, 256,             //
1888            /*channels*/ 4,                 //
1889            /*num_filters*/ 16,             //
1890            /*filter*/ 8, 8,                //
1891            /*padding*/ VALID,              //
1892            /*stride*/ 1, 1,                //
1893            /*patch inflate stride*/ 1, 1,  //
1894            /*block*/ 256, 56);
1895 
1896 BM_PackRhs(/*type*/ float,                 //
1897            /*batch*/ 32,                   //
1898            /*image*/ 256, 256,             //
1899            /*channels*/ 4,                 //
1900            /*num_filters*/ 16,             //
1901            /*filter*/ 8, 8,                //
1902            /*padding*/ SAME,               //
1903            /*stride*/ 2, 4,                //
1904            /*patch inflate stride*/ 1, 1,  //
1905            /*block*/ 256, 56);
1906 
1907 BM_PackRhs(/*type*/ float,                 //
1908            /*batch*/ 32,                   //
1909            /*image*/ 256, 256,             //
1910            /*channels*/ 4,                 //
1911            /*num_filters*/ 16,             //
1912            /*filter*/ 8, 8,                //
1913            /*padding*/ VALID,              //
1914            /*stride*/ 2, 4,                //
1915            /*patch inflate stride*/ 1, 1,  //
1916            /*block*/ 256, 56);
1917 
1918 // Short and wide block with small input channel dimension.
1919 BM_PackRhs(/*type*/ float,                 //
1920            /*batch*/ 32,                   //
1921            /*image*/ 64, 64,               //
1922            /*channels*/ 4,                 //
1923            /*num_filters*/ 16,             //
1924            /*filter*/ 3, 3,                //
1925            /*padding*/ SAME,               //
1926            /*stride*/ 1, 1,                //
1927            /*patch inflate stride*/ 1, 1,  //
1928            /*block*/ 36, 432);
1929 
1930 // Short and wide block with small input channel dimension.
1931 BM_PackRhs(/*type*/ float,                 //
1932            /*batch*/ 32,                   //
1933            /*image*/ 64, 64,               //
1934            /*channels*/ 4,                 //
1935            /*num_filters*/ 16,             //
1936            /*filter*/ 3, 3,                //
1937            /*padding*/ VALID,              //
1938            /*stride*/ 1, 1,                //
1939            /*patch inflate stride*/ 1, 1,  //
1940            /*block*/ 36, 432);
1941 
1942 BM_PackRhs(/*type*/ float,                 //
1943            /*batch*/ 32,                   //
1944            /*image*/ 64, 64,               //
1945            /*channels*/ 4,                 //
1946            /*num_filters*/ 16,             //
1947            /*filter*/ 3, 3,                //
1948            /*padding*/ SAME,               //
1949            /*stride*/ 2, 2,                //
1950            /*patch inflate stride*/ 1, 1,  //
1951            /*block*/ 36, 432);
1952 
1953 BM_PackRhs(/*type*/ float,                 //
1954            /*batch*/ 32,                   //
1955            /*image*/ 64, 64,               //
1956            /*channels*/ 4,                 //
1957            /*num_filters*/ 16,             //
1958            /*filter*/ 3, 3,                //
1959            /*padding*/ VALID,              //
1960            /*stride*/ 2, 2,                //
1961            /*patch inflate stride*/ 1, 1,  //
1962            /*block*/ 36, 432);
1963 
1964 // Non standard patches with inflated strides.
1965 BM_PackRhs(/*type*/ float,                 //
1966            /*batch*/ 32,                   //
1967            /*image*/ 32, 32,               //
1968            /*channels*/ 96,                //
1969            /*num_filters*/ 96,             //
1970            /*filter*/ 5, 5,                //
1971            /*padding*/ SAME,               //
1972            /*stride*/ 1, 1,                //
1973            /*patch inflate stride*/ 2, 2,  //
1974            /*block*/ 272, 240);
1975 
1976 BM_PackRhs(/*type*/ float,                 //
1977            /*batch*/ 32,                   //
1978            /*image*/ 32, 32,               //
1979            /*channels*/ 96,                //
1980            /*num_filters*/ 96,             //
1981            /*filter*/ 5, 5,                //
1982            /*padding*/ VALID,              //
1983            /*stride*/ 1, 1,                //
1984            /*patch inflate stride*/ 2, 2,  //
1985            /*block*/ 272, 240);
1986 
1987 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
1988 using qint8 = Eigen::QInt8;
1989 BM_PackRhs(/*type*/ qint8,                 //
1990            /*batch*/ 32,                   //
1991            /*image*/ 64, 64,               //
1992            /*channels*/ 32,                //
1993            /*num_filters*/ 64,             //
1994            /*filter*/ 5, 5,                //
1995            /*padding*/ SAME,               //
1996            /*stride*/ 1, 1,                //
1997            /*patch inflate stride*/ 1, 1,  //
1998            /*block*/ 256, 56);
1999 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
2000 
2001 // -------------------------------------------------------------------------- //
2002 // Pack LHS
2003 //
2004 // Macro argument names:
2005 //    C: input channels
2006 //   FC: filter channels
2007 //   FH: filter height
2008 //   FW: filter width
2009 //   BR: block rows
2010 //   BC: block cols
2011 
2012 #define BM_LHS_NAME(prefix, T, C, FC, FH, FW, BR, BC) \
2013   BM_CONCAT(BM_##prefix##_##T##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
2014 
2015 #define BM_PackLhs(T, C, FC, FH, FW, BR, BC)                         \
2016   static void BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR,             \
2017                           BC)(::testing::benchmark::State & state) { \
2018     PackLhsHelper<T>(state, C, FC, FH, FW, BR, BC);                  \
2019   }                                                                  \
2020   BENCHMARK(BM_LHS_NAME(PackLhs, T, C, FC, FH, FW, BR, BC))->UseRealTime()
2021 
2022 // Number of input channel (input depth) it equal to the number of patch
2023 // channels (patch depth).
2024 
2025 BM_PackLhs(/*type*/ float,            //
2026            /*input channels*/ 128,    //
2027            /*filter channels*/ 1024,  //
2028            /*filter dims*/ 3, 3,      //
2029            /*block*/ 256, 56);
2030 
2031 BM_PackLhs(/*type*/ float,            //
2032            /*input channels*/ 128,    //
2033            /*filter channels*/ 1024,  //
2034            /*filter dims*/ 3, 3,      //
2035            /*block*/ 56, 256);
2036 
2037 BM_PackLhs(/*type*/ float,          //
2038            /*input channels*/ 30,   //
2039            /*filter channels*/ 64,  //
2040            /*filter dims*/ 3, 3,    //
2041            /*block*/ 256, 56);
2042 
2043 BM_PackLhs(/*type*/ float,          //
2044            /*input channels*/ 50,   //
2045            /*filter channels*/ 64,  //
2046            /*filter dims*/ 3, 3,    //
2047            /*block*/ 56, 256);
2048 }  // namespace Eigen
2049