1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
17 #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
18 
19 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20 
21 // Note the following header is used in both TF and TFLite. Particularly, it's
22 // used for float TFLite Conv2D.
23 #include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
24 
25 #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
26 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
27 
28 namespace Eigen {
29 namespace internal {
30 
31 // After we vectorized all loads from the underlying tensor using Packet ops, we
32 // have to finalize coefficients that do not fit into a packet.
33 template <typename Scalar, typename DataMapper, int packet_size,
34           bool masked_load_store>
35 struct FinalizeDataMapperCoeffs {
36   EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
37                                             const DataMapper& rhs,
38                                             Index base_idx, Index depth,
39                                             Index max_depth, bool pad = false) {
40     const Index num_coeffs = max_depth - depth;
41     eigen_assert(num_coeffs <= packet_size);
42 
43     for (; depth < max_depth; ++depth) {
44       *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx);
45       ++block;
46     }
47 
48     return num_coeffs;
49   }
50 };
51 
52 template <typename Scalar, typename DataMapper, int packet_size>
53 struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size,
54                                 /*masked_load_store=*/true> {
55   EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
56                                             const DataMapper& rhs,
57                                             Index base_idx, Index depth,
58                                             Index max_depth, bool pad = false) {
59     Index num_coeffs = max_depth - depth;
60     eigen_assert(num_coeffs <= packet_size);
61     if (num_coeffs == 0) return 0;
62 
63     using Packet = typename packet_traits<Scalar>::type;
64     Packet p = pad ? pset1<Packet>(Scalar(0))
65                    : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs);
66     internal::pstoreu(block, p, mask<Packet>(0, num_coeffs));
67 
68     return num_coeffs;
69   }
70 };
71 
72 // Pack a block of the right input matrix (in our case it's always a
73 // "virtual matrix" constructed from extracted image patches) in contiguous
74 // block in column-major storage order. Knowing the properties of the
75 // original patch op we can do it more efficient than the default
76 // gemm_pack_colmajor_block.
77 template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
78           typename Device, typename Scalar, typename StorageIndex,
79           typename nocontract_t, typename contract_t, int packet_size,
80           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
81 struct gemm_pack_colmajor_block<
82     Scalar, StorageIndex,
83     TensorContractionSubMapper<
84         Scalar, StorageIndex, Rhs,
85         TensorEvaluator<
86             const TensorReshapingOp<
87                 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
88             Device>,
89         nocontract_t, contract_t, packet_size, inner_dim_contiguous,
90         inner_dim_reordered, Alignment>,
91     ColMajor> {
92   typedef TensorContractionSubMapper<
93       Scalar, StorageIndex, Rhs,
94       TensorEvaluator<
95           const TensorReshapingOp<
96               NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
97           Device>,
98       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
99       inner_dim_reordered, Alignment>
100       SubMapper;
101 
102   typedef SubMapper DataMapper;
103   typedef typename packet_traits<Scalar>::type Packet;
104 
105   using CoeffFinalizer = FinalizeDataMapperCoeffs<
106       Scalar, DataMapper, packet_size,
107       TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT,
108                                       Packet, Index>::value &&
109           unpacket_traits<Packet>::masked_store_available>;
110 
111   EIGEN_DONT_INLINE
112   void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows,
113                   StorageIndex cols) {
114     const bool standard_patches = !rhs.nonStandardPatches();
115 
116     if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
117       // Single packet always belong to single patch (row, col).
118       if (rhs.hasPadding()) {
119         packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
120                             /*has_padding=*/true>(block, rhs, rows, cols);
121       } else {
122         packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
123                             /*has_padding=*/false>(block, rhs, rows, cols);
124       }
125 
126     } else if (standard_patches) {
127       // Single packet can span across multiple patch rows or columns.
128       if (rhs.hasPadding()) {
129         packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
130                             /*has_padding=*/true>(block, rhs, rows, cols);
131       } else {
132         packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
133                             /*has_padding=*/false>(block, rhs, rows, cols);
134       }
135 
136     } else if (rhs.patchDepth() % packet_size == 0) {
137       // Single packet always belong to single patch (row, col).
138       packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
139                              true>(block, rhs, rows, cols);
140 
141     } else {
142       // Single packet can span across multiple patch rows or columns.
143       packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
144                              false>(block, rhs, rows, cols);
145     }
146   }
147 
148  private:
149   // (A) Standard image patches:
150   //
151   //  (1) patch_row_inflate_strides == 1    AND
152   //  (2) patch_col_inflate_strides == 1
153   //
154   // Standard patches guarantee that two inner most dimensions (depth and rows)
155   // are contiguous in memory and we can try to squeeze reads from them.
156   //
157   // (B) Non standard image patches: in_row/in_col and patch_row/patch_col
158   // strides can be not equal to 1, and for each [row, col] inside a patch we
159   // have to do additional computations to find corresponding row and col in the
160   // input tensor. Also we can no longer squeeze reads from inner dimensions.
161   //
162   // Additional parameters:
163   // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
164   //   depth dimension size to be a multiple of packet size, so we can skip all
165   //   non vectorized loads and checks, because it's guaranteed that block size
166   //   will be a multiple of a packet size (see TensorContractionBlocking).
167   //
168   // - has_padding: Input tensor has non-zero padding. In this case for each
169   //   patch col and row we need to check that it doesn't correspond to the
170   //   padded region of original input.
171   template <bool patch_depth_is_multiple_of_packet_size, bool has_padding>
172   EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
173                                                const DataMapper rhs,
174                                                StorageIndex rows,
175                                                StorageIndex cols) {
176     eigen_assert(!rhs.nonStandardPatches());
177 
178     // Give vectorized_rows the name used in all other gemm_pack_rhs above.
179     const StorageIndex peeled_k = (rows / packet_size) * packet_size;
180 
181     const StorageIndex start_col = rhs.colOffset();
182     const StorageIndex max_col = rhs.maxCol(peeled_k);
183     const StorageIndex rhs_depth_offset = rhs.depthOffset();
184 
185     for (StorageIndex col = 0; col < cols; ++col) {
186       SubMapper lm = rhs.getLinearMapper(0, col);
187 
188       StorageIndex k = 0;
189       for (Index c = start_col; c < max_col; ++c) {
190         eigen_assert(k <= peeled_k);
191 
192         const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
193         const StorageIndex max_row = rhs.maxRow(peeled_k, c);
194         const bool pad_col = has_padding && lm.padCol(c);
195 
196         eigen_assert(has_padding || !lm.padCol(c));
197         eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1));
198 
199         // We can squeeze reads for all rows in [start_row, max_row) range.
200         if (!has_padding ||
201             (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
202           const StorageIndex start_depth =
203               (c == start_col) ? rhs_depth_offset : 0;
204 
205           const StorageIndex max_depth =
206               std::min<StorageIndex>(start_depth + (peeled_k - k),
207                                      (max_row - start_row) * rhs.patchDepth());
208 
209           const StorageIndex base_idx = lm.baseIndex(start_row, c);
210 
211           if (patch_depth_is_multiple_of_packet_size) {
212             // If patch depth is a multiple of packet size, it's guaranteed that
213             // we can process all values in depth dimension with packets.
214             eigen_assert((max_depth - start_depth) % packet_size == 0);
215             StorageIndex d = start_depth;
216 
217             const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
218             for (; d <= unrolled_depth; d += 4 * packet_size) {
219               eigen_assert(k < peeled_k);
220 
221               Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
222               Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
223               Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
224               Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
225 
226               internal::pstoreu(block + 0 * packet_size, p0);
227               internal::pstoreu(block + 1 * packet_size, p1);
228               internal::pstoreu(block + 2 * packet_size, p2);
229               internal::pstoreu(block + 3 * packet_size, p3);
230 
231               block += 4 * packet_size;
232               k += 4 * packet_size;
233             }
234 
235             for (; d < max_depth; d += packet_size) {
236               eigen_assert(k < peeled_k);
237               internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
238               block += packet_size;
239               k += packet_size;
240             }
241 
242           } else {
243             StorageIndex d = start_depth;
244 
245             const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
246             for (; d <= unrolled_depth; d += 4 * packet_size) {
247               eigen_assert(k < peeled_k);
248 
249               Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
250               Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
251               Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
252               Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
253 
254               internal::pstoreu(block + 0 * packet_size, p0);
255               internal::pstoreu(block + 1 * packet_size, p1);
256               internal::pstoreu(block + 2 * packet_size, p2);
257               internal::pstoreu(block + 3 * packet_size, p3);
258 
259               block += 4 * packet_size;
260               k += 4 * packet_size;
261             }
262 
263             const StorageIndex vectorized_depth = max_depth - packet_size;
264             for (; d <= vectorized_depth; d += packet_size) {
265               eigen_assert(k < peeled_k);
266               internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
267               block += packet_size;
268               k += packet_size;
269             }
270 
271             eigen_assert(k <= peeled_k);
272             const Index num_coeffs =
273                 CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);
274 
275             k += num_coeffs;
276             block += num_coeffs;
277             eigen_assert(k <= peeled_k);
278           }
279 
280           // Go to the next column.
281           continue;
282         }
283 
284         // If we are not allowed to squeeze reads along the `row` and `depth`
285         // dimensions, we must process rows one by one.
286         for (StorageIndex r = start_row; r < max_row; ++r) {
287           eigen_assert(k <= peeled_k);
288 
289           const StorageIndex start_depth =
290               ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
291           const StorageIndex max_depth =
292               rhs.maxDepth(peeled_k - k, start_depth);
293 
294           const bool pad = has_padding && (pad_col || lm.padRow(r));
295           eigen_assert(has_padding || !lm.padRow(r));
296 
297           const StorageIndex base_idx = lm.baseIndex(r, c);
298 
299           if (patch_depth_is_multiple_of_packet_size) {
300             // If patch depth is a multiple of packet size, it's guaranteed that
301             // we can process all values in depth dimension with packets.
302             eigen_assert((max_depth - start_depth) % packet_size == 0);
303             StorageIndex d = start_depth;
304 
305             for (; d < max_depth; d += packet_size) {
306               eigen_assert(k < peeled_k);
307               const Packet p = (has_padding && pad)
308                                    ? pset1<Packet>(Scalar(0))
309                                    : rhs.packetNoPadding(d, base_idx);
310               internal::pstoreu(block, p);
311               block += packet_size;
312               k += packet_size;
313             }
314 
315           } else {
316             StorageIndex d = start_depth;
317 
318             const StorageIndex vectorized_depth = max_depth - packet_size;
319             for (; d <= vectorized_depth; d += packet_size) {
320               eigen_assert(k < peeled_k);
321               const Packet p = (has_padding && pad)
322                                    ? pset1<Packet>(Scalar(0))
323                                    : rhs.packetNoPadding(d, base_idx);
324               internal::pstoreu(block, p);
325               block += packet_size;
326               k += packet_size;
327             }
328 
329             eigen_assert(k <= peeled_k);
330             const Index num_coeffs = CoeffFinalizer::finalize(
331                 block, rhs, base_idx, d, max_depth, has_padding && pad);
332 
333             k += num_coeffs;
334             block += num_coeffs;
335             eigen_assert(k <= peeled_k);
336           }
337         }
338       }
339 
340       // The loop above should fill peeled_k elements.
341       eigen_assert(peeled_k == k);
342 
343       // Fill remaining elements using loadCoeffStandard.
344       for (; k < rows; ++k) {
345         *block = lm.loadCoeffStandard(k);
346         ++block;
347       }
348     }
349   }
350 
351   template <bool patch_depth_is_multiple_of_packet_size>
352   EIGEN_ALWAYS_INLINE void packNonStandardPatches(Scalar* block,
353                                                   const DataMapper rhs,
354                                                   StorageIndex rows,
355                                                   StorageIndex cols) {
356     eigen_assert(rhs.nonStandardPatches());
357 
358     // Give vectorized_rows the name used in all other gemm_pack_rhs above.
359     const StorageIndex peeled_k = (rows / packet_size) * packet_size;
360 
361     const StorageIndex start_col = rhs.colOffset();
362     const StorageIndex max_col = rhs.maxCol(peeled_k);
363     const StorageIndex rhs_depth_offset = rhs.depthOffset();
364 
365     // Original input column and row after applying all non-standard strides and
366     // dilations. Computed by padOrSkip{Row,Col}.
367     Index orig_c = 0;
368     Index orig_r = 0;
369 
370     for (StorageIndex col = 0; col < cols; ++col) {
371       SubMapper lm = rhs.getLinearMapper(0, col);
372 
373       StorageIndex k = 0;
374       for (Index c = start_col; c < max_col; ++c) {
375         eigen_assert(k <= peeled_k);
376 
377         const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
378         const StorageIndex max_row = rhs.maxRow(peeled_k, c);
379         const bool pad_or_skip_col = lm.padOrSkipCol(c, &orig_c);
380 
381         for (StorageIndex r = start_row; r < max_row; ++r) {
382           eigen_assert(k <= peeled_k);
383 
384           const StorageIndex start_depth =
385               ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
386           const StorageIndex max_depth =
387               rhs.maxDepth(peeled_k - k, start_depth);
388 
389           const bool pad_or_skip =
390               pad_or_skip_col || lm.padOrSkipRow(r, &orig_r);
391           const StorageIndex base_idx = lm.origBaseIndex(orig_r, orig_c);
392 
393           if (patch_depth_is_multiple_of_packet_size) {
394             // If patch depth is a multiple of packet size, it's guaranteed that
395             // we can process all values in depth dimension with packets.
396             eigen_assert((max_depth - start_depth) % packet_size == 0);
397             StorageIndex d = start_depth;
398 
399             for (; d < max_depth; d += packet_size) {
400               eigen_assert(k < peeled_k);
401               const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
402                                            : rhs.packetNoPadding(d, base_idx);
403               internal::pstoreu(block, p);
404               block += packet_size;
405               k += packet_size;
406             }
407 
408           } else {
409             const StorageIndex vectorized_depth = max_depth - packet_size;
410             StorageIndex d = start_depth;
411             for (; d <= vectorized_depth; d += packet_size) {
412               eigen_assert(k < peeled_k);
413               const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
414                                            : rhs.packetNoPadding(d, base_idx);
415               internal::pstoreu(block, p);
416               block += packet_size;
417               k += packet_size;
418             }
419 
420             eigen_assert(k <= peeled_k);
421             const Index num_coeffs = CoeffFinalizer::finalize(
422                 block, rhs, base_idx, d, max_depth, pad_or_skip);
423 
424             k += num_coeffs;
425             block += num_coeffs;
426             eigen_assert(k <= peeled_k);
427           }
428         }
429       }
430 
431       // The loop above should fill peeled_k elements.
432       eigen_assert(peeled_k == k);
433 
434       // Fill remaining elements using loadCoeff.
435       for (; k < rows; ++k) {
436         *block = lm(k);
437         ++block;
438       }
439     }
440   }
441 };
442 }  // namespace internal
443 }  // namespace Eigen
444 #endif  // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
445 #endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
446