1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
15 /** \class TensorConvolution
16   * \ingroup CXX11_Tensor_Module
17   *
18   * \brief Tensor convolution class.
19   *
20   *
21   */
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
IndexMapper(const InputDims & input_dims,const array<Index,NumKernelDims> & kernel_dims,const array<Index,NumKernelDims> & indices)27   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28               const array<Index, NumKernelDims>& indices) {
29 
30     array<Index, NumDims> dimensions = input_dims;
31     for (int i = 0; i < NumKernelDims; ++i) {
32       const Index index = indices[i];
33       const Index input_dim = input_dims[index];
34       const Index kernel_dim = kernel_dims[i];
35       const Index result_dim = input_dim - kernel_dim + 1;
36       dimensions[index] = result_dim;
37     }
38 
39     array<Index, NumDims> inputStrides;
40     array<Index, NumDims> outputStrides;
41     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42       inputStrides[0] = 1;
43       outputStrides[0] = 1;
44       for (int i = 1; i < NumDims; ++i) {
45         inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46         outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47       }
48     } else {
49       inputStrides[NumDims - 1] = 1;
50       outputStrides[NumDims - 1] = 1;
51       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54       }
55     }
56 
57     array<Index, NumDims> cudaInputDimensions;
58     array<Index, NumDims> cudaOutputDimensions;
59     array<Index, NumDims> tmp = dimensions;
60     array<Index, NumDims> ordering;
61     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62                               ? 0
63                               : NumDims - NumKernelDims;
64     for (int i = 0; i < NumKernelDims; ++i) {
65       const Index index = i + offset;
66       ordering[index] = indices[i];
67       tmp[indices[i]] = -1;
68       cudaInputDimensions[index] = input_dims[indices[i]];
69       cudaOutputDimensions[index] = dimensions[indices[i]];
70     }
71 
72     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73                       ? NumKernelDims
74                       : 0;
75     for (int i = 0; i < NumDims; ++i) {
76       if (tmp[i] >= 0) {
77         ordering[written] = i;
78         cudaInputDimensions[written] = input_dims[i];
79         cudaOutputDimensions[written] = dimensions[i];
80         ++written;
81       }
82     }
83 
84     for (int i = 0; i < NumDims; ++i) {
85       m_inputStrides[i] = inputStrides[ordering[i]];
86       m_outputStrides[i] = outputStrides[ordering[i]];
87     }
88 
89     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90       for (int i = 0; i < NumDims; ++i) {
91         if (i > NumKernelDims) {
92           m_cudaInputStrides[i] =
93               m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
94           m_cudaOutputStrides[i] =
95               m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
96         } else {
97           m_cudaInputStrides[i] = 1;
98           m_cudaOutputStrides[i] = 1;
99         }
100       }
101     } else {
102       for (int i = NumDims - 1; i >= 0; --i) {
103         if (i + 1 < offset) {
104           m_cudaInputStrides[i] =
105               m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
106           m_cudaOutputStrides[i] =
107               m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
108         } else {
109           m_cudaInputStrides[i] = 1;
110           m_cudaOutputStrides[i] = 1;
111         }
112       }
113     }
114   }
115 
mapCudaInputPlaneToTensorInputOffset(Index p)116   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
117     Index inputIndex = 0;
118     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119       for (int d = NumDims - 1; d > NumKernelDims; --d) {
120         const Index idx = p / m_cudaInputStrides[d];
121         inputIndex += idx * m_inputStrides[d];
122         p -= idx * m_cudaInputStrides[d];
123       }
124       inputIndex += p * m_inputStrides[NumKernelDims];
125     } else {
126       std::ptrdiff_t limit = 0;
127       if (NumKernelDims < NumDims) {
128         limit = NumDims - NumKernelDims - 1;
129       }
130       for (int d = 0; d < limit; ++d) {
131         const Index idx = p / m_cudaInputStrides[d];
132         inputIndex += idx * m_inputStrides[d];
133         p -= idx * m_cudaInputStrides[d];
134       }
135       inputIndex += p * m_inputStrides[limit];
136     }
137     return inputIndex;
138   }
139 
mapCudaOutputPlaneToTensorOutputOffset(Index p)140   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
141     Index outputIndex = 0;
142     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143       for (int d = NumDims - 1; d > NumKernelDims; --d) {
144         const Index idx = p / m_cudaOutputStrides[d];
145         outputIndex += idx * m_outputStrides[d];
146         p -= idx * m_cudaOutputStrides[d];
147       }
148       outputIndex += p * m_outputStrides[NumKernelDims];
149     } else {
150       std::ptrdiff_t limit = 0;
151       if (NumKernelDims < NumDims) {
152         limit = NumDims - NumKernelDims - 1;
153       }
154       for (int d = 0; d < limit; ++d) {
155         const Index idx = p / m_cudaOutputStrides[d];
156         outputIndex += idx * m_outputStrides[d];
157         p -= idx * m_cudaOutputStrides[d];
158       }
159       outputIndex += p * m_outputStrides[limit];
160     }
161     return outputIndex;
162   }
163 
mapCudaInputKernelToTensorInputOffset(Index i)164   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
165     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166                               ? 0
167                               : NumDims - NumKernelDims;
168     return i * m_inputStrides[offset];
169   }
170 
mapCudaOutputKernelToTensorOutputOffset(Index i)171   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
172     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173                               ? 0
174                               : NumDims - NumKernelDims;
175     return i * m_outputStrides[offset];
176   }
177 
mapCudaInputKernelToTensorInputOffset(Index i,Index j)178   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
179     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180                               ? 0
181                               : NumDims - NumKernelDims;
182     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183   }
184 
mapCudaOutputKernelToTensorOutputOffset(Index i,Index j)185   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
186     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187                               ? 0
188                               : NumDims - NumKernelDims;
189     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190   }
191 
mapCudaInputKernelToTensorInputOffset(Index i,Index j,Index k)192   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
193     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194                               ? 0
195                               : NumDims - NumKernelDims;
196     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197            k * m_inputStrides[offset + 2];
198   }
199 
mapCudaOutputKernelToTensorOutputOffset(Index i,Index j,Index k)200   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
201     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202                               ? 0
203                               : NumDims - NumKernelDims;
204     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205            k * m_outputStrides[offset + 2];
206   }
207 
208  private:
209   static const int NumDims = internal::array_size<InputDims>::value;
210   array<Index, NumDims> m_inputStrides;
211   array<Index, NumDims> m_outputStrides;
212   array<Index, NumDims> m_cudaInputStrides;
213   array<Index, NumDims> m_cudaOutputStrides;
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221   // Type promotion to handle the case where the types of the lhs and the rhs are different.
222   typedef typename promote_storage_type<typename InputXprType::Scalar,
223                                         typename KernelXprType::Scalar>::ret Scalar;
224   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
225                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
226   typedef typename promote_index_type<typename traits<InputXprType>::Index,
227                                       typename traits<KernelXprType>::Index>::type Index;
228   typedef typename InputXprType::Nested LhsNested;
229   typedef typename KernelXprType::Nested RhsNested;
230   typedef typename remove_reference<LhsNested>::type _LhsNested;
231   typedef typename remove_reference<RhsNested>::type _RhsNested;
232   static const int NumDimensions = traits<InputXprType>::NumDimensions;
233   static const int Layout = traits<InputXprType>::Layout;
234 
235   enum {
236     Flags = 0
237   };
238 };
239 
240 template<typename Dimensions, typename InputXprType, typename KernelXprType>
241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
242 {
243   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
244 };
245 
246 template<typename Dimensions, typename InputXprType, typename KernelXprType>
247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
248 {
249   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
250 };
251 
252 }  // end namespace internal
253 
254 
255 
256 template<typename Indices, typename InputXprType, typename KernelXprType>
257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
258 {
259   public:
260   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
261   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
262   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
263                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
264   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
265   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
266   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
267 
268   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
269       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
270 
271     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
272     const Indices& indices() const { return m_indices; }
273 
274     /** \returns the nested expressions */
275     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276     const typename internal::remove_all<typename InputXprType::Nested>::type&
277     inputExpression() const { return m_input_xpr; }
278 
279     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280     const typename internal::remove_all<typename KernelXprType::Nested>::type&
281     kernelExpression() const { return m_kernel_xpr; }
282 
283   protected:
284     typename InputXprType::Nested m_input_xpr;
285     typename KernelXprType::Nested m_kernel_xpr;
286     const Indices m_indices;
287 };
288 
289 
290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
292 {
293   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
294 
295   static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
296   static const int NumKernelDims = internal::array_size<Indices>::value;
297   typedef typename XprType::Index Index;
298   typedef DSizes<Index, NumDims> Dimensions;
299 
300   typedef typename XprType::Scalar Scalar;
301   typedef typename XprType::CoeffReturnType CoeffReturnType;
302   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
303   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
304 
305   enum {
306     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
307     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
308     Layout = TensorEvaluator<InputArgType, Device>::Layout,
309     CoordAccess = false,  // to be implemented
310     RawAccess = false
311   };
312 
313   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
314       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
315   {
316     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
317 
318     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
319     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
320 
321     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
322       m_inputStride[0] = 1;
323       for (int i = 1; i < NumDims; ++i) {
324         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
325       }
326     } else {
327       m_inputStride[NumDims - 1] = 1;
328       for (int i = NumDims - 2; i >= 0; --i) {
329         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
330       }
331     }
332 
333     m_dimensions = m_inputImpl.dimensions();
334     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
335       for (int i = 0; i < NumKernelDims; ++i) {
336         const Index index = op.indices()[i];
337         const Index input_dim = input_dims[index];
338         const Index kernel_dim = kernel_dims[i];
339         const Index result_dim = input_dim - kernel_dim + 1;
340         m_dimensions[index] = result_dim;
341         if (i > 0) {
342           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
343         } else {
344           m_kernelStride[0] = 1;
345         }
346         m_indexStride[i] = m_inputStride[index];
347       }
348 
349       m_outputStride[0] = 1;
350       for (int i = 1; i < NumDims; ++i) {
351         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
352       }
353     } else {
354       for (int i = NumKernelDims - 1; i >= 0; --i) {
355         const Index index = op.indices()[i];
356         const Index input_dim = input_dims[index];
357         const Index kernel_dim = kernel_dims[i];
358         const Index result_dim = input_dim - kernel_dim + 1;
359         m_dimensions[index] = result_dim;
360         if (i < NumKernelDims - 1) {
361           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
362         } else {
363           m_kernelStride[NumKernelDims - 1] = 1;
364         }
365         m_indexStride[i] = m_inputStride[index];
366       }
367 
368       m_outputStride[NumDims - 1] = 1;
369       for (int i = NumDims - 2; i >= 0; --i) {
370         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
371       }
372     }
373   }
374 
375   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
376 
377   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
378     m_inputImpl.evalSubExprsIfNeeded(NULL);
379     preloadKernel();
380     return true;
381   }
382   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
383     m_inputImpl.cleanup();
384     if (m_local_kernel) {
385       m_device.deallocate((void*)m_kernel);
386       m_local_kernel = false;
387     }
388     m_kernel = NULL;
389   }
390 
391   void evalTo(typename XprType::Scalar* buffer) {
392     evalSubExprsIfNeeded(NULL);
393     for (int i = 0; i < dimensions().TotalSize(); ++i) {
394       buffer[i] += coeff(i);
395     }
396     cleanup();
397   }
398 
399   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
400   {
401     CoeffReturnType result = CoeffReturnType(0);
402     convolve(firstInput(index), 0, NumKernelDims-1, result);
403     return result;
404   }
405 
406   template<int LoadMode>
407   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
408   {
409     Index indices[2] = {index, index+PacketSize-1};
410     Index startInputs[2] = {0, 0};
411     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
412       for (int i = NumDims - 1; i > 0; --i) {
413         const Index idx0 = indices[0] / m_outputStride[i];
414         const Index idx1 = indices[1] / m_outputStride[i];
415         startInputs[0] += idx0 * m_inputStride[i];
416         startInputs[1] += idx1 * m_inputStride[i];
417         indices[0] -= idx0 * m_outputStride[i];
418         indices[1] -= idx1 * m_outputStride[i];
419       }
420     } else {
421       for (int i = 0; i < NumDims - 1; ++i) {
422         const Index idx0 = indices[0] / m_outputStride[i];
423         const Index idx1 = indices[1] / m_outputStride[i];
424         startInputs[0] += idx0 * m_inputStride[i];
425         startInputs[1] += idx1 * m_inputStride[i];
426         indices[0] -= idx0 * m_outputStride[i];
427         indices[1] -= idx1 * m_outputStride[i];
428       }
429     }
430     startInputs[0] += indices[0];
431     startInputs[1] += indices[1];
432 
433     if (startInputs[1]-startInputs[0] == PacketSize-1) {
434       PacketReturnType result = internal::pset1<PacketReturnType>(0);
435       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
436       return result;
437     } else {
438       EIGEN_ALIGN_MAX Scalar data[PacketSize];
439       data[0] = Scalar(0);
440       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
441       for (int i = 1; i < PacketSize-1; ++i) {
442         data[i] = Scalar(0);
443         convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
444       }
445       data[PacketSize-1] = Scalar(0);
446       convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
447       return internal::pload<PacketReturnType>(data);
448     }
449   }
450 
451   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
452   costPerCoeff(bool vectorized) const {
453     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
454     // We ignore the use of fused multiply-add.
455     const double convolve_compute_cost =
456         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
457     const double firstIndex_compute_cost =
458         NumDims *
459         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
460          TensorOpCost::DivCost<Index>());
461     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
462            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
463                           m_kernelImpl.costPerCoeff(vectorized) +
464                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
465                                        PacketSize));
466   }
467 
468   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
469 
470  private:
471   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
472     Index startInput = 0;
473     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
474       for (int i = NumDims - 1; i > 0; --i) {
475         const Index idx = index / m_outputStride[i];
476         startInput += idx * m_inputStride[i];
477         index -= idx * m_outputStride[i];
478       }
479     } else {
480       for (int i = 0; i < NumDims - 1; ++i) {
481         const Index idx = index / m_outputStride[i];
482         startInput += idx * m_inputStride[i];
483         index -= idx * m_outputStride[i];
484       }
485     }
486     startInput += index;
487     return startInput;
488   }
489 
490   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
491     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
492       const Index input = firstIndex + j * m_indexStride[DimIndex];
493       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
494       if (DimIndex > 0) {
495         convolve(input, kernel, DimIndex-1, accum);
496       } else {
497         accum += m_inputImpl.coeff(input) * m_kernel[kernel];
498       }
499     }
500   }
501 
502   template <typename Packet>
503   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
504     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
505       const Index input = firstIndex + j * m_indexStride[DimIndex];
506       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
507       if (DimIndex > 0) {
508         convolvePacket(input, kernel, DimIndex-1, accum);
509       } else {
510         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
511       }
512     }
513   }
514 
515   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
516     // Don't make a local copy of the kernel unless we have to (i.e. it's an
517     // expression that needs to be evaluated)
518     const Scalar* in_place = m_kernelImpl.data();
519     if (in_place) {
520       m_kernel = in_place;
521       m_local_kernel = false;
522     } else {
523       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
524       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
525       typedef TensorEvalToOp<const KernelArgType> EvalTo;
526       EvalTo evalToTmp(local, m_kernelArg);
527       const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
528       internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
529 
530       m_kernel = local;
531       m_local_kernel = true;
532     }
533   }
534 
535   array<Index, NumDims> m_inputStride;
536   array<Index, NumDims> m_outputStride;
537 
538   array<Index, NumKernelDims> m_indexStride;
539   array<Index, NumKernelDims> m_kernelStride;
540   TensorEvaluator<InputArgType, Device> m_inputImpl;
541   TensorEvaluator<KernelArgType, Device> m_kernelImpl;
542   Dimensions m_dimensions;
543 
544   KernelArgType m_kernelArg;
545   const Scalar* m_kernel;
546   bool m_local_kernel;
547   const Device& m_device;
548 };
549 
550 
551 
552 
553 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
555 
556 template <int StaticKernelSize>
557 struct GetKernelSize {
558   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
559     return StaticKernelSize;
560   }
561 };
562 template <>
563 struct GetKernelSize<Dynamic> {
564   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
565     return kernelSize;
566   }
567 };
568 
569 template <typename InputEvaluator, typename Index, typename InputDims,
570           int StaticKernelSize>
571 __global__ void EigenConvolutionKernel1D(
572     InputEvaluator eval,
573     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
574         indexMapper,
575     const float* __restrict kernel, const int numPlanes, const int numX,
576     const int maxX, const int kernelSize, float* buffer) {
577   extern __shared__ float s[];
578 
579   const int first_x = blockIdx.x * maxX;
580   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
581   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
582   const int num_x_output = last_x - first_x + 1;
583 
584   const int first_plane = blockIdx.y * blockDim.y;
585   const int plane_stride = blockDim.y * gridDim.y;
586 
587   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
588     // Load inputs to shared memory
589     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
590     const int plane_kernel_offset = threadIdx.y * num_x_input;
591     #pragma unroll
592     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
593       const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
594       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
595     }
596 
597     __syncthreads();
598 
599     // Compute the convolution
600     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
601 
602     #pragma unroll
603     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
604       const int kernel_offset = plane_kernel_offset + i;
605       float result = 0.0f;
606       #pragma unroll
607       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
608         result += s[k + kernel_offset] * kernel[k];
609       }
610       const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
611       buffer[tensor_index] = result;
612     }
613     __syncthreads();
614   }
615 };
616 
617 template <typename InputEvaluator, typename Index, typename InputDims,
618           int StaticKernelSizeX, int StaticKernelSizeY>
619 __global__ void EigenConvolutionKernel2D(
620     InputEvaluator eval,
621     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
622         indexMapper,
623     const float* __restrict kernel, const int numPlanes, const int numX,
624     const int maxX, const int numY, const int maxY, const int kernelSizeX,
625     const int kernelSizeY, float* buffer) {
626   extern __shared__ float s[];
627 
628   const int first_x = blockIdx.x * maxX;
629   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
630   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
631   const int num_x_output = last_x - first_x + 1;
632 
633   const int first_y = blockIdx.y * maxY;
634   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
635   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
636   const int num_y_output = last_y - first_y + 1;
637 
638   const int first_plane = blockIdx.z * blockDim.z;
639   const int plane_stride = blockDim.z * gridDim.z;
640 
641   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
642 
643     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
644     const int plane_kernel_offset = threadIdx.z * num_y_input;
645 
646     // Load inputs to shared memory
647     #pragma unroll
648     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
649       const int input_offset = num_x_input * (j + plane_kernel_offset);
650       #pragma unroll
651       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
652         const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
653         s[i + input_offset] = eval.coeff(tensor_index);
654       }
655     }
656 
657     __syncthreads();
658 
659     // Convolution
660     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
661 
662     #pragma unroll
663     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
664       #pragma unroll
665       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
666         float result = 0.0f;
667         #pragma unroll
668         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
669           const int kernel_offset = kernelSizeX * l;
670           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
671           #pragma unroll
672           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
673             result += s[k + input_offset] * kernel[k + kernel_offset];
674           }
675         }
676         const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
677         buffer[tensor_index] = result;
678       }
679     }
680 
681     __syncthreads();
682   }
683 };
684 
685 template <typename InputEvaluator, typename Index, typename InputDims>
686 __global__ void EigenConvolutionKernel3D(
687     InputEvaluator eval,
688     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
689         indexMapper,
690     const float* __restrict kernel, const size_t numPlanes, const size_t numX,
691     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
692     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
693     const size_t kernelSizeZ, float* buffer) {
694   extern __shared__ float s[];
695 
696   // Load inputs to shared memory
697   const int first_x = blockIdx.x * maxX;
698   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
699   const int num_x_input = last_x - first_x + kernelSizeX;
700 
701   const int first_y = blockIdx.y * maxY;
702   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
703   const int num_y_input = last_y - first_y + kernelSizeY;
704 
705   const int first_z = blockIdx.z * maxZ;
706   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
707   const int num_z_input = last_z - first_z + kernelSizeZ;
708 
709   for (int p = 0; p < numPlanes; ++p) {
710 
711     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
712     const int plane_kernel_offset = 0;
713 
714     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
715       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
716         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
717           const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
718           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
719         }
720       }
721     }
722 
723     __syncthreads();
724 
725     // Convolution
726     const int num_z_output = last_z - first_z + 1;
727     const int num_y_output = last_y - first_y + 1;
728     const int num_x_output = last_x - first_x + 1;
729     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
730 
731     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
732       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
733         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
734           float result = 0.0f;
735           for (int n = 0; n < kernelSizeZ; ++n) {
736             for (int m = 0; m < kernelSizeY; ++m) {
737               for (int l = 0; l < kernelSizeX; ++l) {
738                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
739               }
740             }
741           }
742           const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
743           buffer[tensor_index] = result;
744         }
745       }
746     }
747     __syncthreads();
748   }
749 };
750 
751 
752 
753 template<typename Indices, typename InputArgType, typename KernelArgType>
754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
755 {
756   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
757 
758   static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
759   static const int NumKernelDims = internal::array_size<Indices>::value;
760   typedef typename XprType::Index Index;
761   typedef DSizes<Index, NumDims> Dimensions;
762   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
763 
764   enum {
765     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
766     PacketAccess = false,
767     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
768     CoordAccess = false,  // to be implemented
769     RawAccess = false
770   };
771 
772   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
773       : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
774   {
775     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
776 
777     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
778     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
779 
780     m_dimensions = m_inputImpl.dimensions();
781     for (int i = 0; i < NumKernelDims; ++i) {
782       const Index index = op.indices()[i];
783       const Index input_dim = input_dims[index];
784       const Index kernel_dim = kernel_dims[i];
785       const Index result_dim = input_dim - kernel_dim + 1;
786       m_dimensions[index] = result_dim;
787     }
788   }
789 
790   typedef typename XprType::CoeffReturnType CoeffReturnType;
791   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
792   typedef typename InputArgType::Scalar Scalar;
793   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
794 
795   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
796 
797   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
798     preloadKernel();
799     m_inputImpl.evalSubExprsIfNeeded(NULL);
800     if (data) {
801       executeEval(data);
802       return false;
803     } else {
804       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
805       executeEval(m_buf);
806       return true;
807     }
808   }
809 
810   EIGEN_STRONG_INLINE void cleanup() {
811     m_inputImpl.cleanup();
812     if (m_buf) {
813       m_device.deallocate(m_buf);
814       m_buf = NULL;
815     }
816     if (m_local_kernel) {
817       m_device.deallocate((void*)m_kernel);
818       m_local_kernel = false;
819     }
820     m_kernel = NULL;
821   }
822 
823   EIGEN_STRONG_INLINE void preloadKernel() {
824     // Don't make a local copy of the kernel unless we have to (i.e. it's an
825     // expression that needs to be evaluated)
826     const Scalar* in_place = m_kernelImpl.data();
827     if (in_place) {
828       m_kernel = in_place;
829       m_local_kernel = false;
830     } else {
831       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
832       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
833       typedef TensorEvalToOp<const KernelArgType> EvalTo;
834       EvalTo evalToTmp(local, m_kernelArg);
835       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
836       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
837 
838       m_kernel = local;
839       m_local_kernel = true;
840     }
841   }
842 
843   static unsigned int ceil(unsigned int num, unsigned int denom) {
844     const unsigned int rounded_toward_zero = num / denom;
845     if (num > rounded_toward_zero * denom) {
846       return rounded_toward_zero + 1;
847     }
848     return rounded_toward_zero;
849   }
850 
851   void executeEval(Scalar* data) const {
852     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
853 
854     const int maxSharedMem = m_device.sharedMemPerBlock();
855     const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
856     const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
857     const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
858     const int warpSize = 32;
859 
860     switch (NumKernelDims) {
861       case 1: {
862         const int kernel_size = m_kernelImpl.dimensions().TotalSize();
863 
864         const int numX = dimensions()[m_indices[0]];
865         const int numP = dimensions().TotalSize() / numX;
866         int maxX;
867         dim3 block_size;
868 
869         const int single_stride_dim =
870             static_cast<int>(Layout) == static_cast<int>(ColMajor)
871                 ? 0
872                 : m_inputImpl.dimensions().rank() - 1;
873         if (m_indices[0] == single_stride_dim) {
874           // Maximum the reuse
875           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
876           maxX = numext::mini<int>(inner_dim, numX);
877           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
878           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
879           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
880         }
881         else {
882           // Read as much as possible alongside the inner most dimension, that is the plane
883           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
884           const int maxP = numext::mini<int>(inner_dim, numP);
885           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
886 
887           block_size.x = numext::mini(warpSize, maxX);
888           block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
889         }
890 
891         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
892         assert(shared_mem <= maxSharedMem);
893 
894         const int num_x_blocks = ceil(numX, maxX);
895         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
896         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
897 
898         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
899 
900 
901         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
902 
903         const array<Index, 1> indices(m_indices[0]);
904         const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
905         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
906             m_inputImpl.dimensions(), kernel_dims, indices);
907         switch(kernel_size) {
908           case 4: {
909             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
910             break;
911           }
912           case 7: {
913             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
914             break;
915           }
916           default: {
917             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
918           }
919         }
920         break;
921       }
922 
923       case 2: {
924         const int idxX =
925             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
926         const int idxY =
927             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
928         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
929         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
930 
931         const int numX = dimensions()[m_indices[idxX]];
932         const int numY = dimensions()[m_indices[idxY]];
933         const int numP = dimensions().TotalSize() / (numX*numY);
934 
935         const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
936 
937         // Snap maxX to warp size
938         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
939         const int maxX = numext::mini<int>(inner_dim, numX);
940         const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
941         const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
942 
943         dim3 block_size;
944         block_size.x = numext::mini(1024, maxX);
945         block_size.y = numext::mini<int>(1024/block_size.x, maxY);
946         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
947 
948         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
949         assert(shared_mem <= maxSharedMem);
950 
951         const int num_x_blocks = ceil(numX, maxX);
952         const int num_y_blocks = ceil(numY, maxY);
953         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
954         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
955 
956         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
957 
958 
959         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
960 
961         const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
962         const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
963                                           m_kernelImpl.dimensions()[idxY]);
964         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
965             m_inputImpl.dimensions(), kernel_dims, indices);
966         switch (kernel_size_x) {
967           case 4: {
968             switch (kernel_size_y) {
969               case 7: {
970                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
971                 break;
972               }
973               default: {
974                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
975                 break;
976               }
977             }
978             break;
979           }
980           case 7: {
981             switch (kernel_size_y) {
982               case 4: {
983                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
984                 break;
985               }
986               default: {
987                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
988                 break;
989               }
990             }
991             break;
992           }
993           default: {
994             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
995             break;
996           }
997         }
998         break;
999       }
1000 
1001       case 3: {
1002         const int idxX =
1003             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1004         const int idxY =
1005             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1006         const int idxZ =
1007             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1008 
1009         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1010         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1011         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1012 
1013         const int numX = dimensions()[m_indices[idxX]];
1014         const int numY = dimensions()[m_indices[idxY]];
1015         const int numZ = dimensions()[m_indices[idxZ]];
1016         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1017 
1018         const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1019         const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1020         const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1021 
1022         dim3 block_size;
1023         block_size.x = numext::mini(32, maxX);
1024         block_size.y = numext::mini(32, maxY);
1025         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1026         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1027 
1028         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1029         assert(shared_mem <= maxSharedMem);
1030 
1031         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1032         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1033                                       m_indices[idxZ]);
1034         const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1035                                           m_kernelImpl.dimensions()[idxY],
1036                                           m_kernelImpl.dimensions()[idxZ]);
1037         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1038             m_inputImpl.dimensions(), kernel_dims, indices);
1039 
1040         LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1041         break;
1042       }
1043 
1044       default: {
1045         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1046       }
1047     }
1048   }
1049 
1050   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1051   {
1052     eigen_assert(m_buf);
1053     eigen_assert(index < m_dimensions.TotalSize());
1054     return m_buf[index];
1055   }
1056 
1057   template<int LoadMode>
1058   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1059   {
1060     eigen_assert(m_buf);
1061     eigen_assert(index < m_dimensions.TotalSize());
1062     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1063   }
1064 
1065   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1066   costPerCoeff(bool vectorized) const {
1067     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1068     // model.
1069     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1070     // We ignore the use of fused multiply-add.
1071     const double convolve_compute_cost =
1072         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1073     const double firstIndex_compute_cost =
1074         NumDims *
1075         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1076          TensorOpCost::DivCost<Index>());
1077     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1078            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1079                           m_kernelImpl.costPerCoeff(vectorized) +
1080                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1081                                        PacketSize));
1082   }
1083 
1084  private:
1085   // No assignment (copies are needed by the kernels)
1086   TensorEvaluator& operator = (const TensorEvaluator&);
1087 
1088   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1089   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1090   KernelArgType m_kernelArg;
1091   Indices m_indices;
1092   Dimensions m_dimensions;
1093   Scalar* m_buf;
1094   const Scalar* m_kernel;
1095   bool m_local_kernel;
1096 
1097   const GpuDevice& m_device;
1098 };
1099 #endif
1100 
1101 
1102 } // end namespace Eigen
1103 
1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
1105