1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 template <typename Op, typename XprType>
18 struct traits<TensorScanOp<Op, XprType> >
19     : public traits<XprType> {
20   typedef typename XprType::Scalar Scalar;
21   typedef traits<XprType> XprTraits;
22   typedef typename XprTraits::StorageKind StorageKind;
23   typedef typename XprType::Nested Nested;
24   typedef typename remove_reference<Nested>::type _Nested;
25   static const int NumDimensions = XprTraits::NumDimensions;
26   static const int Layout = XprTraits::Layout;
27 };
28 
29 template<typename Op, typename XprType>
30 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
31 {
32   typedef const TensorScanOp<Op, XprType>& type;
33 };
34 
35 template<typename Op, typename XprType>
36 struct nested<TensorScanOp<Op, XprType>, 1,
37             typename eval<TensorScanOp<Op, XprType> >::type>
38 {
39   typedef TensorScanOp<Op, XprType> type;
40 };
41 } // end namespace internal
42 
43 /** \class TensorScan
44   * \ingroup CXX11_Tensor_Module
45   *
46   * \brief Tensor scan class.
47   */
48 template <typename Op, typename XprType>
49 class TensorScanOp
50     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
51 public:
52   typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
53   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
54   typedef typename XprType::CoeffReturnType CoeffReturnType;
55   typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
56   typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
57   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
58 
59   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
60       const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
61       : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
62 
63   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
64   const Index axis() const { return m_axis; }
65   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
66   const XprType& expression() const { return m_expr; }
67   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
68   const Op accumulator() const { return m_accumulator; }
69   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
70   bool exclusive() const { return m_exclusive; }
71 
72 protected:
73   typename XprType::Nested m_expr;
74   const Index m_axis;
75   const Op m_accumulator;
76   const bool m_exclusive;
77 };
78 
79 template <typename Self, typename Reducer, typename Device>
80 struct ScanLauncher;
81 
82 // Eval as rvalue
83 template <typename Op, typename ArgType, typename Device>
84 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
85 
86   typedef TensorScanOp<Op, ArgType> XprType;
87   typedef typename XprType::Index Index;
88   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
89   typedef DSizes<Index, NumDims> Dimensions;
90   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
91   typedef typename XprType::CoeffReturnType CoeffReturnType;
92   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
93   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
94 
95   enum {
96     IsAligned = false,
97     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
98     BlockAccess = false,
99     Layout = TensorEvaluator<ArgType, Device>::Layout,
100     CoordAccess = false,
101     RawAccess = true
102   };
103 
104   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
105                                                         const Device& device)
106       : m_impl(op.expression(), device),
107         m_device(device),
108         m_exclusive(op.exclusive()),
109         m_accumulator(op.accumulator()),
110         m_size(m_impl.dimensions()[op.axis()]),
111         m_stride(1),
112         m_output(NULL) {
113 
114     // Accumulating a scalar isn't supported.
115     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
116     eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
117 
118     // Compute stride of scan axis
119     const Dimensions& dims = m_impl.dimensions();
120     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121       for (int i = 0; i < op.axis(); ++i) {
122         m_stride = m_stride * dims[i];
123       }
124     } else {
125       for (int i = NumDims - 1; i > op.axis(); --i) {
126         m_stride = m_stride * dims[i];
127       }
128     }
129   }
130 
131   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
132     return m_impl.dimensions();
133   }
134 
135   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
136     return m_stride;
137   }
138 
139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
140     return m_size;
141   }
142 
143   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
144     return m_accumulator;
145   }
146 
147   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
148     return m_exclusive;
149   }
150 
151   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
152     return m_impl;
153   }
154 
155   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
156     return m_device;
157   }
158 
159   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
160     m_impl.evalSubExprsIfNeeded(NULL);
161     ScanLauncher<Self, Op, Device> launcher;
162     if (data) {
163       launcher(*this, data);
164       return false;
165     }
166 
167     const Index total_size = internal::array_prod(dimensions());
168     m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
169     launcher(*this, m_output);
170     return true;
171   }
172 
173   template<int LoadMode>
174   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
175     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
176   }
177 
178   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
179   {
180     return m_output;
181   }
182 
183   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
184   {
185     return m_output[index];
186   }
187 
188   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
189     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
190   }
191 
192   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
193     if (m_output != NULL) {
194       m_device.deallocate(m_output);
195       m_output = NULL;
196     }
197     m_impl.cleanup();
198   }
199 
200 protected:
201   TensorEvaluator<ArgType, Device> m_impl;
202   const Device& m_device;
203   const bool m_exclusive;
204   Op m_accumulator;
205   const Index m_size;
206   Index m_stride;
207   CoeffReturnType* m_output;
208 };
209 
210 // CPU implementation of scan
211 // TODO(ibab) This single-threaded implementation should be parallelized,
212 // at least by running multiple scans at the same time.
213 template <typename Self, typename Reducer, typename Device>
214 struct ScanLauncher {
215   void operator()(Self& self, typename Self::CoeffReturnType *data) {
216     Index total_size = internal::array_prod(self.dimensions());
217 
218     // We fix the index along the scan axis to 0 and perform a
219     // scan per remaining entry. The iteration is split into two nested
220     // loops to avoid an integer division by keeping track of each idx1 and idx2.
221     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
222       for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
223         // Calculate the starting offset for the scan
224         Index offset = idx1 + idx2;
225 
226         // Compute the scan along the axis, starting at the calculated offset
227         typename Self::CoeffReturnType accum = self.accumulator().initialize();
228         for (Index idx3 = 0; idx3 < self.size(); idx3++) {
229           Index curr = offset + idx3 * self.stride();
230 
231           if (self.exclusive()) {
232             data[curr] = self.accumulator().finalize(accum);
233             self.accumulator().reduce(self.inner().coeff(curr), &accum);
234           } else {
235             self.accumulator().reduce(self.inner().coeff(curr), &accum);
236             data[curr] = self.accumulator().finalize(accum);
237           }
238         }
239       }
240     }
241   }
242 };
243 
244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
245 
246 // GPU implementation of scan
247 // TODO(ibab) This placeholder implementation performs multiple scans in
248 // parallel, but it would be better to use a parallel scan algorithm and
249 // optimize memory access.
250 template <typename Self, typename Reducer>
251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
252   // Compute offset as in the CPU version
253   Index val = threadIdx.x + blockIdx.x * blockDim.x;
254   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
255 
256   if (offset + (self.size() - 1) * self.stride() < total_size) {
257     // Compute the scan along the axis, starting at the calculated offset
258     typename Self::CoeffReturnType accum = self.accumulator().initialize();
259     for (Index idx = 0; idx < self.size(); idx++) {
260       Index curr = offset + idx * self.stride();
261       if (self.exclusive()) {
262         data[curr] = self.accumulator().finalize(accum);
263         self.accumulator().reduce(self.inner().coeff(curr), &accum);
264       } else {
265         self.accumulator().reduce(self.inner().coeff(curr), &accum);
266         data[curr] = self.accumulator().finalize(accum);
267       }
268     }
269   }
270   __syncthreads();
271 
272 }
273 
274 template <typename Self, typename Reducer>
275 struct ScanLauncher<Self, Reducer, GpuDevice> {
276   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
277      Index total_size = internal::array_prod(self.dimensions());
278      Index num_blocks = (total_size / self.size() + 63) / 64;
279      Index block_size = 64;
280      LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
281   }
282 };
283 #endif  // EIGEN_USE_GPU && __CUDACC__
284 
285 }  // end namespace Eigen
286 
287 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
288