1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #ifndef __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__
44 #define __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__
45 
46 #include "../common.hpp"
47 #include "../vec_traits.hpp"
48 #include "../functional.hpp"
49 
50 //! @cond IGNORED
51 
52 namespace cv { namespace cuda { namespace device
53 {
54     namespace transform_detail
55     {
56         //! Read Write Traits
57 
58         template <typename T, typename D, int shift> struct UnaryReadWriteTraits
59         {
60             typedef typename TypeVec<T, shift>::vec_type read_type;
61             typedef typename TypeVec<D, shift>::vec_type write_type;
62         };
63 
64         template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
65         {
66             typedef typename TypeVec<T1, shift>::vec_type read_type1;
67             typedef typename TypeVec<T2, shift>::vec_type read_type2;
68             typedef typename TypeVec<D, shift>::vec_type write_type;
69         };
70 
71         //! Transform kernels
72 
73         template <int shift> struct OpUnroller;
74         template <> struct OpUnroller<1>
75         {
76             template <typename T, typename D, typename UnOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller77             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
78             {
79                 if (mask(y, x_shifted))
80                     dst.x = op(src.x);
81             }
82 
83             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller84             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
85             {
86                 if (mask(y, x_shifted))
87                     dst.x = op(src1.x, src2.x);
88             }
89         };
90         template <> struct OpUnroller<2>
91         {
92             template <typename T, typename D, typename UnOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller93             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
94             {
95                 if (mask(y, x_shifted))
96                     dst.x = op(src.x);
97                 if (mask(y, x_shifted + 1))
98                     dst.y = op(src.y);
99             }
100 
101             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller102             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
103             {
104                 if (mask(y, x_shifted))
105                     dst.x = op(src1.x, src2.x);
106                 if (mask(y, x_shifted + 1))
107                     dst.y = op(src1.y, src2.y);
108             }
109         };
110         template <> struct OpUnroller<3>
111         {
112             template <typename T, typename D, typename UnOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller113             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
114             {
115                 if (mask(y, x_shifted))
116                     dst.x = op(src.x);
117                 if (mask(y, x_shifted + 1))
118                     dst.y = op(src.y);
119                 if (mask(y, x_shifted + 2))
120                     dst.z = op(src.z);
121             }
122 
123             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller124             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
125             {
126                 if (mask(y, x_shifted))
127                     dst.x = op(src1.x, src2.x);
128                 if (mask(y, x_shifted + 1))
129                     dst.y = op(src1.y, src2.y);
130                 if (mask(y, x_shifted + 2))
131                     dst.z = op(src1.z, src2.z);
132             }
133         };
134         template <> struct OpUnroller<4>
135         {
136             template <typename T, typename D, typename UnOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller137             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
138             {
139                 if (mask(y, x_shifted))
140                     dst.x = op(src.x);
141                 if (mask(y, x_shifted + 1))
142                     dst.y = op(src.y);
143                 if (mask(y, x_shifted + 2))
144                     dst.z = op(src.z);
145                 if (mask(y, x_shifted + 3))
146                     dst.w = op(src.w);
147             }
148 
149             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller150             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
151             {
152                 if (mask(y, x_shifted))
153                     dst.x = op(src1.x, src2.x);
154                 if (mask(y, x_shifted + 1))
155                     dst.y = op(src1.y, src2.y);
156                 if (mask(y, x_shifted + 2))
157                     dst.z = op(src1.z, src2.z);
158                 if (mask(y, x_shifted + 3))
159                     dst.w = op(src1.w, src2.w);
160             }
161         };
162         template <> struct OpUnroller<8>
163         {
164             template <typename T, typename D, typename UnOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller165             static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
166             {
167                 if (mask(y, x_shifted))
168                     dst.a0 = op(src.a0);
169                 if (mask(y, x_shifted + 1))
170                     dst.a1 = op(src.a1);
171                 if (mask(y, x_shifted + 2))
172                     dst.a2 = op(src.a2);
173                 if (mask(y, x_shifted + 3))
174                     dst.a3 = op(src.a3);
175                 if (mask(y, x_shifted + 4))
176                     dst.a4 = op(src.a4);
177                 if (mask(y, x_shifted + 5))
178                     dst.a5 = op(src.a5);
179                 if (mask(y, x_shifted + 6))
180                     dst.a6 = op(src.a6);
181                 if (mask(y, x_shifted + 7))
182                     dst.a7 = op(src.a7);
183             }
184 
185             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
unrollcv::cuda::device::transform_detail::OpUnroller186             static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
187             {
188                 if (mask(y, x_shifted))
189                     dst.a0 = op(src1.a0, src2.a0);
190                 if (mask(y, x_shifted + 1))
191                     dst.a1 = op(src1.a1, src2.a1);
192                 if (mask(y, x_shifted + 2))
193                     dst.a2 = op(src1.a2, src2.a2);
194                 if (mask(y, x_shifted + 3))
195                     dst.a3 = op(src1.a3, src2.a3);
196                 if (mask(y, x_shifted + 4))
197                     dst.a4 = op(src1.a4, src2.a4);
198                 if (mask(y, x_shifted + 5))
199                     dst.a5 = op(src1.a5, src2.a5);
200                 if (mask(y, x_shifted + 6))
201                     dst.a6 = op(src1.a6, src2.a6);
202                 if (mask(y, x_shifted + 7))
203                     dst.a7 = op(src1.a7, src2.a7);
204             }
205         };
206 
207         template <typename T, typename D, typename UnOp, typename Mask>
transformSmart(const PtrStepSz<T> src_,PtrStep<D> dst_,const Mask mask,const UnOp op)208         static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
209         {
210             typedef TransformFunctorTraits<UnOp> ft;
211             typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
212             typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
213 
214             const int x = threadIdx.x + blockIdx.x * blockDim.x;
215             const int y = threadIdx.y + blockIdx.y * blockDim.y;
216             const int x_shifted = x * ft::smart_shift;
217 
218             if (y < src_.rows)
219             {
220                 const T* src = src_.ptr(y);
221                 D* dst = dst_.ptr(y);
222 
223                 if (x_shifted + ft::smart_shift - 1 < src_.cols)
224                 {
225                     const read_type src_n_el = ((const read_type*)src)[x];
226                     write_type dst_n_el = ((const write_type*)dst)[x];
227 
228                     OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
229 
230                     ((write_type*)dst)[x] = dst_n_el;
231                 }
232                 else
233                 {
234                     for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
235                     {
236                         if (mask(y, real_x))
237                             dst[real_x] = op(src[real_x]);
238                     }
239                 }
240             }
241         }
242 
243         template <typename T, typename D, typename UnOp, typename Mask>
transformSimple(const PtrStepSz<T> src,PtrStep<D> dst,const Mask mask,const UnOp op)244         __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
245         {
246             const int x = blockDim.x * blockIdx.x + threadIdx.x;
247             const int y = blockDim.y * blockIdx.y + threadIdx.y;
248 
249             if (x < src.cols && y < src.rows && mask(y, x))
250             {
251                 dst.ptr(y)[x] = op(src.ptr(y)[x]);
252             }
253         }
254 
255         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
transformSmart(const PtrStepSz<T1> src1_,const PtrStep<T2> src2_,PtrStep<D> dst_,const Mask mask,const BinOp op)256         static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
257             const Mask mask, const BinOp op)
258         {
259             typedef TransformFunctorTraits<BinOp> ft;
260             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
261             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
262             typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
263 
264             const int x = threadIdx.x + blockIdx.x * blockDim.x;
265             const int y = threadIdx.y + blockIdx.y * blockDim.y;
266             const int x_shifted = x * ft::smart_shift;
267 
268             if (y < src1_.rows)
269             {
270                 const T1* src1 = src1_.ptr(y);
271                 const T2* src2 = src2_.ptr(y);
272                 D* dst = dst_.ptr(y);
273 
274                 if (x_shifted + ft::smart_shift - 1 < src1_.cols)
275                 {
276                     const read_type1 src1_n_el = ((const read_type1*)src1)[x];
277                     const read_type2 src2_n_el = ((const read_type2*)src2)[x];
278                     write_type dst_n_el = ((const write_type*)dst)[x];
279 
280                     OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
281 
282                     ((write_type*)dst)[x] = dst_n_el;
283                 }
284                 else
285                 {
286                     for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
287                     {
288                         if (mask(y, real_x))
289                             dst[real_x] = op(src1[real_x], src2[real_x]);
290                     }
291                 }
292             }
293         }
294 
295         template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
transformSimple(const PtrStepSz<T1> src1,const PtrStep<T2> src2,PtrStep<D> dst,const Mask mask,const BinOp op)296         static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
297             const Mask mask, const BinOp op)
298         {
299             const int x = blockDim.x * blockIdx.x + threadIdx.x;
300             const int y = blockDim.y * blockIdx.y + threadIdx.y;
301 
302             if (x < src1.cols && y < src1.rows && mask(y, x))
303             {
304                 const T1 src1_data = src1.ptr(y)[x];
305                 const T2 src2_data = src2.ptr(y)[x];
306                 dst.ptr(y)[x] = op(src1_data, src2_data);
307             }
308         }
309 
310         template <bool UseSmart> struct TransformDispatcher;
311         template<> struct TransformDispatcher<false>
312         {
313             template <typename T, typename D, typename UnOp, typename Mask>
callcv::cuda::device::transform_detail::TransformDispatcher314             static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
315             {
316                 typedef TransformFunctorTraits<UnOp> ft;
317 
318                 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
319                 const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
320 
321                 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
322                 cudaSafeCall( cudaGetLastError() );
323 
324                 if (stream == 0)
325                     cudaSafeCall( cudaDeviceSynchronize() );
326             }
327 
328             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
callcv::cuda::device::transform_detail::TransformDispatcher329             static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
330             {
331                 typedef TransformFunctorTraits<BinOp> ft;
332 
333                 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
334                 const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
335 
336                 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
337                 cudaSafeCall( cudaGetLastError() );
338 
339                 if (stream == 0)
340                     cudaSafeCall( cudaDeviceSynchronize() );
341             }
342         };
343         template<> struct TransformDispatcher<true>
344         {
345             template <typename T, typename D, typename UnOp, typename Mask>
callcv::cuda::device::transform_detail::TransformDispatcher346             static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
347             {
348                 typedef TransformFunctorTraits<UnOp> ft;
349 
350                 CV_StaticAssert(ft::smart_shift != 1, "");
351 
352                 if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
353                     !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
354                 {
355                     TransformDispatcher<false>::call(src, dst, op, mask, stream);
356                     return;
357                 }
358 
359                 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
360                 const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
361 
362                 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
363                 cudaSafeCall( cudaGetLastError() );
364 
365                 if (stream == 0)
366                     cudaSafeCall( cudaDeviceSynchronize() );
367             }
368 
369             template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
callcv::cuda::device::transform_detail::TransformDispatcher370             static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
371             {
372                 typedef TransformFunctorTraits<BinOp> ft;
373 
374                 CV_StaticAssert(ft::smart_shift != 1, "");
375 
376                 if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
377                     !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
378                     !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
379                 {
380                     TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
381                     return;
382                 }
383 
384                 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
385                 const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
386 
387                 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
388                 cudaSafeCall( cudaGetLastError() );
389 
390                 if (stream == 0)
391                     cudaSafeCall( cudaDeviceSynchronize() );
392             }
393         };
394     } // namespace transform_detail
395 }}} // namespace cv { namespace cuda { namespace cudev
396 
397 //! @endcond
398 
399 #endif // __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__
400