1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #ifndef __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 44 #define __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 45 46 #include "../common.hpp" 47 #include "../vec_traits.hpp" 48 #include "../functional.hpp" 49 50 //! @cond IGNORED 51 52 namespace cv { namespace cuda { namespace device 53 { 54 namespace transform_detail 55 { 56 //! Read Write Traits 57 58 template <typename T, typename D, int shift> struct UnaryReadWriteTraits 59 { 60 typedef typename TypeVec<T, shift>::vec_type read_type; 61 typedef typename TypeVec<D, shift>::vec_type write_type; 62 }; 63 64 template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits 65 { 66 typedef typename TypeVec<T1, shift>::vec_type read_type1; 67 typedef typename TypeVec<T2, shift>::vec_type read_type2; 68 typedef typename TypeVec<D, shift>::vec_type write_type; 69 }; 70 71 //! Transform kernels 72 73 template <int shift> struct OpUnroller; 74 template <> struct OpUnroller<1> 75 { 76 template <typename T, typename D, typename UnOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller77 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) 78 { 79 if (mask(y, x_shifted)) 80 dst.x = op(src.x); 81 } 82 83 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller84 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) 85 { 86 if (mask(y, x_shifted)) 87 dst.x = op(src1.x, src2.x); 88 } 89 }; 90 template <> struct OpUnroller<2> 91 { 92 template <typename T, typename D, typename UnOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller93 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) 94 { 95 if (mask(y, x_shifted)) 96 dst.x = op(src.x); 97 if (mask(y, x_shifted + 1)) 98 dst.y = op(src.y); 99 } 100 101 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller102 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) 103 { 104 if (mask(y, x_shifted)) 105 dst.x = op(src1.x, src2.x); 106 if (mask(y, x_shifted + 1)) 107 dst.y = op(src1.y, src2.y); 108 } 109 }; 110 template <> struct OpUnroller<3> 111 { 112 template <typename T, typename D, typename UnOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller113 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 114 { 115 if (mask(y, x_shifted)) 116 dst.x = op(src.x); 117 if (mask(y, x_shifted + 1)) 118 dst.y = op(src.y); 119 if (mask(y, x_shifted + 2)) 120 dst.z = op(src.z); 121 } 122 123 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller124 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 125 { 126 if (mask(y, x_shifted)) 127 dst.x = op(src1.x, src2.x); 128 if (mask(y, x_shifted + 1)) 129 dst.y = op(src1.y, src2.y); 130 if (mask(y, x_shifted + 2)) 131 dst.z = op(src1.z, src2.z); 132 } 133 }; 134 template <> struct OpUnroller<4> 135 { 136 template <typename T, typename D, typename UnOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller137 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 138 { 139 if (mask(y, x_shifted)) 140 dst.x = op(src.x); 141 if (mask(y, x_shifted + 1)) 142 dst.y = op(src.y); 143 if (mask(y, x_shifted + 2)) 144 dst.z = op(src.z); 145 if (mask(y, x_shifted + 3)) 146 dst.w = op(src.w); 147 } 148 149 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller150 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 151 { 152 if (mask(y, x_shifted)) 153 dst.x = op(src1.x, src2.x); 154 if (mask(y, x_shifted + 1)) 155 dst.y = op(src1.y, src2.y); 156 if (mask(y, x_shifted + 2)) 157 dst.z = op(src1.z, src2.z); 158 if (mask(y, x_shifted + 3)) 159 dst.w = op(src1.w, src2.w); 160 } 161 }; 162 template <> struct OpUnroller<8> 163 { 164 template <typename T, typename D, typename UnOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller165 static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) 166 { 167 if (mask(y, x_shifted)) 168 dst.a0 = op(src.a0); 169 if (mask(y, x_shifted + 1)) 170 dst.a1 = op(src.a1); 171 if (mask(y, x_shifted + 2)) 172 dst.a2 = op(src.a2); 173 if (mask(y, x_shifted + 3)) 174 dst.a3 = op(src.a3); 175 if (mask(y, x_shifted + 4)) 176 dst.a4 = op(src.a4); 177 if (mask(y, x_shifted + 5)) 178 dst.a5 = op(src.a5); 179 if (mask(y, x_shifted + 6)) 180 dst.a6 = op(src.a6); 181 if (mask(y, x_shifted + 7)) 182 dst.a7 = op(src.a7); 183 } 184 185 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> unrollcv::cuda::device::transform_detail::OpUnroller186 static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) 187 { 188 if (mask(y, x_shifted)) 189 dst.a0 = op(src1.a0, src2.a0); 190 if (mask(y, x_shifted + 1)) 191 dst.a1 = op(src1.a1, src2.a1); 192 if (mask(y, x_shifted + 2)) 193 dst.a2 = op(src1.a2, src2.a2); 194 if (mask(y, x_shifted + 3)) 195 dst.a3 = op(src1.a3, src2.a3); 196 if (mask(y, x_shifted + 4)) 197 dst.a4 = op(src1.a4, src2.a4); 198 if (mask(y, x_shifted + 5)) 199 dst.a5 = op(src1.a5, src2.a5); 200 if (mask(y, x_shifted + 6)) 201 dst.a6 = op(src1.a6, src2.a6); 202 if (mask(y, x_shifted + 7)) 203 dst.a7 = op(src1.a7, src2.a7); 204 } 205 }; 206 207 template <typename T, typename D, typename UnOp, typename Mask> transformSmart(const PtrStepSz<T> src_,PtrStep<D> dst_,const Mask mask,const UnOp op)208 static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op) 209 { 210 typedef TransformFunctorTraits<UnOp> ft; 211 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type; 212 typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type; 213 214 const int x = threadIdx.x + blockIdx.x * blockDim.x; 215 const int y = threadIdx.y + blockIdx.y * blockDim.y; 216 const int x_shifted = x * ft::smart_shift; 217 218 if (y < src_.rows) 219 { 220 const T* src = src_.ptr(y); 221 D* dst = dst_.ptr(y); 222 223 if (x_shifted + ft::smart_shift - 1 < src_.cols) 224 { 225 const read_type src_n_el = ((const read_type*)src)[x]; 226 write_type dst_n_el = ((const write_type*)dst)[x]; 227 228 OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); 229 230 ((write_type*)dst)[x] = dst_n_el; 231 } 232 else 233 { 234 for (int real_x = x_shifted; real_x < src_.cols; ++real_x) 235 { 236 if (mask(y, real_x)) 237 dst[real_x] = op(src[real_x]); 238 } 239 } 240 } 241 } 242 243 template <typename T, typename D, typename UnOp, typename Mask> transformSimple(const PtrStepSz<T> src,PtrStep<D> dst,const Mask mask,const UnOp op)244 __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op) 245 { 246 const int x = blockDim.x * blockIdx.x + threadIdx.x; 247 const int y = blockDim.y * blockIdx.y + threadIdx.y; 248 249 if (x < src.cols && y < src.rows && mask(y, x)) 250 { 251 dst.ptr(y)[x] = op(src.ptr(y)[x]); 252 } 253 } 254 255 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> transformSmart(const PtrStepSz<T1> src1_,const PtrStep<T2> src2_,PtrStep<D> dst_,const Mask mask,const BinOp op)256 static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 257 const Mask mask, const BinOp op) 258 { 259 typedef TransformFunctorTraits<BinOp> ft; 260 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1; 261 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2; 262 typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type; 263 264 const int x = threadIdx.x + blockIdx.x * blockDim.x; 265 const int y = threadIdx.y + blockIdx.y * blockDim.y; 266 const int x_shifted = x * ft::smart_shift; 267 268 if (y < src1_.rows) 269 { 270 const T1* src1 = src1_.ptr(y); 271 const T2* src2 = src2_.ptr(y); 272 D* dst = dst_.ptr(y); 273 274 if (x_shifted + ft::smart_shift - 1 < src1_.cols) 275 { 276 const read_type1 src1_n_el = ((const read_type1*)src1)[x]; 277 const read_type2 src2_n_el = ((const read_type2*)src2)[x]; 278 write_type dst_n_el = ((const write_type*)dst)[x]; 279 280 OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); 281 282 ((write_type*)dst)[x] = dst_n_el; 283 } 284 else 285 { 286 for (int real_x = x_shifted; real_x < src1_.cols; ++real_x) 287 { 288 if (mask(y, real_x)) 289 dst[real_x] = op(src1[real_x], src2[real_x]); 290 } 291 } 292 } 293 } 294 295 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> transformSimple(const PtrStepSz<T1> src1,const PtrStep<T2> src2,PtrStep<D> dst,const Mask mask,const BinOp op)296 static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 297 const Mask mask, const BinOp op) 298 { 299 const int x = blockDim.x * blockIdx.x + threadIdx.x; 300 const int y = blockDim.y * blockIdx.y + threadIdx.y; 301 302 if (x < src1.cols && y < src1.rows && mask(y, x)) 303 { 304 const T1 src1_data = src1.ptr(y)[x]; 305 const T2 src2_data = src2.ptr(y)[x]; 306 dst.ptr(y)[x] = op(src1_data, src2_data); 307 } 308 } 309 310 template <bool UseSmart> struct TransformDispatcher; 311 template<> struct TransformDispatcher<false> 312 { 313 template <typename T, typename D, typename UnOp, typename Mask> callcv::cuda::device::transform_detail::TransformDispatcher314 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream) 315 { 316 typedef TransformFunctorTraits<UnOp> ft; 317 318 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); 319 const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); 320 321 transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); 322 cudaSafeCall( cudaGetLastError() ); 323 324 if (stream == 0) 325 cudaSafeCall( cudaDeviceSynchronize() ); 326 } 327 328 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> callcv::cuda::device::transform_detail::TransformDispatcher329 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream) 330 { 331 typedef TransformFunctorTraits<BinOp> ft; 332 333 const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); 334 const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); 335 336 transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); 337 cudaSafeCall( cudaGetLastError() ); 338 339 if (stream == 0) 340 cudaSafeCall( cudaDeviceSynchronize() ); 341 } 342 }; 343 template<> struct TransformDispatcher<true> 344 { 345 template <typename T, typename D, typename UnOp, typename Mask> callcv::cuda::device::transform_detail::TransformDispatcher346 static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream) 347 { 348 typedef TransformFunctorTraits<UnOp> ft; 349 350 CV_StaticAssert(ft::smart_shift != 1, ""); 351 352 if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) || 353 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) 354 { 355 TransformDispatcher<false>::call(src, dst, op, mask, stream); 356 return; 357 } 358 359 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); 360 const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); 361 362 transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op); 363 cudaSafeCall( cudaGetLastError() ); 364 365 if (stream == 0) 366 cudaSafeCall( cudaDeviceSynchronize() ); 367 } 368 369 template <typename T1, typename T2, typename D, typename BinOp, typename Mask> callcv::cuda::device::transform_detail::TransformDispatcher370 static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream) 371 { 372 typedef TransformFunctorTraits<BinOp> ft; 373 374 CV_StaticAssert(ft::smart_shift != 1, ""); 375 376 if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) || 377 !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) || 378 !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) 379 { 380 TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream); 381 return; 382 } 383 384 const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); 385 const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); 386 387 transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op); 388 cudaSafeCall( cudaGetLastError() ); 389 390 if (stream == 0) 391 cudaSafeCall( cudaDeviceSynchronize() ); 392 } 393 }; 394 } // namespace transform_detail 395 }}} // namespace cv { namespace cuda { namespace cudev 396 397 //! @endcond 398 399 #endif // __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ 400