1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/saturate_cast.hpp" 47 #include "opencv2/core/cuda/limits.hpp" 48 49 namespace cv { namespace cuda { namespace device 50 { 51 namespace stereobp 52 { 53 /////////////////////////////////////////////////////////////// 54 /////////////////////// load constants //////////////////////// 55 /////////////////////////////////////////////////////////////// 56 57 __constant__ int cndisp; 58 __constant__ float cmax_data_term; 59 __constant__ float cdata_weight; 60 __constant__ float cmax_disc_term; 61 __constant__ float cdisc_single_jump; 62 load_constants(int ndisp,float max_data_term,float data_weight,float max_disc_term,float disc_single_jump)63 void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump) 64 { 65 cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) ); 66 cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) ); 67 cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) ); 68 cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) ); 69 cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) ); 70 } 71 72 /////////////////////////////////////////////////////////////// 73 ////////////////////////// comp data ////////////////////////// 74 /////////////////////////////////////////////////////////////// 75 76 template <int cn> struct PixDiff; 77 template <> struct PixDiff<1> 78 { PixDiffcv::cuda::device::stereobp::PixDiff79 __device__ __forceinline__ PixDiff(const uchar* ls) 80 { 81 l = *ls; 82 } operator ()cv::cuda::device::stereobp::PixDiff83 __device__ __forceinline__ float operator()(const uchar* rs) const 84 { 85 return ::abs((int)l - *rs); 86 } 87 uchar l; 88 }; 89 template <> struct PixDiff<3> 90 { PixDiffcv::cuda::device::stereobp::PixDiff91 __device__ __forceinline__ PixDiff(const uchar* ls) 92 { 93 l = *((uchar3*)ls); 94 } operator ()cv::cuda::device::stereobp::PixDiff95 __device__ __forceinline__ float operator()(const uchar* rs) const 96 { 97 const float tr = 0.299f; 98 const float tg = 0.587f; 99 const float tb = 0.114f; 100 101 float val = tb * ::abs((int)l.x - rs[0]); 102 val += tg * ::abs((int)l.y - rs[1]); 103 val += tr * ::abs((int)l.z - rs[2]); 104 105 return val; 106 } 107 uchar3 l; 108 }; 109 template <> struct PixDiff<4> 110 { PixDiffcv::cuda::device::stereobp::PixDiff111 __device__ __forceinline__ PixDiff(const uchar* ls) 112 { 113 l = *((uchar4*)ls); 114 } operator ()cv::cuda::device::stereobp::PixDiff115 __device__ __forceinline__ float operator()(const uchar* rs) const 116 { 117 const float tr = 0.299f; 118 const float tg = 0.587f; 119 const float tb = 0.114f; 120 121 uchar4 r = *((uchar4*)rs); 122 123 float val = tb * ::abs((int)l.x - r.x); 124 val += tg * ::abs((int)l.y - r.y); 125 val += tr * ::abs((int)l.z - r.z); 126 127 return val; 128 } 129 uchar4 l; 130 }; 131 132 template <int cn, typename D> comp_data(const PtrStepSzb left,const PtrStepb right,PtrStep<D> data)133 __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data) 134 { 135 const int x = blockIdx.x * blockDim.x + threadIdx.x; 136 const int y = blockIdx.y * blockDim.y + threadIdx.y; 137 138 if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1) 139 { 140 const uchar* ls = left.ptr(y) + x * cn; 141 const PixDiff<cn> pixDiff(ls); 142 const uchar* rs = right.ptr(y) + x * cn; 143 144 D* ds = data.ptr(y) + x; 145 const size_t disp_step = data.step * left.rows / sizeof(D); 146 147 for (int disp = 0; disp < cndisp; disp++) 148 { 149 if (x - disp >= 1) 150 { 151 float val = pixDiff(rs - disp * cn); 152 153 ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term)); 154 } 155 else 156 { 157 ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term); 158 } 159 } 160 } 161 } 162 163 template<typename T, typename D> 164 void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream); 165 comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)166 template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 167 { 168 dim3 threads(32, 8, 1); 169 dim3 grid(1, 1, 1); 170 171 grid.x = divUp(left.cols, threads.x); 172 grid.y = divUp(left.rows, threads.y); 173 174 comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data); 175 cudaSafeCall( cudaGetLastError() ); 176 177 if (stream == 0) 178 cudaSafeCall( cudaDeviceSynchronize() ); 179 } comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)180 template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 181 { 182 dim3 threads(32, 8, 1); 183 dim3 grid(1, 1, 1); 184 185 grid.x = divUp(left.cols, threads.x); 186 grid.y = divUp(left.rows, threads.y); 187 188 comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data); 189 cudaSafeCall( cudaGetLastError() ); 190 191 if (stream == 0) 192 cudaSafeCall( cudaDeviceSynchronize() ); 193 } 194 comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)195 template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 196 { 197 dim3 threads(32, 8, 1); 198 dim3 grid(1, 1, 1); 199 200 grid.x = divUp(left.cols, threads.x); 201 grid.y = divUp(left.rows, threads.y); 202 203 comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data); 204 cudaSafeCall( cudaGetLastError() ); 205 206 if (stream == 0) 207 cudaSafeCall( cudaDeviceSynchronize() ); 208 } comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)209 template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 210 { 211 dim3 threads(32, 8, 1); 212 dim3 grid(1, 1, 1); 213 214 grid.x = divUp(left.cols, threads.x); 215 grid.y = divUp(left.rows, threads.y); 216 217 comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data); 218 cudaSafeCall( cudaGetLastError() ); 219 220 if (stream == 0) 221 cudaSafeCall( cudaDeviceSynchronize() ); 222 } 223 comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)224 template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 225 { 226 dim3 threads(32, 8, 1); 227 dim3 grid(1, 1, 1); 228 229 grid.x = divUp(left.cols, threads.x); 230 grid.y = divUp(left.rows, threads.y); 231 232 comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data); 233 cudaSafeCall( cudaGetLastError() ); 234 235 if (stream == 0) 236 cudaSafeCall( cudaDeviceSynchronize() ); 237 } comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)238 template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream) 239 { 240 dim3 threads(32, 8, 1); 241 dim3 grid(1, 1, 1); 242 243 grid.x = divUp(left.cols, threads.x); 244 grid.y = divUp(left.rows, threads.y); 245 246 comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data); 247 cudaSafeCall( cudaGetLastError() ); 248 249 if (stream == 0) 250 cudaSafeCall( cudaDeviceSynchronize() ); 251 } 252 253 /////////////////////////////////////////////////////////////// 254 //////////////////////// data step down /////////////////////// 255 /////////////////////////////////////////////////////////////// 256 257 template <typename T> data_step_down(int dst_cols,int dst_rows,int src_rows,const PtrStep<T> src,PtrStep<T> dst)258 __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst) 259 { 260 const int x = blockIdx.x * blockDim.x + threadIdx.x; 261 const int y = blockIdx.y * blockDim.y + threadIdx.y; 262 263 if (x < dst_cols && y < dst_rows) 264 { 265 for (int d = 0; d < cndisp; ++d) 266 { 267 float dst_reg = src.ptr(d * src_rows + (2*y+0))[(2*x+0)]; 268 dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)]; 269 dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)]; 270 dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)]; 271 272 dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg); 273 } 274 } 275 } 276 277 template<typename T> data_step_down_gpu(int dst_cols,int dst_rows,int src_rows,const PtrStepSzb & src,const PtrStepSzb & dst,cudaStream_t stream)278 void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) 279 { 280 dim3 threads(32, 8, 1); 281 dim3 grid(1, 1, 1); 282 283 grid.x = divUp(dst_cols, threads.x); 284 grid.y = divUp(dst_rows, threads.y); 285 286 data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst); 287 cudaSafeCall( cudaGetLastError() ); 288 289 if (stream == 0) 290 cudaSafeCall( cudaDeviceSynchronize() ); 291 } 292 293 template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream); 294 template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream); 295 296 /////////////////////////////////////////////////////////////// 297 /////////////////// level up messages //////////////////////// 298 /////////////////////////////////////////////////////////////// 299 300 template <typename T> level_up_message(int dst_cols,int dst_rows,int src_rows,const PtrStep<T> src,PtrStep<T> dst)301 __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst) 302 { 303 const int x = blockIdx.x * blockDim.x + threadIdx.x; 304 const int y = blockIdx.y * blockDim.y + threadIdx.y; 305 306 if (x < dst_cols && y < dst_rows) 307 { 308 const size_t dst_disp_step = dst.step * dst_rows / sizeof(T); 309 const size_t src_disp_step = src.step * src_rows / sizeof(T); 310 311 T* dstr = dst.ptr(y ) + x; 312 const T* srcr = src.ptr(y/2) + x/2; 313 314 for (int d = 0; d < cndisp; ++d) 315 dstr[d * dst_disp_step] = srcr[d * src_disp_step]; 316 } 317 } 318 319 template <typename T> level_up_messages_gpu(int dst_idx,int dst_cols,int dst_rows,int src_rows,PtrStepSzb * mus,PtrStepSzb * mds,PtrStepSzb * mls,PtrStepSzb * mrs,cudaStream_t stream)320 void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream) 321 { 322 dim3 threads(32, 8, 1); 323 dim3 grid(1, 1, 1); 324 325 grid.x = divUp(dst_cols, threads.x); 326 grid.y = divUp(dst_rows, threads.y); 327 328 int src_idx = (dst_idx + 1) & 1; 329 330 level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]); 331 cudaSafeCall( cudaGetLastError() ); 332 333 level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]); 334 cudaSafeCall( cudaGetLastError() ); 335 336 level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]); 337 cudaSafeCall( cudaGetLastError() ); 338 339 level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]); 340 cudaSafeCall( cudaGetLastError() ); 341 342 if (stream == 0) 343 cudaSafeCall( cudaDeviceSynchronize() ); 344 } 345 346 template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream); 347 template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream); 348 349 /////////////////////////////////////////////////////////////// 350 //////////////////// calc all iterations ///////////////////// 351 /////////////////////////////////////////////////////////////// 352 353 template <typename T> calc_min_linear_penalty(T * dst,size_t step)354 __device__ void calc_min_linear_penalty(T* dst, size_t step) 355 { 356 float prev = dst[0]; 357 float cur; 358 for (int disp = 1; disp < cndisp; ++disp) 359 { 360 prev += cdisc_single_jump; 361 cur = dst[step * disp]; 362 if (prev < cur) 363 { 364 cur = prev; 365 dst[step * disp] = saturate_cast<T>(prev); 366 } 367 prev = cur; 368 } 369 370 prev = dst[(cndisp - 1) * step]; 371 for (int disp = cndisp - 2; disp >= 0; disp--) 372 { 373 prev += cdisc_single_jump; 374 cur = dst[step * disp]; 375 if (prev < cur) 376 { 377 cur = prev; 378 dst[step * disp] = saturate_cast<T>(prev); 379 } 380 prev = cur; 381 } 382 } 383 384 template <typename T> message(const T * msg1,const T * msg2,const T * msg3,const T * data,T * dst,size_t msg_disp_step,size_t data_disp_step)385 __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step) 386 { 387 float minimum = device::numeric_limits<float>::max(); 388 389 for(int i = 0; i < cndisp; ++i) 390 { 391 float dst_reg = msg1[msg_disp_step * i]; 392 dst_reg += msg2[msg_disp_step * i]; 393 dst_reg += msg3[msg_disp_step * i]; 394 dst_reg += data[data_disp_step * i]; 395 396 if (dst_reg < minimum) 397 minimum = dst_reg; 398 399 dst[msg_disp_step * i] = saturate_cast<T>(dst_reg); 400 } 401 402 calc_min_linear_penalty(dst, msg_disp_step); 403 404 minimum += cmax_disc_term; 405 406 float sum = 0; 407 for(int i = 0; i < cndisp; ++i) 408 { 409 float dst_reg = dst[msg_disp_step * i]; 410 if (dst_reg > minimum) 411 { 412 dst_reg = minimum; 413 dst[msg_disp_step * i] = saturate_cast<T>(minimum); 414 } 415 sum += dst_reg; 416 } 417 sum /= cndisp; 418 419 for(int i = 0; i < cndisp; ++i) 420 dst[msg_disp_step * i] -= sum; 421 } 422 423 template <typename T> one_iteration(int t,int elem_step,T * u,T * d,T * l,T * r,const PtrStep<T> data,int cols,int rows)424 __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows) 425 { 426 const int y = blockIdx.y * blockDim.y + threadIdx.y; 427 const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); 428 429 if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1)) 430 { 431 T* us = u + y * elem_step + x; 432 T* ds = d + y * elem_step + x; 433 T* ls = l + y * elem_step + x; 434 T* rs = r + y * elem_step + x; 435 const T* dt = data.ptr(y) + x; 436 437 size_t msg_disp_step = elem_step * rows; 438 size_t data_disp_step = data.step * rows / sizeof(T); 439 440 message(us + elem_step, ls + 1, rs - 1, dt, us, msg_disp_step, data_disp_step); 441 message(ds - elem_step, ls + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step); 442 message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step); 443 message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step); 444 } 445 } 446 447 template <typename T> calc_all_iterations_gpu(int cols,int rows,int iters,const PtrStepSzb & u,const PtrStepSzb & d,const PtrStepSzb & l,const PtrStepSzb & r,const PtrStepSzb & data,cudaStream_t stream)448 void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, 449 const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream) 450 { 451 dim3 threads(32, 8, 1); 452 dim3 grid(1, 1, 1); 453 454 grid.x = divUp(cols, threads.x << 1); 455 grid.y = divUp(rows, threads.y); 456 457 int elem_step = (int)(u.step / sizeof(T)); 458 459 for(int t = 0; t < iters; ++t) 460 { 461 one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows); 462 cudaSafeCall( cudaGetLastError() ); 463 464 if (stream == 0) 465 cudaSafeCall( cudaDeviceSynchronize() ); 466 } 467 } 468 469 template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream); 470 template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream); 471 472 /////////////////////////////////////////////////////////////// 473 /////////////////////////// output //////////////////////////// 474 /////////////////////////////////////////////////////////////// 475 476 template <typename T> output(const int elem_step,const T * u,const T * d,const T * l,const T * r,const T * data,PtrStepSz<short> disp)477 __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data, 478 PtrStepSz<short> disp) 479 { 480 const int x = blockIdx.x * blockDim.x + threadIdx.x; 481 const int y = blockIdx.y * blockDim.y + threadIdx.y; 482 483 if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1) 484 { 485 const T* us = u + (y + 1) * elem_step + x; 486 const T* ds = d + (y - 1) * elem_step + x; 487 const T* ls = l + y * elem_step + (x + 1); 488 const T* rs = r + y * elem_step+ (x - 1); 489 const T* dt = data + y * elem_step + x; 490 491 size_t disp_step = disp.rows * elem_step; 492 493 int best = 0; 494 float best_val = numeric_limits<float>::max(); 495 for (int d = 0; d < cndisp; ++d) 496 { 497 float val = us[d * disp_step]; 498 val += ds[d * disp_step]; 499 val += ls[d * disp_step]; 500 val += rs[d * disp_step]; 501 val += dt[d * disp_step]; 502 503 if (val < best_val) 504 { 505 best_val = val; 506 best = d; 507 } 508 } 509 510 disp.ptr(y)[x] = saturate_cast<short>(best); 511 } 512 } 513 514 template <typename T> output_gpu(const PtrStepSzb & u,const PtrStepSzb & d,const PtrStepSzb & l,const PtrStepSzb & r,const PtrStepSzb & data,const PtrStepSz<short> & disp,cudaStream_t stream)515 void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, 516 const PtrStepSz<short>& disp, cudaStream_t stream) 517 { 518 dim3 threads(32, 8, 1); 519 dim3 grid(1, 1, 1); 520 521 grid.x = divUp(disp.cols, threads.x); 522 grid.y = divUp(disp.rows, threads.y); 523 524 int elem_step = static_cast<int>(u.step/sizeof(T)); 525 526 output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp); 527 cudaSafeCall( cudaGetLastError() ); 528 529 if (stream == 0) 530 cudaSafeCall( cudaDeviceSynchronize() ); 531 } 532 533 template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream); 534 template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream); 535 } // namespace stereobp 536 }}} // namespace cv { namespace cuda { namespace cudev 537 538 #endif /* CUDA_DISABLER */ 539