1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/saturate_cast.hpp"
47 #include "opencv2/core/cuda/limits.hpp"
48 
49 namespace cv { namespace cuda { namespace device
50 {
51     namespace stereobp
52     {
53         ///////////////////////////////////////////////////////////////
54         /////////////////////// load constants ////////////////////////
55         ///////////////////////////////////////////////////////////////
56 
57         __constant__ int   cndisp;
58         __constant__ float cmax_data_term;
59         __constant__ float cdata_weight;
60         __constant__ float cmax_disc_term;
61         __constant__ float cdisc_single_jump;
62 
load_constants(int ndisp,float max_data_term,float data_weight,float max_disc_term,float disc_single_jump)63         void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
64         {
65             cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
66             cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
67             cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
68             cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
69             cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
70         }
71 
72         ///////////////////////////////////////////////////////////////
73         ////////////////////////// comp data //////////////////////////
74         ///////////////////////////////////////////////////////////////
75 
76         template <int cn> struct PixDiff;
77         template <> struct PixDiff<1>
78         {
PixDiffcv::cuda::device::stereobp::PixDiff79             __device__ __forceinline__ PixDiff(const uchar* ls)
80             {
81                 l = *ls;
82             }
operator ()cv::cuda::device::stereobp::PixDiff83             __device__ __forceinline__ float operator()(const uchar* rs) const
84             {
85                 return ::abs((int)l - *rs);
86             }
87             uchar l;
88         };
89         template <> struct PixDiff<3>
90         {
PixDiffcv::cuda::device::stereobp::PixDiff91             __device__ __forceinline__ PixDiff(const uchar* ls)
92             {
93                 l = *((uchar3*)ls);
94             }
operator ()cv::cuda::device::stereobp::PixDiff95             __device__ __forceinline__ float operator()(const uchar* rs) const
96             {
97                 const float tr = 0.299f;
98                 const float tg = 0.587f;
99                 const float tb = 0.114f;
100 
101                 float val  = tb * ::abs((int)l.x - rs[0]);
102                       val += tg * ::abs((int)l.y - rs[1]);
103                       val += tr * ::abs((int)l.z - rs[2]);
104 
105                 return val;
106             }
107             uchar3 l;
108         };
109         template <> struct PixDiff<4>
110         {
PixDiffcv::cuda::device::stereobp::PixDiff111             __device__ __forceinline__ PixDiff(const uchar* ls)
112             {
113                 l = *((uchar4*)ls);
114             }
operator ()cv::cuda::device::stereobp::PixDiff115             __device__ __forceinline__ float operator()(const uchar* rs) const
116             {
117                 const float tr = 0.299f;
118                 const float tg = 0.587f;
119                 const float tb = 0.114f;
120 
121                 uchar4 r = *((uchar4*)rs);
122 
123                 float val  = tb * ::abs((int)l.x - r.x);
124                       val += tg * ::abs((int)l.y - r.y);
125                       val += tr * ::abs((int)l.z - r.z);
126 
127                 return val;
128             }
129             uchar4 l;
130         };
131 
132         template <int cn, typename D>
comp_data(const PtrStepSzb left,const PtrStepb right,PtrStep<D> data)133         __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
134         {
135             const int x = blockIdx.x * blockDim.x + threadIdx.x;
136             const int y = blockIdx.y * blockDim.y + threadIdx.y;
137 
138             if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
139             {
140                 const uchar* ls = left.ptr(y) + x * cn;
141                 const PixDiff<cn> pixDiff(ls);
142                 const uchar* rs = right.ptr(y) + x * cn;
143 
144                 D* ds = data.ptr(y) + x;
145                 const size_t disp_step = data.step * left.rows / sizeof(D);
146 
147                 for (int disp = 0; disp < cndisp; disp++)
148                 {
149                     if (x - disp >= 1)
150                     {
151                         float val = pixDiff(rs - disp * cn);
152 
153                         ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
154                     }
155                     else
156                     {
157                         ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
158                     }
159                 }
160             }
161         }
162 
163         template<typename T, typename D>
164         void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
165 
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)166         template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
167         {
168             dim3 threads(32, 8, 1);
169             dim3 grid(1, 1, 1);
170 
171             grid.x = divUp(left.cols, threads.x);
172             grid.y = divUp(left.rows, threads.y);
173 
174             comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
175             cudaSafeCall( cudaGetLastError() );
176 
177             if (stream == 0)
178                 cudaSafeCall( cudaDeviceSynchronize() );
179         }
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)180         template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
181         {
182             dim3 threads(32, 8, 1);
183             dim3 grid(1, 1, 1);
184 
185             grid.x = divUp(left.cols, threads.x);
186             grid.y = divUp(left.rows, threads.y);
187 
188             comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
189             cudaSafeCall( cudaGetLastError() );
190 
191             if (stream == 0)
192                 cudaSafeCall( cudaDeviceSynchronize() );
193         }
194 
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)195         template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
196         {
197             dim3 threads(32, 8, 1);
198             dim3 grid(1, 1, 1);
199 
200             grid.x = divUp(left.cols, threads.x);
201             grid.y = divUp(left.rows, threads.y);
202 
203             comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
204             cudaSafeCall( cudaGetLastError() );
205 
206             if (stream == 0)
207                 cudaSafeCall( cudaDeviceSynchronize() );
208         }
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)209         template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
210         {
211             dim3 threads(32, 8, 1);
212             dim3 grid(1, 1, 1);
213 
214             grid.x = divUp(left.cols, threads.x);
215             grid.y = divUp(left.rows, threads.y);
216 
217             comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
218             cudaSafeCall( cudaGetLastError() );
219 
220             if (stream == 0)
221                 cudaSafeCall( cudaDeviceSynchronize() );
222         }
223 
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)224         template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
225         {
226             dim3 threads(32, 8, 1);
227             dim3 grid(1, 1, 1);
228 
229             grid.x = divUp(left.cols, threads.x);
230             grid.y = divUp(left.rows, threads.y);
231 
232             comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
233             cudaSafeCall( cudaGetLastError() );
234 
235             if (stream == 0)
236                 cudaSafeCall( cudaDeviceSynchronize() );
237         }
comp_data_gpu(const PtrStepSzb & left,const PtrStepSzb & right,const PtrStepSzb & data,cudaStream_t stream)238         template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
239         {
240             dim3 threads(32, 8, 1);
241             dim3 grid(1, 1, 1);
242 
243             grid.x = divUp(left.cols, threads.x);
244             grid.y = divUp(left.rows, threads.y);
245 
246             comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
247             cudaSafeCall( cudaGetLastError() );
248 
249             if (stream == 0)
250                 cudaSafeCall( cudaDeviceSynchronize() );
251         }
252 
253         ///////////////////////////////////////////////////////////////
254         //////////////////////// data step down ///////////////////////
255         ///////////////////////////////////////////////////////////////
256 
257         template <typename T>
data_step_down(int dst_cols,int dst_rows,int src_rows,const PtrStep<T> src,PtrStep<T> dst)258         __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
259         {
260             const int x = blockIdx.x * blockDim.x + threadIdx.x;
261             const int y = blockIdx.y * blockDim.y + threadIdx.y;
262 
263             if (x < dst_cols && y < dst_rows)
264             {
265                 for (int d = 0; d < cndisp; ++d)
266                 {
267                     float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
268                           dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
269                           dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
270                           dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
271 
272                     dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
273                 }
274             }
275         }
276 
277         template<typename T>
data_step_down_gpu(int dst_cols,int dst_rows,int src_rows,const PtrStepSzb & src,const PtrStepSzb & dst,cudaStream_t stream)278         void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
279         {
280             dim3 threads(32, 8, 1);
281             dim3 grid(1, 1, 1);
282 
283             grid.x = divUp(dst_cols, threads.x);
284             grid.y = divUp(dst_rows, threads.y);
285 
286             data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
287             cudaSafeCall( cudaGetLastError() );
288 
289             if (stream == 0)
290                 cudaSafeCall( cudaDeviceSynchronize() );
291         }
292 
293         template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
294         template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
295 
296         ///////////////////////////////////////////////////////////////
297         /////////////////// level up messages  ////////////////////////
298         ///////////////////////////////////////////////////////////////
299 
300         template <typename T>
level_up_message(int dst_cols,int dst_rows,int src_rows,const PtrStep<T> src,PtrStep<T> dst)301         __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
302         {
303             const int x = blockIdx.x * blockDim.x + threadIdx.x;
304             const int y = blockIdx.y * blockDim.y + threadIdx.y;
305 
306             if (x < dst_cols && y < dst_rows)
307             {
308                 const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
309                 const size_t src_disp_step = src.step * src_rows / sizeof(T);
310 
311                 T*       dstr = dst.ptr(y  ) + x;
312                 const T* srcr = src.ptr(y/2) + x/2;
313 
314                 for (int d = 0; d < cndisp; ++d)
315                     dstr[d * dst_disp_step] = srcr[d * src_disp_step];
316             }
317         }
318 
319         template <typename T>
level_up_messages_gpu(int dst_idx,int dst_cols,int dst_rows,int src_rows,PtrStepSzb * mus,PtrStepSzb * mds,PtrStepSzb * mls,PtrStepSzb * mrs,cudaStream_t stream)320         void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
321         {
322             dim3 threads(32, 8, 1);
323             dim3 grid(1, 1, 1);
324 
325             grid.x = divUp(dst_cols, threads.x);
326             grid.y = divUp(dst_rows, threads.y);
327 
328             int src_idx = (dst_idx + 1) & 1;
329 
330             level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
331             cudaSafeCall( cudaGetLastError() );
332 
333             level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
334             cudaSafeCall( cudaGetLastError() );
335 
336             level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
337             cudaSafeCall( cudaGetLastError() );
338 
339             level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
340             cudaSafeCall( cudaGetLastError() );
341 
342             if (stream == 0)
343                 cudaSafeCall( cudaDeviceSynchronize() );
344         }
345 
346         template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
347         template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
348 
349         ///////////////////////////////////////////////////////////////
350         ////////////////////  calc all iterations /////////////////////
351         ///////////////////////////////////////////////////////////////
352 
353         template <typename T>
calc_min_linear_penalty(T * dst,size_t step)354         __device__ void calc_min_linear_penalty(T* dst, size_t step)
355         {
356             float prev = dst[0];
357             float cur;
358             for (int disp = 1; disp < cndisp; ++disp)
359             {
360                 prev += cdisc_single_jump;
361                 cur = dst[step * disp];
362                 if (prev < cur)
363                 {
364                     cur = prev;
365                     dst[step * disp] = saturate_cast<T>(prev);
366                 }
367                 prev = cur;
368             }
369 
370             prev = dst[(cndisp - 1) * step];
371             for (int disp = cndisp - 2; disp >= 0; disp--)
372             {
373                 prev += cdisc_single_jump;
374                 cur = dst[step * disp];
375                 if (prev < cur)
376                 {
377                     cur = prev;
378                     dst[step * disp] = saturate_cast<T>(prev);
379                 }
380                 prev = cur;
381             }
382         }
383 
384         template <typename T>
message(const T * msg1,const T * msg2,const T * msg3,const T * data,T * dst,size_t msg_disp_step,size_t data_disp_step)385         __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
386         {
387             float minimum = device::numeric_limits<float>::max();
388 
389             for(int i = 0; i < cndisp; ++i)
390             {
391                 float dst_reg  = msg1[msg_disp_step * i];
392                       dst_reg += msg2[msg_disp_step * i];
393                       dst_reg += msg3[msg_disp_step * i];
394                       dst_reg += data[data_disp_step * i];
395 
396                 if (dst_reg < minimum)
397                     minimum = dst_reg;
398 
399                 dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
400             }
401 
402             calc_min_linear_penalty(dst, msg_disp_step);
403 
404             minimum += cmax_disc_term;
405 
406             float sum = 0;
407             for(int i = 0; i < cndisp; ++i)
408             {
409                 float dst_reg = dst[msg_disp_step * i];
410                 if (dst_reg > minimum)
411                 {
412                     dst_reg = minimum;
413                     dst[msg_disp_step * i] = saturate_cast<T>(minimum);
414                 }
415                 sum += dst_reg;
416             }
417             sum /= cndisp;
418 
419             for(int i = 0; i < cndisp; ++i)
420                 dst[msg_disp_step * i] -= sum;
421         }
422 
423         template <typename T>
one_iteration(int t,int elem_step,T * u,T * d,T * l,T * r,const PtrStep<T> data,int cols,int rows)424         __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
425         {
426             const int y = blockIdx.y * blockDim.y + threadIdx.y;
427             const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
428 
429             if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
430             {
431                 T* us = u + y * elem_step + x;
432                 T* ds = d + y * elem_step + x;
433                 T* ls = l + y * elem_step + x;
434                 T* rs = r + y * elem_step + x;
435                 const T* dt = data.ptr(y) + x;
436 
437                 size_t msg_disp_step = elem_step * rows;
438                 size_t data_disp_step = data.step * rows / sizeof(T);
439 
440                 message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
441                 message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
442                 message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
443                 message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
444             }
445         }
446 
447         template <typename T>
calc_all_iterations_gpu(int cols,int rows,int iters,const PtrStepSzb & u,const PtrStepSzb & d,const PtrStepSzb & l,const PtrStepSzb & r,const PtrStepSzb & data,cudaStream_t stream)448         void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
449             const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
450         {
451             dim3 threads(32, 8, 1);
452             dim3 grid(1, 1, 1);
453 
454             grid.x = divUp(cols, threads.x << 1);
455             grid.y = divUp(rows, threads.y);
456 
457             int elem_step = (int)(u.step / sizeof(T));
458 
459             for(int t = 0; t < iters; ++t)
460             {
461                 one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
462                 cudaSafeCall( cudaGetLastError() );
463 
464                 if (stream == 0)
465                     cudaSafeCall( cudaDeviceSynchronize() );
466             }
467         }
468 
469         template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
470         template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
471 
472         ///////////////////////////////////////////////////////////////
473         /////////////////////////// output ////////////////////////////
474         ///////////////////////////////////////////////////////////////
475 
476         template <typename T>
output(const int elem_step,const T * u,const T * d,const T * l,const T * r,const T * data,PtrStepSz<short> disp)477         __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
478             PtrStepSz<short> disp)
479         {
480             const int x = blockIdx.x * blockDim.x + threadIdx.x;
481             const int y = blockIdx.y * blockDim.y + threadIdx.y;
482 
483             if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
484             {
485                 const T* us = u + (y + 1) * elem_step + x;
486                 const T* ds = d + (y - 1) * elem_step + x;
487                 const T* ls = l + y * elem_step + (x + 1);
488                 const T* rs = r + y * elem_step+ (x - 1);
489                 const T* dt = data + y * elem_step + x;
490 
491                 size_t disp_step = disp.rows * elem_step;
492 
493                 int best = 0;
494                 float best_val = numeric_limits<float>::max();
495                 for (int d = 0; d < cndisp; ++d)
496                 {
497                     float val  = us[d * disp_step];
498                           val += ds[d * disp_step];
499                           val += ls[d * disp_step];
500                           val += rs[d * disp_step];
501                           val += dt[d * disp_step];
502 
503                     if (val < best_val)
504                     {
505                         best_val = val;
506                         best = d;
507                     }
508                 }
509 
510                 disp.ptr(y)[x] = saturate_cast<short>(best);
511             }
512         }
513 
514         template <typename T>
output_gpu(const PtrStepSzb & u,const PtrStepSzb & d,const PtrStepSzb & l,const PtrStepSzb & r,const PtrStepSzb & data,const PtrStepSz<short> & disp,cudaStream_t stream)515         void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
516             const PtrStepSz<short>& disp, cudaStream_t stream)
517         {
518             dim3 threads(32, 8, 1);
519             dim3 grid(1, 1, 1);
520 
521             grid.x = divUp(disp.cols, threads.x);
522             grid.y = divUp(disp.rows, threads.y);
523 
524             int elem_step = static_cast<int>(u.step/sizeof(T));
525 
526             output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
527             cudaSafeCall( cudaGetLastError() );
528 
529             if (stream == 0)
530                 cudaSafeCall( cudaDeviceSynchronize() );
531         }
532 
533         template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
534         template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
535     } // namespace stereobp
536 }}} // namespace cv { namespace cuda { namespace cudev
537 
538 #endif /* CUDA_DISABLER */
539