1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/border_interpolate.hpp" 47 48 #define tx threadIdx.x 49 #define ty threadIdx.y 50 #define bx blockIdx.x 51 #define by blockIdx.y 52 #define bdx blockDim.x 53 #define bdy blockDim.y 54 55 #define BORDER_SIZE 5 56 #define MAX_KSIZE_HALF 100 57 58 namespace cv { namespace cuda { namespace device { namespace optflow_farneback 59 { 60 __constant__ float c_g[8]; 61 __constant__ float c_xg[8]; 62 __constant__ float c_xxg[8]; 63 __constant__ float c_ig11, c_ig03, c_ig33, c_ig55; 64 65 66 template <int polyN> polynomialExpansion(const int height,const int width,const PtrStepf src,PtrStepf dst)67 __global__ void polynomialExpansion( 68 const int height, const int width, const PtrStepf src, PtrStepf dst) 69 { 70 const int y = by * bdy + ty; 71 const int x = bx * (bdx - 2*polyN) + tx - polyN; 72 73 if (y < height) 74 { 75 extern __shared__ float smem[]; 76 volatile float *row = smem + tx; 77 int xWarped = ::min(::max(x, 0), width - 1); 78 79 row[0] = src(y, xWarped) * c_g[0]; 80 row[bdx] = 0.f; 81 row[2*bdx] = 0.f; 82 83 for (int k = 1; k <= polyN; ++k) 84 { 85 float t0 = src(::max(y - k, 0), xWarped); 86 float t1 = src(::min(y + k, height - 1), xWarped); 87 88 row[0] += c_g[k] * (t0 + t1); 89 row[bdx] += c_xg[k] * (t1 - t0); 90 row[2*bdx] += c_xxg[k] * (t0 + t1); 91 } 92 93 __syncthreads(); 94 95 if (tx >= polyN && tx + polyN < bdx && x < width) 96 { 97 float b1 = c_g[0] * row[0]; 98 float b3 = c_g[0] * row[bdx]; 99 float b5 = c_g[0] * row[2*bdx]; 100 float b2 = 0, b4 = 0, b6 = 0; 101 102 for (int k = 1; k <= polyN; ++k) 103 { 104 b1 += (row[k] + row[-k]) * c_g[k]; 105 b4 += (row[k] + row[-k]) * c_xxg[k]; 106 b2 += (row[k] - row[-k]) * c_xg[k]; 107 b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k]; 108 b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k]; 109 b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k]; 110 } 111 112 dst(y, xWarped) = b3*c_ig11; 113 dst(height + y, xWarped) = b2*c_ig11; 114 dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33; 115 dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33; 116 dst(4*height + y, xWarped) = b6*c_ig55; 117 } 118 } 119 } 120 121 setPolynomialExpansionConsts(int polyN,const float * g,const float * xg,const float * xxg,float ig11,float ig03,float ig33,float ig55)122 void setPolynomialExpansionConsts( 123 int polyN, const float *g, const float *xg, const float *xxg, 124 float ig11, float ig03, float ig33, float ig55) 125 { 126 cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g))); 127 cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg))); 128 cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg))); 129 cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11))); 130 cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03))); 131 cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33))); 132 cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55))); 133 } 134 135 polynomialExpansionGpu(const PtrStepSzf & src,int polyN,PtrStepSzf dst,cudaStream_t stream)136 void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream) 137 { 138 dim3 block(256); 139 dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows); 140 int smem = 3 * block.x * sizeof(float); 141 142 if (polyN == 5) 143 polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst); 144 else if (polyN == 7) 145 polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst); 146 147 cudaSafeCall(cudaGetLastError()); 148 149 if (stream == 0) 150 cudaSafeCall(cudaDeviceSynchronize()); 151 } 152 153 154 __constant__ float c_border[BORDER_SIZE + 1]; 155 updateMatrices(const int height,const int width,const PtrStepf flowx,const PtrStepf flowy,const PtrStepf R0,const PtrStepf R1,PtrStepf M)156 __global__ void updateMatrices( 157 const int height, const int width, const PtrStepf flowx, const PtrStepf flowy, 158 const PtrStepf R0, const PtrStepf R1, PtrStepf M) 159 { 160 const int y = by * bdy + ty; 161 const int x = bx * bdx + tx; 162 163 if (y < height && x < width) 164 { 165 float dx = flowx(y, x); 166 float dy = flowy(y, x); 167 float fx = x + dx; 168 float fy = y + dy; 169 170 int x1 = floorf(fx); 171 int y1 = floorf(fy); 172 fx -= x1; fy -= y1; 173 174 float r2, r3, r4, r5, r6; 175 176 if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1) 177 { 178 float a00 = (1.f - fx) * (1.f - fy); 179 float a01 = fx * (1.f - fy); 180 float a10 = (1.f - fx) * fy; 181 float a11 = fx * fy; 182 183 r2 = a00 * R1(y1, x1) + 184 a01 * R1(y1, x1 + 1) + 185 a10 * R1(y1 + 1, x1) + 186 a11 * R1(y1 + 1, x1 + 1); 187 188 r3 = a00 * R1(height + y1, x1) + 189 a01 * R1(height + y1, x1 + 1) + 190 a10 * R1(height + y1 + 1, x1) + 191 a11 * R1(height + y1 + 1, x1 + 1); 192 193 r4 = a00 * R1(2*height + y1, x1) + 194 a01 * R1(2*height + y1, x1 + 1) + 195 a10 * R1(2*height + y1 + 1, x1) + 196 a11 * R1(2*height + y1 + 1, x1 + 1); 197 198 r5 = a00 * R1(3*height + y1, x1) + 199 a01 * R1(3*height + y1, x1 + 1) + 200 a10 * R1(3*height + y1 + 1, x1) + 201 a11 * R1(3*height + y1 + 1, x1 + 1); 202 203 r6 = a00 * R1(4*height + y1, x1) + 204 a01 * R1(4*height + y1, x1 + 1) + 205 a10 * R1(4*height + y1 + 1, x1) + 206 a11 * R1(4*height + y1 + 1, x1 + 1); 207 208 r4 = (R0(2*height + y, x) + r4) * 0.5f; 209 r5 = (R0(3*height + y, x) + r5) * 0.5f; 210 r6 = (R0(4*height + y, x) + r6) * 0.25f; 211 } 212 else 213 { 214 r2 = r3 = 0.f; 215 r4 = R0(2*height + y, x); 216 r5 = R0(3*height + y, x); 217 r6 = R0(4*height + y, x) * 0.5f; 218 } 219 220 r2 = (R0(y, x) - r2) * 0.5f; 221 r3 = (R0(height + y, x) - r3) * 0.5f; 222 223 r2 += r4*dy + r6*dx; 224 r3 += r6*dy + r5*dx; 225 226 float scale = 227 c_border[::min(x, BORDER_SIZE)] * 228 c_border[::min(y, BORDER_SIZE)] * 229 c_border[::min(width - x - 1, BORDER_SIZE)] * 230 c_border[::min(height - y - 1, BORDER_SIZE)]; 231 232 r2 *= scale; r3 *= scale; r4 *= scale; 233 r5 *= scale; r6 *= scale; 234 235 M(y, x) = r4*r4 + r6*r6; 236 M(height + y, x) = (r4 + r5)*r6; 237 M(2*height + y, x) = r5*r5 + r6*r6; 238 M(3*height + y, x) = r4*r2 + r6*r3; 239 M(4*height + y, x) = r6*r2 + r5*r3; 240 } 241 } 242 243 setUpdateMatricesConsts()244 void setUpdateMatricesConsts() 245 { 246 static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f}; 247 cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border))); 248 } 249 250 updateMatricesGpu(const PtrStepSzf flowx,const PtrStepSzf flowy,const PtrStepSzf R0,const PtrStepSzf R1,PtrStepSzf M,cudaStream_t stream)251 void updateMatricesGpu( 252 const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1, 253 PtrStepSzf M, cudaStream_t stream) 254 { 255 dim3 block(32, 8); 256 dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y)); 257 258 updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M); 259 260 cudaSafeCall(cudaGetLastError()); 261 262 if (stream == 0) 263 cudaSafeCall(cudaDeviceSynchronize()); 264 } 265 266 updateFlow(const int height,const int width,const PtrStepf M,PtrStepf flowx,PtrStepf flowy)267 __global__ void updateFlow( 268 const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy) 269 { 270 const int y = by * bdy + ty; 271 const int x = bx * bdx + tx; 272 273 if (y < height && x < width) 274 { 275 float g11 = M(y, x); 276 float g12 = M(height + y, x); 277 float g22 = M(2*height + y, x); 278 float h1 = M(3*height + y, x); 279 float h2 = M(4*height + y, x); 280 281 float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f); 282 283 flowx(y, x) = (g11*h2 - g12*h1) * detInv; 284 flowy(y, x) = (g22*h1 - g12*h2) * detInv; 285 } 286 } 287 288 updateFlowGpu(const PtrStepSzf M,PtrStepSzf flowx,PtrStepSzf flowy,cudaStream_t stream)289 void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream) 290 { 291 dim3 block(32, 8); 292 dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y)); 293 294 updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy); 295 296 cudaSafeCall(cudaGetLastError()); 297 298 if (stream == 0) 299 cudaSafeCall(cudaDeviceSynchronize()); 300 } 301 302 303 /*__global__ void boxFilter( 304 const int height, const int width, const PtrStepf src, 305 const int ksizeHalf, const float boxAreaInv, PtrStepf dst) 306 { 307 const int y = by * bdy + ty; 308 const int x = bx * bdx + tx; 309 310 extern __shared__ float smem[]; 311 volatile float *row = smem + ty * (bdx + 2*ksizeHalf); 312 313 if (y < height) 314 { 315 // Vertical pass 316 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 317 { 318 int xExt = int(bx * bdx) + i - ksizeHalf; 319 xExt = ::min(::max(xExt, 0), width - 1); 320 321 row[i] = src(y, xExt); 322 for (int j = 1; j <= ksizeHalf; ++j) 323 row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt); 324 } 325 326 if (x < width) 327 { 328 __syncthreads(); 329 330 // Horizontal passs 331 row += tx + ksizeHalf; 332 float res = row[0]; 333 for (int i = 1; i <= ksizeHalf; ++i) 334 res += row[-i] + row[i]; 335 dst(y, x) = res * boxAreaInv; 336 } 337 } 338 } 339 340 341 void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream) 342 { 343 dim3 block(256); 344 dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); 345 int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float); 346 347 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); 348 boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst); 349 350 cudaSafeCall(cudaGetLastError()); 351 352 if (stream == 0) 353 cudaSafeCall(cudaDeviceSynchronize()); 354 }*/ 355 356 boxFilter5(const int height,const int width,const PtrStepf src,const int ksizeHalf,const float boxAreaInv,PtrStepf dst)357 __global__ void boxFilter5( 358 const int height, const int width, const PtrStepf src, 359 const int ksizeHalf, const float boxAreaInv, PtrStepf dst) 360 { 361 const int y = by * bdy + ty; 362 const int x = bx * bdx + tx; 363 364 extern __shared__ float smem[]; 365 366 const int smw = bdx + 2*ksizeHalf; // shared memory "width" 367 volatile float *row = smem + 5 * ty * smw; 368 369 if (y < height) 370 { 371 // Vertical pass 372 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 373 { 374 int xExt = int(bx * bdx) + i - ksizeHalf; 375 xExt = ::min(::max(xExt, 0), width - 1); 376 377 #pragma unroll 378 for (int k = 0; k < 5; ++k) 379 row[k*smw + i] = src(k*height + y, xExt); 380 381 for (int j = 1; j <= ksizeHalf; ++j) 382 #pragma unroll 383 for (int k = 0; k < 5; ++k) 384 row[k*smw + i] += 385 src(k*height + ::max(y - j, 0), xExt) + 386 src(k*height + ::min(y + j, height - 1), xExt); 387 } 388 389 if (x < width) 390 { 391 __syncthreads(); 392 393 // Horizontal passs 394 395 row += tx + ksizeHalf; 396 float res[5]; 397 398 #pragma unroll 399 for (int k = 0; k < 5; ++k) 400 res[k] = row[k*smw]; 401 402 for (int i = 1; i <= ksizeHalf; ++i) 403 #pragma unroll 404 for (int k = 0; k < 5; ++k) 405 res[k] += row[k*smw - i] + row[k*smw + i]; 406 407 #pragma unroll 408 for (int k = 0; k < 5; ++k) 409 dst(k*height + y, x) = res[k] * boxAreaInv; 410 } 411 } 412 } 413 414 boxFilter5Gpu(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,cudaStream_t stream)415 void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream) 416 { 417 int height = src.rows / 5; 418 int width = src.cols; 419 420 dim3 block(256); 421 dim3 grid(divUp(width, block.x), divUp(height, block.y)); 422 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); 423 424 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); 425 boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst); 426 427 cudaSafeCall(cudaGetLastError()); 428 429 if (stream == 0) 430 cudaSafeCall(cudaDeviceSynchronize()); 431 } 432 433 boxFilter5Gpu_CC11(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,cudaStream_t stream)434 void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream) 435 { 436 int height = src.rows / 5; 437 int width = src.cols; 438 439 dim3 block(128); 440 dim3 grid(divUp(width, block.x), divUp(height, block.y)); 441 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); 442 443 float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); 444 boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst); 445 446 cudaSafeCall(cudaGetLastError()); 447 448 if (stream == 0) 449 cudaSafeCall(cudaDeviceSynchronize()); 450 } 451 452 453 __constant__ float c_gKer[MAX_KSIZE_HALF + 1]; 454 455 template <typename Border> gaussianBlur(const int height,const int width,const PtrStepf src,const int ksizeHalf,const Border b,PtrStepf dst)456 __global__ void gaussianBlur( 457 const int height, const int width, const PtrStepf src, const int ksizeHalf, 458 const Border b, PtrStepf dst) 459 { 460 const int y = by * bdy + ty; 461 const int x = bx * bdx + tx; 462 463 extern __shared__ float smem[]; 464 volatile float *row = smem + ty * (bdx + 2*ksizeHalf); 465 466 if (y < height) 467 { 468 // Vertical pass 469 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 470 { 471 int xExt = int(bx * bdx) + i - ksizeHalf; 472 xExt = b.idx_col(xExt); 473 row[i] = src(y, xExt) * c_gKer[0]; 474 for (int j = 1; j <= ksizeHalf; ++j) 475 row[i] += 476 (src(b.idx_row_low(y - j), xExt) + 477 src(b.idx_row_high(y + j), xExt)) * c_gKer[j]; 478 } 479 480 if (x < width) 481 { 482 __syncthreads(); 483 484 // Horizontal pass 485 row += tx + ksizeHalf; 486 float res = row[0] * c_gKer[0]; 487 for (int i = 1; i <= ksizeHalf; ++i) 488 res += (row[-i] + row[i]) * c_gKer[i]; 489 dst(y, x) = res; 490 } 491 } 492 } 493 494 setGaussianBlurKernel(const float * gKer,int ksizeHalf)495 void setGaussianBlurKernel(const float *gKer, int ksizeHalf) 496 { 497 cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer))); 498 } 499 500 501 template <typename Border> gaussianBlurCaller(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,cudaStream_t stream)502 void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream) 503 { 504 int height = src.rows; 505 int width = src.cols; 506 507 dim3 block(256); 508 dim3 grid(divUp(width, block.x), divUp(height, block.y)); 509 int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float); 510 Border b(height, width); 511 512 gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst); 513 514 cudaSafeCall(cudaGetLastError()); 515 516 if (stream == 0) 517 cudaSafeCall(cudaDeviceSynchronize()); 518 } 519 520 gaussianBlurGpu(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,int borderMode,cudaStream_t stream)521 void gaussianBlurGpu( 522 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream) 523 { 524 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t); 525 526 static const caller_t callers[] = 527 { 528 0 /*gaussianBlurCaller<BrdConstant<float> >*/, 529 gaussianBlurCaller<BrdReplicate<float> >, 530 0 /*gaussianBlurCaller<BrdReflect<float> >*/, 531 0 /*gaussianBlurCaller<BrdWrap<float> >*/, 532 gaussianBlurCaller<BrdReflect101<float> > 533 }; 534 535 callers[borderMode](src, ksizeHalf, dst, stream); 536 } 537 538 539 template <typename Border> gaussianBlur5(const int height,const int width,const PtrStepf src,const int ksizeHalf,const Border b,PtrStepf dst)540 __global__ void gaussianBlur5( 541 const int height, const int width, const PtrStepf src, const int ksizeHalf, 542 const Border b, PtrStepf dst) 543 { 544 const int y = by * bdy + ty; 545 const int x = bx * bdx + tx; 546 547 extern __shared__ float smem[]; 548 549 const int smw = bdx + 2*ksizeHalf; // shared memory "width" 550 volatile float *row = smem + 5 * ty * smw; 551 552 if (y < height) 553 { 554 // Vertical pass 555 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 556 { 557 int xExt = int(bx * bdx) + i - ksizeHalf; 558 xExt = b.idx_col(xExt); 559 560 #pragma unroll 561 for (int k = 0; k < 5; ++k) 562 row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0]; 563 564 for (int j = 1; j <= ksizeHalf; ++j) 565 #pragma unroll 566 for (int k = 0; k < 5; ++k) 567 row[k*smw + i] += 568 (src(k*height + b.idx_row_low(y - j), xExt) + 569 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j]; 570 } 571 572 if (x < width) 573 { 574 __syncthreads(); 575 576 // Horizontal pass 577 578 row += tx + ksizeHalf; 579 float res[5]; 580 581 #pragma unroll 582 for (int k = 0; k < 5; ++k) 583 res[k] = row[k*smw] * c_gKer[0]; 584 585 for (int i = 1; i <= ksizeHalf; ++i) 586 #pragma unroll 587 for (int k = 0; k < 5; ++k) 588 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i]; 589 590 #pragma unroll 591 for (int k = 0; k < 5; ++k) 592 dst(k*height + y, x) = res[k]; 593 } 594 } 595 } 596 597 598 template <typename Border, int blockDimX> gaussianBlur5Caller(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,cudaStream_t stream)599 void gaussianBlur5Caller( 600 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream) 601 { 602 int height = src.rows / 5; 603 int width = src.cols; 604 605 dim3 block(blockDimX); 606 dim3 grid(divUp(width, block.x), divUp(height, block.y)); 607 int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float); 608 Border b(height, width); 609 610 gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst); 611 612 cudaSafeCall(cudaGetLastError()); 613 614 if (stream == 0) 615 cudaSafeCall(cudaDeviceSynchronize()); 616 } 617 618 gaussianBlur5Gpu(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,int borderMode,cudaStream_t stream)619 void gaussianBlur5Gpu( 620 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream) 621 { 622 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t); 623 624 static const caller_t callers[] = 625 { 626 0 /*gaussianBlur5Caller<BrdConstant<float>,256>*/, 627 gaussianBlur5Caller<BrdReplicate<float>,256>, 628 0 /*gaussianBlur5Caller<BrdReflect<float>,256>*/, 629 0 /*gaussianBlur5Caller<BrdWrap<float>,256>*/, 630 gaussianBlur5Caller<BrdReflect101<float>,256> 631 }; 632 633 callers[borderMode](src, ksizeHalf, dst, stream); 634 } 635 gaussianBlur5Gpu_CC11(const PtrStepSzf src,int ksizeHalf,PtrStepSzf dst,int borderMode,cudaStream_t stream)636 void gaussianBlur5Gpu_CC11( 637 const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream) 638 { 639 typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t); 640 641 static const caller_t callers[] = 642 { 643 0 /*gaussianBlur5Caller<BrdConstant<float>,128>*/, 644 gaussianBlur5Caller<BrdReplicate<float>,128>, 645 0 /*gaussianBlur5Caller<BrdReflect<float>,128>*/, 646 0 /*gaussianBlur5Caller<BrdWrap<float>,128>*/, 647 gaussianBlur5Caller<BrdReflect101<float>,128> 648 }; 649 650 callers[borderMode](src, ksizeHalf, dst, stream); 651 } 652 653 }}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback 654 655 656 #endif /* CUDA_DISABLER */ 657