1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include <opencv2/core/cuda/common.hpp> 46 #include <opencv2/core/cuda/vec_traits.hpp> 47 #include <opencv2/core/cuda/vec_math.hpp> 48 #include <opencv2/core/cuda/emulation.hpp> 49 50 #include <iostream> 51 #include <stdio.h> 52 53 namespace cv { namespace cuda { namespace device 54 { 55 namespace ccl 56 { 57 enum 58 { 59 WARP_SIZE = 32, 60 WARP_LOG = 5, 61 62 CTA_SIZE_X = 32, 63 CTA_SIZE_Y = 8, 64 65 STA_SIZE_MERGE_Y = 4, 66 STA_SIZE_MERGE_X = 32, 67 68 TPB_X = 1, 69 TPB_Y = 4, 70 71 TILE_COLS = CTA_SIZE_X * TPB_X, 72 TILE_ROWS = CTA_SIZE_Y * TPB_Y 73 }; 74 75 template<typename T> struct IntervalsTraits 76 { 77 typedef T elem_type; 78 }; 79 80 template<> struct IntervalsTraits<unsigned char> 81 { 82 typedef int dist_type; 83 enum {ch = 1}; 84 }; 85 86 template<> struct IntervalsTraits<uchar3> 87 { 88 typedef int3 dist_type; 89 enum {ch = 3}; 90 }; 91 92 template<> struct IntervalsTraits<uchar4> 93 { 94 typedef int4 dist_type; 95 enum {ch = 4}; 96 }; 97 98 template<> struct IntervalsTraits<unsigned short> 99 { 100 typedef int dist_type; 101 enum {ch = 1}; 102 }; 103 104 template<> struct IntervalsTraits<ushort3> 105 { 106 typedef int3 dist_type; 107 enum {ch = 3}; 108 }; 109 110 template<> struct IntervalsTraits<ushort4> 111 { 112 typedef int4 dist_type; 113 enum {ch = 4}; 114 }; 115 116 template<> struct IntervalsTraits<float> 117 { 118 typedef float dist_type; 119 enum {ch = 1}; 120 }; 121 122 template<> struct IntervalsTraits<int> 123 { 124 typedef int dist_type; 125 enum {ch = 1}; 126 }; 127 128 typedef unsigned char component; 129 enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 }; 130 131 template<typename T, int CH> struct InInterval {}; 132 133 template<typename T> struct InInterval<T, 1> 134 { 135 typedef typename VecTraits<T>::elem_type E; InIntervalcv::cuda::device::ccl::InInterval136 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) { } 137 T lo, hi; 138 operator ()cv::cuda::device::ccl::InInterval139 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const 140 { 141 I d = a - b; 142 return lo <= d && d <= hi; 143 } 144 }; 145 146 147 template<typename T> struct InInterval<T, 3> 148 { 149 typedef typename VecTraits<T>::elem_type E; InIntervalcv::cuda::device::ccl::InInterval150 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) 151 : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)) { } 152 T lo, hi; 153 operator ()cv::cuda::device::ccl::InInterval154 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const 155 { 156 I d = saturate_cast<I>(a - b); 157 return lo.x <= d.x && d.x <= hi.x && 158 lo.y <= d.y && d.y <= hi.y && 159 lo.z <= d.z && d.z <= hi.z; 160 } 161 }; 162 163 template<typename T> struct InInterval<T, 4> 164 { 165 typedef typename VecTraits<T>::elem_type E; InIntervalcv::cuda::device::ccl::InInterval166 __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) 167 : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)) { } 168 T lo, hi; 169 operator ()cv::cuda::device::ccl::InInterval170 template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const 171 { 172 I d = saturate_cast<I>(a - b); 173 return lo.x <= d.x && d.x <= hi.x && 174 lo.y <= d.y && d.y <= hi.y && 175 lo.z <= d.z && d.z <= hi.z && 176 lo.w <= d.w && d.w <= hi.w; 177 } 178 }; 179 180 181 template<typename T, typename F> computeConnectivity(const PtrStepSz<T> image,PtrStepSzb components,F connected)182 __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected) 183 { 184 int x = threadIdx.x + blockIdx.x * blockDim.x; 185 int y = threadIdx.y + blockIdx.y * blockDim.y; 186 187 if (x >= image.cols || y >= image.rows) return; 188 189 T intensity = image(y, x); 190 component c = 0; 191 192 if ( x > 0 && connected(intensity, image(y, x - 1))) 193 c |= LEFT; 194 195 if ( y > 0 && connected(intensity, image(y - 1, x))) 196 c |= UP; 197 198 if ( x + 1 < image.cols && connected(intensity, image(y, x + 1))) 199 c |= RIGHT; 200 201 if ( y + 1 < image.rows && connected(intensity, image(y + 1, x))) 202 c |= DOWN; 203 204 components(y, x) = c; 205 } 206 207 template< typename T> computeEdges(const PtrStepSzb & image,PtrStepSzb edges,const float4 & lo,const float4 & hi,cudaStream_t stream)208 void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream) 209 { 210 dim3 block(CTA_SIZE_X, CTA_SIZE_Y); 211 dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y)); 212 213 typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t; 214 215 Int_t inInt(lo, hi); 216 computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt); 217 218 cudaSafeCall( cudaGetLastError() ); 219 if (stream == 0) 220 cudaSafeCall( cudaDeviceSynchronize() ); 221 } 222 223 template void computeEdges<uchar> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 224 template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 225 template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 226 template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 227 template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 228 template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 229 template void computeEdges<int> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 230 template void computeEdges<float> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream); 231 lableTiles(const PtrStepSzb edges,PtrStepSzi comps)232 __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps) 233 { 234 int x = threadIdx.x + blockIdx.x * TILE_COLS; 235 int y = threadIdx.y + blockIdx.y * TILE_ROWS; 236 237 if (x >= edges.cols || y >= edges.rows) return; 238 239 //currently x is 1 240 int bounds = ((y + TPB_Y) < edges.rows); 241 242 __shared__ int labelsTile[TILE_ROWS][TILE_COLS]; 243 __shared__ int edgesTile[TILE_ROWS][TILE_COLS]; 244 245 int new_labels[TPB_Y][TPB_X]; 246 int old_labels[TPB_Y][TPB_X]; 247 248 #pragma unroll 249 for (int i = 0; i < TPB_Y; ++i) 250 #pragma unroll 251 for (int j = 0; j < TPB_X; ++j) 252 { 253 int yloc = threadIdx.y + CTA_SIZE_Y * i; 254 int xloc = threadIdx.x + CTA_SIZE_X * j; 255 component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j); 256 257 if (!xloc) c &= ~LEFT; 258 if (!yloc) c &= ~UP; 259 260 if (xloc == TILE_COLS -1) c &= ~RIGHT; 261 if (yloc == TILE_ROWS -1) c &= ~DOWN; 262 263 new_labels[i][j] = yloc * TILE_COLS + xloc; 264 edgesTile[yloc][xloc] = c; 265 } 266 267 for (int k = 0; ;++k) 268 { 269 //1. backup 270 #pragma unroll 271 for (int i = 0; i < TPB_Y; ++i) 272 #pragma unroll 273 for (int j = 0; j < TPB_X; ++j) 274 { 275 int yloc = threadIdx.y + CTA_SIZE_Y * i; 276 int xloc = threadIdx.x + CTA_SIZE_X * j; 277 278 old_labels[i][j] = new_labels[i][j]; 279 labelsTile[yloc][xloc] = new_labels[i][j]; 280 } 281 282 __syncthreads(); 283 284 //2. compare local arrays 285 #pragma unroll 286 for (int i = 0; i < TPB_Y; ++i) 287 #pragma unroll 288 for (int j = 0; j < TPB_X; ++j) 289 { 290 int yloc = threadIdx.y + CTA_SIZE_Y * i; 291 int xloc = threadIdx.x + CTA_SIZE_X * j; 292 293 component c = edgesTile[yloc][xloc]; 294 int label = new_labels[i][j]; 295 296 if (c & UP) 297 label = ::min(label, labelsTile[yloc - 1][xloc]); 298 299 if (c & DOWN) 300 label = ::min(label, labelsTile[yloc + 1][xloc]); 301 302 if (c & LEFT) 303 label = ::min(label, labelsTile[yloc][xloc - 1]); 304 305 if (c & RIGHT) 306 label = ::min(label, labelsTile[yloc][xloc + 1]); 307 308 new_labels[i][j] = label; 309 } 310 311 __syncthreads(); 312 313 //3. determine: Is any value changed? 314 int changed = 0; 315 #pragma unroll 316 for (int i = 0; i < TPB_Y; ++i) 317 #pragma unroll 318 for (int j = 0; j < TPB_X; ++j) 319 { 320 if (new_labels[i][j] < old_labels[i][j]) 321 { 322 changed = 1; 323 Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]); 324 } 325 } 326 327 changed = Emulation::syncthreadsOr(changed); 328 329 if (!changed) 330 break; 331 332 //4. Compact paths 333 const int *labels = &labelsTile[0][0]; 334 #pragma unroll 335 for (int i = 0; i < TPB_Y; ++i) 336 #pragma unroll 337 for (int j = 0; j < TPB_X; ++j) 338 { 339 int label = new_labels[i][j]; 340 341 while( labels[label] < label ) label = labels[label]; 342 343 new_labels[i][j] = label; 344 } 345 __syncthreads(); 346 } 347 348 #pragma unroll 349 for (int i = 0; i < TPB_Y; ++i) 350 #pragma unroll 351 for (int j = 0; j < TPB_X; ++j) 352 { 353 int label = new_labels[i][j]; 354 int yloc = label / TILE_COLS; 355 int xloc = label - yloc * TILE_COLS; 356 357 xloc += blockIdx.x * TILE_COLS; 358 yloc += blockIdx.y * TILE_ROWS; 359 360 label = yloc * edges.cols + xloc; 361 // do it for x too. 362 if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label; 363 } 364 } 365 root(const PtrStepSzi & comps,int label)366 __device__ __forceinline__ int root(const PtrStepSzi& comps, int label) 367 { 368 while(1) 369 { 370 int y = label / comps.cols; 371 int x = label - y * comps.cols; 372 373 int parent = comps(y, x); 374 375 if (label == parent) break; 376 377 label = parent; 378 } 379 return label; 380 } 381 isConnected(PtrStepSzi & comps,int l1,int l2,bool & changed)382 __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed) 383 { 384 int r1 = root(comps, l1); 385 int r2 = root(comps, l2); 386 387 if (r1 == r2) return; 388 389 int mi = ::min(r1, r2); 390 int ma = ::max(r1, r2); 391 392 int y = ma / comps.cols; 393 int x = ma - y * comps.cols; 394 395 atomicMin(&comps.ptr(y)[x], mi); 396 changed = true; 397 } 398 crossMerge(const int tilesNumY,const int tilesNumX,int tileSizeY,int tileSizeX,const PtrStepSzb edges,PtrStepSzi comps,const int yIncomplete,int xIncomplete)399 __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX, 400 const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete) 401 { 402 int tid = threadIdx.y * blockDim.x + threadIdx.x; 403 int stride = blockDim.y * blockDim.x; 404 405 int ybegin = blockIdx.y * (tilesNumY * tileSizeY); 406 int yend = ybegin + tilesNumY * tileSizeY; 407 408 if (blockIdx.y == gridDim.y - 1) 409 { 410 yend -= yIncomplete * tileSizeY; 411 yend -= tileSizeY; 412 tileSizeY = (edges.rows % tileSizeY); 413 414 yend += tileSizeY; 415 } 416 417 int xbegin = blockIdx.x * tilesNumX * tileSizeX; 418 int xend = xbegin + tilesNumX * tileSizeX; 419 420 if (blockIdx.x == gridDim.x - 1) 421 { 422 if (xIncomplete) yend = ybegin; 423 xend -= xIncomplete * tileSizeX; 424 xend -= tileSizeX; 425 tileSizeX = (edges.cols % tileSizeX); 426 427 xend += tileSizeX; 428 } 429 430 if (blockIdx.y == (gridDim.y - 1) && yIncomplete) 431 { 432 xend = xbegin; 433 } 434 435 int tasksV = (tilesNumX - 1) * (yend - ybegin); 436 int tasksH = (tilesNumY - 1) * (xend - xbegin); 437 438 int total = tasksH + tasksV; 439 440 bool changed; 441 do 442 { 443 changed = false; 444 for (int taskIdx = tid; taskIdx < total; taskIdx += stride) 445 { 446 if (taskIdx < tasksH) 447 { 448 int indexH = taskIdx; 449 450 int row = indexH / (xend - xbegin); 451 int col = indexH - row * (xend - xbegin); 452 453 int y = ybegin + (row + 1) * tileSizeY; 454 int x = xbegin + col; 455 456 component e = edges( x, y); 457 if (e & UP) 458 { 459 int lc = comps(y,x); 460 int lu = comps(y - 1, x); 461 462 isConnected(comps, lc, lu, changed); 463 } 464 } 465 else 466 { 467 int indexV = taskIdx - tasksH; 468 469 int col = indexV / (yend - ybegin); 470 int row = indexV - col * (yend - ybegin); 471 472 int x = xbegin + (col + 1) * tileSizeX; 473 int y = ybegin + row; 474 475 component e = edges(x, y); 476 if (e & LEFT) 477 { 478 int lc = comps(y, x); 479 int ll = comps(y, x - 1); 480 481 isConnected(comps, lc, ll, changed); 482 } 483 } 484 } 485 } while (Emulation::syncthreadsOr(changed)); 486 } 487 flatten(const PtrStepSzb edges,PtrStepSzi comps)488 __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps) 489 { 490 int x = threadIdx.x + blockIdx.x * blockDim.x; 491 int y = threadIdx.y + blockIdx.y * blockDim.y; 492 493 if( x < comps.cols && y < comps.rows) 494 comps(y, x) = root(comps, comps(y, x)); 495 } 496 497 enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1}; 498 labelComponents(const PtrStepSzb & edges,PtrStepSzi comps,int flags,cudaStream_t stream)499 void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream) 500 { 501 (void) flags; 502 dim3 block(CTA_SIZE_X, CTA_SIZE_Y); 503 dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS)); 504 505 lableTiles<<<grid, block, 0, stream>>>(edges, comps); 506 cudaSafeCall( cudaGetLastError() ); 507 508 int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS; 509 while (grid.x > 1 || grid.y > 1) 510 { 511 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f)); 512 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y); 513 // debug log 514 // std::cout << "merging: " << grid.y << " x " << grid.x << " ---> " << mergeGrid.y << " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl; 515 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2); 516 tileSizeX <<= 1; 517 tileSizeY <<= 1; 518 grid = mergeGrid; 519 520 cudaSafeCall( cudaGetLastError() ); 521 } 522 523 grid.x = divUp(edges.cols, block.x); 524 grid.y = divUp(edges.rows, block.y); 525 flatten<<<grid, block, 0, stream>>>(edges, comps); 526 cudaSafeCall( cudaGetLastError() ); 527 528 if (stream == 0) 529 cudaSafeCall( cudaDeviceSynchronize() ); 530 } 531 } 532 } } } 533 534 #endif /* CUDA_DISABLER */ 535