1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #pragma once 45 46 #ifndef __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__ 47 #define __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__ 48 49 #include "../../common.hpp" 50 #include "../../warp/shuffle.hpp" 51 #include "../../block/scan.hpp" 52 #include "../../ptr2d/glob.hpp" 53 54 namespace cv { namespace cudev { 55 56 namespace integral_detail 57 { 58 // horizontal_pass 59 60 template <int NUM_SCAN_THREADS, class SrcPtr, typename D> horizontal_pass(const SrcPtr src,GlobPtr<D> dst,const int cols)61 __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols) 62 { 63 __shared__ D smem[NUM_SCAN_THREADS * 2]; 64 __shared__ D carryElem; 65 66 carryElem = 0; 67 68 __syncthreads(); 69 70 D* dst_row = dst.row(blockIdx.x); 71 72 int numBuckets = divUp(cols, NUM_SCAN_THREADS); 73 int offsetX = 0; 74 75 while (numBuckets--) 76 { 77 const int curElemOffs = offsetX + threadIdx.x; 78 79 D curElem = 0.0f; 80 81 if (curElemOffs < cols) 82 curElem = src(blockIdx.x, curElemOffs); 83 84 const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x); 85 86 if (curElemOffs < cols) 87 dst_row[curElemOffs] = carryElem + curScanElem; 88 89 offsetX += NUM_SCAN_THREADS; 90 91 __syncthreads(); 92 93 if (threadIdx.x == NUM_SCAN_THREADS - 1) 94 { 95 carryElem += curScanElem; 96 } 97 98 __syncthreads(); 99 } 100 } 101 102 template <int NUM_SCAN_THREADS, typename T, typename D> horizontal_pass(const GlobPtr<T> src,GlobPtr<D> dst,const int cols)103 __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols) 104 { 105 __shared__ D smem[NUM_SCAN_THREADS * 2]; 106 __shared__ D carryElem; 107 108 carryElem = 0; 109 110 __syncthreads(); 111 112 const T* src_row = src.row(blockIdx.x); 113 D* dst_row = dst.row(blockIdx.x); 114 115 int numBuckets = divUp(cols, NUM_SCAN_THREADS); 116 int offsetX = 0; 117 118 while (numBuckets--) 119 { 120 const int curElemOffs = offsetX + threadIdx.x; 121 122 D curElem = 0.0f; 123 124 if (curElemOffs < cols) 125 curElem = src_row[curElemOffs]; 126 127 const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x); 128 129 if (curElemOffs < cols) 130 dst_row[curElemOffs] = carryElem + curScanElem; 131 132 offsetX += NUM_SCAN_THREADS; 133 134 __syncthreads(); 135 136 if (threadIdx.x == NUM_SCAN_THREADS - 1) 137 { 138 carryElem += curScanElem; 139 } 140 141 __syncthreads(); 142 } 143 } 144 145 template <class SrcPtr, typename D> horizontal_pass(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)146 __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream) 147 { 148 const int NUM_SCAN_THREADS = 256; 149 150 const dim3 block(NUM_SCAN_THREADS); 151 const dim3 grid(rows); 152 153 horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols); 154 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 155 } 156 157 // horisontal_pass_8u_shfl 158 int_to_uchar4(unsigned int in)159 __device__ static uchar4 int_to_uchar4(unsigned int in) 160 { 161 uchar4 bytes; 162 bytes.x = (in & 0x000000ff) >> 0; 163 bytes.y = (in & 0x0000ff00) >> 8; 164 bytes.z = (in & 0x00ff0000) >> 16; 165 bytes.w = (in & 0xff000000) >> 24; 166 return bytes; 167 } 168 horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img,GlobPtr<uint4> integral)169 __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral) 170 { 171 #if CV_CUDEV_ARCH >= 300 172 __shared__ int sums[128]; 173 174 const int id = threadIdx.x; 175 const int lane_id = id % warpSize; 176 const int warp_id = id / warpSize; 177 178 const uint4 data = img(blockIdx.x, id); 179 180 const uchar4 a = int_to_uchar4(data.x); 181 const uchar4 b = int_to_uchar4(data.y); 182 const uchar4 c = int_to_uchar4(data.z); 183 const uchar4 d = int_to_uchar4(data.w); 184 185 int result[16]; 186 187 result[0] = a.x; 188 result[1] = result[0] + a.y; 189 result[2] = result[1] + a.z; 190 result[3] = result[2] + a.w; 191 192 result[4] = result[3] + b.x; 193 result[5] = result[4] + b.y; 194 result[6] = result[5] + b.z; 195 result[7] = result[6] + b.w; 196 197 result[8] = result[7] + c.x; 198 result[9] = result[8] + c.y; 199 result[10] = result[9] + c.z; 200 result[11] = result[10] + c.w; 201 202 result[12] = result[11] + d.x; 203 result[13] = result[12] + d.y; 204 result[14] = result[13] + d.z; 205 result[15] = result[14] + d.w; 206 207 int sum = result[15]; 208 209 // the prefix sum for each thread's 16 value is computed, 210 // now the final sums (result[15]) need to be shared 211 // with the other threads and add. To do this, 212 // the shfl_up() instruction is used and a shuffle scan 213 // operation is performed to distribute the sums to the correct 214 // threads 215 #pragma unroll 216 for (int i = 1; i < 32; i *= 2) 217 { 218 const int n = shfl_up(sum, i, 32); 219 220 if (lane_id >= i) 221 { 222 #pragma unroll 223 for (int k = 0; k < 16; ++k) 224 result[k] += n; 225 226 sum += n; 227 } 228 } 229 230 // Now the final sum for the warp must be shared 231 // between warps. This is done by each warp 232 // having a thread store to shared memory, then 233 // having some other warp load the values and 234 // compute a prefix sum, again by using shfl_up. 235 // The results are uniformly added back to the warps. 236 // last thread in the warp holding sum of the warp 237 // places that in shared 238 if (threadIdx.x % warpSize == warpSize - 1) 239 sums[warp_id] = result[15]; 240 241 __syncthreads(); 242 243 if (warp_id == 0) 244 { 245 int warp_sum = sums[lane_id]; 246 247 #pragma unroll 248 for (int i = 1; i <= 32; i *= 2) 249 { 250 const int n = shfl_up(warp_sum, i, 32); 251 252 if (lane_id >= i) 253 warp_sum += n; 254 } 255 256 sums[lane_id] = warp_sum; 257 } 258 259 __syncthreads(); 260 261 int blockSum = 0; 262 263 // fold in unused warp 264 if (warp_id > 0) 265 { 266 blockSum = sums[warp_id - 1]; 267 268 #pragma unroll 269 for (int k = 0; k < 16; ++k) 270 result[k] += blockSum; 271 } 272 273 // assemble result 274 // Each thread has 16 values to write, which are 275 // now integer data (to avoid overflow). Instead of 276 // each thread writing consecutive uint4s, the 277 // approach shown here experiments using 278 // the shuffle command to reformat the data 279 // inside the registers so that each thread holds 280 // consecutive data to be written so larger contiguous 281 // segments can be assembled for writing. 282 283 /* 284 For example data that needs to be written as 285 286 GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 287 but is stored in registers (r0..r3), in four threads (0..3) as: 288 289 threadId 0 1 2 3 290 r0 x0 y0 z0 w0 291 r1 x1 y1 z1 w1 292 r2 x2 y2 z2 w2 293 r3 x3 y3 z3 w3 294 295 after apply shfl_xor operations to move data between registers r1..r3: 296 297 threadId 00 01 10 11 298 x0 y0 z0 w0 299 xor(01)->y1 x1 w1 z1 300 xor(10)->z2 w2 x2 y2 301 xor(11)->w3 z3 y3 x3 302 303 and now x0..x3, and z0..z3 can be written out in order by all threads. 304 305 In the current code, each register above is actually representing 306 four integers to be written as uint4's to GMEM. 307 */ 308 309 result[4] = shfl_xor(result[4] , 1, 32); 310 result[5] = shfl_xor(result[5] , 1, 32); 311 result[6] = shfl_xor(result[6] , 1, 32); 312 result[7] = shfl_xor(result[7] , 1, 32); 313 314 result[8] = shfl_xor(result[8] , 2, 32); 315 result[9] = shfl_xor(result[9] , 2, 32); 316 result[10] = shfl_xor(result[10], 2, 32); 317 result[11] = shfl_xor(result[11], 2, 32); 318 319 result[12] = shfl_xor(result[12], 3, 32); 320 result[13] = shfl_xor(result[13], 3, 32); 321 result[14] = shfl_xor(result[14], 3, 32); 322 result[15] = shfl_xor(result[15], 3, 32); 323 324 uint4* integral_row = integral.row(blockIdx.x); 325 uint4 output; 326 327 /////// 328 329 if (threadIdx.x % 4 == 0) 330 output = make_uint4(result[0], result[1], result[2], result[3]); 331 332 if (threadIdx.x % 4 == 1) 333 output = make_uint4(result[4], result[5], result[6], result[7]); 334 335 if (threadIdx.x % 4 == 2) 336 output = make_uint4(result[8], result[9], result[10], result[11]); 337 338 if (threadIdx.x % 4 == 3) 339 output = make_uint4(result[12], result[13], result[14], result[15]); 340 341 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output; 342 343 /////// 344 345 if (threadIdx.x % 4 == 2) 346 output = make_uint4(result[0], result[1], result[2], result[3]); 347 348 if (threadIdx.x % 4 == 3) 349 output = make_uint4(result[4], result[5], result[6], result[7]); 350 351 if (threadIdx.x % 4 == 0) 352 output = make_uint4(result[8], result[9], result[10], result[11]); 353 354 if (threadIdx.x % 4 == 1) 355 output = make_uint4(result[12], result[13], result[14], result[15]); 356 357 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output; 358 359 // continuning from the above example, 360 // this use of shfl_xor() places the y0..y3 and w0..w3 data 361 // in order. 362 363 #pragma unroll 364 for (int i = 0; i < 16; ++i) 365 result[i] = shfl_xor(result[i], 1, 32); 366 367 if (threadIdx.x % 4 == 0) 368 output = make_uint4(result[0], result[1], result[2], result[3]); 369 370 if (threadIdx.x % 4 == 1) 371 output = make_uint4(result[4], result[5], result[6], result[7]); 372 373 if (threadIdx.x % 4 == 2) 374 output = make_uint4(result[8], result[9], result[10], result[11]); 375 376 if (threadIdx.x % 4 == 3) 377 output = make_uint4(result[12], result[13], result[14], result[15]); 378 379 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output; 380 381 /////// 382 383 if (threadIdx.x % 4 == 2) 384 output = make_uint4(result[0], result[1], result[2], result[3]); 385 386 if (threadIdx.x % 4 == 3) 387 output = make_uint4(result[4], result[5], result[6], result[7]); 388 389 if (threadIdx.x % 4 == 0) 390 output = make_uint4(result[8], result[9], result[10], result[11]); 391 392 if (threadIdx.x % 4 == 1) 393 output = make_uint4(result[12], result[13], result[14], result[15]); 394 395 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output; 396 #endif 397 } 398 horisontal_pass_8u_shfl(const GlobPtr<uchar> src,GlobPtr<uint> integral,int rows,int cols,cudaStream_t stream)399 __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream) 400 { 401 // each thread handles 16 values, use 1 block/row 402 // save, because step is actually can't be less 512 bytes 403 const int block = cols / 16; 404 405 // launch 1 block / row 406 const int grid = rows; 407 408 CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) ); 409 410 GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step); 411 GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step); 412 413 horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4); 414 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 415 } 416 417 // vertical 418 419 template <typename T> vertical_pass(GlobPtr<T> integral,const int rows,const int cols)420 __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols) 421 { 422 #if CV_CUDEV_ARCH >= 300 423 __shared__ T sums[32][9]; 424 425 const int tidx = blockIdx.x * blockDim.x + threadIdx.x; 426 const int lane_id = tidx % 8; 427 428 sums[threadIdx.x][threadIdx.y] = 0; 429 __syncthreads(); 430 431 T stepSum = 0; 432 433 int numBuckets = divUp(rows, blockDim.y); 434 int y = threadIdx.y; 435 436 while (numBuckets--) 437 { 438 T* p = integral.row(y) + tidx; 439 440 T sum = (tidx < cols) && (y < rows) ? *p : 0; 441 442 sums[threadIdx.x][threadIdx.y] = sum; 443 __syncthreads(); 444 445 // place into SMEM 446 // shfl scan reduce the SMEM, reformating so the column 447 // sums are computed in a warp 448 // then read out properly 449 const int j = threadIdx.x % 8; 450 const int k = threadIdx.x / 8 + threadIdx.y * 4; 451 452 T partial_sum = sums[k][j]; 453 454 for (int i = 1; i <= 8; i *= 2) 455 { 456 T n = shfl_up(partial_sum, i, 32); 457 458 if (lane_id >= i) 459 partial_sum += n; 460 } 461 462 sums[k][j] = partial_sum; 463 __syncthreads(); 464 465 if (threadIdx.y > 0) 466 sum += sums[threadIdx.x][threadIdx.y - 1]; 467 468 sum += stepSum; 469 stepSum += sums[threadIdx.x][blockDim.y - 1]; 470 471 __syncthreads(); 472 473 if ((tidx < cols) && (y < rows)) 474 { 475 *p = sum; 476 } 477 478 y += blockDim.y; 479 } 480 #else 481 __shared__ T smem[32][32]; 482 __shared__ T prevVals[32]; 483 484 volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y; 485 486 if (threadIdx.y == 0) 487 prevVals[threadIdx.x] = 0; 488 489 __syncthreads(); 490 491 const int x = blockIdx.x * blockDim.x + threadIdx.x; 492 493 int numBuckets = divUp(rows, 8 * 4); 494 int offsetY = 0; 495 496 while (numBuckets--) 497 { 498 const int curRowOffs = offsetY + threadIdx.y; 499 500 T curElems[4]; 501 T temp[4]; 502 503 // load patch 504 505 smem[threadIdx.y + 0][threadIdx.x] = 0.0f; 506 smem[threadIdx.y + 8][threadIdx.x] = 0.0f; 507 smem[threadIdx.y + 16][threadIdx.x] = 0.0f; 508 smem[threadIdx.y + 24][threadIdx.x] = 0.0f; 509 510 if (x < cols) 511 { 512 for (int i = 0; i < 4; ++i) 513 { 514 if (curRowOffs + i * 8 < rows) 515 smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x); 516 } 517 } 518 519 __syncthreads(); 520 521 // reduce 522 523 curElems[0] = smem[threadIdx.x][threadIdx.y ]; 524 curElems[1] = smem[threadIdx.x][threadIdx.y + 8]; 525 curElems[2] = smem[threadIdx.x][threadIdx.y + 16]; 526 curElems[3] = smem[threadIdx.x][threadIdx.y + 24]; 527 528 __syncthreads(); 529 530 temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x); 531 temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x); 532 temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x); 533 temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x); 534 535 curElems[0] += prevVals[threadIdx.y ]; 536 curElems[1] += prevVals[threadIdx.y + 8]; 537 curElems[2] += prevVals[threadIdx.y + 16]; 538 curElems[3] += prevVals[threadIdx.y + 24]; 539 540 __syncthreads(); 541 542 if (threadIdx.x == 31) 543 { 544 prevVals[threadIdx.y ] += temp[0]; 545 prevVals[threadIdx.y + 8] += temp[1]; 546 prevVals[threadIdx.y + 16] += temp[2]; 547 prevVals[threadIdx.y + 24] += temp[3]; 548 } 549 550 smem[threadIdx.y ][threadIdx.x] = curElems[0]; 551 smem[threadIdx.y + 8][threadIdx.x] = curElems[1]; 552 smem[threadIdx.y + 16][threadIdx.x] = curElems[2]; 553 smem[threadIdx.y + 24][threadIdx.x] = curElems[3]; 554 555 __syncthreads(); 556 557 // store patch 558 559 if (x < cols) 560 { 561 // read 4 value from source 562 for (int i = 0; i < 4; ++i) 563 { 564 if (curRowOffs + i * 8 < rows) 565 integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8]; 566 } 567 } 568 569 __syncthreads(); 570 571 offsetY += 8 * 4; 572 } 573 #endif 574 } 575 576 template <typename T> vertical_pass(const GlobPtr<T> & integral,int rows,int cols,cudaStream_t stream)577 __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream) 578 { 579 const dim3 block(32, 8); 580 const dim3 grid(divUp(cols, block.x)); 581 582 vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols); 583 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 584 } 585 586 // integral 587 588 template <class SrcPtr, typename D> integral(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)589 __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream) 590 { 591 horizontal_pass(src, dst, rows, cols, stream); 592 vertical_pass(dst, rows, cols, stream); 593 594 if (stream == 0) 595 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 596 } 597 integral(const GlobPtr<uchar> & src,const GlobPtr<uint> & dst,int rows,int cols,cudaStream_t stream)598 __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream) 599 { 600 if (deviceSupports(FEATURE_SET_COMPUTE_30) 601 && (cols % 16 == 0) 602 && reinterpret_cast<intptr_t>(src.data) % 32 == 0 603 && reinterpret_cast<intptr_t>(dst.data) % 32 == 0) 604 { 605 horisontal_pass_8u_shfl(src, dst, rows, cols, stream); 606 } 607 else 608 { 609 horizontal_pass(src, dst, rows, cols, stream); 610 } 611 612 vertical_pass(dst, rows, cols, stream); 613 614 if (stream == 0) 615 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 616 } 617 integral(const GlobPtr<uchar> & src,const GlobPtr<int> & dst,int rows,int cols,cudaStream_t stream)618 __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream) 619 { 620 GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step); 621 integral(src, dstui, rows, cols, stream); 622 } 623 } 624 625 }} 626 627 #endif 628