1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 #pragma once
45 
46 #ifndef __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
47 #define __OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP__
48 
49 #include "../../common.hpp"
50 #include "../../warp/shuffle.hpp"
51 #include "../../block/scan.hpp"
52 #include "../../ptr2d/glob.hpp"
53 
54 namespace cv { namespace cudev {
55 
56 namespace integral_detail
57 {
58     // horizontal_pass
59 
60     template <int NUM_SCAN_THREADS, class SrcPtr, typename D>
horizontal_pass(const SrcPtr src,GlobPtr<D> dst,const int cols)61     __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols)
62     {
63         __shared__ D smem[NUM_SCAN_THREADS * 2];
64         __shared__ D carryElem;
65 
66         carryElem = 0;
67 
68         __syncthreads();
69 
70         D* dst_row = dst.row(blockIdx.x);
71 
72         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
73         int offsetX = 0;
74 
75         while (numBuckets--)
76         {
77             const int curElemOffs = offsetX + threadIdx.x;
78 
79             D curElem = 0.0f;
80 
81             if (curElemOffs < cols)
82                 curElem = src(blockIdx.x, curElemOffs);
83 
84             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
85 
86             if (curElemOffs < cols)
87                 dst_row[curElemOffs] = carryElem + curScanElem;
88 
89             offsetX += NUM_SCAN_THREADS;
90 
91             __syncthreads();
92 
93             if (threadIdx.x == NUM_SCAN_THREADS - 1)
94             {
95                 carryElem += curScanElem;
96             }
97 
98             __syncthreads();
99         }
100     }
101 
102     template <int NUM_SCAN_THREADS, typename T, typename D>
horizontal_pass(const GlobPtr<T> src,GlobPtr<D> dst,const int cols)103     __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols)
104     {
105         __shared__ D smem[NUM_SCAN_THREADS * 2];
106         __shared__ D carryElem;
107 
108         carryElem = 0;
109 
110         __syncthreads();
111 
112         const T* src_row = src.row(blockIdx.x);
113         D* dst_row = dst.row(blockIdx.x);
114 
115         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
116         int offsetX = 0;
117 
118         while (numBuckets--)
119         {
120             const int curElemOffs = offsetX + threadIdx.x;
121 
122             D curElem = 0.0f;
123 
124             if (curElemOffs < cols)
125                 curElem = src_row[curElemOffs];
126 
127             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
128 
129             if (curElemOffs < cols)
130                 dst_row[curElemOffs] = carryElem + curScanElem;
131 
132             offsetX += NUM_SCAN_THREADS;
133 
134             __syncthreads();
135 
136             if (threadIdx.x == NUM_SCAN_THREADS - 1)
137             {
138                 carryElem += curScanElem;
139             }
140 
141             __syncthreads();
142         }
143     }
144 
145     template <class SrcPtr, typename D>
horizontal_pass(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)146     __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
147     {
148         const int NUM_SCAN_THREADS = 256;
149 
150         const dim3 block(NUM_SCAN_THREADS);
151         const dim3 grid(rows);
152 
153         horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols);
154         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
155     }
156 
157     // horisontal_pass_8u_shfl
158 
int_to_uchar4(unsigned int in)159     __device__ static uchar4 int_to_uchar4(unsigned int in)
160     {
161         uchar4 bytes;
162         bytes.x = (in & 0x000000ff) >>  0;
163         bytes.y = (in & 0x0000ff00) >>  8;
164         bytes.z = (in & 0x00ff0000) >> 16;
165         bytes.w = (in & 0xff000000) >> 24;
166         return bytes;
167     }
168 
horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img,GlobPtr<uint4> integral)169     __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral)
170     {
171     #if CV_CUDEV_ARCH >= 300
172         __shared__ int sums[128];
173 
174         const int id = threadIdx.x;
175         const int lane_id = id % warpSize;
176         const int warp_id = id / warpSize;
177 
178         const uint4 data = img(blockIdx.x, id);
179 
180         const uchar4 a = int_to_uchar4(data.x);
181         const uchar4 b = int_to_uchar4(data.y);
182         const uchar4 c = int_to_uchar4(data.z);
183         const uchar4 d = int_to_uchar4(data.w);
184 
185         int result[16];
186 
187         result[0]  =              a.x;
188         result[1]  = result[0]  + a.y;
189         result[2]  = result[1]  + a.z;
190         result[3]  = result[2]  + a.w;
191 
192         result[4]  = result[3]  + b.x;
193         result[5]  = result[4]  + b.y;
194         result[6]  = result[5]  + b.z;
195         result[7]  = result[6]  + b.w;
196 
197         result[8]  = result[7]  + c.x;
198         result[9]  = result[8]  + c.y;
199         result[10] = result[9]  + c.z;
200         result[11] = result[10] + c.w;
201 
202         result[12] = result[11] + d.x;
203         result[13] = result[12] + d.y;
204         result[14] = result[13] + d.z;
205         result[15] = result[14] + d.w;
206 
207         int sum = result[15];
208 
209         // the prefix sum for each thread's 16 value is computed,
210         // now the final sums (result[15]) need to be shared
211         // with the other threads and add.  To do this,
212         // the shfl_up() instruction is used and a shuffle scan
213         // operation is performed to distribute the sums to the correct
214         // threads
215         #pragma unroll
216         for (int i = 1; i < 32; i *= 2)
217         {
218             const int n = shfl_up(sum, i, 32);
219 
220             if (lane_id >= i)
221             {
222                 #pragma unroll
223                 for (int k = 0; k < 16; ++k)
224                     result[k] += n;
225 
226                 sum += n;
227             }
228         }
229 
230         // Now the final sum for the warp must be shared
231         // between warps.  This is done by each warp
232         // having a thread store to shared memory, then
233         // having some other warp load the values and
234         // compute a prefix sum, again by using shfl_up.
235         // The results are uniformly added back to the warps.
236         // last thread in the warp holding sum of the warp
237         // places that in shared
238         if (threadIdx.x % warpSize == warpSize - 1)
239             sums[warp_id] = result[15];
240 
241         __syncthreads();
242 
243         if (warp_id == 0)
244         {
245             int warp_sum = sums[lane_id];
246 
247             #pragma unroll
248             for (int i = 1; i <= 32; i *= 2)
249             {
250                 const int n = shfl_up(warp_sum, i, 32);
251 
252                 if (lane_id >= i)
253                     warp_sum += n;
254             }
255 
256             sums[lane_id] = warp_sum;
257         }
258 
259         __syncthreads();
260 
261         int blockSum = 0;
262 
263         // fold in unused warp
264         if (warp_id > 0)
265         {
266             blockSum = sums[warp_id - 1];
267 
268             #pragma unroll
269             for (int k = 0; k < 16; ++k)
270                 result[k] += blockSum;
271         }
272 
273         // assemble result
274         // Each thread has 16 values to write, which are
275         // now integer data (to avoid overflow).  Instead of
276         // each thread writing consecutive uint4s, the
277         // approach shown here experiments using
278         // the shuffle command to reformat the data
279         // inside the registers so that each thread holds
280         // consecutive data to be written so larger contiguous
281         // segments can be assembled for writing.
282 
283         /*
284             For example data that needs to be written as
285 
286             GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
287             but is stored in registers (r0..r3), in four threads (0..3) as:
288 
289             threadId   0  1  2  3
290               r0      x0 y0 z0 w0
291               r1      x1 y1 z1 w1
292               r2      x2 y2 z2 w2
293               r3      x3 y3 z3 w3
294 
295               after apply shfl_xor operations to move data between registers r1..r3:
296 
297             threadId  00 01 10 11
298                       x0 y0 z0 w0
299              xor(01)->y1 x1 w1 z1
300              xor(10)->z2 w2 x2 y2
301              xor(11)->w3 z3 y3 x3
302 
303              and now x0..x3, and z0..z3 can be written out in order by all threads.
304 
305              In the current code, each register above is actually representing
306              four integers to be written as uint4's to GMEM.
307         */
308 
309         result[4]  = shfl_xor(result[4] , 1, 32);
310         result[5]  = shfl_xor(result[5] , 1, 32);
311         result[6]  = shfl_xor(result[6] , 1, 32);
312         result[7]  = shfl_xor(result[7] , 1, 32);
313 
314         result[8]  = shfl_xor(result[8] , 2, 32);
315         result[9]  = shfl_xor(result[9] , 2, 32);
316         result[10] = shfl_xor(result[10], 2, 32);
317         result[11] = shfl_xor(result[11], 2, 32);
318 
319         result[12] = shfl_xor(result[12], 3, 32);
320         result[13] = shfl_xor(result[13], 3, 32);
321         result[14] = shfl_xor(result[14], 3, 32);
322         result[15] = shfl_xor(result[15], 3, 32);
323 
324         uint4* integral_row = integral.row(blockIdx.x);
325         uint4 output;
326 
327         ///////
328 
329         if (threadIdx.x % 4 == 0)
330             output = make_uint4(result[0], result[1], result[2], result[3]);
331 
332         if (threadIdx.x % 4 == 1)
333             output = make_uint4(result[4], result[5], result[6], result[7]);
334 
335         if (threadIdx.x % 4 == 2)
336             output = make_uint4(result[8], result[9], result[10], result[11]);
337 
338         if (threadIdx.x % 4 == 3)
339             output = make_uint4(result[12], result[13], result[14], result[15]);
340 
341         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
342 
343         ///////
344 
345         if (threadIdx.x % 4 == 2)
346             output = make_uint4(result[0], result[1], result[2], result[3]);
347 
348         if (threadIdx.x % 4 == 3)
349             output = make_uint4(result[4], result[5], result[6], result[7]);
350 
351         if (threadIdx.x % 4 == 0)
352             output = make_uint4(result[8], result[9], result[10], result[11]);
353 
354         if (threadIdx.x % 4 == 1)
355             output = make_uint4(result[12], result[13], result[14], result[15]);
356 
357         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
358 
359         // continuning from the above example,
360         // this use of shfl_xor() places the y0..y3 and w0..w3 data
361         // in order.
362 
363         #pragma unroll
364         for (int i = 0; i < 16; ++i)
365             result[i] = shfl_xor(result[i], 1, 32);
366 
367         if (threadIdx.x % 4 == 0)
368             output = make_uint4(result[0], result[1], result[2], result[3]);
369 
370         if (threadIdx.x % 4 == 1)
371             output = make_uint4(result[4], result[5], result[6], result[7]);
372 
373         if (threadIdx.x % 4 == 2)
374             output = make_uint4(result[8], result[9], result[10], result[11]);
375 
376         if (threadIdx.x % 4 == 3)
377             output = make_uint4(result[12], result[13], result[14], result[15]);
378 
379         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
380 
381         ///////
382 
383         if (threadIdx.x % 4 == 2)
384             output = make_uint4(result[0], result[1], result[2], result[3]);
385 
386         if (threadIdx.x % 4 == 3)
387             output = make_uint4(result[4], result[5], result[6], result[7]);
388 
389         if (threadIdx.x % 4 == 0)
390             output = make_uint4(result[8], result[9], result[10], result[11]);
391 
392         if (threadIdx.x % 4 == 1)
393             output = make_uint4(result[12], result[13], result[14], result[15]);
394 
395         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
396     #endif
397     }
398 
horisontal_pass_8u_shfl(const GlobPtr<uchar> src,GlobPtr<uint> integral,int rows,int cols,cudaStream_t stream)399     __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream)
400     {
401         // each thread handles 16 values, use 1 block/row
402         // save, because step is actually can't be less 512 bytes
403         const int block = cols / 16;
404 
405         // launch 1 block / row
406         const int grid = rows;
407 
408         CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) );
409 
410         GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step);
411         GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step);
412 
413         horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4);
414         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
415     }
416 
417     // vertical
418 
419     template <typename T>
vertical_pass(GlobPtr<T> integral,const int rows,const int cols)420     __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols)
421     {
422     #if CV_CUDEV_ARCH >= 300
423         __shared__ T sums[32][9];
424 
425         const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
426         const int lane_id = tidx % 8;
427 
428         sums[threadIdx.x][threadIdx.y] = 0;
429         __syncthreads();
430 
431         T stepSum = 0;
432 
433         int numBuckets = divUp(rows, blockDim.y);
434         int y = threadIdx.y;
435 
436         while (numBuckets--)
437         {
438             T* p = integral.row(y) + tidx;
439 
440             T sum = (tidx < cols) && (y < rows) ? *p : 0;
441 
442             sums[threadIdx.x][threadIdx.y] = sum;
443             __syncthreads();
444 
445             // place into SMEM
446             // shfl scan reduce the SMEM, reformating so the column
447             // sums are computed in a warp
448             // then read out properly
449             const int j = threadIdx.x % 8;
450             const int k = threadIdx.x / 8 + threadIdx.y * 4;
451 
452             T partial_sum = sums[k][j];
453 
454             for (int i = 1; i <= 8; i *= 2)
455             {
456                 T n = shfl_up(partial_sum, i, 32);
457 
458                 if (lane_id >= i)
459                     partial_sum += n;
460             }
461 
462             sums[k][j] = partial_sum;
463             __syncthreads();
464 
465             if (threadIdx.y > 0)
466                 sum += sums[threadIdx.x][threadIdx.y - 1];
467 
468             sum += stepSum;
469             stepSum += sums[threadIdx.x][blockDim.y - 1];
470 
471             __syncthreads();
472 
473             if ((tidx < cols) && (y < rows))
474             {
475                 *p = sum;
476             }
477 
478             y += blockDim.y;
479         }
480     #else
481         __shared__ T smem[32][32];
482         __shared__ T prevVals[32];
483 
484         volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y;
485 
486         if (threadIdx.y == 0)
487             prevVals[threadIdx.x] = 0;
488 
489         __syncthreads();
490 
491         const int x = blockIdx.x * blockDim.x + threadIdx.x;
492 
493         int numBuckets = divUp(rows, 8 * 4);
494         int offsetY = 0;
495 
496         while (numBuckets--)
497         {
498             const int curRowOffs = offsetY + threadIdx.y;
499 
500             T curElems[4];
501             T temp[4];
502 
503             // load patch
504 
505             smem[threadIdx.y +  0][threadIdx.x] = 0.0f;
506             smem[threadIdx.y +  8][threadIdx.x] = 0.0f;
507             smem[threadIdx.y + 16][threadIdx.x] = 0.0f;
508             smem[threadIdx.y + 24][threadIdx.x] = 0.0f;
509 
510             if (x < cols)
511             {
512                 for (int i = 0; i < 4; ++i)
513                 {
514                     if (curRowOffs + i * 8 < rows)
515                         smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x);
516                 }
517             }
518 
519             __syncthreads();
520 
521             // reduce
522 
523             curElems[0] = smem[threadIdx.x][threadIdx.y     ];
524             curElems[1] = smem[threadIdx.x][threadIdx.y +  8];
525             curElems[2] = smem[threadIdx.x][threadIdx.y + 16];
526             curElems[3] = smem[threadIdx.x][threadIdx.y + 24];
527 
528             __syncthreads();
529 
530             temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x);
531             temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x);
532             temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x);
533             temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x);
534 
535             curElems[0] += prevVals[threadIdx.y     ];
536             curElems[1] += prevVals[threadIdx.y +  8];
537             curElems[2] += prevVals[threadIdx.y + 16];
538             curElems[3] += prevVals[threadIdx.y + 24];
539 
540             __syncthreads();
541 
542             if (threadIdx.x == 31)
543             {
544                 prevVals[threadIdx.y     ] += temp[0];
545                 prevVals[threadIdx.y +  8] += temp[1];
546                 prevVals[threadIdx.y + 16] += temp[2];
547                 prevVals[threadIdx.y + 24] += temp[3];
548             }
549 
550             smem[threadIdx.y     ][threadIdx.x] = curElems[0];
551             smem[threadIdx.y +  8][threadIdx.x] = curElems[1];
552             smem[threadIdx.y + 16][threadIdx.x] = curElems[2];
553             smem[threadIdx.y + 24][threadIdx.x] = curElems[3];
554 
555             __syncthreads();
556 
557             // store patch
558 
559             if (x < cols)
560             {
561                 // read 4 value from source
562                 for (int i = 0; i < 4; ++i)
563                 {
564                     if (curRowOffs + i * 8 < rows)
565                         integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8];
566                 }
567             }
568 
569             __syncthreads();
570 
571             offsetY += 8 * 4;
572         }
573     #endif
574     }
575 
576     template <typename T>
vertical_pass(const GlobPtr<T> & integral,int rows,int cols,cudaStream_t stream)577     __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream)
578     {
579         const dim3 block(32, 8);
580         const dim3 grid(divUp(cols, block.x));
581 
582         vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols);
583         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
584     }
585 
586     // integral
587 
588     template <class SrcPtr, typename D>
integral(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)589     __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
590     {
591         horizontal_pass(src, dst, rows, cols, stream);
592         vertical_pass(dst, rows, cols, stream);
593 
594         if (stream == 0)
595             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
596     }
597 
integral(const GlobPtr<uchar> & src,const GlobPtr<uint> & dst,int rows,int cols,cudaStream_t stream)598     __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
599     {
600         if (deviceSupports(FEATURE_SET_COMPUTE_30)
601             && (cols % 16 == 0)
602             && reinterpret_cast<intptr_t>(src.data) % 32 == 0
603             && reinterpret_cast<intptr_t>(dst.data) % 32 == 0)
604         {
605             horisontal_pass_8u_shfl(src, dst, rows, cols, stream);
606         }
607         else
608         {
609             horizontal_pass(src, dst, rows, cols, stream);
610         }
611 
612         vertical_pass(dst, rows, cols, stream);
613 
614         if (stream == 0)
615             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
616     }
617 
integral(const GlobPtr<uchar> & src,const GlobPtr<int> & dst,int rows,int cols,cudaStream_t stream)618     __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
619     {
620         GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
621         integral(src, dstui, rows, cols, stream);
622     }
623 }
624 
625 }}
626 
627 #endif
628