1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "lbp.hpp"
46 #include "opencv2/core/cuda/vec_traits.hpp"
47 #include "opencv2/core/cuda/saturate_cast.hpp"
48 
49 namespace cv { namespace cuda { namespace device
50 {
51     namespace lbp
52     {
53         struct LBP
54         {
LBPcv::cuda::device::lbp::LBP55             __host__ __device__ __forceinline__ LBP() {}
56 
operator ()cv::cuda::device::lbp::LBP57             __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
58             {
59                 int anchors[9];
60 
61                 anchors[0]  = integral[ty];
62                 anchors[1]  = integral[ty + fw];
63                 anchors[0] -= anchors[1];
64                 anchors[2]  = integral[ty + fw * 2];
65                 anchors[1] -= anchors[2];
66                 anchors[2] -= integral[ty + fw * 3];
67 
68                 ty += fh;
69                 anchors[3]  = integral[ty];
70                 anchors[4]  = integral[ty + fw];
71                 anchors[3] -= anchors[4];
72                 anchors[5]  = integral[ty + fw * 2];
73                 anchors[4] -= anchors[5];
74                 anchors[5] -= integral[ty + fw * 3];
75 
76                 anchors[0] -= anchors[3];
77                 anchors[1] -= anchors[4];
78                 anchors[2] -= anchors[5];
79                 // 0 - 2 contains s0 - s2
80 
81                 ty += fh;
82                 anchors[6]  = integral[ty];
83                 anchors[7]  = integral[ty + fw];
84                 anchors[6] -= anchors[7];
85                 anchors[8]  = integral[ty + fw * 2];
86                 anchors[7] -= anchors[8];
87                 anchors[8] -= integral[ty + fw * 3];
88 
89                 anchors[3] -= anchors[6];
90                 anchors[4] -= anchors[7];
91                 anchors[5] -= anchors[8];
92                 // 3 - 5 contains s3 - s5
93 
94                 anchors[0] -= anchors[4];
95                 anchors[1] -= anchors[4];
96                 anchors[2] -= anchors[4];
97                 anchors[3] -= anchors[4];
98                 anchors[5] -= anchors[4];
99 
100                 int response = (~(anchors[0] >> 31)) & 4;
101                 response |= (~(anchors[1] >> 31)) & 2;;
102                 response |= (~(anchors[2] >> 31)) & 1;
103 
104                 shift = (~(anchors[5] >> 31)) & 16;
105                 shift |= (~(anchors[3] >> 31)) & 1;
106 
107                 ty += fh;
108                 anchors[0]  = integral[ty];
109                 anchors[1]  = integral[ty + fw];
110                 anchors[0] -= anchors[1];
111                 anchors[2]  = integral[ty + fw * 2];
112                 anchors[1] -= anchors[2];
113                 anchors[2] -= integral[ty + fw * 3];
114 
115                 anchors[6] -= anchors[0];
116                 anchors[7] -= anchors[1];
117                 anchors[8] -= anchors[2];
118                 // 0 -2 contains s6 - s8
119 
120                 anchors[6] -= anchors[4];
121                 anchors[7] -= anchors[4];
122                 anchors[8] -= anchors[4];
123 
124                 shift |= (~(anchors[6] >> 31)) & 2;
125                 shift |= (~(anchors[7] >> 31)) & 4;
126                 shift |= (~(anchors[8] >> 31)) & 8;
127                 return response;
128             }
129         };
130 
131         template<typename Pr>
disjoin(int4 * candidates,int4 * objects,unsigned int n,int groupThreshold,float grouping_eps,unsigned int * nclasses)132         __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
133         {
134             unsigned int tid = threadIdx.x;
135             extern __shared__ int sbuff[];
136 
137             int* labels = sbuff;
138             int* rrects = sbuff + n;
139 
140             Pr predicate(grouping_eps);
141             partition(candidates, n, labels, predicate);
142 
143             rrects[tid * 4 + 0] = 0;
144             rrects[tid * 4 + 1] = 0;
145             rrects[tid * 4 + 2] = 0;
146             rrects[tid * 4 + 3] = 0;
147             __syncthreads();
148 
149             int cls = labels[tid];
150             Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
151             Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
152             Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
153             Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
154 
155             __syncthreads();
156             labels[tid] = 0;
157 
158             __syncthreads();
159             Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
160 
161             __syncthreads();
162             *nclasses = 0;
163 
164             int active = labels[tid];
165             if (active)
166             {
167                 int* r1 = rrects + tid * 4;
168                 float s = 1.f / active;
169                 r1[0] = saturate_cast<int>(r1[0] * s);
170                 r1[1] = saturate_cast<int>(r1[1] * s);
171                 r1[2] = saturate_cast<int>(r1[2] * s);
172                 r1[3] = saturate_cast<int>(r1[3] * s);
173             }
174             __syncthreads();
175 
176             if (active && active >= groupThreshold)
177             {
178                 int* r1 = rrects + tid * 4;
179                 int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
180 
181                 int aidx = Emulation::smem::atomicInc(nclasses, n);
182                 objects[aidx] = r_out;
183             }
184         }
185 
connectedConmonents(PtrStepSz<int4> candidates,int ncandidates,PtrStepSz<int4> objects,int groupThreshold,float grouping_eps,unsigned int * nclasses)186         void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
187         {
188             if (!ncandidates) return;
189             int block = ncandidates;
190             int smem  = block * ( sizeof(int) + sizeof(int4) );
191             disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
192             cudaSafeCall( cudaGetLastError() );
193         }
194 
195         struct Cascade
196         {
Cascadecv::cuda::device::lbp::Cascade197             __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
198                 const int* _subsets, const uchar4* _features, int _subsetSize)
199 
200             : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
201 
operator ()cv::cuda::device::lbp::Cascade202             __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
203             {
204                 int current_node = 0;
205                 int current_leave = 0;
206 
207                 for (int s = 0; s < nstages; ++s)
208                 {
209                     float sum = 0;
210                     Stage stage = stages[s];
211                     for (int t = 0; t < stage.ntrees; t++)
212                     {
213                         ClNode node = nodes[current_node];
214                         uchar4 feature = features[node.featureIdx];
215 
216                         int shift;
217                         int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
218                         int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
219                         sum += leaves[idx];
220 
221                         current_node += 1;
222                         current_leave += 2;
223                     }
224 
225                     if (sum < stage.threshold)
226                         return false;
227                 }
228 
229                 return true;
230             }
231 
232             const Stage*  stages;
233             const int nstages;
234 
235             const ClNode* nodes;
236             const float* leaves;
237             const int* subsets;
238             const uchar4* features;
239 
240             const int subsetSize;
241             const LBP evaluator;
242         };
243 
244         // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
lbp_cascade(const Cascade cascade,int frameW,int frameH,int windowW,int windowH,float scale,const float factor,const int total,int * integral,const int pitch,PtrStepSz<int4> objects,unsigned int * classified)245         __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
246             const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
247         {
248             int ftid = blockIdx.x * blockDim.x + threadIdx.x;
249             if (ftid >= total) return;
250 
251             int step = (scale <= 2.f);
252 
253             int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
254             int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
255             int wshift = 0;
256 
257             int scaleTid = ftid;
258 
259             while (scaleTid >= stotal)
260             {
261                 scaleTid -= stotal;
262                 wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
263                 scale *= factor;
264                 step = (scale <= 2.f);
265                 windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
266                 stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
267             }
268 
269             int y = __fdividef(scaleTid, windowsForLine);
270             int x = scaleTid - y * windowsForLine;
271 
272             x <<= step;
273             y <<= step;
274 
275             if (cascade(y, x + wshift, integral, pitch))
276             {
277                 if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
278 
279                 int4 rect;
280                 rect.x = __float2int_rn(x * scale);
281                 rect.y = __float2int_rn(y * scale);
282                 rect.z = __float2int_rn(windowW * scale);
283                 rect.w = __float2int_rn(windowH * scale);
284 
285                 int res = atomicInc(classified, (unsigned int)objects.cols);
286                 objects(0, res) = rect;
287             }
288         }
289 
classifyPyramid(int frameW,int frameH,int windowW,int windowH,float initialScale,float factor,int workAmount,const PtrStepSzb & mstages,const int nstages,const PtrStepSzi & mnodes,const PtrStepSzf & mleaves,const PtrStepSzi & msubsets,const PtrStepSzb & mfeatures,const int subsetSize,PtrStepSz<int4> objects,unsigned int * classified,PtrStepSzi integral)290         void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
291             const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
292             const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
293         {
294             const int block = 128;
295             int grid = divUp(workAmount, block);
296             cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
297             Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
298             lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
299         }
300     }
301 }}}
302 
303 #endif /* CUDA_DISABLER */
304