1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include <thrust/device_ptr.h>
46 #include <thrust/transform.h>
47 
48 #include "opencv2/core/cuda/common.hpp"
49 #include "opencv2/core/cuda/emulation.hpp"
50 #include "opencv2/core/cuda/vec_math.hpp"
51 #include "opencv2/core/cuda/functional.hpp"
52 
53 #include "opencv2/opencv_modules.hpp"
54 
55 #ifdef HAVE_OPENCV_CUDAARITHM
56 
57 namespace cv { namespace cuda { namespace device
58 {
59     namespace ght
60     {
61         __device__ int g_counter;
62 
63         template <typename T, int PIXELS_PER_THREAD>
buildEdgePointList(const PtrStepSzb edges,const PtrStep<T> dx,const PtrStep<T> dy,unsigned int * coordList,float * thetaList)64         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy,
65                                            unsigned int* coordList, float* thetaList)
66         {
67             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
68             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
69             __shared__ int s_sizes[4];
70             __shared__ int s_globStart[4];
71 
72             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
73             const int y = blockIdx.y * blockDim.y + threadIdx.y;
74 
75             if (threadIdx.x == 0)
76                 s_sizes[threadIdx.y] = 0;
77             __syncthreads();
78 
79             if (y < edges.rows)
80             {
81                 // fill the queue
82                 const uchar* edgesRow = edges.ptr(y);
83                 const T* dxRow = dx.ptr(y);
84                 const T* dyRow = dy.ptr(y);
85 
86                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
87                 {
88                     const T dxVal = dxRow[xx];
89                     const T dyVal = dyRow[xx];
90 
91                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
92                     {
93                         const unsigned int coord = (y << 16) | xx;
94 
95                         float theta = ::atan2f(dyVal, dxVal);
96                         if (theta < 0)
97                             theta += 2.0f * CV_PI_F;
98 
99                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
100 
101                         s_coordLists[threadIdx.y][qidx] = coord;
102                         s_thetaLists[threadIdx.y][qidx] = theta;
103                     }
104                 }
105             }
106 
107             __syncthreads();
108 
109             // let one thread reserve the space required in the global list
110             if (threadIdx.x == 0 && threadIdx.y == 0)
111             {
112                 // find how many items are stored in each list
113                 int totalSize = 0;
114                 for (int i = 0; i < blockDim.y; ++i)
115                 {
116                     s_globStart[i] = totalSize;
117                     totalSize += s_sizes[i];
118                 }
119 
120                 // calculate the offset in the global list
121                 const int globalOffset = atomicAdd(&g_counter, totalSize);
122                 for (int i = 0; i < blockDim.y; ++i)
123                     s_globStart[i] += globalOffset;
124             }
125 
126             __syncthreads();
127 
128             // copy local queues to global queue
129             const int qsize = s_sizes[threadIdx.y];
130             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
131             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
132             {
133                 coordList[gidx] = s_coordLists[threadIdx.y][i];
134                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
135             }
136         }
137 
138         template <typename T>
buildEdgePointList_gpu(PtrStepSzb edges,PtrStepSzb dx,PtrStepSzb dy,unsigned int * coordList,float * thetaList)139         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
140         {
141             const int PIXELS_PER_THREAD = 8;
142 
143             void* counterPtr;
144             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
145 
146             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
147 
148             const dim3 block(32, 4);
149             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
150 
151             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
152 
153             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
154             cudaSafeCall( cudaGetLastError() );
155 
156             cudaSafeCall( cudaDeviceSynchronize() );
157 
158             int totalCount;
159             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
160 
161             return totalCount;
162         }
163 
164         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
165         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
166         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
167 
buildRTable(const unsigned int * coordList,const float * thetaList,const int pointsCount,PtrStep<short2> r_table,int * r_sizes,int maxSize,const short2 templCenter,const float thetaScale)168         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
169                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
170                                     const short2 templCenter, const float thetaScale)
171         {
172             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
173 
174             if (tid >= pointsCount)
175                 return;
176 
177             const unsigned int coord = coordList[tid];
178             short2 p;
179             p.x = (coord & 0xFFFF);
180             p.y = (coord >> 16) & 0xFFFF;
181 
182             const float theta = thetaList[tid];
183             const int n = __float2int_rn(theta * thetaScale);
184 
185             const int ind = ::atomicAdd(r_sizes + n, 1);
186             if (ind < maxSize)
187                 r_table(n, ind) = saturate_cast<short2>(p - templCenter);
188         }
189 
buildRTable_gpu(const unsigned int * coordList,const float * thetaList,int pointsCount,PtrStepSz<short2> r_table,int * r_sizes,short2 templCenter,int levels)190         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
191                              PtrStepSz<short2> r_table, int* r_sizes,
192                              short2 templCenter, int levels)
193         {
194             const dim3 block(256);
195             const dim3 grid(divUp(pointsCount, block.x));
196 
197             const float thetaScale = levels / (2.0f * CV_PI_F);
198 
199             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
200             cudaSafeCall( cudaGetLastError() );
201 
202             cudaSafeCall( cudaDeviceSynchronize() );
203         }
204 
205         ////////////////////////////////////////////////////////////////////////
206         // Ballard_Pos
207 
Ballard_Pos_calcHist(const unsigned int * coordList,const float * thetaList,const int pointsCount,const PtrStep<short2> r_table,const int * r_sizes,PtrStepSzi hist,const float idp,const float thetaScale)208         __global__ void Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
209                                              const PtrStep<short2> r_table, const int* r_sizes,
210                                              PtrStepSzi hist,
211                                              const float idp, const float thetaScale)
212         {
213             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
214 
215             if (tid >= pointsCount)
216                 return;
217 
218             const unsigned int coord = coordList[tid];
219             short2 p;
220             p.x = (coord & 0xFFFF);
221             p.y = (coord >> 16) & 0xFFFF;
222 
223             const float theta = thetaList[tid];
224             const int n = __float2int_rn(theta * thetaScale);
225 
226             const short2* r_row = r_table.ptr(n);
227             const int r_row_size = r_sizes[n];
228 
229             for (int j = 0; j < r_row_size; ++j)
230             {
231                 short2 c = saturate_cast<short2>(p - r_row[j]);
232 
233                 c.x = __float2int_rn(c.x * idp);
234                 c.y = __float2int_rn(c.y * idp);
235 
236                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
237                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
238             }
239         }
240 
Ballard_Pos_calcHist_gpu(const unsigned int * coordList,const float * thetaList,int pointsCount,PtrStepSz<short2> r_table,const int * r_sizes,PtrStepSzi hist,float dp,int levels)241         void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
242                                       PtrStepSz<short2> r_table, const int* r_sizes,
243                                       PtrStepSzi hist,
244                                       float dp, int levels)
245         {
246             const dim3 block(256);
247             const dim3 grid(divUp(pointsCount, block.x));
248 
249             const float idp = 1.0f / dp;
250             const float thetaScale = levels / (2.0f * CV_PI_F);
251 
252             Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
253             cudaSafeCall( cudaGetLastError() );
254 
255             cudaSafeCall( cudaDeviceSynchronize() );
256         }
257 
Ballard_Pos_findPosInHist(const PtrStepSzi hist,float4 * out,int3 * votes,const int maxSize,const float dp,const int threshold)258         __global__ void Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes,
259                                                   const int maxSize, const float dp, const int threshold)
260         {
261             const int x = blockIdx.x * blockDim.x + threadIdx.x;
262             const int y = blockIdx.y * blockDim.y + threadIdx.y;
263 
264             if (x >= hist.cols - 2 || y >= hist.rows - 2)
265                 return;
266 
267             const int curVotes = hist(y + 1, x + 1);
268 
269             if (curVotes > threshold &&
270                 curVotes >  hist(y + 1, x) &&
271                 curVotes >= hist(y + 1, x + 2) &&
272                 curVotes >  hist(y, x + 1) &&
273                 curVotes >= hist(y + 2, x + 1))
274             {
275                 const int ind = ::atomicAdd(&g_counter, 1);
276 
277                 if (ind < maxSize)
278                 {
279                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
280                     votes[ind] = make_int3(curVotes, 0, 0);
281                 }
282             }
283         }
284 
Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist,float4 * out,int3 * votes,int maxSize,float dp,int threshold)285         int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
286         {
287             void* counterPtr;
288             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
289 
290             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
291 
292             const dim3 block(32, 8);
293             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
294 
295             cudaSafeCall( cudaFuncSetCacheConfig(Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
296 
297             Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
298             cudaSafeCall( cudaGetLastError() );
299 
300             cudaSafeCall( cudaDeviceSynchronize() );
301 
302             int totalCount;
303             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
304 
305             totalCount = ::min(totalCount, maxSize);
306 
307             return totalCount;
308         }
309 
310         ////////////////////////////////////////////////////////////////////////
311         // Guil_Full
312 
313         struct FeatureTable
314         {
315             uchar* p1_pos_data;
316             size_t p1_pos_step;
317 
318             uchar* p1_theta_data;
319             size_t p1_theta_step;
320 
321             uchar* p2_pos_data;
322             size_t p2_pos_step;
323 
324             uchar* d12_data;
325             size_t d12_step;
326 
327             uchar* r1_data;
328             size_t r1_step;
329 
330             uchar* r2_data;
331             size_t r2_step;
332         };
333 
334         __constant__ FeatureTable c_templFeatures;
335         __constant__ FeatureTable c_imageFeatures;
336 
Guil_Full_setTemplFeatures(PtrStepb p1_pos,PtrStepb p1_theta,PtrStepb p2_pos,PtrStepb d12,PtrStepb r1,PtrStepb r2)337         void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
338         {
339             FeatureTable tbl;
340 
341             tbl.p1_pos_data = p1_pos.data;
342             tbl.p1_pos_step = p1_pos.step;
343 
344             tbl.p1_theta_data = p1_theta.data;
345             tbl.p1_theta_step = p1_theta.step;
346 
347             tbl.p2_pos_data = p2_pos.data;
348             tbl.p2_pos_step = p2_pos.step;
349 
350             tbl.d12_data = d12.data;
351             tbl.d12_step = d12.step;
352 
353             tbl.r1_data = r1.data;
354             tbl.r1_step = r1.step;
355 
356             tbl.r2_data = r2.data;
357             tbl.r2_step = r2.step;
358 
359             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
360         }
Guil_Full_setImageFeatures(PtrStepb p1_pos,PtrStepb p1_theta,PtrStepb p2_pos,PtrStepb d12,PtrStepb r1,PtrStepb r2)361         void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
362         {
363             FeatureTable tbl;
364 
365             tbl.p1_pos_data = p1_pos.data;
366             tbl.p1_pos_step = p1_pos.step;
367 
368             tbl.p1_theta_data = p1_theta.data;
369             tbl.p1_theta_step = p1_theta.step;
370 
371             tbl.p2_pos_data = p2_pos.data;
372             tbl.p2_pos_step = p2_pos.step;
373 
374             tbl.d12_data = d12.data;
375             tbl.d12_step = d12.step;
376 
377             tbl.r1_data = r1.data;
378             tbl.r1_step = r1.step;
379 
380             tbl.r2_data = r2.data;
381             tbl.r2_step = r2.step;
382 
383             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
384         }
385 
386         struct TemplFeatureTable
387         {
p1_poscv::cuda::device::ght::TemplFeatureTable388             static __device__ float2* p1_pos(int n)
389             {
390                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
391             }
p1_thetacv::cuda::device::ght::TemplFeatureTable392             static __device__ float* p1_theta(int n)
393             {
394                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
395             }
p2_poscv::cuda::device::ght::TemplFeatureTable396             static __device__ float2* p2_pos(int n)
397             {
398                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
399             }
400 
d12cv::cuda::device::ght::TemplFeatureTable401             static __device__ float* d12(int n)
402             {
403                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
404             }
405 
r1cv::cuda::device::ght::TemplFeatureTable406             static __device__ float2* r1(int n)
407             {
408                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
409             }
r2cv::cuda::device::ght::TemplFeatureTable410             static __device__ float2* r2(int n)
411             {
412                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
413             }
414         };
415         struct ImageFeatureTable
416         {
p1_poscv::cuda::device::ght::ImageFeatureTable417             static __device__ float2* p1_pos(int n)
418             {
419                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
420             }
p1_thetacv::cuda::device::ght::ImageFeatureTable421             static __device__ float* p1_theta(int n)
422             {
423                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
424             }
p2_poscv::cuda::device::ght::ImageFeatureTable425             static __device__ float2* p2_pos(int n)
426             {
427                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
428             }
429 
d12cv::cuda::device::ght::ImageFeatureTable430             static __device__ float* d12(int n)
431             {
432                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
433             }
434 
r1cv::cuda::device::ght::ImageFeatureTable435             static __device__ float2* r1(int n)
436             {
437                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
438             }
r2cv::cuda::device::ght::ImageFeatureTable439             static __device__ float2* r2(int n)
440             {
441                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
442             }
443         };
444 
clampAngle(float a)445         __device__ float clampAngle(float a)
446         {
447             float res = a;
448 
449             while (res > 2.0f * CV_PI_F)
450                 res -= 2.0f * CV_PI_F;
451             while (res < 0.0f)
452                 res += 2.0f * CV_PI_F;
453 
454             return res;
455         }
456 
angleEq(float a,float b,float eps)457         __device__ bool angleEq(float a, float b, float eps)
458         {
459             return (::fabs(clampAngle(a - b)) <= eps);
460         }
461 
462         template <class FT, bool isTempl>
Guil_Full_buildFeatureList(const unsigned int * coordList,const float * thetaList,const int pointsCount,int * sizes,const int maxSize,const float xi,const float angleEpsilon,const float alphaScale,const float2 center,const float maxDist)463         __global__ void Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
464                                                    int* sizes, const int maxSize,
465                                                    const float xi, const float angleEpsilon, const float alphaScale,
466                                                    const float2 center, const float maxDist)
467         {
468             const float p1_theta = thetaList[blockIdx.x];
469             const unsigned int coord1 = coordList[blockIdx.x];
470             float2 p1_pos;
471             p1_pos.x = (coord1 & 0xFFFF);
472             p1_pos.y = (coord1 >> 16) & 0xFFFF;
473 
474             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
475             {
476                 const float p2_theta = thetaList[i];
477                 const unsigned int coord2 = coordList[i];
478                 float2 p2_pos;
479                 p2_pos.x = (coord2 & 0xFFFF);
480                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
481 
482                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
483                 {
484                     const float2 d = p1_pos - p2_pos;
485 
486                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
487                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
488 
489                     if (d12 > maxDist)
490                         continue;
491 
492                     float2 r1 = p1_pos - center;
493                     float2 r2 = p2_pos - center;
494 
495                     const int n = __float2int_rn(alpha12 * alphaScale);
496 
497                     const int ind = ::atomicAdd(sizes + n, 1);
498 
499                     if (ind < maxSize)
500                     {
501                         if (!isTempl)
502                         {
503                             FT::p1_pos(n)[ind] = p1_pos;
504                             FT::p2_pos(n)[ind] = p2_pos;
505                         }
506 
507                         FT::p1_theta(n)[ind] = p1_theta;
508 
509                         FT::d12(n)[ind] = d12;
510 
511                         if (isTempl)
512                         {
513                             FT::r1(n)[ind] = r1;
514                             FT::r2(n)[ind] = r2;
515                         }
516                     }
517                 }
518             }
519         }
520 
521         template <class FT, bool isTempl>
Guil_Full_buildFeatureList_caller(const unsigned int * coordList,const float * thetaList,int pointsCount,int * sizes,int maxSize,float xi,float angleEpsilon,int levels,float2 center,float maxDist)522         void Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
523                                                int* sizes, int maxSize,
524                                                float xi, float angleEpsilon, int levels,
525                                                float2 center, float maxDist)
526         {
527             const dim3 block(256);
528             const dim3 grid(pointsCount);
529 
530             const float alphaScale = levels / (2.0f * CV_PI_F);
531 
532             Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
533                                                                      sizes, maxSize,
534                                                                      xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
535                                                                      center, maxDist);
536             cudaSafeCall( cudaGetLastError() );
537 
538             cudaSafeCall( cudaDeviceSynchronize() );
539 
540             thrust::device_ptr<int> sizesPtr(sizes);
541             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
542         }
543 
Guil_Full_buildTemplFeatureList_gpu(const unsigned int * coordList,const float * thetaList,int pointsCount,int * sizes,int maxSize,float xi,float angleEpsilon,int levels,float2 center,float maxDist)544         void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
545                                                  int* sizes, int maxSize,
546                                                  float xi, float angleEpsilon, int levels,
547                                                  float2 center, float maxDist)
548         {
549             Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
550                                                                        sizes, maxSize,
551                                                                        xi, angleEpsilon, levels,
552                                                                        center, maxDist);
553         }
Guil_Full_buildImageFeatureList_gpu(const unsigned int * coordList,const float * thetaList,int pointsCount,int * sizes,int maxSize,float xi,float angleEpsilon,int levels,float2 center,float maxDist)554         void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
555                                                  int* sizes, int maxSize,
556                                                  float xi, float angleEpsilon, int levels,
557                                                  float2 center, float maxDist)
558         {
559             Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
560                                                                         sizes, maxSize,
561                                                                         xi, angleEpsilon, levels,
562                                                                         center, maxDist);
563         }
564 
Guil_Full_calcOHist(const int * templSizes,const int * imageSizes,int * OHist,const float minAngle,const float maxAngle,const float iAngleStep,const int angleRange)565         __global__ void Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
566                                             const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
567         {
568             extern __shared__ int s_OHist[];
569             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
570                 s_OHist[i] = 0;
571             __syncthreads();
572 
573             const int tIdx = blockIdx.x;
574             const int level = blockIdx.y;
575 
576             const int tSize = templSizes[level];
577 
578             if (tIdx < tSize)
579             {
580                 const int imSize = imageSizes[level];
581 
582                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
583 
584                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
585                 {
586                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
587 
588                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
589 
590                     if (angle >= minAngle && angle <= maxAngle)
591                     {
592                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
593                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
594                     }
595                 }
596             }
597             __syncthreads();
598 
599             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
600                 ::atomicAdd(OHist + i, s_OHist[i]);
601         }
602 
Guil_Full_calcOHist_gpu(const int * templSizes,const int * imageSizes,int * OHist,float minAngle,float maxAngle,float angleStep,int angleRange,int levels,int tMaxSize)603         void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
604                                      float minAngle, float maxAngle, float angleStep, int angleRange,
605                                      int levels, int tMaxSize)
606         {
607             const dim3 block(256);
608             const dim3 grid(tMaxSize, levels + 1);
609 
610             minAngle *= (CV_PI_F / 180.0f);
611             maxAngle *= (CV_PI_F / 180.0f);
612             angleStep *= (CV_PI_F / 180.0f);
613 
614             const size_t smemSize = (angleRange + 1) * sizeof(float);
615 
616             Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
617                                                            minAngle, maxAngle, 1.0f / angleStep, angleRange);
618             cudaSafeCall( cudaGetLastError() );
619 
620             cudaSafeCall( cudaDeviceSynchronize() );
621         }
622 
Guil_Full_calcSHist(const int * templSizes,const int * imageSizes,int * SHist,const float angle,const float angleEpsilon,const float minScale,const float maxScale,const float iScaleStep,const int scaleRange)623         __global__ void Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
624                                             const float angle, const float angleEpsilon,
625                                             const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
626         {
627             extern __shared__ int s_SHist[];
628             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
629                 s_SHist[i] = 0;
630             __syncthreads();
631 
632             const int tIdx = blockIdx.x;
633             const int level = blockIdx.y;
634 
635             const int tSize = templSizes[level];
636 
637             if (tIdx < tSize)
638             {
639                 const int imSize = imageSizes[level];
640 
641                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
642                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
643 
644                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
645                 {
646                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
647                     const float im_d12 = ImageFeatureTable::d12(level)[i];
648 
649                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
650                     {
651                         const float scale = im_d12 / t_d12;
652 
653                         if (scale >= minScale && scale <= maxScale)
654                         {
655                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
656                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
657                         }
658                     }
659                 }
660             }
661             __syncthreads();
662 
663             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
664                 ::atomicAdd(SHist + i, s_SHist[i]);
665         }
666 
Guil_Full_calcSHist_gpu(const int * templSizes,const int * imageSizes,int * SHist,float angle,float angleEpsilon,float minScale,float maxScale,float iScaleStep,int scaleRange,int levels,int tMaxSize)667         void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
668                                      float angle, float angleEpsilon,
669                                      float minScale, float maxScale, float iScaleStep, int scaleRange,
670                                      int levels, int tMaxSize)
671         {
672             const dim3 block(256);
673             const dim3 grid(tMaxSize, levels + 1);
674 
675             angle *= (CV_PI_F / 180.0f);
676             angleEpsilon *= (CV_PI_F / 180.0f);
677 
678             const size_t smemSize = (scaleRange + 1) * sizeof(float);
679 
680             Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
681                                                            angle, angleEpsilon,
682                                                            minScale, maxScale, iScaleStep, scaleRange);
683             cudaSafeCall( cudaGetLastError() );
684 
685             cudaSafeCall( cudaDeviceSynchronize() );
686         }
687 
Guil_Full_calcPHist(const int * templSizes,const int * imageSizes,PtrStepSzi PHist,const float angle,const float sinVal,const float cosVal,const float angleEpsilon,const float scale,const float idp)688         __global__ void Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
689                                             const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
690                                             const float idp)
691         {
692             const int tIdx = blockIdx.x;
693             const int level = blockIdx.y;
694 
695             const int tSize = templSizes[level];
696 
697             if (tIdx < tSize)
698             {
699                 const int imSize = imageSizes[level];
700 
701                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
702 
703                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
704                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
705 
706                 r1 = r1 * scale;
707                 r2 = r2 * scale;
708 
709                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
710                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
711 
712                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
713                 {
714                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
715 
716                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
717                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
718 
719                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
720                     {
721                         float2 c1, c2;
722 
723                         c1 = im_p1_pos - r1;
724                         c1 = c1 * idp;
725 
726                         c2 = im_p2_pos - r2;
727                         c2 = c2 * idp;
728 
729                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
730                             continue;
731 
732                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
733                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
734                     }
735                 }
736             }
737         }
738 
Guil_Full_calcPHist_gpu(const int * templSizes,const int * imageSizes,PtrStepSzi PHist,float angle,float angleEpsilon,float scale,float dp,int levels,int tMaxSize)739         void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
740                                      float angle, float angleEpsilon, float scale,
741                                      float dp,
742                                      int levels, int tMaxSize)
743         {
744             const dim3 block(256);
745             const dim3 grid(tMaxSize, levels + 1);
746 
747             angle *= (CV_PI_F / 180.0f);
748             angleEpsilon *= (CV_PI_F / 180.0f);
749 
750             const float sinVal = ::sinf(angle);
751             const float cosVal = ::cosf(angle);
752 
753             cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_calcPHist, cudaFuncCachePreferL1) );
754 
755             Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
756                                                  angle, sinVal, cosVal, angleEpsilon, scale,
757                                                  1.0f / dp);
758             cudaSafeCall( cudaGetLastError() );
759 
760             cudaSafeCall( cudaDeviceSynchronize() );
761         }
762 
Guil_Full_findPosInHist(const PtrStepSzi hist,float4 * out,int3 * votes,const int maxSize,const float angle,const int angleVotes,const float scale,const int scaleVotes,const float dp,const int threshold)763         __global__ void Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
764                                                 const float angle, const int angleVotes, const float scale, const int scaleVotes,
765                                                 const float dp, const int threshold)
766         {
767             const int x = blockIdx.x * blockDim.x + threadIdx.x;
768             const int y = blockIdx.y * blockDim.y + threadIdx.y;
769 
770             if (x >= hist.cols - 2 || y >= hist.rows - 2)
771                 return;
772 
773             const int curVotes = hist(y + 1, x + 1);
774 
775             if (curVotes > threshold &&
776                 curVotes >  hist(y + 1, x) &&
777                 curVotes >= hist(y + 1, x + 2) &&
778                 curVotes >  hist(y, x + 1) &&
779                 curVotes >= hist(y + 2, x + 1))
780             {
781                 const int ind = ::atomicAdd(&g_counter, 1);
782 
783                 if (ind < maxSize)
784                 {
785                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
786                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
787                 }
788             }
789         }
790 
Guil_Full_findPosInHist_gpu(PtrStepSzi hist,float4 * out,int3 * votes,int curSize,int maxSize,float angle,int angleVotes,float scale,int scaleVotes,float dp,int threshold)791         int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
792                                         float angle, int angleVotes, float scale, int scaleVotes,
793                                         float dp, int threshold)
794         {
795             void* counterPtr;
796             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
797 
798             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
799 
800             const dim3 block(32, 8);
801             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
802 
803             cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
804 
805             Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
806                                                      angle, angleVotes, scale, scaleVotes,
807                                                      dp, threshold);
808             cudaSafeCall( cudaGetLastError() );
809 
810             cudaSafeCall( cudaDeviceSynchronize() );
811 
812             int totalCount;
813             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
814 
815             totalCount = ::min(totalCount, maxSize);
816 
817             return totalCount;
818         }
819     }
820 }}}
821 
822 #endif // HAVE_OPENCV_CUDAARITHM
823 
824 #endif /* CUDA_DISABLER */
825