1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include <thrust/device_ptr.h>
46 #include <thrust/sort.h>
47 
48 #include "opencv2/core/cuda/common.hpp"
49 #include "opencv2/core/cuda/reduce.hpp"
50 #include "opencv2/core/cuda/functional.hpp"
51 
52 namespace cv { namespace cuda { namespace device
53 {
54     namespace orb
55     {
56         ////////////////////////////////////////////////////////////////////////////////////////////////////////
57         // cull
58 
cull_gpu(int * loc,float * response,int size,int n_points)59         int cull_gpu(int* loc, float* response, int size, int n_points)
60         {
61             thrust::device_ptr<int> loc_ptr(loc);
62             thrust::device_ptr<float> response_ptr(response);
63 
64             thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
65 
66             return n_points;
67         }
68 
69         ////////////////////////////////////////////////////////////////////////////////////////////////////////
70         // HarrisResponses
71 
HarrisResponses(const PtrStepb img,const short2 * loc_,float * response,const int npoints,const int blockSize,const float harris_k)72         __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
73         {
74             __shared__ int smem0[8 * 32];
75             __shared__ int smem1[8 * 32];
76             __shared__ int smem2[8 * 32];
77 
78             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
79 
80             if (ptidx < npoints)
81             {
82                 const short2 loc = loc_[ptidx];
83 
84                 const int r = blockSize / 2;
85                 const int x0 = loc.x - r;
86                 const int y0 = loc.y - r;
87 
88                 int a = 0, b = 0, c = 0;
89 
90                 for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
91                 {
92                     const int i = ind / blockSize;
93                     const int j = ind % blockSize;
94 
95                     int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
96                         (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
97                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
98 
99                     int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
100                         (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
101                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
102 
103                     a += Ix * Ix;
104                     b += Iy * Iy;
105                     c += Ix * Iy;
106                 }
107 
108                 int* srow0 = smem0 + threadIdx.y * blockDim.x;
109                 int* srow1 = smem1 + threadIdx.y * blockDim.x;
110                 int* srow2 = smem2 + threadIdx.y * blockDim.x;
111 
112                 plus<int> op;
113                 reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
114 
115                 if (threadIdx.x == 0)
116                 {
117                     float scale = (1 << 2) * blockSize * 255.0f;
118                     scale = 1.0f / scale;
119                     const float scale_sq_sq = scale * scale * scale * scale;
120 
121                     response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
122                 }
123             }
124         }
125 
HarrisResponses_gpu(PtrStepSzb img,const short2 * loc,float * response,const int npoints,int blockSize,float harris_k,cudaStream_t stream)126         void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
127         {
128             dim3 block(32, 8);
129 
130             dim3 grid;
131             grid.x = divUp(npoints, block.y);
132 
133             HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
134 
135             cudaSafeCall( cudaGetLastError() );
136 
137             if (stream == 0)
138                 cudaSafeCall( cudaDeviceSynchronize() );
139         }
140 
141         ////////////////////////////////////////////////////////////////////////////////////////////////////////
142         // IC_Angle
143 
144         __constant__ int c_u_max[32];
145 
loadUMax(const int * u_max,int count)146         void loadUMax(const int* u_max, int count)
147         {
148             cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
149         }
150 
IC_Angle(const PtrStepb image,const short2 * loc_,float * angle,const int npoints,const int half_k)151         __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
152         {
153             __shared__ int smem0[8 * 32];
154             __shared__ int smem1[8 * 32];
155 
156             int* srow0 = smem0 + threadIdx.y * blockDim.x;
157             int* srow1 = smem1 + threadIdx.y * blockDim.x;
158 
159             plus<int> op;
160 
161             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
162 
163             if (ptidx < npoints)
164             {
165                 int m_01 = 0, m_10 = 0;
166 
167                 const short2 loc = loc_[ptidx];
168 
169                 // Treat the center line differently, v=0
170                 for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
171                     m_10 += u * image(loc.y, loc.x + u);
172 
173                 reduce<32>(srow0, m_10, threadIdx.x, op);
174 
175                 for (int v = 1; v <= half_k; ++v)
176                 {
177                     // Proceed over the two lines
178                     int v_sum = 0;
179                     int m_sum = 0;
180                     const int d = c_u_max[v];
181 
182                     for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
183                     {
184                         int val_plus = image(loc.y + v, loc.x + u);
185                         int val_minus = image(loc.y - v, loc.x + u);
186 
187                         v_sum += (val_plus - val_minus);
188                         m_sum += u * (val_plus + val_minus);
189                     }
190 
191                     reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
192 
193                     m_10 += m_sum;
194                     m_01 += v * v_sum;
195                 }
196 
197                 if (threadIdx.x == 0)
198                 {
199                     float kp_dir = ::atan2f((float)m_01, (float)m_10);
200                     kp_dir += (kp_dir < 0) * (2.0f * CV_PI_F);
201                     kp_dir *= 180.0f / CV_PI_F;
202 
203                     angle[ptidx] = kp_dir;
204                 }
205             }
206         }
207 
IC_Angle_gpu(PtrStepSzb image,const short2 * loc,float * angle,int npoints,int half_k,cudaStream_t stream)208         void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
209         {
210             dim3 block(32, 8);
211 
212             dim3 grid;
213             grid.x = divUp(npoints, block.y);
214 
215             IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
216 
217             cudaSafeCall( cudaGetLastError() );
218 
219             if (stream == 0)
220                 cudaSafeCall( cudaDeviceSynchronize() );
221         }
222 
223         ////////////////////////////////////////////////////////////////////////////////////////////////////////
224         // computeOrbDescriptor
225 
226         template <int WTA_K> struct OrbDescriptor;
227 
228         #define GET_VALUE(idx) \
229             img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
230                 loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
231 
232         template <> struct OrbDescriptor<2>
233         {
calccv::cuda::device::orb::OrbDescriptor234             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
235             {
236                 pattern_x += 16 * i;
237                 pattern_y += 16 * i;
238 
239                 int t0, t1, val;
240 
241                 t0 = GET_VALUE(0); t1 = GET_VALUE(1);
242                 val = t0 < t1;
243 
244                 t0 = GET_VALUE(2); t1 = GET_VALUE(3);
245                 val |= (t0 < t1) << 1;
246 
247                 t0 = GET_VALUE(4); t1 = GET_VALUE(5);
248                 val |= (t0 < t1) << 2;
249 
250                 t0 = GET_VALUE(6); t1 = GET_VALUE(7);
251                 val |= (t0 < t1) << 3;
252 
253                 t0 = GET_VALUE(8); t1 = GET_VALUE(9);
254                 val |= (t0 < t1) << 4;
255 
256                 t0 = GET_VALUE(10); t1 = GET_VALUE(11);
257                 val |= (t0 < t1) << 5;
258 
259                 t0 = GET_VALUE(12); t1 = GET_VALUE(13);
260                 val |= (t0 < t1) << 6;
261 
262                 t0 = GET_VALUE(14); t1 = GET_VALUE(15);
263                 val |= (t0 < t1) << 7;
264 
265                 return val;
266             }
267         };
268 
269         template <> struct OrbDescriptor<3>
270         {
calccv::cuda::device::orb::OrbDescriptor271             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
272             {
273                 pattern_x += 12 * i;
274                 pattern_y += 12 * i;
275 
276                 int t0, t1, t2, val;
277 
278                 t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
279                 val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
280 
281                 t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
282                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
283 
284                 t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
285                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
286 
287                 t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
288                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
289 
290                 return val;
291             }
292         };
293 
294         template <> struct OrbDescriptor<4>
295         {
calccv::cuda::device::orb::OrbDescriptor296             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
297             {
298                 pattern_x += 16 * i;
299                 pattern_y += 16 * i;
300 
301                 int t0, t1, t2, t3, k, val;
302                 int a, b;
303 
304                 t0 = GET_VALUE(0); t1 = GET_VALUE(1);
305                 t2 = GET_VALUE(2); t3 = GET_VALUE(3);
306                 a = 0, b = 2;
307                 if( t1 > t0 ) t0 = t1, a = 1;
308                 if( t3 > t2 ) t2 = t3, b = 3;
309                 k = t0 > t2 ? a : b;
310                 val = k;
311 
312                 t0 = GET_VALUE(4); t1 = GET_VALUE(5);
313                 t2 = GET_VALUE(6); t3 = GET_VALUE(7);
314                 a = 0, b = 2;
315                 if( t1 > t0 ) t0 = t1, a = 1;
316                 if( t3 > t2 ) t2 = t3, b = 3;
317                 k = t0 > t2 ? a : b;
318                 val |= k << 2;
319 
320                 t0 = GET_VALUE(8); t1 = GET_VALUE(9);
321                 t2 = GET_VALUE(10); t3 = GET_VALUE(11);
322                 a = 0, b = 2;
323                 if( t1 > t0 ) t0 = t1, a = 1;
324                 if( t3 > t2 ) t2 = t3, b = 3;
325                 k = t0 > t2 ? a : b;
326                 val |= k << 4;
327 
328                 t0 = GET_VALUE(12); t1 = GET_VALUE(13);
329                 t2 = GET_VALUE(14); t3 = GET_VALUE(15);
330                 a = 0, b = 2;
331                 if( t1 > t0 ) t0 = t1, a = 1;
332                 if( t3 > t2 ) t2 = t3, b = 3;
333                 k = t0 > t2 ? a : b;
334                 val |= k << 6;
335 
336                 return val;
337             }
338         };
339 
340         #undef GET_VALUE
341 
342         template <int WTA_K>
computeOrbDescriptor(const PtrStepb img,const short2 * loc,const float * angle_,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize)343         __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
344             const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
345         {
346             const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
347             const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
348 
349             if (ptidx < npoints && descidx < dsize)
350             {
351                 float angle = angle_[ptidx];
352                 angle *= (float)(CV_PI_F / 180.f);
353 
354                 float sina, cosa;
355                 ::sincosf(angle, &sina, &cosa);
356 
357                 desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
358             }
359         }
360 
computeOrbDescriptor_gpu(PtrStepb img,const short2 * loc,const float * angle,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize,int WTA_K,cudaStream_t stream)361         void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
362             const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
363         {
364             dim3 block(32, 8);
365 
366             dim3 grid;
367             grid.x = divUp(dsize, block.x);
368             grid.y = divUp(npoints, block.y);
369 
370             switch (WTA_K)
371             {
372             case 2:
373                 computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
374                 break;
375 
376             case 3:
377                 computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
378                 break;
379 
380             case 4:
381                 computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
382                 break;
383             }
384 
385             cudaSafeCall( cudaGetLastError() );
386 
387             if (stream == 0)
388                 cudaSafeCall( cudaDeviceSynchronize() );
389         }
390 
391         ////////////////////////////////////////////////////////////////////////////////////////////////////////
392         // mergeLocation
393 
mergeLocation(const short2 * loc_,float * x,float * y,const int npoints,float scale)394         __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
395         {
396             const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
397 
398             if (ptidx < npoints)
399             {
400                 short2 loc = loc_[ptidx];
401 
402                 x[ptidx] = loc.x * scale;
403                 y[ptidx] = loc.y * scale;
404             }
405         }
406 
mergeLocation_gpu(const short2 * loc,float * x,float * y,int npoints,float scale,cudaStream_t stream)407         void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
408         {
409             dim3 block(256);
410 
411             dim3 grid;
412             grid.x = divUp(npoints, block.x);
413 
414             mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
415 
416             cudaSafeCall( cudaGetLastError() );
417 
418             if (stream == 0)
419                 cudaSafeCall( cudaDeviceSynchronize() );
420         }
421     }
422 }}}
423 
424 #endif /* CUDA_DISABLER */
425