1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/reduce.hpp"
47 #include "opencv2/core/cuda/functional.hpp"
48 #include "opencv2/core/cuda/warp_shuffle.hpp"
49 
50 namespace cv { namespace cuda { namespace device
51 {
52     // Other values are not supported
53     #define CELL_WIDTH 8
54     #define CELL_HEIGHT 8
55     #define CELLS_PER_BLOCK_X 2
56     #define CELLS_PER_BLOCK_Y 2
57 
58     namespace hog
59     {
60         __constant__ int cnbins;
61         __constant__ int cblock_stride_x;
62         __constant__ int cblock_stride_y;
63         __constant__ int cnblocks_win_x;
64         __constant__ int cnblocks_win_y;
65         __constant__ int cblock_hist_size;
66         __constant__ int cblock_hist_size_2up;
67         __constant__ int cdescr_size;
68         __constant__ int cdescr_width;
69 
70 
71         /* Returns the nearest upper power of two, works only for
72         the typical GPU thread count (pert block) values */
power_2up(unsigned int n)73         int power_2up(unsigned int n)
74         {
75             if (n < 1) return 1;
76             else if (n < 2) return 2;
77             else if (n < 4) return 4;
78             else if (n < 8) return 8;
79             else if (n < 16) return 16;
80             else if (n < 32) return 32;
81             else if (n < 64) return 64;
82             else if (n < 128) return 128;
83             else if (n < 256) return 256;
84             else if (n < 512) return 512;
85             else if (n < 1024) return 1024;
86             return -1; // Input is too big
87         }
88 
89 
set_up_constants(int nbins,int block_stride_x,int block_stride_y,int nblocks_win_x,int nblocks_win_y)90         void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
91                               int nblocks_win_x, int nblocks_win_y)
92         {
93             cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
94             cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
95             cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
96             cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
97             cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
98 
99             int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
100             cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
101 
102             int block_hist_size_2up = power_2up(block_hist_size);
103             cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
104 
105             int descr_width = nblocks_win_x * block_hist_size;
106             cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
107 
108             int descr_size = descr_width * nblocks_win_y;
109             cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
110         }
111 
112 
113         //----------------------------------------------------------------------------
114         // Histogram computation
115 
116 
117         template <int nblocks> // Number of histogram blocks processed by single GPU thread block
compute_hists_kernel_many_blocks(const int img_block_width,const PtrStepf grad,const PtrStepb qangle,float scale,float * block_hists)118         __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
119                                                          const PtrStepb qangle, float scale, float* block_hists)
120         {
121             const int block_x = threadIdx.z;
122             const int cell_x = threadIdx.x / 16;
123             const int cell_y = threadIdx.y;
124             const int cell_thread_x = threadIdx.x & 0xF;
125 
126             if (blockIdx.x * blockDim.z + block_x >= img_block_width)
127                 return;
128 
129             extern __shared__ float smem[];
130             float* hists = smem;
131             float* final_hist = smem + cnbins * 48 * nblocks;
132 
133             const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
134                                  4 * cell_x + cell_thread_x;
135             const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;
136 
137             const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
138             const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;
139 
140             // 12 means that 12 pixels affect on block's cell (in one row)
141             if (cell_thread_x < 12)
142             {
143                 float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
144                                             cell_x + block_x * CELLS_PER_BLOCK_X) +
145                                            cell_thread_x;
146                 for (int bin_id = 0; bin_id < cnbins; ++bin_id)
147                     hist[bin_id * 48 * nblocks] = 0.f;
148 
149                 const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;
150 
151                 const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
152                 for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
153                 {
154                     float2 vote = *(const float2*)grad_ptr;
155                     uchar2 bin = *(const uchar2*)qangle_ptr;
156 
157                     grad_ptr += grad.step/sizeof(float);
158                     qangle_ptr += qangle.step;
159 
160                     int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
161                     int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
162 
163                     float gaussian = ::expf(-(dist_center_y * dist_center_y +
164                                               dist_center_x * dist_center_x) * scale);
165                     float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
166                                           (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
167 
168                     hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
169                     hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
170                 }
171 
172                 volatile float* hist_ = hist;
173                 for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
174                 {
175                     if (cell_thread_x < 6) hist_[0] += hist_[6];
176                     if (cell_thread_x < 3) hist_[0] += hist_[3];
177                     if (cell_thread_x == 0)
178                         final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
179                             = hist_[0] + hist_[1] + hist_[2];
180                 }
181             }
182 
183             __syncthreads();
184 
185             float* block_hist = block_hists + (blockIdx.y * img_block_width +
186                                                blockIdx.x * blockDim.z + block_x) *
187                                               cblock_hist_size;
188 
189             int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
190             if (tid < cblock_hist_size)
191                 block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
192         }
193 
194 
compute_hists(int nbins,int block_stride_x,int block_stride_y,int height,int width,const PtrStepSzf & grad,const PtrStepSzb & qangle,float sigma,float * block_hists)195         void compute_hists(int nbins, int block_stride_x, int block_stride_y,
196                            int height, int width, const PtrStepSzf& grad,
197                            const PtrStepSzb& qangle, float sigma, float* block_hists)
198         {
199             const int nblocks = 1;
200 
201             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
202                                   block_stride_x;
203             int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
204                                    block_stride_y;
205 
206             dim3 grid(divUp(img_block_width, nblocks), img_block_height);
207             dim3 threads(32, 2, nblocks);
208 
209             cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
210                                                 cudaFuncCachePreferL1));
211 
212             // Precompute gaussian spatial window parameter
213             float scale = 1.f / (2.f * sigma * sigma);
214 
215             int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
216             int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
217             int smem = hists_size + final_hists_size;
218             compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
219                 img_block_width, grad, qangle, scale, block_hists);
220             cudaSafeCall( cudaGetLastError() );
221 
222             cudaSafeCall( cudaDeviceSynchronize() );
223         }
224 
225 
226         //-------------------------------------------------------------
227         //  Normalization of histograms via L2Hys_norm
228         //
229 
230 
231         template<int size>
reduce_smem(float * smem,float val)232         __device__ float reduce_smem(float* smem, float val)
233         {
234             unsigned int tid = threadIdx.x;
235             float sum = val;
236 
237             reduce<size>(smem, sum, tid, plus<float>());
238 
239             if (size == 32)
240             {
241             #if __CUDA_ARCH__ >= 300
242                 return shfl(sum, 0);
243             #else
244                 return smem[0];
245             #endif
246             }
247             else
248             {
249             #if __CUDA_ARCH__ >= 300
250                 if (threadIdx.x == 0)
251                     smem[0] = sum;
252             #endif
253 
254                 __syncthreads();
255 
256                 return smem[0];
257             }
258         }
259 
260 
261         template <int nthreads, // Number of threads which process one block historgam
262                   int nblocks> // Number of block hisograms processed by one GPU thread block
normalize_hists_kernel_many_blocks(const int block_hist_size,const int img_block_width,float * block_hists,float threshold)263         __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
264                                                            const int img_block_width,
265                                                            float* block_hists, float threshold)
266         {
267             if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
268                 return;
269 
270             float* hist = block_hists + (blockIdx.y * img_block_width +
271                                          blockIdx.x * blockDim.z + threadIdx.z) *
272                                         block_hist_size + threadIdx.x;
273 
274             __shared__ float sh_squares[nthreads * nblocks];
275             float* squares = sh_squares + threadIdx.z * nthreads;
276 
277             float elem = 0.f;
278             if (threadIdx.x < block_hist_size)
279                 elem = hist[0];
280 
281             float sum = reduce_smem<nthreads>(squares, elem * elem);
282 
283             float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
284             elem = ::min(elem * scale, threshold);
285 
286             sum = reduce_smem<nthreads>(squares, elem * elem);
287 
288             scale = 1.0f / (::sqrtf(sum) + 1e-3f);
289 
290             if (threadIdx.x < block_hist_size)
291                 hist[0] = elem * scale;
292         }
293 
294 
normalize_hists(int nbins,int block_stride_x,int block_stride_y,int height,int width,float * block_hists,float threshold)295         void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
296                              int height, int width, float* block_hists, float threshold)
297         {
298             const int nblocks = 1;
299 
300             int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
301             int nthreads = power_2up(block_hist_size);
302             dim3 threads(nthreads, 1, nblocks);
303 
304             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
305             int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
306             dim3 grid(divUp(img_block_width, nblocks), img_block_height);
307 
308             if (nthreads == 32)
309                 normalize_hists_kernel_many_blocks<32, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
310             else if (nthreads == 64)
311                 normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
312             else if (nthreads == 128)
313                 normalize_hists_kernel_many_blocks<64, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
314             else if (nthreads == 256)
315                 normalize_hists_kernel_many_blocks<256, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
316             else if (nthreads == 512)
317                 normalize_hists_kernel_many_blocks<512, nblocks><<<grid, threads>>>(block_hist_size, img_block_width, block_hists, threshold);
318             else
319                 CV_Error(cv::Error::StsBadArg, "normalize_hists: histogram's size is too big, try to decrease number of bins");
320 
321             cudaSafeCall( cudaGetLastError() );
322 
323             cudaSafeCall( cudaDeviceSynchronize() );
324         }
325 
326 
327         //---------------------------------------------------------------------
328         //  Linear SVM based classification
329         //
330 
331        // return confidence values not just positive location
332        template <int nthreads, // Number of threads per one histogram block
333                  int nblocks>  // Number of histogram block processed by single GPU thread block
compute_confidence_hists_kernel_many_blocks(const int img_win_width,const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,const float * coefs,float free_coef,float threshold,float * confidences)334        __global__ void compute_confidence_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
335                                                                                                            const int win_block_stride_x, const int win_block_stride_y,
336                                                                                                            const float* block_hists, const float* coefs,
337                                                                                                            float free_coef, float threshold, float* confidences)
338        {
339            const int win_x = threadIdx.z;
340            if (blockIdx.x * blockDim.z + win_x >= img_win_width)
341                    return;
342 
343            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
344                                                                                 blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
345                                                                                cblock_hist_size;
346 
347            float product = 0.f;
348            for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
349            {
350                    int offset_y = i / cdescr_width;
351                    int offset_x = i - offset_y * cdescr_width;
352                    product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
353            }
354 
355            __shared__ float products[nthreads * nblocks];
356 
357            const int tid = threadIdx.z * nthreads + threadIdx.x;
358 
359            reduce<nthreads>(products, product, tid, plus<float>());
360 
361            if (threadIdx.x == 0)
362                confidences[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = product + free_coef;
363 
364        }
365 
compute_confidence_hists(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,float * coefs,float free_coef,float threshold,float * confidences)366        void compute_confidence_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
367                                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
368                                                float* coefs, float free_coef, float threshold, float *confidences)
369        {
370            const int nthreads = 256;
371            const int nblocks = 1;
372 
373            int win_block_stride_x = win_stride_x / block_stride_x;
374            int win_block_stride_y = win_stride_y / block_stride_y;
375            int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
376            int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
377 
378            dim3 threads(nthreads, 1, nblocks);
379            dim3 grid(divUp(img_win_width, nblocks), img_win_height);
380 
381            cudaSafeCall(cudaFuncSetCacheConfig(compute_confidence_hists_kernel_many_blocks<nthreads, nblocks>,
382                                                                                    cudaFuncCachePreferL1));
383 
384            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
385                                                        block_stride_x;
386            compute_confidence_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
387                    img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
388                    block_hists, coefs, free_coef, threshold, confidences);
389            cudaSafeCall(cudaThreadSynchronize());
390        }
391 
392 
393 
394         template <int nthreads, // Number of threads per one histogram block
395                   int nblocks>  // Number of histogram block processed by single GPU thread block
classify_hists_kernel_many_blocks(const int img_win_width,const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,const float * coefs,float free_coef,float threshold,unsigned char * labels)396         __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
397                                                           const int win_block_stride_x, const int win_block_stride_y,
398                                                           const float* block_hists, const float* coefs,
399                                                           float free_coef, float threshold, unsigned char* labels)
400         {
401             const int win_x = threadIdx.z;
402             if (blockIdx.x * blockDim.z + win_x >= img_win_width)
403                 return;
404 
405             const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
406                                                blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
407                                               cblock_hist_size;
408 
409             float product = 0.f;
410             for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
411             {
412                 int offset_y = i / cdescr_width;
413                 int offset_x = i - offset_y * cdescr_width;
414                 product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
415             }
416 
417             __shared__ float products[nthreads * nblocks];
418 
419             const int tid = threadIdx.z * nthreads + threadIdx.x;
420 
421             reduce<nthreads>(products, product, tid, plus<float>());
422 
423             if (threadIdx.x == 0)
424                 labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
425         }
426 
427 
classify_hists(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,float * coefs,float free_coef,float threshold,unsigned char * labels)428         void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
429                             int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
430                             float* coefs, float free_coef, float threshold, unsigned char* labels)
431         {
432             const int nthreads = 256;
433             const int nblocks = 1;
434 
435             int win_block_stride_x = win_stride_x / block_stride_x;
436             int win_block_stride_y = win_stride_y / block_stride_y;
437             int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
438             int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
439 
440             dim3 threads(nthreads, 1, nblocks);
441             dim3 grid(divUp(img_win_width, nblocks), img_win_height);
442 
443             cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks<nthreads, nblocks>, cudaFuncCachePreferL1));
444 
445             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
446             classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
447                 img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
448                 block_hists, coefs, free_coef, threshold, labels);
449             cudaSafeCall( cudaGetLastError() );
450 
451             cudaSafeCall( cudaDeviceSynchronize() );
452         }
453 
454         //----------------------------------------------------------------------------
455         // Extract descriptors
456 
457 
458         template <int nthreads>
extract_descrs_by_rows_kernel(const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,PtrStepf descriptors)459         __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
460                                                       const float* block_hists, PtrStepf descriptors)
461         {
462             // Get left top corner of the window in src
463             const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
464                                                blockIdx.x * win_block_stride_x) * cblock_hist_size;
465 
466             // Get left top corner of the window in dst
467             float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
468 
469             // Copy elements from src to dst
470             for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
471             {
472                 int offset_y = i / cdescr_width;
473                 int offset_x = i - offset_y * cdescr_width;
474                 descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
475             }
476         }
477 
478 
extract_descrs_by_rows(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,PtrStepSzf descriptors)479         void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
480                                     int height, int width, float* block_hists, PtrStepSzf descriptors)
481         {
482             const int nthreads = 256;
483 
484             int win_block_stride_x = win_stride_x / block_stride_x;
485             int win_block_stride_y = win_stride_y / block_stride_y;
486             int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
487             int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
488             dim3 threads(nthreads, 1);
489             dim3 grid(img_win_width, img_win_height);
490 
491             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
492             extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
493                 img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
494             cudaSafeCall( cudaGetLastError() );
495 
496             cudaSafeCall( cudaDeviceSynchronize() );
497         }
498 
499 
500         template <int nthreads>
extract_descrs_by_cols_kernel(const int img_block_width,const int win_block_stride_x,const int win_block_stride_y,const float * block_hists,PtrStepf descriptors)501         __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
502                                                       const int win_block_stride_y, const float* block_hists,
503                                                       PtrStepf descriptors)
504         {
505             // Get left top corner of the window in src
506             const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
507                                                blockIdx.x * win_block_stride_x) * cblock_hist_size;
508 
509             // Get left top corner of the window in dst
510             float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
511 
512             // Copy elements from src to dst
513             for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
514             {
515                 int block_idx = i / cblock_hist_size;
516                 int idx_in_block = i - block_idx * cblock_hist_size;
517 
518                 int y = block_idx / cnblocks_win_x;
519                 int x = block_idx - y * cnblocks_win_x;
520 
521                 descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
522                     = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
523             }
524         }
525 
526 
extract_descrs_by_cols(int win_height,int win_width,int block_stride_y,int block_stride_x,int win_stride_y,int win_stride_x,int height,int width,float * block_hists,PtrStepSzf descriptors)527         void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
528                                     int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
529                                     PtrStepSzf descriptors)
530         {
531             const int nthreads = 256;
532 
533             int win_block_stride_x = win_stride_x / block_stride_x;
534             int win_block_stride_y = win_stride_y / block_stride_y;
535             int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
536             int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
537             dim3 threads(nthreads, 1);
538             dim3 grid(img_win_width, img_win_height);
539 
540             int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
541             extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
542                 img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
543             cudaSafeCall( cudaGetLastError() );
544 
545             cudaSafeCall( cudaDeviceSynchronize() );
546         }
547 
548         //----------------------------------------------------------------------------
549         // Gradients computation
550 
551 
552         template <int nthreads, int correct_gamma>
compute_gradients_8UC4_kernel(int height,int width,const PtrStepb img,float angle_scale,PtrStepf grad,PtrStepb qangle)553         __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
554                                                       float angle_scale, PtrStepf grad, PtrStepb qangle)
555         {
556             const int x = blockIdx.x * blockDim.x + threadIdx.x;
557 
558             const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);
559 
560             __shared__ float sh_row[(nthreads + 2) * 3];
561 
562             uchar4 val;
563             if (x < width)
564                 val = row[x];
565             else
566                 val = row[width - 2];
567 
568             sh_row[threadIdx.x + 1] = val.x;
569             sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
570             sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;
571 
572             if (threadIdx.x == 0)
573             {
574                 val = row[::max(x - 1, 1)];
575                 sh_row[0] = val.x;
576                 sh_row[(nthreads + 2)] = val.y;
577                 sh_row[2 * (nthreads + 2)] = val.z;
578             }
579 
580             if (threadIdx.x == blockDim.x - 1)
581             {
582                 val = row[::min(x + 1, width - 2)];
583                 sh_row[blockDim.x + 1] = val.x;
584                 sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
585                 sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
586             }
587 
588             __syncthreads();
589             if (x < width)
590             {
591                 float3 a, b;
592 
593                 b.x = sh_row[threadIdx.x + 2];
594                 b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
595                 b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
596                 a.x = sh_row[threadIdx.x];
597                 a.y = sh_row[threadIdx.x + (nthreads + 2)];
598                 a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];
599 
600                 float3 dx;
601                 if (correct_gamma)
602                     dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
603                 else
604                     dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
605 
606                 float3 dy = make_float3(0.f, 0.f, 0.f);
607 
608                 if (blockIdx.y > 0 && blockIdx.y < height - 1)
609                 {
610                     val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
611                     a = make_float3(val.x, val.y, val.z);
612 
613                     val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
614                     b = make_float3(val.x, val.y, val.z);
615 
616                     if (correct_gamma)
617                         dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
618                     else
619                         dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
620                 }
621 
622                 float best_dx = dx.x;
623                 float best_dy = dy.x;
624 
625                 float mag0 = dx.x * dx.x + dy.x * dy.x;
626                 float mag1 = dx.y * dx.y + dy.y * dy.y;
627                 if (mag0 < mag1)
628                 {
629                     best_dx = dx.y;
630                     best_dy = dy.y;
631                     mag0 = mag1;
632                 }
633 
634                 mag1 = dx.z * dx.z + dy.z * dy.z;
635                 if (mag0 < mag1)
636                 {
637                     best_dx = dx.z;
638                     best_dy = dy.z;
639                     mag0 = mag1;
640                 }
641 
642                 mag0 = ::sqrtf(mag0);
643 
644                 float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
645                 int hidx = (int)::floorf(ang);
646                 ang -= hidx;
647                 hidx = (hidx + cnbins) % cnbins;
648 
649                 ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
650                 ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
651             }
652         }
653 
654 
compute_gradients_8UC4(int nbins,int height,int width,const PtrStepSzb & img,float angle_scale,PtrStepSzf grad,PtrStepSzb qangle,bool correct_gamma)655         void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
656                                     float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
657         {
658             (void)nbins;
659             const int nthreads = 256;
660 
661             dim3 bdim(nthreads, 1);
662             dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
663 
664             if (correct_gamma)
665                 compute_gradients_8UC4_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
666             else
667                 compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
668 
669             cudaSafeCall( cudaGetLastError() );
670 
671             cudaSafeCall( cudaDeviceSynchronize() );
672         }
673 
674         template <int nthreads, int correct_gamma>
compute_gradients_8UC1_kernel(int height,int width,const PtrStepb img,float angle_scale,PtrStepf grad,PtrStepb qangle)675         __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
676                                                       float angle_scale, PtrStepf grad, PtrStepb qangle)
677         {
678             const int x = blockIdx.x * blockDim.x + threadIdx.x;
679 
680             const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);
681 
682             __shared__ float sh_row[nthreads + 2];
683 
684             if (x < width)
685                 sh_row[threadIdx.x + 1] = row[x];
686             else
687                 sh_row[threadIdx.x + 1] = row[width - 2];
688 
689             if (threadIdx.x == 0)
690                 sh_row[0] = row[::max(x - 1, 1)];
691 
692             if (threadIdx.x == blockDim.x - 1)
693                 sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
694 
695             __syncthreads();
696             if (x < width)
697             {
698                 float dx;
699 
700                 if (correct_gamma)
701                     dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
702                 else
703                     dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
704 
705                 float dy = 0.f;
706                 if (blockIdx.y > 0 && blockIdx.y < height - 1)
707                 {
708                     float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
709                     float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
710                     if (correct_gamma)
711                         dy = ::sqrtf(a) - ::sqrtf(b);
712                     else
713                         dy = a - b;
714                 }
715                 float mag = ::sqrtf(dx * dx + dy * dy);
716 
717                 float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
718                 int hidx = (int)::floorf(ang);
719                 ang -= hidx;
720                 hidx = (hidx + cnbins) % cnbins;
721 
722                 ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
723                 ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
724             }
725         }
726 
727 
compute_gradients_8UC1(int nbins,int height,int width,const PtrStepSzb & img,float angle_scale,PtrStepSzf grad,PtrStepSzb qangle,bool correct_gamma)728         void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
729                                     float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
730         {
731             (void)nbins;
732             const int nthreads = 256;
733 
734             dim3 bdim(nthreads, 1);
735             dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));
736 
737             if (correct_gamma)
738                 compute_gradients_8UC1_kernel<nthreads, 1><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
739             else
740                 compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(height, width, img, angle_scale, grad, qangle);
741 
742             cudaSafeCall( cudaGetLastError() );
743 
744             cudaSafeCall( cudaDeviceSynchronize() );
745         }
746 
747 
748 
749         //-------------------------------------------------------------------
750         // Resize
751 
752         texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
753         texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
754 
resize_for_hog_kernel(float sx,float sy,PtrStepSz<uchar> dst,int colOfs)755         __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
756         {
757             unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
758             unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
759 
760             if (x < dst.cols && y < dst.rows)
761                 dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
762         }
763 
resize_for_hog_kernel(float sx,float sy,PtrStepSz<uchar4> dst,int colOfs)764         __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
765         {
766             unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
767             unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
768 
769             if (x < dst.cols && y < dst.rows)
770             {
771                 float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
772                 dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
773             }
774         }
775 
776         template<class T, class TEX>
resize_for_hog(const PtrStepSzb & src,PtrStepSzb dst,TEX & tex)777         static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
778         {
779             tex.filterMode = cudaFilterModeLinear;
780 
781             size_t texOfs = 0;
782             int colOfs = 0;
783 
784             cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
785             cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
786 
787             if (texOfs != 0)
788             {
789                 colOfs = static_cast<int>( texOfs/sizeof(T) );
790                 cudaSafeCall( cudaUnbindTexture(tex) );
791                 cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
792             }
793 
794             dim3 threads(32, 8);
795             dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
796 
797             float sx = static_cast<float>(src.cols) / dst.cols;
798             float sy = static_cast<float>(src.rows) / dst.rows;
799 
800             resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
801             cudaSafeCall( cudaGetLastError() );
802 
803             cudaSafeCall( cudaDeviceSynchronize() );
804 
805             cudaSafeCall( cudaUnbindTexture(tex) );
806         }
807 
resize_8UC1(const PtrStepSzb & src,PtrStepSzb dst)808         void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
resize_8UC4(const PtrStepSzb & src,PtrStepSzb dst)809         void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
810     } // namespace hog
811 }}} // namespace cv { namespace cuda { namespace cudev
812 
813 
814 #endif /* CUDA_DISABLER */
815