1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/vec_math.hpp"
47 
48 namespace cv { namespace cuda { namespace device
49 {
50     namespace match_template
51     {
sum(float v)52         __device__ __forceinline__ float sum(float v) { return v; }
sum(float2 v)53         __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
sum(float3 v)54         __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
sum(float4 v)55         __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
56 
first(float v)57         __device__ __forceinline__ float first(float v) { return v; }
first(float2 v)58         __device__ __forceinline__ float first(float2 v) { return v.x; }
first(float3 v)59         __device__ __forceinline__ float first(float3 v) { return v.x; }
first(float4 v)60         __device__ __forceinline__ float first(float4 v) { return v.x; }
61 
mul(float a,float b)62         __device__ __forceinline__ float mul(float a, float b) { return a * b; }
mul(float2 a,float2 b)63         __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
mul(float3 a,float3 b)64         __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
mul(float4 a,float4 b)65         __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
66 
mul(uchar a,uchar b)67         __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
mul(uchar2 a,uchar2 b)68         __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
mul(uchar3 a,uchar3 b)69         __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
mul(uchar4 a,uchar4 b)70         __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
71 
sub(float a,float b)72         __device__ __forceinline__ float sub(float a, float b) { return a - b; }
sub(float2 a,float2 b)73         __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
sub(float3 a,float3 b)74         __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
sub(float4 a,float4 b)75         __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
76 
sub(uchar a,uchar b)77         __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
sub(uchar2 a,uchar2 b)78         __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
sub(uchar3 a,uchar3 b)79         __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
sub(uchar4 a,uchar4 b)80         __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
81 
82         //////////////////////////////////////////////////////////////////////
83         // Naive_CCORR
84 
85         template <typename T, int cn>
matchTemplateNaiveKernel_CCORR(int w,int h,const PtrStepb image,const PtrStepb templ,PtrStepSzf result)86         __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
87         {
88             typedef typename TypeVec<T, cn>::vec_type Type;
89             typedef typename TypeVec<float, cn>::vec_type Typef;
90 
91             int x = blockDim.x * blockIdx.x + threadIdx.x;
92             int y = blockDim.y * blockIdx.y + threadIdx.y;
93 
94             if (x < result.cols && y < result.rows)
95             {
96                 Typef res = VecTraits<Typef>::all(0);
97 
98                 for (int i = 0; i < h; ++i)
99                 {
100                     const Type* image_ptr = (const Type*)image.ptr(y + i);
101                     const Type* templ_ptr = (const Type*)templ.ptr(i);
102                     for (int j = 0; j < w; ++j)
103                         res = res + mul(image_ptr[x + j], templ_ptr[j]);
104                 }
105 
106                 result.ptr(y)[x] = sum(res);
107             }
108         }
109 
110         template <typename T, int cn>
matchTemplateNaive_CCORR(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,cudaStream_t stream)111         void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
112         {
113             const dim3 threads(32, 8);
114             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
115 
116             matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
117             cudaSafeCall( cudaGetLastError() );
118 
119             if (stream == 0)
120                 cudaSafeCall( cudaDeviceSynchronize() );
121         }
122 
matchTemplateNaive_CCORR_32F(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)123         void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
124         {
125             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
126 
127             static const caller_t callers[] =
128             {
129                 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
130             };
131 
132             callers[cn](image, templ, result, stream);
133         }
134 
135 
matchTemplateNaive_CCORR_8U(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)136         void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
137         {
138             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
139 
140             static const caller_t callers[] =
141             {
142                 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
143             };
144 
145             callers[cn](image, templ, result, stream);
146         }
147 
148         //////////////////////////////////////////////////////////////////////
149         // Naive_SQDIFF
150 
151         template <typename T, int cn>
matchTemplateNaiveKernel_SQDIFF(int w,int h,const PtrStepb image,const PtrStepb templ,PtrStepSzf result)152         __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
153         {
154             typedef typename TypeVec<T, cn>::vec_type Type;
155             typedef typename TypeVec<float, cn>::vec_type Typef;
156 
157             int x = blockDim.x * blockIdx.x + threadIdx.x;
158             int y = blockDim.y * blockIdx.y + threadIdx.y;
159 
160             if (x < result.cols && y < result.rows)
161             {
162                 Typef res = VecTraits<Typef>::all(0);
163                 Typef delta;
164 
165                 for (int i = 0; i < h; ++i)
166                 {
167                     const Type* image_ptr = (const Type*)image.ptr(y + i);
168                     const Type* templ_ptr = (const Type*)templ.ptr(i);
169                     for (int j = 0; j < w; ++j)
170                     {
171                         delta = sub(image_ptr[x + j], templ_ptr[j]);
172                         res = res + delta * delta;
173                     }
174                 }
175 
176                 result.ptr(y)[x] = sum(res);
177             }
178         }
179 
180         template <typename T, int cn>
matchTemplateNaive_SQDIFF(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,cudaStream_t stream)181         void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
182         {
183             const dim3 threads(32, 8);
184             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
185 
186             matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
187             cudaSafeCall( cudaGetLastError() );
188 
189             if (stream == 0)
190                 cudaSafeCall( cudaDeviceSynchronize() );
191         }
192 
matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)193         void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
194         {
195             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
196 
197             static const caller_t callers[] =
198             {
199                 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
200             };
201 
202             callers[cn](image, templ, result, stream);
203         }
204 
matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)205         void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
206         {
207             typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
208 
209             static const caller_t callers[] =
210             {
211                 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
212             };
213 
214             callers[cn](image, templ, result, stream);
215         }
216 
217         //////////////////////////////////////////////////////////////////////
218         // Prepared_SQDIFF
219 
220         template <int cn>
matchTemplatePreparedKernel_SQDIFF_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)221         __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
222         {
223             const int x = blockIdx.x * blockDim.x + threadIdx.x;
224             const int y = blockIdx.y * blockDim.y + threadIdx.y;
225 
226             if (x < result.cols && y < result.rows)
227             {
228                 float image_sqsum_ = (float)(
229                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
230                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
231                 float ccorr = result.ptr(y)[x];
232                 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
233             }
234         }
235 
236         template <int cn>
matchTemplatePrepared_SQDIFF_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)237         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
238         {
239             const dim3 threads(32, 8);
240             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
241 
242             matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
243             cudaSafeCall( cudaGetLastError() );
244 
245             if (stream == 0)
246                 cudaSafeCall( cudaDeviceSynchronize() );
247         }
248 
matchTemplatePrepared_SQDIFF_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)249         void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
250                                              cudaStream_t stream)
251         {
252             typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
253 
254             static const caller_t callers[] =
255             {
256                 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
257             };
258 
259             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
260         }
261 
262         //////////////////////////////////////////////////////////////////////
263         // Prepared_SQDIFF_NORMED
264 
265         // normAcc* are accurate normalization routines which make CUDA matchTemplate
266         // consistent with CPU one
267 
normAcc(float num,float denum)268         __device__ float normAcc(float num, float denum)
269         {
270             if (::fabs(num) < denum)
271                 return num / denum;
272             if (::fabs(num) < denum * 1.125f)
273                 return num > 0 ? 1 : -1;
274             return 0;
275         }
276 
277 
normAcc_SQDIFF(float num,float denum)278         __device__ float normAcc_SQDIFF(float num, float denum)
279         {
280             if (::fabs(num) < denum)
281                 return num / denum;
282             if (::fabs(num) < denum * 1.125f)
283                 return num > 0 ? 1 : -1;
284             return 1;
285         }
286 
287 
288         template <int cn>
matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)289         __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
290                 int w, int h, const PtrStep<double> image_sqsum,
291                 double templ_sqsum, PtrStepSzf result)
292         {
293             const int x = blockIdx.x * blockDim.x + threadIdx.x;
294             const int y = blockIdx.y * blockDim.y + threadIdx.y;
295 
296             if (x < result.cols && y < result.rows)
297             {
298                 float image_sqsum_ = (float)(
299                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
300                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
301                 float ccorr = result.ptr(y)[x];
302                 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
303                                                   sqrtf(image_sqsum_ * templ_sqsum));
304             }
305         }
306 
307         template <int cn>
matchTemplatePrepared_SQDIFF_NORMED_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)308         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
309                                                     PtrStepSzf result, cudaStream_t stream)
310         {
311             const dim3 threads(32, 8);
312             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
313 
314             matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
315             cudaSafeCall( cudaGetLastError() );
316 
317             if (stream == 0)
318                 cudaSafeCall( cudaDeviceSynchronize() );
319         }
320 
321 
matchTemplatePrepared_SQDIFF_NORMED_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)322         void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
323                                                     PtrStepSzf result, int cn, cudaStream_t stream)
324         {
325             typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
326             static const caller_t callers[] =
327             {
328                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
329             };
330 
331             callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
332         }
333 
334         //////////////////////////////////////////////////////////////////////
335         // Prepared_CCOFF
336 
matchTemplatePreparedKernel_CCOFF_8U(int w,int h,float templ_sum_scale,const PtrStep<int> image_sum,PtrStepSzf result)337         __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
338         {
339             const int x = blockIdx.x * blockDim.x + threadIdx.x;
340             const int y = blockIdx.y * blockDim.y + threadIdx.y;
341 
342             if (x < result.cols && y < result.rows)
343             {
344                 float image_sum_ = (float)(
345                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
346                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
347                 float ccorr = result.ptr(y)[x];
348                 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
349             }
350         }
351 
matchTemplatePrepared_CCOFF_8U(int w,int h,const PtrStepSz<int> image_sum,int templ_sum,PtrStepSzf result,cudaStream_t stream)352         void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
353         {
354             dim3 threads(32, 8);
355             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
356 
357             matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
358             cudaSafeCall( cudaGetLastError() );
359 
360             if (stream == 0)
361                 cudaSafeCall( cudaDeviceSynchronize() );
362         }
363 
364 
365 
matchTemplatePreparedKernel_CCOFF_8UC2(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,PtrStepSzf result)366         __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
367                 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
368                 const PtrStep<int> image_sum_r,
369                 const PtrStep<int> image_sum_g,
370                 PtrStepSzf result)
371         {
372             const int x = blockIdx.x * blockDim.x + threadIdx.x;
373             const int y = blockIdx.y * blockDim.y + threadIdx.y;
374 
375             if (x < result.cols && y < result.rows)
376             {
377                 float image_sum_r_ = (float)(
378                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
379                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
380                 float image_sum_g_ = (float)(
381                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
382                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
383                 float ccorr = result.ptr(y)[x];
384                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
385                                          - image_sum_g_ * templ_sum_scale_g;
386             }
387         }
388 
matchTemplatePrepared_CCOFF_8UC2(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,int templ_sum_r,int templ_sum_g,PtrStepSzf result,cudaStream_t stream)389         void matchTemplatePrepared_CCOFF_8UC2(
390                 int w, int h,
391                 const PtrStepSz<int> image_sum_r,
392                 const PtrStepSz<int> image_sum_g,
393                 int templ_sum_r, int templ_sum_g,
394                 PtrStepSzf result, cudaStream_t stream)
395         {
396             dim3 threads(32, 8);
397             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
398 
399             matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
400                     w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
401                     image_sum_r, image_sum_g, result);
402             cudaSafeCall( cudaGetLastError() );
403 
404             if (stream == 0)
405                 cudaSafeCall( cudaDeviceSynchronize() );
406         }
407 
408 
409 
matchTemplatePreparedKernel_CCOFF_8UC3(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,const PtrStep<int> image_sum_b,PtrStepSzf result)410         __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
411                 int w, int h,
412                 float templ_sum_scale_r,
413                 float templ_sum_scale_g,
414                 float templ_sum_scale_b,
415                 const PtrStep<int> image_sum_r,
416                 const PtrStep<int> image_sum_g,
417                 const PtrStep<int> image_sum_b,
418                 PtrStepSzf result)
419         {
420             const int x = blockIdx.x * blockDim.x + threadIdx.x;
421             const int y = blockIdx.y * blockDim.y + threadIdx.y;
422 
423             if (x < result.cols && y < result.rows)
424             {
425                 float image_sum_r_ = (float)(
426                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
427                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
428                 float image_sum_g_ = (float)(
429                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
430                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
431                 float image_sum_b_ = (float)(
432                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
433                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
434                 float ccorr = result.ptr(y)[x];
435                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
436                                          - image_sum_g_ * templ_sum_scale_g
437                                          - image_sum_b_ * templ_sum_scale_b;
438             }
439         }
440 
matchTemplatePrepared_CCOFF_8UC3(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<int> image_sum_b,int templ_sum_r,int templ_sum_g,int templ_sum_b,PtrStepSzf result,cudaStream_t stream)441         void matchTemplatePrepared_CCOFF_8UC3(
442                 int w, int h,
443                 const PtrStepSz<int> image_sum_r,
444                 const PtrStepSz<int> image_sum_g,
445                 const PtrStepSz<int> image_sum_b,
446                 int templ_sum_r,
447                 int templ_sum_g,
448                 int templ_sum_b,
449                 PtrStepSzf result, cudaStream_t stream)
450         {
451             dim3 threads(32, 8);
452             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
453 
454             matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
455                     w, h,
456                     (float)templ_sum_r / (w * h),
457                     (float)templ_sum_g / (w * h),
458                     (float)templ_sum_b / (w * h),
459                     image_sum_r, image_sum_g, image_sum_b, result);
460             cudaSafeCall( cudaGetLastError() );
461 
462             if (stream == 0)
463                 cudaSafeCall( cudaDeviceSynchronize() );
464         }
465 
466 
467 
matchTemplatePreparedKernel_CCOFF_8UC4(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sum_scale_a,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,const PtrStep<int> image_sum_b,const PtrStep<int> image_sum_a,PtrStepSzf result)468         __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
469                 int w, int h,
470                 float templ_sum_scale_r,
471                 float templ_sum_scale_g,
472                 float templ_sum_scale_b,
473                 float templ_sum_scale_a,
474                 const PtrStep<int> image_sum_r,
475                 const PtrStep<int> image_sum_g,
476                 const PtrStep<int> image_sum_b,
477                 const PtrStep<int> image_sum_a,
478                 PtrStepSzf result)
479         {
480             const int x = blockIdx.x * blockDim.x + threadIdx.x;
481             const int y = blockIdx.y * blockDim.y + threadIdx.y;
482 
483             if (x < result.cols && y < result.rows)
484             {
485                 float image_sum_r_ = (float)(
486                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
487                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
488                 float image_sum_g_ = (float)(
489                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
490                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
491                 float image_sum_b_ = (float)(
492                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
493                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
494                 float image_sum_a_ = (float)(
495                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
496                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
497                 float ccorr = result.ptr(y)[x];
498                 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
499                                          - image_sum_g_ * templ_sum_scale_g
500                                          - image_sum_b_ * templ_sum_scale_b
501                                          - image_sum_a_ * templ_sum_scale_a;
502             }
503         }
504 
matchTemplatePrepared_CCOFF_8UC4(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<int> image_sum_a,int templ_sum_r,int templ_sum_g,int templ_sum_b,int templ_sum_a,PtrStepSzf result,cudaStream_t stream)505         void matchTemplatePrepared_CCOFF_8UC4(
506                 int w, int h,
507                 const PtrStepSz<int> image_sum_r,
508                 const PtrStepSz<int> image_sum_g,
509                 const PtrStepSz<int> image_sum_b,
510                 const PtrStepSz<int> image_sum_a,
511                 int templ_sum_r,
512                 int templ_sum_g,
513                 int templ_sum_b,
514                 int templ_sum_a,
515                 PtrStepSzf result, cudaStream_t stream)
516         {
517             dim3 threads(32, 8);
518             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
519 
520             matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
521                     w, h,
522                     (float)templ_sum_r / (w * h),
523                     (float)templ_sum_g / (w * h),
524                     (float)templ_sum_b / (w * h),
525                     (float)templ_sum_a / (w * h),
526                     image_sum_r, image_sum_g, image_sum_b, image_sum_a,
527                     result);
528             cudaSafeCall( cudaGetLastError() );
529 
530             if (stream == 0)
531                 cudaSafeCall( cudaDeviceSynchronize() );
532         }
533 
534         //////////////////////////////////////////////////////////////////////
535         // Prepared_CCOFF_NORMED
536 
matchTemplatePreparedKernel_CCOFF_NORMED_8U(int w,int h,float weight,float templ_sum_scale,float templ_sqsum_scale,const PtrStep<int> image_sum,const PtrStep<double> image_sqsum,PtrStepSzf result)537         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
538                 int w, int h, float weight,
539                 float templ_sum_scale, float templ_sqsum_scale,
540                 const PtrStep<int> image_sum,
541                 const PtrStep<double> image_sqsum,
542                 PtrStepSzf result)
543         {
544             const int x = blockIdx.x * blockDim.x + threadIdx.x;
545             const int y = blockIdx.y * blockDim.y + threadIdx.y;
546 
547             if (x < result.cols && y < result.rows)
548             {
549                 float ccorr = result.ptr(y)[x];
550                 float image_sum_ = (float)(
551                         (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
552                         (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
553                 float image_sqsum_ = (float)(
554                         (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
555                         (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
556                 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
557                                            sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
558             }
559         }
560 
matchTemplatePrepared_CCOFF_NORMED_8U(int w,int h,const PtrStepSz<int> image_sum,const PtrStepSz<double> image_sqsum,int templ_sum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)561         void matchTemplatePrepared_CCOFF_NORMED_8U(
562                     int w, int h, const PtrStepSz<int> image_sum,
563                     const PtrStepSz<double> image_sqsum,
564                     int templ_sum, double templ_sqsum,
565                     PtrStepSzf result, cudaStream_t stream)
566         {
567             dim3 threads(32, 8);
568             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
569 
570             float weight = 1.f / (w * h);
571             float templ_sum_scale = templ_sum * weight;
572             float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
573 
574             matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
575                     w, h, weight, templ_sum_scale, templ_sqsum_scale,
576                     image_sum, image_sqsum, result);
577             cudaSafeCall( cudaGetLastError() );
578 
579             if (stream == 0)
580                 cudaSafeCall( cudaDeviceSynchronize() );
581         }
582 
583 
584 
matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,PtrStepSzf result)585         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
586                 int w, int h, float weight,
587                 float templ_sum_scale_r, float templ_sum_scale_g,
588                 float templ_sqsum_scale,
589                 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
590                 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
591                 PtrStepSzf result)
592         {
593             const int x = blockIdx.x * blockDim.x + threadIdx.x;
594             const int y = blockIdx.y * blockDim.y + threadIdx.y;
595 
596             if (x < result.cols && y < result.rows)
597             {
598                 float image_sum_r_ = (float)(
599                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
600                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
601                 float image_sqsum_r_ = (float)(
602                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
603                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
604                 float image_sum_g_ = (float)(
605                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
606                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
607                 float image_sqsum_g_ = (float)(
608                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
609                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
610 
611                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
612                                              - image_sum_g_ * templ_sum_scale_g;
613                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
614                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
615                 result.ptr(y)[x] = normAcc(num, denum);
616             }
617         }
618 
matchTemplatePrepared_CCOFF_NORMED_8UC2(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,PtrStepSzf result,cudaStream_t stream)619         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
620                     int w, int h,
621                     const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
622                     const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
623                     int templ_sum_r, double templ_sqsum_r,
624                     int templ_sum_g, double templ_sqsum_g,
625                     PtrStepSzf result, cudaStream_t stream)
626         {
627             dim3 threads(32, 8);
628             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
629 
630             float weight = 1.f / (w * h);
631             float templ_sum_scale_r = templ_sum_r * weight;
632             float templ_sum_scale_g = templ_sum_g * weight;
633             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
634                                        + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
635 
636             matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
637                     w, h, weight,
638                     templ_sum_scale_r, templ_sum_scale_g,
639                     templ_sqsum_scale,
640                     image_sum_r, image_sqsum_r,
641                     image_sum_g, image_sqsum_g,
642                     result);
643             cudaSafeCall( cudaGetLastError() );
644 
645             if (stream == 0)
646                 cudaSafeCall( cudaDeviceSynchronize() );
647         }
648 
649 
650 
matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,const PtrStep<int> image_sum_b,const PtrStep<double> image_sqsum_b,PtrStepSzf result)651         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
652                 int w, int h, float weight,
653                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
654                 float templ_sqsum_scale,
655                 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
656                 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
657                 const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
658                 PtrStepSzf result)
659         {
660             const int x = blockIdx.x * blockDim.x + threadIdx.x;
661             const int y = blockIdx.y * blockDim.y + threadIdx.y;
662 
663             if (x < result.cols && y < result.rows)
664             {
665                 float image_sum_r_ = (float)(
666                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
667                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
668                 float image_sqsum_r_ = (float)(
669                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
670                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
671                 float image_sum_g_ = (float)(
672                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
673                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
674                 float image_sqsum_g_ = (float)(
675                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
676                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
677                 float image_sum_b_ = (float)(
678                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
679                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
680                 float image_sqsum_b_ = (float)(
681                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
682                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
683 
684                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
685                                              - image_sum_g_ * templ_sum_scale_g
686                                              - image_sum_b_ * templ_sum_scale_b;
687                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
688                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
689                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
690                 result.ptr(y)[x] = normAcc(num, denum);
691             }
692         }
693 
matchTemplatePrepared_CCOFF_NORMED_8UC3(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<double> image_sqsum_b,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,int templ_sum_b,double templ_sqsum_b,PtrStepSzf result,cudaStream_t stream)694         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
695                     int w, int h,
696                     const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
697                     const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
698                     const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
699                     int templ_sum_r, double templ_sqsum_r,
700                     int templ_sum_g, double templ_sqsum_g,
701                     int templ_sum_b, double templ_sqsum_b,
702                     PtrStepSzf result, cudaStream_t stream)
703         {
704             dim3 threads(32, 8);
705             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
706 
707             float weight = 1.f / (w * h);
708             float templ_sum_scale_r = templ_sum_r * weight;
709             float templ_sum_scale_g = templ_sum_g * weight;
710             float templ_sum_scale_b = templ_sum_b * weight;
711             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
712                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
713                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
714 
715             matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
716                     w, h, weight,
717                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
718                     templ_sqsum_scale,
719                     image_sum_r, image_sqsum_r,
720                     image_sum_g, image_sqsum_g,
721                     image_sum_b, image_sqsum_b,
722                     result);
723             cudaSafeCall( cudaGetLastError() );
724 
725             if (stream == 0)
726                 cudaSafeCall( cudaDeviceSynchronize() );
727         }
728 
729 
730 
matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sum_scale_a,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,const PtrStep<int> image_sum_b,const PtrStep<double> image_sqsum_b,const PtrStep<int> image_sum_a,const PtrStep<double> image_sqsum_a,PtrStepSzf result)731         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
732                 int w, int h, float weight,
733                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
734                 float templ_sum_scale_a, float templ_sqsum_scale,
735                 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
736                 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
737                 const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
738                 const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
739                 PtrStepSzf result)
740         {
741             const int x = blockIdx.x * blockDim.x + threadIdx.x;
742             const int y = blockIdx.y * blockDim.y + threadIdx.y;
743 
744             if (x < result.cols && y < result.rows)
745             {
746                 float image_sum_r_ = (float)(
747                         (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
748                         (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
749                 float image_sqsum_r_ = (float)(
750                         (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
751                         (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
752                 float image_sum_g_ = (float)(
753                         (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
754                         (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
755                 float image_sqsum_g_ = (float)(
756                         (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
757                         (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
758                 float image_sum_b_ = (float)(
759                         (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
760                         (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
761                 float image_sqsum_b_ = (float)(
762                         (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
763                         (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
764                 float image_sum_a_ = (float)(
765                         (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
766                         (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
767                 float image_sqsum_a_ = (float)(
768                         (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
769                         (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
770 
771                 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
772                                              - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
773                 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
774                                                          + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
775                                                          + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
776                                                          + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
777                 result.ptr(y)[x] = normAcc(num, denum);
778             }
779         }
780 
matchTemplatePrepared_CCOFF_NORMED_8UC4(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<double> image_sqsum_b,const PtrStepSz<int> image_sum_a,const PtrStepSz<double> image_sqsum_a,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,int templ_sum_b,double templ_sqsum_b,int templ_sum_a,double templ_sqsum_a,PtrStepSzf result,cudaStream_t stream)781         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
782                     int w, int h,
783                     const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
784                     const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
785                     const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
786                     const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
787                     int templ_sum_r, double templ_sqsum_r,
788                     int templ_sum_g, double templ_sqsum_g,
789                     int templ_sum_b, double templ_sqsum_b,
790                     int templ_sum_a, double templ_sqsum_a,
791                     PtrStepSzf result, cudaStream_t stream)
792         {
793             dim3 threads(32, 8);
794             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
795 
796             float weight = 1.f / (w * h);
797             float templ_sum_scale_r = templ_sum_r * weight;
798             float templ_sum_scale_g = templ_sum_g * weight;
799             float templ_sum_scale_b = templ_sum_b * weight;
800             float templ_sum_scale_a = templ_sum_a * weight;
801             float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
802                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
803                                       + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
804                                       + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
805 
806             matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
807                     w, h, weight,
808                     templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
809                     templ_sqsum_scale,
810                     image_sum_r, image_sqsum_r,
811                     image_sum_g, image_sqsum_g,
812                     image_sum_b, image_sqsum_b,
813                     image_sum_a, image_sqsum_a,
814                     result);
815             cudaSafeCall( cudaGetLastError() );
816 
817             if (stream == 0)
818                 cudaSafeCall( cudaDeviceSynchronize() );
819         }
820 
821         //////////////////////////////////////////////////////////////////////
822         // normalize
823 
824         template <int cn>
normalizeKernel_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)825         __global__ void normalizeKernel_8U(
826                 int w, int h, const PtrStep<double> image_sqsum,
827                 double templ_sqsum, PtrStepSzf result)
828         {
829             const int x = blockIdx.x * blockDim.x + threadIdx.x;
830             const int y = blockIdx.y * blockDim.y + threadIdx.y;
831 
832             if (x < result.cols && y < result.rows)
833             {
834                 float image_sqsum_ = (float)(
835                         (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
836                         (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
837                 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
838             }
839         }
840 
normalize_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)841         void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
842                           double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
843         {
844             dim3 threads(32, 8);
845             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
846 
847             switch (cn)
848             {
849             case 1:
850                 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
851                 break;
852             case 2:
853                 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
854                 break;
855             case 3:
856                 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
857                 break;
858             case 4:
859                 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
860                 break;
861             }
862 
863             cudaSafeCall( cudaGetLastError() );
864 
865             if (stream == 0)
866                 cudaSafeCall( cudaDeviceSynchronize() );
867         }
868 
869         //////////////////////////////////////////////////////////////////////
870         // extractFirstChannel
871 
872         template <int cn>
extractFirstChannel_32F(const PtrStepb image,PtrStepSzf result)873         __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
874         {
875             typedef typename TypeVec<float, cn>::vec_type Typef;
876 
877             int x = blockDim.x * blockIdx.x + threadIdx.x;
878             int y = blockDim.y * blockIdx.y + threadIdx.y;
879 
880             if (x < result.cols && y < result.rows)
881             {
882                 Typef val = ((const Typef*)image.ptr(y))[x];
883                 result.ptr(y)[x] = first(val);
884             }
885         }
886 
extractFirstChannel_32F(const PtrStepSzb image,PtrStepSzf result,int cn,cudaStream_t stream)887         void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
888         {
889             dim3 threads(32, 8);
890             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
891 
892             switch (cn)
893             {
894             case 1:
895                 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
896                 break;
897             case 2:
898                 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
899                 break;
900             case 3:
901                 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
902                 break;
903             case 4:
904                 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
905                 break;
906             }
907             cudaSafeCall( cudaGetLastError() );
908 
909             if (stream == 0)
910                 cudaSafeCall( cudaDeviceSynchronize() );
911         }
912     } //namespace match_template
913 }}} // namespace cv { namespace cuda { namespace cudev
914 
915 
916 #endif /* CUDA_DISABLER */
917