1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/vec_math.hpp" 47 48 namespace cv { namespace cuda { namespace device 49 { 50 namespace match_template 51 { sum(float v)52 __device__ __forceinline__ float sum(float v) { return v; } sum(float2 v)53 __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } sum(float3 v)54 __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; } sum(float4 v)55 __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } 56 first(float v)57 __device__ __forceinline__ float first(float v) { return v; } first(float2 v)58 __device__ __forceinline__ float first(float2 v) { return v.x; } first(float3 v)59 __device__ __forceinline__ float first(float3 v) { return v.x; } first(float4 v)60 __device__ __forceinline__ float first(float4 v) { return v.x; } 61 mul(float a,float b)62 __device__ __forceinline__ float mul(float a, float b) { return a * b; } mul(float2 a,float2 b)63 __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } mul(float3 a,float3 b)64 __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } mul(float4 a,float4 b)65 __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } 66 mul(uchar a,uchar b)67 __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; } mul(uchar2 a,uchar2 b)68 __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } mul(uchar3 a,uchar3 b)69 __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } mul(uchar4 a,uchar4 b)70 __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } 71 sub(float a,float b)72 __device__ __forceinline__ float sub(float a, float b) { return a - b; } sub(float2 a,float2 b)73 __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } sub(float3 a,float3 b)74 __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } sub(float4 a,float4 b)75 __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } 76 sub(uchar a,uchar b)77 __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; } sub(uchar2 a,uchar2 b)78 __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } sub(uchar3 a,uchar3 b)79 __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } sub(uchar4 a,uchar4 b)80 __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } 81 82 ////////////////////////////////////////////////////////////////////// 83 // Naive_CCORR 84 85 template <typename T, int cn> matchTemplateNaiveKernel_CCORR(int w,int h,const PtrStepb image,const PtrStepb templ,PtrStepSzf result)86 __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) 87 { 88 typedef typename TypeVec<T, cn>::vec_type Type; 89 typedef typename TypeVec<float, cn>::vec_type Typef; 90 91 int x = blockDim.x * blockIdx.x + threadIdx.x; 92 int y = blockDim.y * blockIdx.y + threadIdx.y; 93 94 if (x < result.cols && y < result.rows) 95 { 96 Typef res = VecTraits<Typef>::all(0); 97 98 for (int i = 0; i < h; ++i) 99 { 100 const Type* image_ptr = (const Type*)image.ptr(y + i); 101 const Type* templ_ptr = (const Type*)templ.ptr(i); 102 for (int j = 0; j < w; ++j) 103 res = res + mul(image_ptr[x + j], templ_ptr[j]); 104 } 105 106 result.ptr(y)[x] = sum(res); 107 } 108 } 109 110 template <typename T, int cn> matchTemplateNaive_CCORR(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,cudaStream_t stream)111 void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) 112 { 113 const dim3 threads(32, 8); 114 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 115 116 matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); 117 cudaSafeCall( cudaGetLastError() ); 118 119 if (stream == 0) 120 cudaSafeCall( cudaDeviceSynchronize() ); 121 } 122 matchTemplateNaive_CCORR_32F(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)123 void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 124 { 125 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 126 127 static const caller_t callers[] = 128 { 129 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4> 130 }; 131 132 callers[cn](image, templ, result, stream); 133 } 134 135 matchTemplateNaive_CCORR_8U(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)136 void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 137 { 138 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 139 140 static const caller_t callers[] = 141 { 142 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4> 143 }; 144 145 callers[cn](image, templ, result, stream); 146 } 147 148 ////////////////////////////////////////////////////////////////////// 149 // Naive_SQDIFF 150 151 template <typename T, int cn> matchTemplateNaiveKernel_SQDIFF(int w,int h,const PtrStepb image,const PtrStepb templ,PtrStepSzf result)152 __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) 153 { 154 typedef typename TypeVec<T, cn>::vec_type Type; 155 typedef typename TypeVec<float, cn>::vec_type Typef; 156 157 int x = blockDim.x * blockIdx.x + threadIdx.x; 158 int y = blockDim.y * blockIdx.y + threadIdx.y; 159 160 if (x < result.cols && y < result.rows) 161 { 162 Typef res = VecTraits<Typef>::all(0); 163 Typef delta; 164 165 for (int i = 0; i < h; ++i) 166 { 167 const Type* image_ptr = (const Type*)image.ptr(y + i); 168 const Type* templ_ptr = (const Type*)templ.ptr(i); 169 for (int j = 0; j < w; ++j) 170 { 171 delta = sub(image_ptr[x + j], templ_ptr[j]); 172 res = res + delta * delta; 173 } 174 } 175 176 result.ptr(y)[x] = sum(res); 177 } 178 } 179 180 template <typename T, int cn> matchTemplateNaive_SQDIFF(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,cudaStream_t stream)181 void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) 182 { 183 const dim3 threads(32, 8); 184 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 185 186 matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); 187 cudaSafeCall( cudaGetLastError() ); 188 189 if (stream == 0) 190 cudaSafeCall( cudaDeviceSynchronize() ); 191 } 192 matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)193 void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 194 { 195 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 196 197 static const caller_t callers[] = 198 { 199 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4> 200 }; 201 202 callers[cn](image, templ, result, stream); 203 } 204 matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image,const PtrStepSzb templ,PtrStepSzf result,int cn,cudaStream_t stream)205 void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) 206 { 207 typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); 208 209 static const caller_t callers[] = 210 { 211 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4> 212 }; 213 214 callers[cn](image, templ, result, stream); 215 } 216 217 ////////////////////////////////////////////////////////////////////// 218 // Prepared_SQDIFF 219 220 template <int cn> matchTemplatePreparedKernel_SQDIFF_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)221 __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result) 222 { 223 const int x = blockIdx.x * blockDim.x + threadIdx.x; 224 const int y = blockIdx.y * blockDim.y + threadIdx.y; 225 226 if (x < result.cols && y < result.rows) 227 { 228 float image_sqsum_ = (float)( 229 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 230 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 231 float ccorr = result.ptr(y)[x]; 232 result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum; 233 } 234 } 235 236 template <int cn> matchTemplatePrepared_SQDIFF_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)237 void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream) 238 { 239 const dim3 threads(32, 8); 240 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 241 242 matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 243 cudaSafeCall( cudaGetLastError() ); 244 245 if (stream == 0) 246 cudaSafeCall( cudaDeviceSynchronize() ); 247 } 248 matchTemplatePrepared_SQDIFF_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)249 void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn, 250 cudaStream_t stream) 251 { 252 typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream); 253 254 static const caller_t callers[] = 255 { 256 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> 257 }; 258 259 callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); 260 } 261 262 ////////////////////////////////////////////////////////////////////// 263 // Prepared_SQDIFF_NORMED 264 265 // normAcc* are accurate normalization routines which make CUDA matchTemplate 266 // consistent with CPU one 267 normAcc(float num,float denum)268 __device__ float normAcc(float num, float denum) 269 { 270 if (::fabs(num) < denum) 271 return num / denum; 272 if (::fabs(num) < denum * 1.125f) 273 return num > 0 ? 1 : -1; 274 return 0; 275 } 276 277 normAcc_SQDIFF(float num,float denum)278 __device__ float normAcc_SQDIFF(float num, float denum) 279 { 280 if (::fabs(num) < denum) 281 return num / denum; 282 if (::fabs(num) < denum * 1.125f) 283 return num > 0 ? 1 : -1; 284 return 1; 285 } 286 287 288 template <int cn> matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)289 __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( 290 int w, int h, const PtrStep<double> image_sqsum, 291 double templ_sqsum, PtrStepSzf result) 292 { 293 const int x = blockIdx.x * blockDim.x + threadIdx.x; 294 const int y = blockIdx.y * blockDim.y + threadIdx.y; 295 296 if (x < result.cols && y < result.rows) 297 { 298 float image_sqsum_ = (float)( 299 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 300 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 301 float ccorr = result.ptr(y)[x]; 302 result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum, 303 sqrtf(image_sqsum_ * templ_sqsum)); 304 } 305 } 306 307 template <int cn> matchTemplatePrepared_SQDIFF_NORMED_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)308 void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, 309 PtrStepSzf result, cudaStream_t stream) 310 { 311 const dim3 threads(32, 8); 312 const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 313 314 matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 315 cudaSafeCall( cudaGetLastError() ); 316 317 if (stream == 0) 318 cudaSafeCall( cudaDeviceSynchronize() ); 319 } 320 321 matchTemplatePrepared_SQDIFF_NORMED_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)322 void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, 323 PtrStepSzf result, int cn, cudaStream_t stream) 324 { 325 typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream); 326 static const caller_t callers[] = 327 { 328 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> 329 }; 330 331 callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); 332 } 333 334 ////////////////////////////////////////////////////////////////////// 335 // Prepared_CCOFF 336 matchTemplatePreparedKernel_CCOFF_8U(int w,int h,float templ_sum_scale,const PtrStep<int> image_sum,PtrStepSzf result)337 __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result) 338 { 339 const int x = blockIdx.x * blockDim.x + threadIdx.x; 340 const int y = blockIdx.y * blockDim.y + threadIdx.y; 341 342 if (x < result.cols && y < result.rows) 343 { 344 float image_sum_ = (float)( 345 (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - 346 (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); 347 float ccorr = result.ptr(y)[x]; 348 result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale; 349 } 350 } 351 matchTemplatePrepared_CCOFF_8U(int w,int h,const PtrStepSz<int> image_sum,int templ_sum,PtrStepSzf result,cudaStream_t stream)352 void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream) 353 { 354 dim3 threads(32, 8); 355 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 356 357 matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result); 358 cudaSafeCall( cudaGetLastError() ); 359 360 if (stream == 0) 361 cudaSafeCall( cudaDeviceSynchronize() ); 362 } 363 364 365 matchTemplatePreparedKernel_CCOFF_8UC2(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,PtrStepSzf result)366 __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( 367 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, 368 const PtrStep<int> image_sum_r, 369 const PtrStep<int> image_sum_g, 370 PtrStepSzf result) 371 { 372 const int x = blockIdx.x * blockDim.x + threadIdx.x; 373 const int y = blockIdx.y * blockDim.y + threadIdx.y; 374 375 if (x < result.cols && y < result.rows) 376 { 377 float image_sum_r_ = (float)( 378 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 379 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 380 float image_sum_g_ = (float)( 381 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 382 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 383 float ccorr = result.ptr(y)[x]; 384 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 385 - image_sum_g_ * templ_sum_scale_g; 386 } 387 } 388 matchTemplatePrepared_CCOFF_8UC2(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,int templ_sum_r,int templ_sum_g,PtrStepSzf result,cudaStream_t stream)389 void matchTemplatePrepared_CCOFF_8UC2( 390 int w, int h, 391 const PtrStepSz<int> image_sum_r, 392 const PtrStepSz<int> image_sum_g, 393 int templ_sum_r, int templ_sum_g, 394 PtrStepSzf result, cudaStream_t stream) 395 { 396 dim3 threads(32, 8); 397 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 398 399 matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>( 400 w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), 401 image_sum_r, image_sum_g, result); 402 cudaSafeCall( cudaGetLastError() ); 403 404 if (stream == 0) 405 cudaSafeCall( cudaDeviceSynchronize() ); 406 } 407 408 409 matchTemplatePreparedKernel_CCOFF_8UC3(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,const PtrStep<int> image_sum_b,PtrStepSzf result)410 __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( 411 int w, int h, 412 float templ_sum_scale_r, 413 float templ_sum_scale_g, 414 float templ_sum_scale_b, 415 const PtrStep<int> image_sum_r, 416 const PtrStep<int> image_sum_g, 417 const PtrStep<int> image_sum_b, 418 PtrStepSzf result) 419 { 420 const int x = blockIdx.x * blockDim.x + threadIdx.x; 421 const int y = blockIdx.y * blockDim.y + threadIdx.y; 422 423 if (x < result.cols && y < result.rows) 424 { 425 float image_sum_r_ = (float)( 426 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 427 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 428 float image_sum_g_ = (float)( 429 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 430 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 431 float image_sum_b_ = (float)( 432 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 433 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 434 float ccorr = result.ptr(y)[x]; 435 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 436 - image_sum_g_ * templ_sum_scale_g 437 - image_sum_b_ * templ_sum_scale_b; 438 } 439 } 440 matchTemplatePrepared_CCOFF_8UC3(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<int> image_sum_b,int templ_sum_r,int templ_sum_g,int templ_sum_b,PtrStepSzf result,cudaStream_t stream)441 void matchTemplatePrepared_CCOFF_8UC3( 442 int w, int h, 443 const PtrStepSz<int> image_sum_r, 444 const PtrStepSz<int> image_sum_g, 445 const PtrStepSz<int> image_sum_b, 446 int templ_sum_r, 447 int templ_sum_g, 448 int templ_sum_b, 449 PtrStepSzf result, cudaStream_t stream) 450 { 451 dim3 threads(32, 8); 452 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 453 454 matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>( 455 w, h, 456 (float)templ_sum_r / (w * h), 457 (float)templ_sum_g / (w * h), 458 (float)templ_sum_b / (w * h), 459 image_sum_r, image_sum_g, image_sum_b, result); 460 cudaSafeCall( cudaGetLastError() ); 461 462 if (stream == 0) 463 cudaSafeCall( cudaDeviceSynchronize() ); 464 } 465 466 467 matchTemplatePreparedKernel_CCOFF_8UC4(int w,int h,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sum_scale_a,const PtrStep<int> image_sum_r,const PtrStep<int> image_sum_g,const PtrStep<int> image_sum_b,const PtrStep<int> image_sum_a,PtrStepSzf result)468 __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( 469 int w, int h, 470 float templ_sum_scale_r, 471 float templ_sum_scale_g, 472 float templ_sum_scale_b, 473 float templ_sum_scale_a, 474 const PtrStep<int> image_sum_r, 475 const PtrStep<int> image_sum_g, 476 const PtrStep<int> image_sum_b, 477 const PtrStep<int> image_sum_a, 478 PtrStepSzf result) 479 { 480 const int x = blockIdx.x * blockDim.x + threadIdx.x; 481 const int y = blockIdx.y * blockDim.y + threadIdx.y; 482 483 if (x < result.cols && y < result.rows) 484 { 485 float image_sum_r_ = (float)( 486 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 487 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 488 float image_sum_g_ = (float)( 489 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 490 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 491 float image_sum_b_ = (float)( 492 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 493 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 494 float image_sum_a_ = (float)( 495 (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - 496 (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); 497 float ccorr = result.ptr(y)[x]; 498 result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 499 - image_sum_g_ * templ_sum_scale_g 500 - image_sum_b_ * templ_sum_scale_b 501 - image_sum_a_ * templ_sum_scale_a; 502 } 503 } 504 matchTemplatePrepared_CCOFF_8UC4(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<int> image_sum_a,int templ_sum_r,int templ_sum_g,int templ_sum_b,int templ_sum_a,PtrStepSzf result,cudaStream_t stream)505 void matchTemplatePrepared_CCOFF_8UC4( 506 int w, int h, 507 const PtrStepSz<int> image_sum_r, 508 const PtrStepSz<int> image_sum_g, 509 const PtrStepSz<int> image_sum_b, 510 const PtrStepSz<int> image_sum_a, 511 int templ_sum_r, 512 int templ_sum_g, 513 int templ_sum_b, 514 int templ_sum_a, 515 PtrStepSzf result, cudaStream_t stream) 516 { 517 dim3 threads(32, 8); 518 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 519 520 matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>( 521 w, h, 522 (float)templ_sum_r / (w * h), 523 (float)templ_sum_g / (w * h), 524 (float)templ_sum_b / (w * h), 525 (float)templ_sum_a / (w * h), 526 image_sum_r, image_sum_g, image_sum_b, image_sum_a, 527 result); 528 cudaSafeCall( cudaGetLastError() ); 529 530 if (stream == 0) 531 cudaSafeCall( cudaDeviceSynchronize() ); 532 } 533 534 ////////////////////////////////////////////////////////////////////// 535 // Prepared_CCOFF_NORMED 536 matchTemplatePreparedKernel_CCOFF_NORMED_8U(int w,int h,float weight,float templ_sum_scale,float templ_sqsum_scale,const PtrStep<int> image_sum,const PtrStep<double> image_sqsum,PtrStepSzf result)537 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( 538 int w, int h, float weight, 539 float templ_sum_scale, float templ_sqsum_scale, 540 const PtrStep<int> image_sum, 541 const PtrStep<double> image_sqsum, 542 PtrStepSzf result) 543 { 544 const int x = blockIdx.x * blockDim.x + threadIdx.x; 545 const int y = blockIdx.y * blockDim.y + threadIdx.y; 546 547 if (x < result.cols && y < result.rows) 548 { 549 float ccorr = result.ptr(y)[x]; 550 float image_sum_ = (float)( 551 (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - 552 (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); 553 float image_sqsum_ = (float)( 554 (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - 555 (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); 556 result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale, 557 sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_))); 558 } 559 } 560 matchTemplatePrepared_CCOFF_NORMED_8U(int w,int h,const PtrStepSz<int> image_sum,const PtrStepSz<double> image_sqsum,int templ_sum,double templ_sqsum,PtrStepSzf result,cudaStream_t stream)561 void matchTemplatePrepared_CCOFF_NORMED_8U( 562 int w, int h, const PtrStepSz<int> image_sum, 563 const PtrStepSz<double> image_sqsum, 564 int templ_sum, double templ_sqsum, 565 PtrStepSzf result, cudaStream_t stream) 566 { 567 dim3 threads(32, 8); 568 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 569 570 float weight = 1.f / (w * h); 571 float templ_sum_scale = templ_sum * weight; 572 float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; 573 574 matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>( 575 w, h, weight, templ_sum_scale, templ_sqsum_scale, 576 image_sum, image_sqsum, result); 577 cudaSafeCall( cudaGetLastError() ); 578 579 if (stream == 0) 580 cudaSafeCall( cudaDeviceSynchronize() ); 581 } 582 583 584 matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,PtrStepSzf result)585 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( 586 int w, int h, float weight, 587 float templ_sum_scale_r, float templ_sum_scale_g, 588 float templ_sqsum_scale, 589 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 590 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 591 PtrStepSzf result) 592 { 593 const int x = blockIdx.x * blockDim.x + threadIdx.x; 594 const int y = blockIdx.y * blockDim.y + threadIdx.y; 595 596 if (x < result.cols && y < result.rows) 597 { 598 float image_sum_r_ = (float)( 599 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 600 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 601 float image_sqsum_r_ = (float)( 602 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 603 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 604 float image_sum_g_ = (float)( 605 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 606 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 607 float image_sqsum_g_ = (float)( 608 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 609 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 610 611 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r 612 - image_sum_g_ * templ_sum_scale_g; 613 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 614 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_)); 615 result.ptr(y)[x] = normAcc(num, denum); 616 } 617 } 618 matchTemplatePrepared_CCOFF_NORMED_8UC2(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,PtrStepSzf result,cudaStream_t stream)619 void matchTemplatePrepared_CCOFF_NORMED_8UC2( 620 int w, int h, 621 const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 622 const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 623 int templ_sum_r, double templ_sqsum_r, 624 int templ_sum_g, double templ_sqsum_g, 625 PtrStepSzf result, cudaStream_t stream) 626 { 627 dim3 threads(32, 8); 628 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 629 630 float weight = 1.f / (w * h); 631 float templ_sum_scale_r = templ_sum_r * weight; 632 float templ_sum_scale_g = templ_sum_g * weight; 633 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 634 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g; 635 636 matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>( 637 w, h, weight, 638 templ_sum_scale_r, templ_sum_scale_g, 639 templ_sqsum_scale, 640 image_sum_r, image_sqsum_r, 641 image_sum_g, image_sqsum_g, 642 result); 643 cudaSafeCall( cudaGetLastError() ); 644 645 if (stream == 0) 646 cudaSafeCall( cudaDeviceSynchronize() ); 647 } 648 649 650 matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,const PtrStep<int> image_sum_b,const PtrStep<double> image_sqsum_b,PtrStepSzf result)651 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( 652 int w, int h, float weight, 653 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 654 float templ_sqsum_scale, 655 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 656 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 657 const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b, 658 PtrStepSzf result) 659 { 660 const int x = blockIdx.x * blockDim.x + threadIdx.x; 661 const int y = blockIdx.y * blockDim.y + threadIdx.y; 662 663 if (x < result.cols && y < result.rows) 664 { 665 float image_sum_r_ = (float)( 666 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 667 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 668 float image_sqsum_r_ = (float)( 669 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 670 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 671 float image_sum_g_ = (float)( 672 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 673 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 674 float image_sqsum_g_ = (float)( 675 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 676 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 677 float image_sum_b_ = (float)( 678 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 679 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 680 float image_sqsum_b_ = (float)( 681 (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - 682 (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); 683 684 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r 685 - image_sum_g_ * templ_sum_scale_g 686 - image_sum_b_ * templ_sum_scale_b; 687 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 688 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ 689 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_)); 690 result.ptr(y)[x] = normAcc(num, denum); 691 } 692 } 693 matchTemplatePrepared_CCOFF_NORMED_8UC3(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<double> image_sqsum_b,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,int templ_sum_b,double templ_sqsum_b,PtrStepSzf result,cudaStream_t stream)694 void matchTemplatePrepared_CCOFF_NORMED_8UC3( 695 int w, int h, 696 const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 697 const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 698 const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b, 699 int templ_sum_r, double templ_sqsum_r, 700 int templ_sum_g, double templ_sqsum_g, 701 int templ_sum_b, double templ_sqsum_b, 702 PtrStepSzf result, cudaStream_t stream) 703 { 704 dim3 threads(32, 8); 705 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 706 707 float weight = 1.f / (w * h); 708 float templ_sum_scale_r = templ_sum_r * weight; 709 float templ_sum_scale_g = templ_sum_g * weight; 710 float templ_sum_scale_b = templ_sum_b * weight; 711 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 712 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g 713 + templ_sqsum_b - weight * templ_sum_b * templ_sum_b; 714 715 matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>( 716 w, h, weight, 717 templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 718 templ_sqsum_scale, 719 image_sum_r, image_sqsum_r, 720 image_sum_g, image_sqsum_g, 721 image_sum_b, image_sqsum_b, 722 result); 723 cudaSafeCall( cudaGetLastError() ); 724 725 if (stream == 0) 726 cudaSafeCall( cudaDeviceSynchronize() ); 727 } 728 729 730 matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(int w,int h,float weight,float templ_sum_scale_r,float templ_sum_scale_g,float templ_sum_scale_b,float templ_sum_scale_a,float templ_sqsum_scale,const PtrStep<int> image_sum_r,const PtrStep<double> image_sqsum_r,const PtrStep<int> image_sum_g,const PtrStep<double> image_sqsum_g,const PtrStep<int> image_sum_b,const PtrStep<double> image_sqsum_b,const PtrStep<int> image_sum_a,const PtrStep<double> image_sqsum_a,PtrStepSzf result)731 __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( 732 int w, int h, float weight, 733 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 734 float templ_sum_scale_a, float templ_sqsum_scale, 735 const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r, 736 const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g, 737 const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b, 738 const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a, 739 PtrStepSzf result) 740 { 741 const int x = blockIdx.x * blockDim.x + threadIdx.x; 742 const int y = blockIdx.y * blockDim.y + threadIdx.y; 743 744 if (x < result.cols && y < result.rows) 745 { 746 float image_sum_r_ = (float)( 747 (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - 748 (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); 749 float image_sqsum_r_ = (float)( 750 (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - 751 (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); 752 float image_sum_g_ = (float)( 753 (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - 754 (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); 755 float image_sqsum_g_ = (float)( 756 (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - 757 (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); 758 float image_sum_b_ = (float)( 759 (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - 760 (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); 761 float image_sqsum_b_ = (float)( 762 (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - 763 (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); 764 float image_sum_a_ = (float)( 765 (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - 766 (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); 767 float image_sqsum_a_ = (float)( 768 (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) - 769 (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x])); 770 771 float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g 772 - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a; 773 float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ 774 + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ 775 + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_ 776 + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_)); 777 result.ptr(y)[x] = normAcc(num, denum); 778 } 779 } 780 matchTemplatePrepared_CCOFF_NORMED_8UC4(int w,int h,const PtrStepSz<int> image_sum_r,const PtrStepSz<double> image_sqsum_r,const PtrStepSz<int> image_sum_g,const PtrStepSz<double> image_sqsum_g,const PtrStepSz<int> image_sum_b,const PtrStepSz<double> image_sqsum_b,const PtrStepSz<int> image_sum_a,const PtrStepSz<double> image_sqsum_a,int templ_sum_r,double templ_sqsum_r,int templ_sum_g,double templ_sqsum_g,int templ_sum_b,double templ_sqsum_b,int templ_sum_a,double templ_sqsum_a,PtrStepSzf result,cudaStream_t stream)781 void matchTemplatePrepared_CCOFF_NORMED_8UC4( 782 int w, int h, 783 const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r, 784 const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g, 785 const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b, 786 const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a, 787 int templ_sum_r, double templ_sqsum_r, 788 int templ_sum_g, double templ_sqsum_g, 789 int templ_sum_b, double templ_sqsum_b, 790 int templ_sum_a, double templ_sqsum_a, 791 PtrStepSzf result, cudaStream_t stream) 792 { 793 dim3 threads(32, 8); 794 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 795 796 float weight = 1.f / (w * h); 797 float templ_sum_scale_r = templ_sum_r * weight; 798 float templ_sum_scale_g = templ_sum_g * weight; 799 float templ_sum_scale_b = templ_sum_b * weight; 800 float templ_sum_scale_a = templ_sum_a * weight; 801 float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 802 + templ_sqsum_g - weight * templ_sum_g * templ_sum_g 803 + templ_sqsum_b - weight * templ_sum_b * templ_sum_b 804 + templ_sqsum_a - weight * templ_sum_a * templ_sum_a; 805 806 matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>( 807 w, h, weight, 808 templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 809 templ_sqsum_scale, 810 image_sum_r, image_sqsum_r, 811 image_sum_g, image_sqsum_g, 812 image_sum_b, image_sqsum_b, 813 image_sum_a, image_sqsum_a, 814 result); 815 cudaSafeCall( cudaGetLastError() ); 816 817 if (stream == 0) 818 cudaSafeCall( cudaDeviceSynchronize() ); 819 } 820 821 ////////////////////////////////////////////////////////////////////// 822 // normalize 823 824 template <int cn> normalizeKernel_8U(int w,int h,const PtrStep<double> image_sqsum,double templ_sqsum,PtrStepSzf result)825 __global__ void normalizeKernel_8U( 826 int w, int h, const PtrStep<double> image_sqsum, 827 double templ_sqsum, PtrStepSzf result) 828 { 829 const int x = blockIdx.x * blockDim.x + threadIdx.x; 830 const int y = blockIdx.y * blockDim.y + threadIdx.y; 831 832 if (x < result.cols && y < result.rows) 833 { 834 float image_sqsum_ = (float)( 835 (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - 836 (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); 837 result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum)); 838 } 839 } 840 normalize_8U(int w,int h,const PtrStepSz<double> image_sqsum,double templ_sqsum,PtrStepSzf result,int cn,cudaStream_t stream)841 void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum, 842 double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) 843 { 844 dim3 threads(32, 8); 845 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 846 847 switch (cn) 848 { 849 case 1: 850 normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 851 break; 852 case 2: 853 normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 854 break; 855 case 3: 856 normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 857 break; 858 case 4: 859 normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); 860 break; 861 } 862 863 cudaSafeCall( cudaGetLastError() ); 864 865 if (stream == 0) 866 cudaSafeCall( cudaDeviceSynchronize() ); 867 } 868 869 ////////////////////////////////////////////////////////////////////// 870 // extractFirstChannel 871 872 template <int cn> extractFirstChannel_32F(const PtrStepb image,PtrStepSzf result)873 __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result) 874 { 875 typedef typename TypeVec<float, cn>::vec_type Typef; 876 877 int x = blockDim.x * blockIdx.x + threadIdx.x; 878 int y = blockDim.y * blockIdx.y + threadIdx.y; 879 880 if (x < result.cols && y < result.rows) 881 { 882 Typef val = ((const Typef*)image.ptr(y))[x]; 883 result.ptr(y)[x] = first(val); 884 } 885 } 886 extractFirstChannel_32F(const PtrStepSzb image,PtrStepSzf result,int cn,cudaStream_t stream)887 void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream) 888 { 889 dim3 threads(32, 8); 890 dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); 891 892 switch (cn) 893 { 894 case 1: 895 extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result); 896 break; 897 case 2: 898 extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result); 899 break; 900 case 3: 901 extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result); 902 break; 903 case 4: 904 extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result); 905 break; 906 } 907 cudaSafeCall( cudaGetLastError() ); 908 909 if (stream == 0) 910 cudaSafeCall( cudaDeviceSynchronize() ); 911 } 912 } //namespace match_template 913 }}} // namespace cv { namespace cuda { namespace cudev 914 915 916 #endif /* CUDA_DISABLER */ 917