1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/utility.hpp" 47 #include "opencv2/core/cuda/reduce.hpp" 48 #include "opencv2/core/cuda/limits.hpp" 49 #include "opencv2/core/cuda/vec_distance.hpp" 50 #include "opencv2/core/cuda/datamov_utils.hpp" 51 #include "opencv2/core/cuda/warp_shuffle.hpp" 52 53 namespace cv { namespace cuda { namespace device 54 { 55 namespace bf_knnmatch 56 { 57 /////////////////////////////////////////////////////////////////////////////// 58 // Reduction 59 60 template <int BLOCK_SIZE> findBestMatch(float & bestDistance1,float & bestDistance2,int & bestTrainIdx1,int & bestTrainIdx2,float * s_distance,int * s_trainIdx)61 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 62 int& bestTrainIdx1, int& bestTrainIdx2, 63 float* s_distance, int* s_trainIdx) 64 { 65 #if __CUDA_ARCH__ >= 300 66 (void) s_distance; 67 (void) s_trainIdx; 68 69 float d1, d2; 70 int i1, i2; 71 72 #pragma unroll 73 for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) 74 { 75 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); 76 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); 77 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); 78 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); 79 80 if (bestDistance1 < d1) 81 { 82 if (d1 < bestDistance2) 83 { 84 bestDistance2 = d1; 85 bestTrainIdx2 = i1; 86 } 87 } 88 else 89 { 90 bestDistance2 = bestDistance1; 91 bestTrainIdx2 = bestTrainIdx1; 92 93 bestDistance1 = d1; 94 bestTrainIdx1 = i1; 95 96 if (d2 < bestDistance2) 97 { 98 bestDistance2 = d2; 99 bestTrainIdx2 = i2; 100 } 101 } 102 } 103 #else 104 float myBestDistance1 = numeric_limits<float>::max(); 105 float myBestDistance2 = numeric_limits<float>::max(); 106 int myBestTrainIdx1 = -1; 107 int myBestTrainIdx2 = -1; 108 109 s_distance += threadIdx.y * BLOCK_SIZE; 110 s_trainIdx += threadIdx.y * BLOCK_SIZE; 111 112 s_distance[threadIdx.x] = bestDistance1; 113 s_trainIdx[threadIdx.x] = bestTrainIdx1; 114 115 __syncthreads(); 116 117 if (threadIdx.x == 0) 118 { 119 #pragma unroll 120 for (int i = 0; i < BLOCK_SIZE; ++i) 121 { 122 float val = s_distance[i]; 123 124 if (val < myBestDistance1) 125 { 126 myBestDistance2 = myBestDistance1; 127 myBestTrainIdx2 = myBestTrainIdx1; 128 129 myBestDistance1 = val; 130 myBestTrainIdx1 = s_trainIdx[i]; 131 } 132 else if (val < myBestDistance2) 133 { 134 myBestDistance2 = val; 135 myBestTrainIdx2 = s_trainIdx[i]; 136 } 137 } 138 } 139 140 __syncthreads(); 141 142 s_distance[threadIdx.x] = bestDistance2; 143 s_trainIdx[threadIdx.x] = bestTrainIdx2; 144 145 __syncthreads(); 146 147 if (threadIdx.x == 0) 148 { 149 #pragma unroll 150 for (int i = 0; i < BLOCK_SIZE; ++i) 151 { 152 float val = s_distance[i]; 153 154 if (val < myBestDistance2) 155 { 156 myBestDistance2 = val; 157 myBestTrainIdx2 = s_trainIdx[i]; 158 } 159 } 160 } 161 162 bestDistance1 = myBestDistance1; 163 bestDistance2 = myBestDistance2; 164 165 bestTrainIdx1 = myBestTrainIdx1; 166 bestTrainIdx2 = myBestTrainIdx2; 167 #endif 168 } 169 170 template <int BLOCK_SIZE> findBestMatch(float & bestDistance1,float & bestDistance2,int & bestTrainIdx1,int & bestTrainIdx2,int & bestImgIdx1,int & bestImgIdx2,float * s_distance,int * s_trainIdx,int * s_imgIdx)171 __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, 172 int& bestTrainIdx1, int& bestTrainIdx2, 173 int& bestImgIdx1, int& bestImgIdx2, 174 float* s_distance, int* s_trainIdx, int* s_imgIdx) 175 { 176 #if __CUDA_ARCH__ >= 300 177 (void) s_distance; 178 (void) s_trainIdx; 179 (void) s_imgIdx; 180 181 float d1, d2; 182 int i1, i2; 183 int j1, j2; 184 185 #pragma unroll 186 for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) 187 { 188 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); 189 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); 190 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); 191 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); 192 j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE); 193 j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE); 194 195 if (bestDistance1 < d1) 196 { 197 if (d1 < bestDistance2) 198 { 199 bestDistance2 = d1; 200 bestTrainIdx2 = i1; 201 bestImgIdx2 = j1; 202 } 203 } 204 else 205 { 206 bestDistance2 = bestDistance1; 207 bestTrainIdx2 = bestTrainIdx1; 208 bestImgIdx2 = bestImgIdx1; 209 210 bestDistance1 = d1; 211 bestTrainIdx1 = i1; 212 bestImgIdx1 = j1; 213 214 if (d2 < bestDistance2) 215 { 216 bestDistance2 = d2; 217 bestTrainIdx2 = i2; 218 bestImgIdx2 = j2; 219 } 220 } 221 } 222 #else 223 float myBestDistance1 = numeric_limits<float>::max(); 224 float myBestDistance2 = numeric_limits<float>::max(); 225 int myBestTrainIdx1 = -1; 226 int myBestTrainIdx2 = -1; 227 int myBestImgIdx1 = -1; 228 int myBestImgIdx2 = -1; 229 230 s_distance += threadIdx.y * BLOCK_SIZE; 231 s_trainIdx += threadIdx.y * BLOCK_SIZE; 232 s_imgIdx += threadIdx.y * BLOCK_SIZE; 233 234 s_distance[threadIdx.x] = bestDistance1; 235 s_trainIdx[threadIdx.x] = bestTrainIdx1; 236 s_imgIdx[threadIdx.x] = bestImgIdx1; 237 238 __syncthreads(); 239 240 if (threadIdx.x == 0) 241 { 242 #pragma unroll 243 for (int i = 0; i < BLOCK_SIZE; ++i) 244 { 245 float val = s_distance[i]; 246 247 if (val < myBestDistance1) 248 { 249 myBestDistance2 = myBestDistance1; 250 myBestTrainIdx2 = myBestTrainIdx1; 251 myBestImgIdx2 = myBestImgIdx1; 252 253 myBestDistance1 = val; 254 myBestTrainIdx1 = s_trainIdx[i]; 255 myBestImgIdx1 = s_imgIdx[i]; 256 } 257 else if (val < myBestDistance2) 258 { 259 myBestDistance2 = val; 260 myBestTrainIdx2 = s_trainIdx[i]; 261 myBestImgIdx2 = s_imgIdx[i]; 262 } 263 } 264 } 265 266 __syncthreads(); 267 268 s_distance[threadIdx.x] = bestDistance2; 269 s_trainIdx[threadIdx.x] = bestTrainIdx2; 270 s_imgIdx[threadIdx.x] = bestImgIdx2; 271 272 __syncthreads(); 273 274 if (threadIdx.x == 0) 275 { 276 #pragma unroll 277 for (int i = 0; i < BLOCK_SIZE; ++i) 278 { 279 float val = s_distance[i]; 280 281 if (val < myBestDistance2) 282 { 283 myBestDistance2 = val; 284 myBestTrainIdx2 = s_trainIdx[i]; 285 myBestImgIdx2 = s_imgIdx[i]; 286 } 287 } 288 } 289 290 bestDistance1 = myBestDistance1; 291 bestDistance2 = myBestDistance2; 292 293 bestTrainIdx1 = myBestTrainIdx1; 294 bestTrainIdx2 = myBestTrainIdx2; 295 296 bestImgIdx1 = myBestImgIdx1; 297 bestImgIdx2 = myBestImgIdx2; 298 #endif 299 } 300 301 /////////////////////////////////////////////////////////////////////////////// 302 // Match Unrolled Cached 303 304 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> loadQueryToSmem(int queryIdx,const PtrStepSz<T> & query,U * s_query)305 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query) 306 { 307 #pragma unroll 308 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 309 { 310 const int loadX = threadIdx.x + i * BLOCK_SIZE; 311 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; 312 } 313 } 314 315 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> loopUnrolledCached(int queryIdx,const PtrStepSz<T> & query,int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance1,float & bestDistance2,int & bestTrainIdx1,int & bestTrainIdx2,int & bestImgIdx1,int & bestImgIdx2)316 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 317 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 318 float& bestDistance1, float& bestDistance2, 319 int& bestTrainIdx1, int& bestTrainIdx2, 320 int& bestImgIdx1, int& bestImgIdx2) 321 { 322 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 323 { 324 Dist dist; 325 326 #pragma unroll 327 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 328 { 329 const int loadX = threadIdx.x + i * BLOCK_SIZE; 330 331 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 332 333 if (loadX < train.cols) 334 { 335 T val; 336 337 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 338 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 339 } 340 341 __syncthreads(); 342 343 #pragma unroll 344 for (int j = 0; j < BLOCK_SIZE; ++j) 345 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 346 347 __syncthreads(); 348 } 349 350 typename Dist::result_type distVal = dist; 351 352 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 353 354 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) 355 { 356 if (distVal < bestDistance1) 357 { 358 bestImgIdx2 = bestImgIdx1; 359 bestDistance2 = bestDistance1; 360 bestTrainIdx2 = bestTrainIdx1; 361 362 bestImgIdx1 = imgIdx; 363 bestDistance1 = distVal; 364 bestTrainIdx1 = trainIdx; 365 } 366 else if (distVal < bestDistance2) 367 { 368 bestImgIdx2 = imgIdx; 369 bestDistance2 = distVal; 370 bestTrainIdx2 = trainIdx; 371 } 372 } 373 } 374 } 375 376 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int2 * bestTrainIdx,float2 * bestDistance)377 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) 378 { 379 extern __shared__ int smem[]; 380 381 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 382 383 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 384 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); 385 386 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query); 387 388 float myBestDistance1 = numeric_limits<float>::max(); 389 float myBestDistance2 = numeric_limits<float>::max(); 390 int myBestTrainIdx1 = -1; 391 int myBestTrainIdx2 = -1; 392 393 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); 394 395 __syncthreads(); 396 397 float* s_distance = (float*)(smem); 398 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 399 400 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); 401 402 if (queryIdx < query.rows && threadIdx.x == 0) 403 { 404 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 405 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 406 } 407 } 408 409 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)410 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 411 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance, 412 cudaStream_t stream) 413 { 414 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 415 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 416 417 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 418 419 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 420 cudaSafeCall( cudaGetLastError() ); 421 422 if (stream == 0) 423 cudaSafeCall( cudaDeviceSynchronize() ); 424 } 425 426 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int2 * bestTrainIdx,int2 * bestImgIdx,float2 * bestDistance)427 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) 428 { 429 extern __shared__ int smem[]; 430 431 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 432 433 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 434 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); 435 436 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query); 437 438 float myBestDistance1 = numeric_limits<float>::max(); 439 float myBestDistance2 = numeric_limits<float>::max(); 440 int myBestTrainIdx1 = -1; 441 int myBestTrainIdx2 = -1; 442 int myBestImgIdx1 = -1; 443 int myBestImgIdx2 = -1; 444 445 Mask m = mask; 446 447 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 448 { 449 const PtrStepSz<T> train = trains[imgIdx]; 450 m.next(); 451 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); 452 } 453 454 __syncthreads(); 455 456 float* s_distance = (float*)(smem); 457 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 458 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 459 460 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); 461 462 if (queryIdx < query.rows && threadIdx.x == 0) 463 { 464 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 465 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); 466 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 467 } 468 } 469 470 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<int2> & imgIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)471 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 472 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance, 473 cudaStream_t stream) 474 { 475 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 476 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 477 478 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 479 480 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 481 cudaSafeCall( cudaGetLastError() ); 482 483 if (stream == 0) 484 cudaSafeCall( cudaDeviceSynchronize() ); 485 } 486 487 /////////////////////////////////////////////////////////////////////////////// 488 // Match Unrolled 489 490 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> loopUnrolled(int queryIdx,const PtrStepSz<T> & query,int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance1,float & bestDistance2,int & bestTrainIdx1,int & bestTrainIdx2,int & bestImgIdx1,int & bestImgIdx2)491 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 492 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 493 float& bestDistance1, float& bestDistance2, 494 int& bestTrainIdx1, int& bestTrainIdx2, 495 int& bestImgIdx1, int& bestImgIdx2) 496 { 497 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 498 { 499 Dist dist; 500 501 #pragma unroll 502 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 503 { 504 const int loadX = threadIdx.x + i * BLOCK_SIZE; 505 506 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 507 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 508 509 if (loadX < query.cols) 510 { 511 T val; 512 513 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); 514 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; 515 516 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 517 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 518 } 519 520 __syncthreads(); 521 522 #pragma unroll 523 for (int j = 0; j < BLOCK_SIZE; ++j) 524 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 525 526 __syncthreads(); 527 } 528 529 typename Dist::result_type distVal = dist; 530 531 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 532 533 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) 534 { 535 if (distVal < bestDistance1) 536 { 537 bestImgIdx2 = bestImgIdx1; 538 bestDistance2 = bestDistance1; 539 bestTrainIdx2 = bestTrainIdx1; 540 541 bestImgIdx1 = imgIdx; 542 bestDistance1 = distVal; 543 bestTrainIdx1 = trainIdx; 544 } 545 else if (distVal < bestDistance2) 546 { 547 bestImgIdx2 = imgIdx; 548 bestDistance2 = distVal; 549 bestTrainIdx2 = trainIdx; 550 } 551 } 552 } 553 } 554 555 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int2 * bestTrainIdx,float2 * bestDistance)556 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) 557 { 558 extern __shared__ int smem[]; 559 560 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 561 562 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 563 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 564 565 float myBestDistance1 = numeric_limits<float>::max(); 566 float myBestDistance2 = numeric_limits<float>::max(); 567 int myBestTrainIdx1 = -1; 568 int myBestTrainIdx2 = -1; 569 570 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); 571 572 __syncthreads(); 573 574 float* s_distance = (float*)(smem); 575 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 576 577 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); 578 579 if (queryIdx < query.rows && threadIdx.x == 0) 580 { 581 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 582 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 583 } 584 } 585 586 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)587 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 588 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance, 589 cudaStream_t stream) 590 { 591 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 592 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 593 594 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 595 596 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 597 cudaSafeCall( cudaGetLastError() ); 598 599 if (stream == 0) 600 cudaSafeCall( cudaDeviceSynchronize() ); 601 } 602 603 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int2 * bestTrainIdx,int2 * bestImgIdx,float2 * bestDistance)604 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) 605 { 606 extern __shared__ int smem[]; 607 608 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 609 610 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 611 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 612 613 float myBestDistance1 = numeric_limits<float>::max(); 614 float myBestDistance2 = numeric_limits<float>::max(); 615 int myBestTrainIdx1 = -1; 616 int myBestTrainIdx2 = -1; 617 int myBestImgIdx1 = -1; 618 int myBestImgIdx2 = -1; 619 620 Mask m = mask; 621 622 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 623 { 624 const PtrStepSz<T> train = trains[imgIdx]; 625 m.next(); 626 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); 627 } 628 629 __syncthreads(); 630 631 float* s_distance = (float*)(smem); 632 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 633 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 634 635 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); 636 637 if (queryIdx < query.rows && threadIdx.x == 0) 638 { 639 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 640 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); 641 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 642 } 643 } 644 645 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<int2> & imgIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)646 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 647 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance, 648 cudaStream_t stream) 649 { 650 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 651 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 652 653 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 654 655 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 656 cudaSafeCall( cudaGetLastError() ); 657 658 if (stream == 0) 659 cudaSafeCall( cudaDeviceSynchronize() ); 660 } 661 662 /////////////////////////////////////////////////////////////////////////////// 663 // Match 664 665 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> loop(int queryIdx,const PtrStepSz<T> & query,int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance1,float & bestDistance2,int & bestTrainIdx1,int & bestTrainIdx2,int & bestImgIdx1,int & bestImgIdx2)666 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 667 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 668 float& bestDistance1, float& bestDistance2, 669 int& bestTrainIdx1, int& bestTrainIdx2, 670 int& bestImgIdx1, int& bestImgIdx2) 671 { 672 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 673 { 674 Dist dist; 675 676 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) 677 { 678 const int loadX = threadIdx.x + i * BLOCK_SIZE; 679 680 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 681 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 682 683 if (loadX < query.cols) 684 { 685 T val; 686 687 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); 688 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; 689 690 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 691 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 692 } 693 694 __syncthreads(); 695 696 #pragma unroll 697 for (int j = 0; j < BLOCK_SIZE; ++j) 698 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 699 700 __syncthreads(); 701 } 702 703 typename Dist::result_type distVal = dist; 704 705 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 706 707 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) 708 { 709 if (distVal < bestDistance1) 710 { 711 bestImgIdx2 = bestImgIdx1; 712 bestDistance2 = bestDistance1; 713 bestTrainIdx2 = bestTrainIdx1; 714 715 bestImgIdx1 = imgIdx; 716 bestDistance1 = distVal; 717 bestTrainIdx1 = trainIdx; 718 } 719 else if (distVal < bestDistance2) 720 { 721 bestImgIdx2 = imgIdx; 722 bestDistance2 = distVal; 723 bestTrainIdx2 = trainIdx; 724 } 725 } 726 } 727 } 728 729 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int2 * bestTrainIdx,float2 * bestDistance)730 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) 731 { 732 extern __shared__ int smem[]; 733 734 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 735 736 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 737 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 738 739 float myBestDistance1 = numeric_limits<float>::max(); 740 float myBestDistance2 = numeric_limits<float>::max(); 741 int myBestTrainIdx1 = -1; 742 int myBestTrainIdx2 = -1; 743 744 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); 745 746 __syncthreads(); 747 748 float* s_distance = (float*)(smem); 749 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 750 751 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); 752 753 if (queryIdx < query.rows && threadIdx.x == 0) 754 { 755 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 756 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 757 } 758 } 759 760 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)761 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 762 const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance, 763 cudaStream_t stream) 764 { 765 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 766 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 767 768 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 769 770 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 771 cudaSafeCall( cudaGetLastError() ); 772 773 if (stream == 0) 774 cudaSafeCall( cudaDeviceSynchronize() ); 775 } 776 777 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int2 * bestTrainIdx,int2 * bestImgIdx,float2 * bestDistance)778 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) 779 { 780 extern __shared__ int smem[]; 781 782 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 783 784 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 785 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 786 787 float myBestDistance1 = numeric_limits<float>::max(); 788 float myBestDistance2 = numeric_limits<float>::max(); 789 int myBestTrainIdx1 = -1; 790 int myBestTrainIdx2 = -1; 791 int myBestImgIdx1 = -1; 792 int myBestImgIdx2 = -1; 793 794 Mask m = mask; 795 796 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 797 { 798 const PtrStepSz<T> train = trains[imgIdx]; 799 m.next(); 800 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); 801 } 802 803 __syncthreads(); 804 805 float* s_distance = (float*)(smem); 806 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 807 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 808 809 findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); 810 811 if (queryIdx < query.rows && threadIdx.x == 0) 812 { 813 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); 814 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); 815 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); 816 } 817 } 818 819 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSz<int2> & trainIdx,const PtrStepSz<int2> & imgIdx,const PtrStepSz<float2> & distance,cudaStream_t stream)820 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 821 const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance, 822 cudaStream_t stream) 823 { 824 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 825 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 826 827 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 828 829 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 830 cudaSafeCall( cudaGetLastError() ); 831 832 if (stream == 0) 833 cudaSafeCall( cudaDeviceSynchronize() ); 834 } 835 836 /////////////////////////////////////////////////////////////////////////////// 837 // knnMatch 2 dispatcher 838 839 template <typename Dist, typename T, typename Mask> match2Dispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,cudaStream_t stream)840 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 841 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, 842 cudaStream_t stream) 843 { 844 if (query.cols <= 64) 845 { 846 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 847 } 848 else if (query.cols <= 128) 849 { 850 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 851 } 852 /*else if (query.cols <= 256) 853 { 854 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 855 } 856 else if (query.cols <= 512) 857 { 858 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 859 } 860 else if (query.cols <= 1024) 861 { 862 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 863 }*/ 864 else 865 { 866 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); 867 } 868 } 869 870 template <typename Dist, typename T, typename Mask> match2Dispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & imgIdx,const PtrStepSzb & distance,cudaStream_t stream)871 void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 872 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, 873 cudaStream_t stream) 874 { 875 if (query.cols <= 64) 876 { 877 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 878 } 879 else if (query.cols <= 128) 880 { 881 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 882 } 883 /*else if (query.cols <= 256) 884 { 885 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 886 } 887 else if (query.cols <= 512) 888 { 889 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 890 } 891 else if (query.cols <= 1024) 892 { 893 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 894 }*/ 895 else 896 { 897 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); 898 } 899 } 900 901 /////////////////////////////////////////////////////////////////////////////// 902 // Calc distance kernel 903 904 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> calcDistanceUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,PtrStepf allDist)905 __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist) 906 { 907 extern __shared__ int smem[]; 908 909 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; 910 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; 911 912 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 913 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 914 915 Dist dist; 916 917 #pragma unroll 918 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 919 { 920 const int loadX = threadIdx.x + i * BLOCK_SIZE; 921 922 if (loadX < query.cols) 923 { 924 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; 925 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; 926 } 927 else 928 { 929 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 930 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 931 } 932 933 __syncthreads(); 934 935 #pragma unroll 936 for (int j = 0; j < BLOCK_SIZE; ++j) 937 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 938 939 __syncthreads(); 940 } 941 942 if (queryIdx < query.rows && trainIdx < train.rows) 943 { 944 float distVal = numeric_limits<float>::max(); 945 946 if (mask(queryIdx, trainIdx)) 947 distVal = (typename Dist::result_type)dist; 948 949 allDist.ptr(queryIdx)[trainIdx] = distVal; 950 } 951 } 952 953 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> calcDistanceUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzf & allDist,cudaStream_t stream)954 void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream) 955 { 956 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 957 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); 958 959 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 960 961 calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist); 962 cudaSafeCall( cudaGetLastError() ); 963 964 if (stream == 0) 965 cudaSafeCall( cudaDeviceSynchronize() ); 966 } 967 968 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> calcDistance(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,PtrStepf allDist)969 __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist) 970 { 971 extern __shared__ int smem[]; 972 973 const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; 974 const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; 975 976 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 977 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 978 979 Dist dist; 980 981 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) 982 { 983 const int loadX = threadIdx.x + i * BLOCK_SIZE; 984 985 if (loadX < query.cols) 986 { 987 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; 988 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; 989 } 990 else 991 { 992 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 993 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 994 } 995 996 __syncthreads(); 997 998 #pragma unroll 999 for (int j = 0; j < BLOCK_SIZE; ++j) 1000 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 1001 1002 __syncthreads(); 1003 } 1004 1005 if (queryIdx < query.rows && trainIdx < train.rows) 1006 { 1007 float distVal = numeric_limits<float>::max(); 1008 1009 if (mask(queryIdx, trainIdx)) 1010 distVal = (typename Dist::result_type)dist; 1011 1012 allDist.ptr(queryIdx)[trainIdx] = distVal; 1013 } 1014 } 1015 1016 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> calcDistance(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzf & allDist,cudaStream_t stream)1017 void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream) 1018 { 1019 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 1020 const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); 1021 1022 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 1023 1024 calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist); 1025 cudaSafeCall( cudaGetLastError() ); 1026 1027 if (stream == 0) 1028 cudaSafeCall( cudaDeviceSynchronize() ); 1029 } 1030 1031 /////////////////////////////////////////////////////////////////////////////// 1032 // Calc Distance dispatcher 1033 1034 template <typename Dist, typename T, typename Mask> calcDistanceDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzf & allDist,cudaStream_t stream)1035 void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 1036 const PtrStepSzf& allDist, 1037 cudaStream_t stream) 1038 { 1039 if (query.cols <= 64) 1040 { 1041 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); 1042 } 1043 else if (query.cols <= 128) 1044 { 1045 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); 1046 } 1047 /*else if (query.cols <= 256) 1048 { 1049 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); 1050 } 1051 else if (query.cols <= 512) 1052 { 1053 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); 1054 } 1055 else if (query.cols <= 1024) 1056 { 1057 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); 1058 }*/ 1059 else 1060 { 1061 calcDistance<16, Dist>(query, train, mask, allDist, stream); 1062 } 1063 } 1064 1065 /////////////////////////////////////////////////////////////////////////////// 1066 // find knn match kernel 1067 1068 template <int BLOCK_SIZE> findBestMatch(PtrStepSzf allDist,int i,PtrStepi trainIdx,PtrStepf distance)1069 __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance) 1070 { 1071 const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; 1072 __shared__ float s_dist[SMEM_SIZE]; 1073 __shared__ int s_trainIdx[SMEM_SIZE]; 1074 1075 const int queryIdx = blockIdx.x; 1076 1077 float* allDistRow = allDist.ptr(queryIdx); 1078 1079 float dist = numeric_limits<float>::max(); 1080 int bestIdx = -1; 1081 1082 for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) 1083 { 1084 float reg = allDistRow[i]; 1085 if (reg < dist) 1086 { 1087 dist = reg; 1088 bestIdx = i; 1089 } 1090 } 1091 1092 s_dist[threadIdx.x] = dist; 1093 s_trainIdx[threadIdx.x] = bestIdx; 1094 __syncthreads(); 1095 1096 reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>()); 1097 1098 if (threadIdx.x == 0) 1099 { 1100 if (dist < numeric_limits<float>::max()) 1101 { 1102 allDistRow[bestIdx] = numeric_limits<float>::max(); 1103 trainIdx.ptr(queryIdx)[i] = bestIdx; 1104 distance.ptr(queryIdx)[i] = dist; 1105 } 1106 } 1107 } 1108 1109 template <int BLOCK_SIZE> findKnnMatch(int k,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,const PtrStepSzf & allDist,cudaStream_t stream)1110 void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream) 1111 { 1112 const dim3 block(BLOCK_SIZE, 1, 1); 1113 const dim3 grid(trainIdx.rows, 1, 1); 1114 1115 for (int i = 0; i < k; ++i) 1116 { 1117 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance); 1118 cudaSafeCall( cudaGetLastError() ); 1119 } 1120 1121 if (stream == 0) 1122 cudaSafeCall( cudaDeviceSynchronize() ); 1123 } 1124 findKnnMatchDispatcher(int k,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,const PtrStepSzf & allDist,cudaStream_t stream)1125 void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream) 1126 { 1127 findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream); 1128 } 1129 1130 /////////////////////////////////////////////////////////////////////////////// 1131 // knn match Dispatcher 1132 1133 template <typename Dist, typename T, typename Mask> matchDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> & train,int k,const Mask & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,const PtrStepSzf & allDist,cudaStream_t stream)1134 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask, 1135 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, 1136 cudaStream_t stream) 1137 { 1138 if (k == 2) 1139 { 1140 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream); 1141 } 1142 else 1143 { 1144 calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream); 1145 findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); 1146 } 1147 } 1148 1149 /////////////////////////////////////////////////////////////////////////////// 1150 // knn match caller 1151 matchL1_gpu(const PtrStepSzb & query,const PtrStepSzb & train,int k,const PtrStepSzb & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,const PtrStepSzf & allDist,cudaStream_t stream)1152 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, 1153 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, 1154 cudaStream_t stream) 1155 { 1156 if (mask.data) 1157 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); 1158 else 1159 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); 1160 } 1161 1162 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1163 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1164 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1165 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1166 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1167 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1168 matchL2_gpu(const PtrStepSzb & query,const PtrStepSzb & train,int k,const PtrStepSzb & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,const PtrStepSzf & allDist,cudaStream_t stream)1169 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, 1170 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, 1171 cudaStream_t stream) 1172 { 1173 if (mask.data) 1174 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); 1175 else 1176 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); 1177 } 1178 1179 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1180 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1181 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1182 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1183 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1184 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1185 matchHamming_gpu(const PtrStepSzb & query,const PtrStepSzb & train,int k,const PtrStepSzb & mask,const PtrStepSzb & trainIdx,const PtrStepSzb & distance,const PtrStepSzf & allDist,cudaStream_t stream)1186 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, 1187 const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, 1188 cudaStream_t stream) 1189 { 1190 if (mask.data) 1191 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); 1192 else 1193 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); 1194 } 1195 1196 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1197 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1198 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1199 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1200 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); 1201 match2L1_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzb & trainIdx,const PtrStepSzb & imgIdx,const PtrStepSzb & distance,cudaStream_t stream)1202 template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 1203 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, 1204 cudaStream_t stream) 1205 { 1206 if (masks.data) 1207 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); 1208 else 1209 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); 1210 } 1211 1212 template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1213 //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1214 template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1215 template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1216 template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1217 template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1218 match2L2_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzb & trainIdx,const PtrStepSzb & imgIdx,const PtrStepSzb & distance,cudaStream_t stream)1219 template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 1220 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, 1221 cudaStream_t stream) 1222 { 1223 if (masks.data) 1224 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); 1225 else 1226 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); 1227 } 1228 1229 //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1230 //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1231 //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1232 //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1233 //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1234 template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1235 match2Hamming_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzb & trainIdx,const PtrStepSzb & imgIdx,const PtrStepSzb & distance,cudaStream_t stream)1236 template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 1237 const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, 1238 cudaStream_t stream) 1239 { 1240 if (masks.data) 1241 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); 1242 else 1243 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); 1244 } 1245 1246 template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1247 //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1248 template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1249 //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1250 template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); 1251 } // namespace bf_knnmatch 1252 }}} // namespace cv { namespace cuda { namespace cudev { 1253 1254 1255 #endif /* CUDA_DISABLER */ 1256