1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
46 //
47 // The algorithm and code are explained in the upcoming GPU Computing Gems
48 // chapter in detail:
49 //
50 // Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
51 // PDF URL placeholder
52 // email: aobukhov@nvidia.com, devsupport@nvidia.com
53 //
54 // Credits for help with the code to:
55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
56 //
57 ////////////////////////////////////////////////////////////////////////////////
58
59 #include <algorithm>
60 #include <cstdio>
61
62 #include "opencv2/core/cuda/warp.hpp"
63 #include "opencv2/core/cuda/warp_shuffle.hpp"
64
65 #include "opencv2/opencv_modules.hpp"
66
67 #ifdef HAVE_OPENCV_OBJDETECT
68 # include "opencv2/objdetect.hpp"
69 # include "opencv2/objdetect/objdetect_c.h"
70 #endif
71
72 #include "opencv2/cudalegacy/NCV.hpp"
73 #include "opencv2/cudalegacy/NPP_staging.hpp"
74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
75
76 #include "NCVRuntimeTemplates.hpp"
77 #include "NCVAlg.hpp"
78
79
80 //==============================================================================
81 //
82 // BlockScan file
83 //
84 //==============================================================================
85
86
87 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
88
89
90 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
91 //assuming size <= WARP_SIZE and size is power of 2
warpScanInclusive(Ncv32u idata,volatile Ncv32u * s_Data)92 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
93 {
94 #if __CUDA_ARCH__ >= 300
95 const unsigned int laneId = cv::cuda::device::Warp::laneId();
96
97 // scan on shuffl functions
98 #pragma unroll
99 for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
100 {
101 const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
102 if (laneId >= i)
103 idata += n;
104 }
105
106 return idata;
107 #else
108 Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
109 s_Data[pos] = 0;
110 pos += K_WARP_SIZE;
111 s_Data[pos] = idata;
112
113 s_Data[pos] += s_Data[pos - 1];
114 s_Data[pos] += s_Data[pos - 2];
115 s_Data[pos] += s_Data[pos - 4];
116 s_Data[pos] += s_Data[pos - 8];
117 s_Data[pos] += s_Data[pos - 16];
118
119 return s_Data[pos];
120 #endif
121 }
122
warpScanExclusive(Ncv32u idata,volatile Ncv32u * s_Data)123 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
124 {
125 return warpScanInclusive(idata, s_Data) - idata;
126 }
127
128 template <Ncv32u tiNumScanThreads>
scan1Inclusive(Ncv32u idata,volatile Ncv32u * s_Data)129 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
130 {
131 if (tiNumScanThreads > K_WARP_SIZE)
132 {
133 //Bottom-level inclusive warp scan
134 Ncv32u warpResult = warpScanInclusive(idata, s_Data);
135
136 //Save top elements of each warp for exclusive warp scan
137 //sync to wait for warp scans to complete (because s_Data is being overwritten)
138 __syncthreads();
139 if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
140 {
141 s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
142 }
143
144 //wait for warp scans to complete
145 __syncthreads();
146
147 if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
148 {
149 //grab top warp elements
150 Ncv32u val = s_Data[threadIdx.x];
151 //calculate exclusive scan and write back to shared memory
152 s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
153 }
154
155 //return updated warp scans with exclusive scan results
156 __syncthreads();
157 return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
158 }
159 else
160 {
161 return warpScanInclusive(idata, s_Data);
162 }
163 }
164
165
166 //==============================================================================
167 //
168 // HaarClassifierCascade file
169 //
170 //==============================================================================
171
172
173 const Ncv32u MAX_GRID_DIM = 65535;
174
175
176 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
177
178
179 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6
180 #define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
181
182
183 /** \internal
184 * Haar features solid array.
185 */
186 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
187
188
189 /** \internal
190 * Haar classifiers flattened trees container.
191 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
192 * Drawback: breaks tree locality (might cause more cache misses
193 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
194 */
195 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
196
197
198 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
199
200
getStage(Ncv32u iStage,HaarStage64 * d_Stages)201 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
202 {
203 return d_Stages[iStage];
204 }
205
206
207 template <NcvBool tbCacheTextureCascade>
getClassifierNode(Ncv32u iNode,HaarClassifierNode128 * d_ClassifierNodes)208 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
209 {
210 HaarClassifierNode128 tmpNode;
211 if (tbCacheTextureCascade)
212 {
213 tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
214 }
215 else
216 {
217 tmpNode = d_ClassifierNodes[iNode];
218 }
219 return tmpNode;
220 }
221
222
223 template <NcvBool tbCacheTextureCascade>
getFeature(Ncv32u iFeature,HaarFeature64 * d_Features,Ncv32f * weight,Ncv32u * rectX,Ncv32u * rectY,Ncv32u * rectWidth,Ncv32u * rectHeight)224 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
225 Ncv32f *weight,
226 Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
227 {
228 HaarFeature64 feature;
229 if (tbCacheTextureCascade)
230 {
231 feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
232 }
233 else
234 {
235 feature = d_Features[iFeature];
236 }
237 feature.getRect(rectX, rectY, rectWidth, rectHeight);
238 *weight = feature.getWeight();
239 }
240
241
242 template <NcvBool tbCacheTextureIImg>
getElemIImg(Ncv32u x,Ncv32u * d_IImg)243 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
244 {
245 if (tbCacheTextureIImg)
246 {
247 return tex1Dfetch(texIImage, x);
248 }
249 else
250 {
251 return d_IImg[x];
252 }
253 }
254
255
256 __device__ Ncv32u d_outMaskPosition;
257
258
compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag,Ncv32u threadElem,Ncv32u * vectorOut)259 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
260 {
261 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
262
263 __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
264 __shared__ Ncv32u numPassed;
265 __shared__ Ncv32u outMaskOffset;
266
267 Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
268 __syncthreads();
269
270 if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
271 {
272 numPassed = incScan;
273 outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
274 }
275
276 if (threadPassFlag)
277 {
278 Ncv32u excScan = incScan - threadPassFlag;
279 shmem[excScan] = threadElem;
280 }
281
282 __syncthreads();
283
284 if (threadIdx.x < numPassed)
285 {
286 vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
287 }
288 #endif
289 }
290
291
292 template <NcvBool tbInitMaskPositively,
293 NcvBool tbCacheTextureIImg,
294 NcvBool tbCacheTextureCascade,
295 NcvBool tbReadPixelIndexFromVector,
296 NcvBool tbDoAtomicCompaction>
applyHaarClassifierAnchorParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)297 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
298 Ncv32f *d_weights, Ncv32u weightsStride,
299 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
300 Ncv32u *d_inMask, Ncv32u *d_outMask,
301 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
302 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
303 {
304 Ncv32u y_offs;
305 Ncv32u x_offs;
306 Ncv32u maskOffset;
307 Ncv32u outMaskVal;
308
309 NcvBool bInactiveThread = false;
310
311 if (tbReadPixelIndexFromVector)
312 {
313 maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
314
315 if (maskOffset >= mask1Dlen)
316 {
317 if (tbDoAtomicCompaction) bInactiveThread = true; else return;
318 }
319
320 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
321 {
322 outMaskVal = d_inMask[maskOffset];
323 y_offs = outMaskVal >> 16;
324 x_offs = outMaskVal & 0xFFFF;
325 }
326 }
327 else
328 {
329 y_offs = blockIdx.y;
330 x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
331
332 if (x_offs >= mask2Dstride)
333 {
334 if (tbDoAtomicCompaction) bInactiveThread = true; else return;
335 }
336
337 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
338 {
339 maskOffset = y_offs * mask2Dstride + x_offs;
340
341 if ((x_offs >= anchorsRoi.width) ||
342 (!tbInitMaskPositively &&
343 d_inMask != d_outMask &&
344 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
345 {
346 if (tbDoAtomicCompaction)
347 {
348 bInactiveThread = true;
349 }
350 else
351 {
352 d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
353 return;
354 }
355 }
356
357 outMaskVal = (y_offs << 16) | x_offs;
358 }
359 }
360
361 NcvBool bPass = true;
362
363 if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
364 {
365 Ncv32f pixelStdDev = 0.0f;
366
367 if (!bInactiveThread)
368 pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
369
370 for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
371 {
372 Ncv32f curStageSum = 0.0f;
373
374 HaarStage64 curStage = getStage(iStage, d_Stages);
375 Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
376 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
377 Ncv32f stageThreshold = curStage.getStageThreshold();
378
379 while (numRootNodesInStage--)
380 {
381 NcvBool bMoreNodesToTraverse = true;
382 Ncv32u iNode = curRootNodeOffset;
383
384 if (bPass && !bInactiveThread)
385 {
386 while (bMoreNodesToTraverse)
387 {
388 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
389 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
390 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
391 Ncv32u iFeature = featuresDesc.getFeaturesOffset();
392
393 Ncv32f curNodeVal = 0.0f;
394
395 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
396 {
397 Ncv32f rectWeight;
398 Ncv32u rectX, rectY, rectWidth, rectHeight;
399 getFeature<tbCacheTextureCascade>
400 (iFeature + iRect, d_Features,
401 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
402
403 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
404 Ncv32u iioffsTR = iioffsTL + rectWidth;
405 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
406 Ncv32u iioffsBR = iioffsBL + rectWidth;
407
408 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
409 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
410 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
411 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
412
413 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
414 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
415 #else
416 curNodeVal += (Ncv32f)rectSum * rectWeight;
417 #endif
418 }
419
420 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
421 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
422 Ncv32f nodeThreshold = curNode.getThreshold();
423
424 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
425 NcvBool nextNodeIsLeaf;
426
427 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
428 {
429 nextNodeDescriptor = nodeLeft;
430 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
431 }
432 else
433 {
434 nextNodeDescriptor = nodeRight;
435 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
436 }
437
438 if (nextNodeIsLeaf)
439 {
440 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
441 curStageSum += tmpLeafValue;
442 bMoreNodesToTraverse = false;
443 }
444 else
445 {
446 iNode = nextNodeDescriptor.getNextNodeOffset();
447 }
448 }
449 }
450
451 __syncthreads();
452 curRootNodeOffset++;
453 }
454
455 if (curStageSum < stageThreshold)
456 {
457 bPass = false;
458 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
459 }
460 }
461 }
462
463 __syncthreads();
464
465 if (!tbDoAtomicCompaction)
466 {
467 if (!tbReadPixelIndexFromVector ||
468 (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
469 {
470 d_outMask[maskOffset] = outMaskVal;
471 }
472 }
473 else
474 {
475 compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
476 outMaskVal,
477 d_outMask);
478 }
479 }
480
481
482 template <NcvBool tbCacheTextureIImg,
483 NcvBool tbCacheTextureCascade,
484 NcvBool tbDoAtomicCompaction>
applyHaarClassifierClassifierParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)485 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
486 Ncv32f *d_weights, Ncv32u weightsStride,
487 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
488 Ncv32u *d_inMask, Ncv32u *d_outMask,
489 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
490 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
491 {
492 Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
493
494 if (maskOffset >= mask1Dlen)
495 {
496 return;
497 }
498
499 Ncv32u outMaskVal = d_inMask[maskOffset];
500 Ncv32u y_offs = outMaskVal >> 16;
501 Ncv32u x_offs = outMaskVal & 0xFFFF;
502
503 Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
504 NcvBool bPass = true;
505
506 for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
507 {
508 //this variable is subject to reduction
509 Ncv32f curStageSum = 0.0f;
510
511 HaarStage64 curStage = getStage(iStage, d_Stages);
512 Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
513 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
514 Ncv32f stageThreshold = curStage.getStageThreshold();
515
516 Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
517
518 for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
519 {
520 NcvBool bMoreNodesToTraverse = true;
521
522 if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
523 {
524 Ncv32u iNode = curRootNodeOffset;
525
526 while (bMoreNodesToTraverse)
527 {
528 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
529 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
530 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
531 Ncv32u iFeature = featuresDesc.getFeaturesOffset();
532
533 Ncv32f curNodeVal = 0.0f;
534 //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
535 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
536 {
537 Ncv32f rectWeight;
538 Ncv32u rectX, rectY, rectWidth, rectHeight;
539 getFeature<tbCacheTextureCascade>
540 (iFeature + iRect, d_Features,
541 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
542
543 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
544 Ncv32u iioffsTR = iioffsTL + rectWidth;
545 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
546 Ncv32u iioffsBR = iioffsBL + rectWidth;
547
548 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
549 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
550 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
551 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
552
553 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
554 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
555 #else
556 curNodeVal += (Ncv32f)rectSum * rectWeight;
557 #endif
558 }
559
560 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
561 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
562 Ncv32f nodeThreshold = curNode.getThreshold();
563
564 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
565 NcvBool nextNodeIsLeaf;
566
567 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
568 {
569 nextNodeDescriptor = nodeLeft;
570 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
571 }
572 else
573 {
574 nextNodeDescriptor = nodeRight;
575 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
576 }
577
578 if (nextNodeIsLeaf)
579 {
580 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
581 curStageSum += tmpLeafValue;
582 bMoreNodesToTraverse = false;
583 }
584 else
585 {
586 iNode = nextNodeDescriptor.getNextNodeOffset();
587 }
588 }
589 }
590 __syncthreads();
591
592 curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
593 }
594
595 Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
596
597 if (finalStageSum < stageThreshold)
598 {
599 bPass = false;
600 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
601 break;
602 }
603 }
604
605 if (!tbDoAtomicCompaction)
606 {
607 if (!bPass || d_inMask != d_outMask)
608 {
609 if (!threadIdx.x)
610 {
611 d_outMask[maskOffset] = outMaskVal;
612 }
613 }
614 }
615 else
616 {
617 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
618 if (bPass && !threadIdx.x)
619 {
620 Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
621 d_outMask[outMaskOffset] = outMaskVal;
622 }
623 #endif
624 }
625 }
626
627
628 template <NcvBool tbMaskByInmask,
629 NcvBool tbDoAtomicCompaction>
initializeMaskVector(Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)630 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
631 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
632 NcvSize32u anchorsRoi, Ncv32u step)
633 {
634 Ncv32u y_offs = blockIdx.y;
635 Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
636 Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
637
638 Ncv32u y_offs_upsc = step * y_offs;
639 Ncv32u x_offs_upsc = step * x_offs;
640 Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
641
642 Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
643
644 if (x_offs_upsc < anchorsRoi.width &&
645 (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
646 {
647 outElem = (y_offs_upsc << 16) | x_offs_upsc;
648 }
649
650 if (!tbDoAtomicCompaction)
651 {
652 d_outMask[outMaskOffset] = outElem;
653 }
654 else
655 {
656 compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
657 outElem,
658 d_outMask);
659 }
660 }
661
662
663 struct applyHaarClassifierAnchorParallelFunctor
664 {
665 dim3 gridConf, blockConf;
666 cudaStream_t cuStream;
667
668 //Kernel arguments are stored as members;
669 Ncv32u *d_IImg;
670 Ncv32u IImgStride;
671 Ncv32f *d_weights;
672 Ncv32u weightsStride;
673 HaarFeature64 *d_Features;
674 HaarClassifierNode128 *d_ClassifierNodes;
675 HaarStage64 *d_Stages;
676 Ncv32u *d_inMask;
677 Ncv32u *d_outMask;
678 Ncv32u mask1Dlen;
679 Ncv32u mask2Dstride;
680 NcvSize32u anchorsRoi;
681 Ncv32u startStageInc;
682 Ncv32u endStageExc;
683 Ncv32f scaleArea;
684
685 //Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctorapplyHaarClassifierAnchorParallelFunctor686 applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
687 Ncv32u *_d_IImg, Ncv32u _IImgStride,
688 Ncv32f *_d_weights, Ncv32u _weightsStride,
689 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
690 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
691 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
692 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
693 Ncv32u _endStageExc, Ncv32f _scaleArea) :
694 gridConf(_gridConf),
695 blockConf(_blockConf),
696 cuStream(_cuStream),
697 d_IImg(_d_IImg),
698 IImgStride(_IImgStride),
699 d_weights(_d_weights),
700 weightsStride(_weightsStride),
701 d_Features(_d_Features),
702 d_ClassifierNodes(_d_ClassifierNodes),
703 d_Stages(_d_Stages),
704 d_inMask(_d_inMask),
705 d_outMask(_d_outMask),
706 mask1Dlen(_mask1Dlen),
707 mask2Dstride(_mask2Dstride),
708 anchorsRoi(_anchorsRoi),
709 startStageInc(_startStageInc),
710 endStageExc(_endStageExc),
711 scaleArea(_scaleArea)
712 {}
713
714 template<class TList>
callapplyHaarClassifierAnchorParallelFunctor715 void call(TList tl)
716 {
717 (void)tl;
718 applyHaarClassifierAnchorParallel <
719 Loki::TL::TypeAt<TList, 0>::Result::value,
720 Loki::TL::TypeAt<TList, 1>::Result::value,
721 Loki::TL::TypeAt<TList, 2>::Result::value,
722 Loki::TL::TypeAt<TList, 3>::Result::value,
723 Loki::TL::TypeAt<TList, 4>::Result::value >
724 <<<gridConf, blockConf, 0, cuStream>>>
725 (d_IImg, IImgStride,
726 d_weights, weightsStride,
727 d_Features, d_ClassifierNodes, d_Stages,
728 d_inMask, d_outMask,
729 mask1Dlen, mask2Dstride,
730 anchorsRoi, startStageInc,
731 endStageExc, scaleArea);
732 }
733 };
734
735
applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbReadPixelIndexFromVector,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)736 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
737 NcvBool tbCacheTextureIImg,
738 NcvBool tbCacheTextureCascade,
739 NcvBool tbReadPixelIndexFromVector,
740 NcvBool tbDoAtomicCompaction,
741
742 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
743
744 Ncv32u *d_IImg, Ncv32u IImgStride,
745 Ncv32f *d_weights, Ncv32u weightsStride,
746 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
747 Ncv32u *d_inMask, Ncv32u *d_outMask,
748 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
749 NcvSize32u anchorsRoi, Ncv32u startStageInc,
750 Ncv32u endStageExc, Ncv32f scaleArea)
751 {
752
753 applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
754 d_IImg, IImgStride,
755 d_weights, weightsStride,
756 d_Features, d_ClassifierNodes, d_Stages,
757 d_inMask, d_outMask,
758 mask1Dlen, mask2Dstride,
759 anchorsRoi, startStageInc,
760 endStageExc, scaleArea);
761
762 //Second parameter is the number of "dynamic" template parameters
763 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
764 ::call( &functor,
765 tbInitMaskPositively,
766 tbCacheTextureIImg,
767 tbCacheTextureCascade,
768 tbReadPixelIndexFromVector,
769 tbDoAtomicCompaction);
770 }
771
772
773 struct applyHaarClassifierClassifierParallelFunctor
774 {
775 dim3 gridConf, blockConf;
776 cudaStream_t cuStream;
777
778 //Kernel arguments are stored as members;
779 Ncv32u *d_IImg;
780 Ncv32u IImgStride;
781 Ncv32f *d_weights;
782 Ncv32u weightsStride;
783 HaarFeature64 *d_Features;
784 HaarClassifierNode128 *d_ClassifierNodes;
785 HaarStage64 *d_Stages;
786 Ncv32u *d_inMask;
787 Ncv32u *d_outMask;
788 Ncv32u mask1Dlen;
789 Ncv32u mask2Dstride;
790 NcvSize32u anchorsRoi;
791 Ncv32u startStageInc;
792 Ncv32u endStageExc;
793 Ncv32f scaleArea;
794
795 //Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctorapplyHaarClassifierClassifierParallelFunctor796 applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
797 Ncv32u *_d_IImg, Ncv32u _IImgStride,
798 Ncv32f *_d_weights, Ncv32u _weightsStride,
799 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
800 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
801 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
802 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
803 Ncv32u _endStageExc, Ncv32f _scaleArea) :
804 gridConf(_gridConf),
805 blockConf(_blockConf),
806 cuStream(_cuStream),
807 d_IImg(_d_IImg),
808 IImgStride(_IImgStride),
809 d_weights(_d_weights),
810 weightsStride(_weightsStride),
811 d_Features(_d_Features),
812 d_ClassifierNodes(_d_ClassifierNodes),
813 d_Stages(_d_Stages),
814 d_inMask(_d_inMask),
815 d_outMask(_d_outMask),
816 mask1Dlen(_mask1Dlen),
817 mask2Dstride(_mask2Dstride),
818 anchorsRoi(_anchorsRoi),
819 startStageInc(_startStageInc),
820 endStageExc(_endStageExc),
821 scaleArea(_scaleArea)
822 {}
823
824 template<class TList>
callapplyHaarClassifierClassifierParallelFunctor825 void call(TList tl)
826 {
827 (void)tl;
828 applyHaarClassifierClassifierParallel <
829 Loki::TL::TypeAt<TList, 0>::Result::value,
830 Loki::TL::TypeAt<TList, 1>::Result::value,
831 Loki::TL::TypeAt<TList, 2>::Result::value >
832 <<<gridConf, blockConf, 0, cuStream>>>
833 (d_IImg, IImgStride,
834 d_weights, weightsStride,
835 d_Features, d_ClassifierNodes, d_Stages,
836 d_inMask, d_outMask,
837 mask1Dlen, mask2Dstride,
838 anchorsRoi, startStageInc,
839 endStageExc, scaleArea);
840 }
841 };
842
843
applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)844 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
845 NcvBool tbCacheTextureCascade,
846 NcvBool tbDoAtomicCompaction,
847
848 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
849
850 Ncv32u *d_IImg, Ncv32u IImgStride,
851 Ncv32f *d_weights, Ncv32u weightsStride,
852 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
853 Ncv32u *d_inMask, Ncv32u *d_outMask,
854 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
855 NcvSize32u anchorsRoi, Ncv32u startStageInc,
856 Ncv32u endStageExc, Ncv32f scaleArea)
857 {
858 applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
859 d_IImg, IImgStride,
860 d_weights, weightsStride,
861 d_Features, d_ClassifierNodes, d_Stages,
862 d_inMask, d_outMask,
863 mask1Dlen, mask2Dstride,
864 anchorsRoi, startStageInc,
865 endStageExc, scaleArea);
866
867 //Second parameter is the number of "dynamic" template parameters
868 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
869 ::call( &functor,
870 tbCacheTextureIImg,
871 tbCacheTextureCascade,
872 tbDoAtomicCompaction);
873 }
874
875
876 struct initializeMaskVectorFunctor
877 {
878 dim3 gridConf, blockConf;
879 cudaStream_t cuStream;
880
881 //Kernel arguments are stored as members;
882 Ncv32u *d_inMask;
883 Ncv32u *d_outMask;
884 Ncv32u mask1Dlen;
885 Ncv32u mask2Dstride;
886 NcvSize32u anchorsRoi;
887 Ncv32u step;
888
889 //Arguments are passed through the constructor
initializeMaskVectorFunctorinitializeMaskVectorFunctor890 initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
891 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
892 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
893 NcvSize32u _anchorsRoi, Ncv32u _step) :
894 gridConf(_gridConf),
895 blockConf(_blockConf),
896 cuStream(_cuStream),
897 d_inMask(_d_inMask),
898 d_outMask(_d_outMask),
899 mask1Dlen(_mask1Dlen),
900 mask2Dstride(_mask2Dstride),
901 anchorsRoi(_anchorsRoi),
902 step(_step)
903 {}
904
905 template<class TList>
callinitializeMaskVectorFunctor906 void call(TList tl)
907 {
908 (void)tl;
909 initializeMaskVector <
910 Loki::TL::TypeAt<TList, 0>::Result::value,
911 Loki::TL::TypeAt<TList, 1>::Result::value >
912 <<<gridConf, blockConf, 0, cuStream>>>
913 (d_inMask, d_outMask,
914 mask1Dlen, mask2Dstride,
915 anchorsRoi, step);
916 }
917 };
918
919
initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)920 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
921 NcvBool tbDoAtomicCompaction,
922
923 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
924
925 Ncv32u *d_inMask, Ncv32u *d_outMask,
926 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
927 NcvSize32u anchorsRoi, Ncv32u step)
928 {
929 initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
930 d_inMask, d_outMask,
931 mask1Dlen, mask2Dstride,
932 anchorsRoi, step);
933
934 //Second parameter is the number of "dynamic" template parameters
935 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
936 ::call( &functor,
937 tbMaskByInmask,
938 tbDoAtomicCompaction);
939 }
940
941
getStageNumWithNotLessThanNclassifiers(Ncv32u N,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages)942 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
943 NCVVector<HaarStage64> &h_HaarStages)
944 {
945 Ncv32u i = 0;
946 for (; i<haar.NumStages; i++)
947 {
948 if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
949 {
950 break;
951 }
952 }
953 return i;
954 }
955
956
ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> & integral,NCVMatrix<Ncv32f> & d_weights,NCVMatrixAlloc<Ncv32u> & d_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)957 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
958 NCVMatrix<Ncv32f> &d_weights,
959 NCVMatrixAlloc<Ncv32u> &d_pixelMask,
960 Ncv32u &numDetections,
961 HaarClassifierCascadeDescriptor &haar,
962 NCVVector<HaarStage64> &h_HaarStages,
963 NCVVector<HaarStage64> &d_HaarStages,
964 NCVVector<HaarClassifierNode128> &d_HaarNodes,
965 NCVVector<HaarFeature64> &d_HaarFeatures,
966 NcvBool bMaskElements,
967 NcvSize32u anchorsRoi,
968 Ncv32u pixelStep,
969 Ncv32f scaleArea,
970 INCVMemAllocator &gpuAllocator,
971 INCVMemAllocator &cpuAllocator,
972 cudaDeviceProp &devProp,
973 cudaStream_t cuStream)
974 {
975 ncvAssertReturn(integral.memType() == d_weights.memType()&&
976 integral.memType() == d_pixelMask.memType() &&
977 integral.memType() == gpuAllocator.memType() &&
978 (integral.memType() == NCVMemoryTypeDevice ||
979 integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
980
981 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
982 d_HaarStages.memType() == d_HaarFeatures.memType() &&
983 (d_HaarStages.memType() == NCVMemoryTypeDevice ||
984 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
985
986 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
987
988 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
989
990 ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
991 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
992 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
993
994 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
995 d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
996 d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
997 integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
998 integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
999
1000 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1001
1002 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1003 d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1004 d_HaarFeatures.length() >= haar.NumFeatures &&
1005 d_HaarStages.length() == h_HaarStages.length() &&
1006 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1007
1008 ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
1009
1010 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1011
1012 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1013
1014 #if defined _SELF_TEST_
1015
1016 NCVStatus ncvStat;
1017
1018 NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
1019 ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1020 NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
1021 ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1022 NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1023 ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1024 NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
1025 ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1026 NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
1027 ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1028
1029 NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1030 ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1031
1032 NCV_SKIP_COND_BEGIN
1033
1034 ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
1035 ncvAssertReturnNcvStat(ncvStat);
1036 ncvStat = integral.copySolid(h_integralImage, 0);
1037 ncvAssertReturnNcvStat(ncvStat);
1038 ncvStat = d_weights.copySolid(h_weights, 0);
1039 ncvAssertReturnNcvStat(ncvStat);
1040 ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
1041 ncvAssertReturnNcvStat(ncvStat);
1042 ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
1043 ncvAssertReturnNcvStat(ncvStat);
1044 ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
1045
1046 for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
1047 {
1048 for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
1049 {
1050 if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
1051 {
1052 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
1053 {
1054 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
1055 }
1056 }
1057 else
1058 {
1059 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1060 }
1061 }
1062 }
1063
1064 NCV_SKIP_COND_END
1065
1066 #endif
1067
1068 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
1069 ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
1070
1071 NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
1072 ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1073
1074 NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
1075 ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1076 Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
1077 Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
1078
1079 NCV_SKIP_COND_BEGIN
1080 *hp_zero = 0;
1081 *hp_numDet = 0;
1082 NCV_SKIP_COND_END
1083
1084 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1085 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1086
1087 NcvBool bTexCacheCascade = devProp.major < 2;
1088 NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
1089 NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
1090
1091 NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
1092 NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
1093
1094 Ncv32u szNppCompactTmpBuf;
1095 nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
1096 if (bDoAtomicCompaction)
1097 {
1098 szNppCompactTmpBuf = 0;
1099 }
1100 NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
1101
1102 NCV_SKIP_COND_BEGIN
1103
1104 if (bTexCacheIImg)
1105 {
1106 cudaChannelFormatDesc cfdTexIImage;
1107 cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
1108
1109 size_t alignmentOffset;
1110 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
1111 (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
1112 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1113 }
1114
1115 if (bTexCacheCascade)
1116 {
1117 cudaChannelFormatDesc cfdTexHaarFeatures;
1118 cudaChannelFormatDesc cfdTexHaarClassifierNodes;
1119 cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
1120 cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
1121
1122 size_t alignmentOffset;
1123 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
1124 d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
1125 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1126 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
1127 d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
1128 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1129 }
1130
1131 Ncv32u stageStartAnchorParallel = 0;
1132 Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
1133 haar, h_HaarStages);
1134 Ncv32u stageEndClassifierParallel = haar.NumStages;
1135 if (stageMiddleSwitch == 0)
1136 {
1137 stageMiddleSwitch = 1;
1138 }
1139
1140 //create stages subdivision for pixel-parallel processing
1141 const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
1142 Ncv32u curStop = stageStartAnchorParallel;
1143 std::vector<Ncv32u> pixParallelStageStops;
1144 while (curStop < stageMiddleSwitch)
1145 {
1146 pixParallelStageStops.push_back(curStop);
1147 curStop += compactEveryNstage;
1148 }
1149 if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
1150 {
1151 pixParallelStageStops[pixParallelStageStops.size()-1] =
1152 (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
1153 }
1154 pixParallelStageStops.push_back(stageMiddleSwitch);
1155 Ncv32u pixParallelStageStopsIndex = 0;
1156
1157 if (pixelStep != 1 || bMaskElements)
1158 {
1159 if (bDoAtomicCompaction)
1160 {
1161 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1162 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1163 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1164 }
1165
1166 dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1167 (anchorsRoi.height + pixelStep - 1) / pixelStep);
1168 dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
1169
1170 if (gridInit.x == 0 || gridInit.y == 0)
1171 {
1172 numDetections = 0;
1173 return NCV_SUCCESS;
1174 }
1175
1176 initializeMaskVectorDynTemplate(bMaskElements,
1177 bDoAtomicCompaction,
1178 gridInit, blockInit, cuStream,
1179 d_ptrNowData->ptr(),
1180 d_ptrNowTmp->ptr(),
1181 static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
1182 anchorsRoi, pixelStep);
1183 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1184
1185 if (bDoAtomicCompaction)
1186 {
1187 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1188 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1189 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1190 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1191 swap(d_ptrNowData, d_ptrNowTmp);
1192 }
1193 else
1194 {
1195 NCVStatus nppSt;
1196 nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1197 d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1198 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1199 ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
1200 }
1201 numDetections = *hp_numDet;
1202 }
1203 else
1204 {
1205 //
1206 // 1. Run the first pixel-input pixel-parallel classifier for few stages
1207 //
1208
1209 if (bDoAtomicCompaction)
1210 {
1211 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1212 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1213 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1214 }
1215
1216 dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1217 anchorsRoi.height);
1218 dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
1219 applyHaarClassifierAnchorParallelDynTemplate(
1220 true, //tbInitMaskPositively
1221 bTexCacheIImg, //tbCacheTextureIImg
1222 bTexCacheCascade, //tbCacheTextureCascade
1223 pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
1224 bDoAtomicCompaction, //tbDoAtomicCompaction
1225 grid1,
1226 block1,
1227 cuStream,
1228 integral.ptr(), integral.stride(),
1229 d_weights.ptr(), d_weights.stride(),
1230 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1231 d_ptrNowData->ptr(),
1232 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1233 0,
1234 d_pixelMask.stride(),
1235 anchorsRoi,
1236 pixParallelStageStops[pixParallelStageStopsIndex],
1237 pixParallelStageStops[pixParallelStageStopsIndex+1],
1238 scaleAreaPixels);
1239 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1240
1241 if (bDoAtomicCompaction)
1242 {
1243 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1244 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1245 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1246 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1247 }
1248 else
1249 {
1250 NCVStatus nppSt;
1251 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1252 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1253 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1254 ncvAssertReturnNcvStat(nppSt);
1255 }
1256
1257 swap(d_ptrNowData, d_ptrNowTmp);
1258 numDetections = *hp_numDet;
1259
1260 pixParallelStageStopsIndex++;
1261 }
1262
1263 //
1264 // 2. Run pixel-parallel stages
1265 //
1266
1267 for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
1268 {
1269 if (numDetections == 0)
1270 {
1271 break;
1272 }
1273
1274 if (bDoAtomicCompaction)
1275 {
1276 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1277 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1278 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1279 }
1280
1281 dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
1282 if (numDetections > MAX_GRID_DIM)
1283 {
1284 grid2.x = MAX_GRID_DIM;
1285 grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1286 }
1287 dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
1288
1289 applyHaarClassifierAnchorParallelDynTemplate(
1290 false, //tbInitMaskPositively
1291 bTexCacheIImg, //tbCacheTextureIImg
1292 bTexCacheCascade, //tbCacheTextureCascade
1293 pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
1294 bDoAtomicCompaction, //tbDoAtomicCompaction
1295 grid2,
1296 block2,
1297 cuStream,
1298 integral.ptr(), integral.stride(),
1299 d_weights.ptr(), d_weights.stride(),
1300 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1301 d_ptrNowData->ptr(),
1302 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1303 numDetections,
1304 d_pixelMask.stride(),
1305 anchorsRoi,
1306 pixParallelStageStops[pixParallelStageStopsIndex],
1307 pixParallelStageStops[pixParallelStageStopsIndex+1],
1308 scaleAreaPixels);
1309 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1310
1311 if (bDoAtomicCompaction)
1312 {
1313 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1314 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1315 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1316 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1317 }
1318 else
1319 {
1320 NCVStatus nppSt;
1321 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1322 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1323 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1324 ncvAssertReturnNcvStat(nppSt);
1325 }
1326
1327 swap(d_ptrNowData, d_ptrNowTmp);
1328 numDetections = *hp_numDet;
1329 }
1330
1331 //
1332 // 3. Run all left stages in one stage-parallel kernel
1333 //
1334
1335 if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
1336 {
1337 if (bDoAtomicCompaction)
1338 {
1339 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1340 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1341 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1342 }
1343
1344 dim3 grid3(numDetections);
1345 if (numDetections > MAX_GRID_DIM)
1346 {
1347 grid3.x = MAX_GRID_DIM;
1348 grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1349 }
1350 dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
1351
1352 applyHaarClassifierClassifierParallelDynTemplate(
1353 bTexCacheIImg, //tbCacheTextureIImg
1354 bTexCacheCascade, //tbCacheTextureCascade
1355 bDoAtomicCompaction, //tbDoAtomicCompaction
1356 grid3,
1357 block3,
1358 cuStream,
1359 integral.ptr(), integral.stride(),
1360 d_weights.ptr(), d_weights.stride(),
1361 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1362 d_ptrNowData->ptr(),
1363 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1364 numDetections,
1365 d_pixelMask.stride(),
1366 anchorsRoi,
1367 stageMiddleSwitch,
1368 stageEndClassifierParallel,
1369 scaleAreaPixels);
1370 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1371
1372 if (bDoAtomicCompaction)
1373 {
1374 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1375 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1376 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1377 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1378 }
1379 else
1380 {
1381 NCVStatus nppSt;
1382 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1383 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1384 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1385 ncvAssertReturnNcvStat(nppSt);
1386 }
1387
1388 swap(d_ptrNowData, d_ptrNowTmp);
1389 numDetections = *hp_numDet;
1390 }
1391
1392 if (d_ptrNowData != &d_vecPixelMask)
1393 {
1394 d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
1395 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1396 }
1397
1398 #if defined _SELF_TEST_
1399
1400 ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
1401 ncvAssertReturnNcvStat(ncvStat);
1402 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1403
1404 if (bDoAtomicCompaction)
1405 {
1406 std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
1407 }
1408
1409 Ncv32u fpu_oldcw, fpu_cw;
1410 _controlfp_s(&fpu_cw, 0, 0);
1411 fpu_oldcw = fpu_cw;
1412 _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
1413 Ncv32u numDetGold;
1414 ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
1415 h_HaarStages, h_HaarNodes, h_HaarFeatures,
1416 bMaskElements, anchorsRoi, pixelStep, scaleArea);
1417 ncvAssertReturnNcvStat(ncvStat);
1418 _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
1419
1420 bool bPass = true;
1421
1422 if (numDetGold != numDetections)
1423 {
1424 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
1425 bPass = false;
1426 }
1427 else
1428 {
1429 for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
1430 {
1431 if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
1432 {
1433 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
1434 bPass = false;
1435 }
1436 }
1437 }
1438
1439 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
1440 #endif
1441
1442 NCV_SKIP_COND_END
1443
1444 return NCV_SUCCESS;
1445 }
1446
1447
1448 //==============================================================================
1449 //
1450 // HypothesesOperations file
1451 //
1452 //==============================================================================
1453
1454
1455 const Ncv32u NUM_GROW_THREADS = 128;
1456
1457
pixelToRect(Ncv32u pixel,Ncv32u width,Ncv32u height,Ncv32f scale)1458 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
1459 {
1460 NcvRect32u res;
1461 res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
1462 res.y = (Ncv32u)(scale * (pixel >> 16));
1463 res.width = (Ncv32u)(scale * width);
1464 res.height = (Ncv32u)(scale * height);
1465 return res;
1466 }
1467
1468
growDetectionsKernel(Ncv32u * pixelMask,Ncv32u numElements,NcvRect32u * hypotheses,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1469 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
1470 NcvRect32u *hypotheses,
1471 Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
1472 {
1473 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1474 Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
1475 if (elemAddr >= numElements)
1476 {
1477 return;
1478 }
1479 hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
1480 }
1481
1482
ncvGrowDetectionsVector_device(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale,cudaStream_t cuStream)1483 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
1484 Ncv32u numPixelMaskDetections,
1485 NCVVector<NcvRect32u> &hypotheses,
1486 Ncv32u &totalDetections,
1487 Ncv32u totalMaxDetections,
1488 Ncv32u rectWidth,
1489 Ncv32u rectHeight,
1490 Ncv32f curScale,
1491 cudaStream_t cuStream)
1492 {
1493 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1494
1495 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1496 pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1497
1498 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1499
1500 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1501
1502 ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1503 numPixelMaskDetections <= pixelMask.length() &&
1504 totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1505
1506 NCVStatus ncvStat = NCV_SUCCESS;
1507 Ncv32u numDetsToCopy = numPixelMaskDetections;
1508
1509 if (numDetsToCopy == 0)
1510 {
1511 return ncvStat;
1512 }
1513
1514 if (totalDetections + numPixelMaskDetections > totalMaxDetections)
1515 {
1516 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1517 numDetsToCopy = totalMaxDetections - totalDetections;
1518 }
1519
1520 dim3 block(NUM_GROW_THREADS);
1521 dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
1522 if (grid.x > 65535)
1523 {
1524 grid.y = (grid.x + 65534) / 65535;
1525 grid.x = 65535;
1526 }
1527 growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
1528 hypotheses.ptr() + totalDetections,
1529 rectWidth, rectHeight, curScale);
1530 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1531
1532 totalDetections += numDetsToCopy;
1533 return ncvStat;
1534 }
1535
1536
1537 //==============================================================================
1538 //
1539 // Pipeline file
1540 //
1541 //==============================================================================
1542
1543
ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> & d_srcImg,NcvSize32u srcRoi,NCVVector<NcvRect32u> & d_dstRects,Ncv32u & dstNumRects,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvSize32u minObjSize,Ncv32u minNeighbors,Ncv32f scaleStep,Ncv32u pixelStep,Ncv32u flags,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)1544 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
1545 NcvSize32u srcRoi,
1546 NCVVector<NcvRect32u> &d_dstRects,
1547 Ncv32u &dstNumRects,
1548
1549 HaarClassifierCascadeDescriptor &haar,
1550 NCVVector<HaarStage64> &h_HaarStages,
1551 NCVVector<HaarStage64> &d_HaarStages,
1552 NCVVector<HaarClassifierNode128> &d_HaarNodes,
1553 NCVVector<HaarFeature64> &d_HaarFeatures,
1554
1555 NcvSize32u minObjSize,
1556 Ncv32u minNeighbors, //default 4
1557 Ncv32f scaleStep, //default 1.2f
1558 Ncv32u pixelStep, //default 1
1559 Ncv32u flags, //default NCVPipeObjDet_Default
1560
1561 INCVMemAllocator &gpuAllocator,
1562 INCVMemAllocator &cpuAllocator,
1563 cudaDeviceProp &devProp,
1564 cudaStream_t cuStream)
1565 {
1566 ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
1567 d_srcImg.memType() == gpuAllocator.memType() &&
1568 (d_srcImg.memType() == NCVMemoryTypeDevice ||
1569 d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1570
1571 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
1572 d_HaarStages.memType() == d_HaarFeatures.memType() &&
1573 (d_HaarStages.memType() == NCVMemoryTypeDevice ||
1574 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1575
1576 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1577
1578 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
1579
1580 ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
1581 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
1582 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
1583 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
1584 d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
1585 srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
1586 d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
1587
1588 ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
1589
1590 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1591 d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1592 d_HaarFeatures.length() >= haar.NumFeatures &&
1593 d_HaarStages.length() == h_HaarStages.length() &&
1594 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1595
1596 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1597
1598 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1599
1600 //TODO: set NPP active stream to cuStream
1601
1602 NCVStatus ncvStat;
1603 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1604
1605 Ncv32u integralWidth = d_srcImg.width() + 1;
1606 Ncv32u integralHeight = d_srcImg.height() + 1;
1607
1608 NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
1609 ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1610 NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1611 ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1612
1613 NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1614 ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1615 NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1616 ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1617
1618 NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
1619 ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1620 NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1621 ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1622
1623 NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
1624 ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1625 NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
1626 ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1627
1628 NCVStatus nppStat;
1629 Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
1630 nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
1631 ncvAssertReturnNcvStat(nppStat);
1632 nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
1633 ncvAssertReturnNcvStat(nppStat);
1634 NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
1635 ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1636
1637 NCV_SKIP_COND_BEGIN
1638
1639 nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1640 integral.ptr(), integral.pitch(),
1641 NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1642 d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
1643 ncvAssertReturnNcvStat(nppStat);
1644
1645 nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1646 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1647 NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1648 d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
1649 ncvAssertReturnNcvStat(nppStat);
1650
1651 NCV_SKIP_COND_END
1652
1653 dstNumRects = 0;
1654
1655 Ncv32u lastCheckedScale = 0;
1656 NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
1657 std::vector<Ncv32u> scalesVector;
1658
1659 NcvBool bFoundLargestFace = false;
1660
1661 for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
1662 {
1663 Ncv32u scale = (Ncv32u)scaleIter;
1664 if (lastCheckedScale == scale)
1665 {
1666 continue;
1667 }
1668 lastCheckedScale = scale;
1669
1670 if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
1671 haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
1672 {
1673 continue;
1674 }
1675
1676 NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
1677
1678 srcRoi_.width = d_srcImg.width();
1679 srcRoi_.height = d_srcImg.height();
1680
1681 srcIIRo_i.width = srcRoi_.width + 1;
1682 srcIIRo_i.height = srcRoi_.height + 1;
1683
1684 scaledIIRoi.width = srcIIRo_i.width / scale;
1685 scaledIIRoi.height = srcIIRo_i.height / scale;
1686
1687 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1688 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1689
1690 if (searchRoi.width <= 0 || searchRoi.height <= 0)
1691 {
1692 break;
1693 }
1694
1695 scalesVector.push_back(scale);
1696
1697 if (gpuAllocator.isCounting())
1698 {
1699 break;
1700 }
1701 }
1702
1703 if (bReverseTraverseScale)
1704 {
1705 std::reverse(scalesVector.begin(), scalesVector.end());
1706 }
1707
1708 //TODO: handle _fair_scale_ flag
1709 for (Ncv32u i=0; i<scalesVector.size(); i++)
1710 {
1711 Ncv32u scale = scalesVector[i];
1712
1713 NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
1714 NcvSize32u srcIIRoi;
1715 srcRoi_.width = d_srcImg.width();
1716 srcRoi_.height = d_srcImg.height();
1717 srcIIRoi.width = srcRoi_.width + 1;
1718 srcIIRoi.height = srcRoi_.height + 1;
1719 scaledIIRoi.width = srcIIRoi.width / scale;
1720 scaledIIRoi.height = srcIIRoi.height / scale;
1721 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1722 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1723
1724 NCV_SKIP_COND_BEGIN
1725
1726 nppStat = nppiStDecimate_32u_C1R(
1727 integral.ptr(), integral.pitch(),
1728 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1729 srcIIRoi, scale, true);
1730 ncvAssertReturnNcvStat(nppStat);
1731
1732 nppStat = nppiStDecimate_64u_C1R(
1733 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1734 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1735 srcIIRoi, scale, true);
1736 ncvAssertReturnNcvStat(nppStat);
1737
1738 const NcvRect32u rect(
1739 HAAR_STDDEV_BORDER,
1740 HAAR_STDDEV_BORDER,
1741 haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
1742 haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
1743 nppStat = nppiStRectStdDev_32f_C1R(
1744 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1745 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1746 d_rectStdDev.ptr(), d_rectStdDev.pitch(),
1747 NcvSize32u(searchRoi.width, searchRoi.height), rect,
1748 (Ncv32f)scale*scale, true);
1749 ncvAssertReturnNcvStat(nppStat);
1750
1751 NCV_SKIP_COND_END
1752
1753 Ncv32u detectionsOnThisScale;
1754 ncvStat = ncvApplyHaarClassifierCascade_device(
1755 d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
1756 detectionsOnThisScale,
1757 haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
1758 searchRoi, pixelStep, (Ncv32f)scale*scale,
1759 gpuAllocator, cpuAllocator, devProp, cuStream);
1760 ncvAssertReturnNcvStat(nppStat);
1761
1762 NCV_SKIP_COND_BEGIN
1763
1764 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
1765 ncvStat = ncvGrowDetectionsVector_device(
1766 d_vecPixelMask,
1767 detectionsOnThisScale,
1768 d_hypothesesIntermediate,
1769 dstNumRects,
1770 static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
1771 haar.ClassifierSize.width,
1772 haar.ClassifierSize.height,
1773 (Ncv32f)scale,
1774 cuStream);
1775 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
1776
1777 if (flags & NCVPipeObjDet_FindLargestObject)
1778 {
1779 if (dstNumRects == 0)
1780 {
1781 continue;
1782 }
1783
1784 if (dstNumRects != 0)
1785 {
1786 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1787 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1788 dstNumRects * sizeof(NcvRect32u));
1789 ncvAssertReturnNcvStat(ncvStat);
1790 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1791 }
1792
1793 Ncv32u numStrongHypothesesNow = dstNumRects;
1794 ncvStat = ncvGroupRectangles_host(
1795 h_hypothesesIntermediate,
1796 numStrongHypothesesNow,
1797 minNeighbors,
1798 RECT_SIMILARITY_PROPORTION,
1799 NULL);
1800 ncvAssertReturnNcvStat(ncvStat);
1801
1802 if (numStrongHypothesesNow > 0)
1803 {
1804 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
1805 for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
1806 {
1807 if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
1808 {
1809 maxRect = h_hypothesesIntermediate.ptr()[j];
1810 }
1811 }
1812
1813 h_hypothesesIntermediate.ptr()[0] = maxRect;
1814 dstNumRects = 1;
1815
1816 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
1817 ncvAssertReturnNcvStat(ncvStat);
1818
1819 bFoundLargestFace = true;
1820
1821 break;
1822 }
1823 }
1824
1825 NCV_SKIP_COND_END
1826
1827 if (gpuAllocator.isCounting())
1828 {
1829 break;
1830 }
1831 }
1832
1833 NCVStatus ncvRetCode = NCV_SUCCESS;
1834
1835 NCV_SKIP_COND_BEGIN
1836
1837 if (flags & NCVPipeObjDet_FindLargestObject)
1838 {
1839 if (!bFoundLargestFace)
1840 {
1841 dstNumRects = 0;
1842 }
1843 }
1844 else
1845 {
1846 //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
1847 if (dstNumRects != 0)
1848 {
1849 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1850 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1851 dstNumRects * sizeof(NcvRect32u));
1852 ncvAssertReturnNcvStat(ncvStat);
1853 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1854 }
1855
1856 ncvStat = ncvGroupRectangles_host(
1857 h_hypothesesIntermediate,
1858 dstNumRects,
1859 minNeighbors,
1860 RECT_SIMILARITY_PROPORTION,
1861 NULL);
1862 ncvAssertReturnNcvStat(ncvStat);
1863
1864 if (dstNumRects > d_dstRects.length())
1865 {
1866 ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1867 dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
1868 }
1869
1870 if (dstNumRects != 0)
1871 {
1872 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
1873 dstNumRects * sizeof(NcvRect32u));
1874 ncvAssertReturnNcvStat(ncvStat);
1875 }
1876 }
1877
1878 if (flags & NCVPipeObjDet_VisualizeInPlace)
1879 {
1880 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1881 ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
1882 d_srcImg.width(), d_srcImg.height(),
1883 d_dstRects.ptr(), dstNumRects, 255, cuStream);
1884 }
1885
1886 NCV_SKIP_COND_END
1887
1888 return ncvRetCode;
1889 }
1890
1891
1892 //==============================================================================
1893 //
1894 // Purely Host code: classifier IO, mock-ups
1895 //
1896 //==============================================================================
1897
1898
1899 #ifdef _SELF_TEST_
1900 #include <float.h>
1901 #endif
1902
1903
ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> & h_integralImage,NCVMatrix<Ncv32f> & h_weights,NCVMatrixAlloc<Ncv32u> & h_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea)1904 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
1905 NCVMatrix<Ncv32f> &h_weights,
1906 NCVMatrixAlloc<Ncv32u> &h_pixelMask,
1907 Ncv32u &numDetections,
1908 HaarClassifierCascadeDescriptor &haar,
1909 NCVVector<HaarStage64> &h_HaarStages,
1910 NCVVector<HaarClassifierNode128> &h_HaarNodes,
1911 NCVVector<HaarFeature64> &h_HaarFeatures,
1912 NcvBool bMaskElements,
1913 NcvSize32u anchorsRoi,
1914 Ncv32u pixelStep,
1915 Ncv32f scaleArea)
1916 {
1917 ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
1918 h_integralImage.memType() == h_pixelMask.memType() &&
1919 (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
1920 h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1921 ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
1922 h_HaarStages.memType() == h_HaarFeatures.memType() &&
1923 (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
1924 h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1925 ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
1926 h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
1927 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
1928 h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
1929 h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
1930 h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
1931 h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
1932 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1933 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
1934 h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1935 h_HaarFeatures.length() >= haar.NumFeatures &&
1936 h_HaarStages.length() == h_HaarStages.length() &&
1937 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1938 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1939 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1940
1941 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1942 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1943
1944 for (Ncv32u i=0; i<anchorsRoi.height; i++)
1945 {
1946 for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
1947 {
1948 if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
1949 {
1950 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1951 }
1952 else
1953 {
1954 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
1955 {
1956 Ncv32f curStageSum = 0.0f;
1957 Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
1958 Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
1959
1960 if (iStage == 0)
1961 {
1962 if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1963 {
1964 break;
1965 }
1966 else
1967 {
1968 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
1969 }
1970 }
1971 else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1972 {
1973 break;
1974 }
1975
1976 while (numRootNodesInStage--)
1977 {
1978 NcvBool bMoreNodesToTraverse = true;
1979 Ncv32u curNodeOffset = curRootNodeOffset;
1980
1981 while (bMoreNodesToTraverse)
1982 {
1983 HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
1984 HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
1985 Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
1986 Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
1987
1988 Ncv32f curNodeVal = 0.f;
1989 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
1990 {
1991 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
1992 Ncv32u rectX, rectY, rectWidth, rectHeight;
1993 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
1994 Ncv32f rectWeight = feature.getWeight();
1995 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
1996 Ncv32u iioffsTR = iioffsTL + rectWidth;
1997 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
1998 Ncv32u iioffsBR = iioffsBL + rectWidth;
1999
2000 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
2001 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
2002 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
2003 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
2004 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
2005 curNodeVal += (Ncv32f)rectSum * rectWeight;
2006 }
2007
2008 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
2009 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
2010 Ncv32f nodeThreshold = curNode.getThreshold();
2011
2012 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
2013 NcvBool nextNodeIsLeaf;
2014
2015 if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
2016 {
2017 nextNodeDescriptor = nodeLeft;
2018 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
2019 }
2020 else
2021 {
2022 nextNodeDescriptor = nodeRight;
2023 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
2024 }
2025
2026 if (nextNodeIsLeaf)
2027 {
2028 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
2029 curStageSum += tmpLeafValue;
2030 bMoreNodesToTraverse = false;
2031 }
2032 else
2033 {
2034 curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
2035 }
2036 }
2037
2038 curRootNodeOffset++;
2039 }
2040
2041 Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
2042 if (curStageSum < tmpStageThreshold)
2043 {
2044 //drop
2045 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
2046 break;
2047 }
2048 }
2049 }
2050 }
2051 }
2052
2053 std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
2054 Ncv32u i = 0;
2055 for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
2056 {
2057 if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
2058 {
2059 break;
2060 }
2061 }
2062 numDetections = i;
2063
2064 return NCV_SUCCESS;
2065 }
2066
2067
ncvGrowDetectionsVector_host(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)2068 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
2069 Ncv32u numPixelMaskDetections,
2070 NCVVector<NcvRect32u> &hypotheses,
2071 Ncv32u &totalDetections,
2072 Ncv32u totalMaxDetections,
2073 Ncv32u rectWidth,
2074 Ncv32u rectHeight,
2075 Ncv32f curScale)
2076 {
2077 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
2078 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
2079 pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
2080 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
2081 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
2082 ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
2083 numPixelMaskDetections <= pixelMask.length() &&
2084 totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
2085
2086 NCVStatus ncvStat = NCV_SUCCESS;
2087 Ncv32u numDetsToCopy = numPixelMaskDetections;
2088
2089 if (numDetsToCopy == 0)
2090 {
2091 return ncvStat;
2092 }
2093
2094 if (totalDetections + numPixelMaskDetections > totalMaxDetections)
2095 {
2096 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
2097 numDetsToCopy = totalMaxDetections - totalDetections;
2098 }
2099
2100 for (Ncv32u i=0; i<numDetsToCopy; i++)
2101 {
2102 hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
2103 }
2104
2105 totalDetections += numDetsToCopy;
2106 return ncvStat;
2107 }
2108
loadFromXML(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2109 static NCVStatus loadFromXML(const cv::String &filename,
2110 HaarClassifierCascadeDescriptor &haar,
2111 std::vector<HaarStage64> &haarStages,
2112 std::vector<HaarClassifierNode128> &haarClassifierNodes,
2113 std::vector<HaarFeature64> &haarFeatures)
2114 {
2115 #ifndef HAVE_OPENCV_OBJDETECT
2116 (void) filename;
2117 (void) haar;
2118 (void) haarStages;
2119 (void) haarClassifierNodes;
2120 (void) haarFeatures;
2121 CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
2122 return NCV_HAAR_XML_LOADING_EXCEPTION;
2123 #else
2124 NCVStatus ncvStat;
2125
2126 haar.NumStages = 0;
2127 haar.NumClassifierRootNodes = 0;
2128 haar.NumClassifierTotalNodes = 0;
2129 haar.NumFeatures = 0;
2130 haar.ClassifierSize.width = 0;
2131 haar.ClassifierSize.height = 0;
2132 haar.bHasStumpsOnly = true;
2133 haar.bNeedsTiltedII = false;
2134 Ncv32u curMaxTreeDepth = 0;
2135
2136 std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
2137 haarStages.resize(0);
2138 haarClassifierNodes.resize(0);
2139 haarFeatures.resize(0);
2140
2141 cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
2142 if (!oldCascade)
2143 {
2144 return NCV_HAAR_XML_LOADING_EXCEPTION;
2145 }
2146
2147 haar.ClassifierSize.width = oldCascade->orig_window_size.width;
2148 haar.ClassifierSize.height = oldCascade->orig_window_size.height;
2149
2150 int stagesCound = oldCascade->count;
2151 for(int s = 0; s < stagesCound; ++s) // by stages
2152 {
2153 HaarStage64 curStage;
2154 curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
2155
2156 curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
2157
2158 int treesCount = oldCascade->stage_classifier[s].count;
2159 for(int t = 0; t < treesCount; ++t) // by trees
2160 {
2161 Ncv32u nodeId = 0;
2162 CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
2163
2164 int nodesCount = tree->count;
2165 for(int n = 0; n < nodesCount; ++n) //by features
2166 {
2167 CvHaarFeature* feature = &tree->haar_feature[n];
2168
2169 HaarClassifierNode128 curNode;
2170 curNode.setThreshold(tree->threshold[n]);
2171
2172 NcvBool bIsLeftNodeLeaf = false;
2173 NcvBool bIsRightNodeLeaf = false;
2174
2175 HaarClassifierNodeDescriptor32 nodeLeft;
2176 if ( tree->left[n] <= 0 )
2177 {
2178 Ncv32f leftVal = tree->alpha[-tree->left[n]];
2179 ncvStat = nodeLeft.create(leftVal);
2180 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2181 bIsLeftNodeLeaf = true;
2182 }
2183 else
2184 {
2185 Ncv32u leftNodeOffset = tree->left[n];
2186 nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
2187 haar.bHasStumpsOnly = false;
2188 }
2189 curNode.setLeftNodeDesc(nodeLeft);
2190
2191 HaarClassifierNodeDescriptor32 nodeRight;
2192 if ( tree->right[n] <= 0 )
2193 {
2194 Ncv32f rightVal = tree->alpha[-tree->right[n]];
2195 ncvStat = nodeRight.create(rightVal);
2196 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2197 bIsRightNodeLeaf = true;
2198 }
2199 else
2200 {
2201 Ncv32u rightNodeOffset = tree->right[n];
2202 nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
2203 haar.bHasStumpsOnly = false;
2204 }
2205 curNode.setRightNodeDesc(nodeRight);
2206
2207 Ncv32u tiltedVal = feature->tilted;
2208 haar.bNeedsTiltedII = (tiltedVal != 0);
2209
2210 Ncv32u featureId = 0;
2211 for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
2212 {
2213 Ncv32u rectX = feature->rect[l].r.x;
2214 Ncv32u rectY = feature->rect[l].r.y;
2215 Ncv32u rectWidth = feature->rect[l].r.width;
2216 Ncv32u rectHeight = feature->rect[l].r.height;
2217
2218 Ncv32f rectWeight = feature->rect[l].weight;
2219
2220 if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
2221 break;
2222
2223 HaarFeature64 curFeature;
2224 ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
2225 curFeature.setWeight(rectWeight);
2226 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2227 haarFeatures.push_back(curFeature);
2228
2229 featureId++;
2230 }
2231
2232 HaarFeatureDescriptor32 tmpFeatureDesc;
2233 ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
2234 featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
2235 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2236 curNode.setFeatureDesc(tmpFeatureDesc);
2237
2238 if (!nodeId)
2239 {
2240 //root node
2241 haarClassifierNodes.push_back(curNode);
2242 curMaxTreeDepth = 1;
2243 }
2244 else
2245 {
2246 //other node
2247 h_TmpClassifierNotRootNodes.push_back(curNode);
2248 curMaxTreeDepth++;
2249 }
2250
2251 nodeId++;
2252 }
2253 }
2254
2255 curStage.setNumClassifierRootNodes(treesCount);
2256 haarStages.push_back(curStage);
2257 }
2258
2259 //fill in cascade stats
2260 haar.NumStages = static_cast<Ncv32u>(haarStages.size());
2261 haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
2262 haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
2263 haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
2264
2265 //merge root and leaf nodes in one classifiers array
2266 Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
2267 for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
2268 {
2269 HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
2270
2271 HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
2272 if (!featureDesc.isLeftNodeLeaf())
2273 {
2274 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2275 nodeLeft.create(newOffset);
2276 }
2277 haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
2278
2279 HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
2280 if (!featureDesc.isRightNodeLeaf())
2281 {
2282 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2283 nodeRight.create(newOffset);
2284 }
2285 haarClassifierNodes[i].setRightNodeDesc(nodeRight);
2286 }
2287
2288 for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
2289 {
2290 HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
2291
2292 HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
2293 if (!featureDesc.isLeftNodeLeaf())
2294 {
2295 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2296 nodeLeft.create(newOffset);
2297 }
2298 h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
2299
2300 HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
2301 if (!featureDesc.isRightNodeLeaf())
2302 {
2303 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2304 nodeRight.create(newOffset);
2305 }
2306 h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
2307
2308 haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
2309 }
2310
2311 return NCV_SUCCESS;
2312 #endif
2313 }
2314
2315
2316 #define NVBIN_HAAR_SIZERESERVED 16
2317 #define NVBIN_HAAR_VERSION 0x1
2318
2319
loadFromNVBIN(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2320 static NCVStatus loadFromNVBIN(const cv::String &filename,
2321 HaarClassifierCascadeDescriptor &haar,
2322 std::vector<HaarStage64> &haarStages,
2323 std::vector<HaarClassifierNode128> &haarClassifierNodes,
2324 std::vector<HaarFeature64> &haarFeatures)
2325 {
2326 size_t readCount;
2327 FILE *fp = fopen(filename.c_str(), "rb");
2328 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2329 Ncv32u fileVersion;
2330 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2331 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2332 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2333 Ncv32u fsize;
2334 readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
2335 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2336 fseek(fp, 0, SEEK_END);
2337 Ncv32u fsizeActual = ftell(fp);
2338 ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
2339
2340 std::vector<unsigned char> fdata;
2341 fdata.resize(fsize);
2342 Ncv32u dataOffset = 0;
2343 fseek(fp, 0, SEEK_SET);
2344 readCount = fread(&fdata[0], fsize, 1, fp);
2345 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2346 fclose(fp);
2347
2348 //data
2349 dataOffset = NVBIN_HAAR_SIZERESERVED;
2350 haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
2351 dataOffset += sizeof(Ncv32u);
2352 haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2353 dataOffset += sizeof(Ncv32u);
2354 haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2355 dataOffset += sizeof(Ncv32u);
2356 haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
2357 dataOffset += sizeof(Ncv32u);
2358 haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
2359 dataOffset += sizeof(NcvSize32u);
2360 haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
2361 dataOffset += sizeof(NcvBool);
2362 haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
2363 dataOffset += sizeof(NcvBool);
2364
2365 haarStages.resize(haar.NumStages);
2366 haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
2367 haarFeatures.resize(haar.NumFeatures);
2368
2369 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2370 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2371 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2372
2373 memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
2374 dataOffset += szStages;
2375 memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
2376 dataOffset += szClassifiers;
2377 memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
2378 dataOffset += szFeatures;
2379
2380 return NCV_SUCCESS;
2381 }
2382
2383
ncvHaarGetClassifierSize(const cv::String & filename,Ncv32u & numStages,Ncv32u & numNodes,Ncv32u & numFeatures)2384 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
2385 Ncv32u &numNodes, Ncv32u &numFeatures)
2386 {
2387 size_t readCount;
2388 NCVStatus ncvStat;
2389
2390 cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2391 fext = fext.toLowerCase();
2392
2393 if (fext == "nvbin")
2394 {
2395 FILE *fp = fopen(filename.c_str(), "rb");
2396 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2397 Ncv32u fileVersion;
2398 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2399 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2400 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2401 fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
2402 Ncv32u tmp;
2403 readCount = fread(&numStages, sizeof(Ncv32u), 1, fp);
2404 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2405 readCount = fread(&tmp, sizeof(Ncv32u), 1, fp);
2406 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2407 readCount = fread(&numNodes, sizeof(Ncv32u), 1, fp);
2408 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2409 readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
2410 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2411 fclose(fp);
2412 }
2413 else if (fext == "xml")
2414 {
2415 HaarClassifierCascadeDescriptor haar;
2416 std::vector<HaarStage64> haarStages;
2417 std::vector<HaarClassifierNode128> haarNodes;
2418 std::vector<HaarFeature64> haarFeatures;
2419
2420 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2421 ncvAssertReturnNcvStat(ncvStat);
2422
2423 numStages = haar.NumStages;
2424 numNodes = haar.NumClassifierTotalNodes;
2425 numFeatures = haar.NumFeatures;
2426 }
2427 else
2428 {
2429 return NCV_HAAR_XML_LOADING_EXCEPTION;
2430 }
2431
2432 return NCV_SUCCESS;
2433 }
2434
2435
ncvHaarLoadFromFile_host(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2436 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
2437 HaarClassifierCascadeDescriptor &haar,
2438 NCVVector<HaarStage64> &h_HaarStages,
2439 NCVVector<HaarClassifierNode128> &h_HaarNodes,
2440 NCVVector<HaarFeature64> &h_HaarFeatures)
2441 {
2442 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2443 h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2444 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2445
2446 NCVStatus ncvStat;
2447
2448 cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2449 fext = fext.toLowerCase();
2450
2451 std::vector<HaarStage64> haarStages;
2452 std::vector<HaarClassifierNode128> haarNodes;
2453 std::vector<HaarFeature64> haarFeatures;
2454
2455 if (fext == "nvbin")
2456 {
2457 ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
2458 ncvAssertReturnNcvStat(ncvStat);
2459 }
2460 else if (fext == "xml")
2461 {
2462 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2463 ncvAssertReturnNcvStat(ncvStat);
2464 }
2465 else
2466 {
2467 return NCV_HAAR_XML_LOADING_EXCEPTION;
2468 }
2469
2470 ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2471 ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2472 ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2473
2474 memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
2475 memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
2476 memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
2477
2478 return NCV_SUCCESS;
2479 }
2480
2481
ncvHaarStoreNVBIN_host(const cv::String & filename,HaarClassifierCascadeDescriptor haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2482 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
2483 HaarClassifierCascadeDescriptor haar,
2484 NCVVector<HaarStage64> &h_HaarStages,
2485 NCVVector<HaarClassifierNode128> &h_HaarNodes,
2486 NCVVector<HaarFeature64> &h_HaarFeatures)
2487 {
2488 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
2489 ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
2490 ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
2491 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2492 h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2493 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2494
2495 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2496 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2497 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2498
2499 Ncv32u dataOffset = 0;
2500 std::vector<unsigned char> fdata;
2501 fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
2502
2503 //header
2504 *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
2505
2506 //data
2507 dataOffset = NVBIN_HAAR_SIZERESERVED;
2508 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
2509 dataOffset += sizeof(Ncv32u);
2510 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
2511 dataOffset += sizeof(Ncv32u);
2512 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
2513 dataOffset += sizeof(Ncv32u);
2514 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
2515 dataOffset += sizeof(Ncv32u);
2516 *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
2517 dataOffset += sizeof(NcvSize32u);
2518 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
2519 dataOffset += sizeof(NcvBool);
2520 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
2521 dataOffset += sizeof(NcvBool);
2522
2523 memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
2524 dataOffset += szStages;
2525 memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
2526 dataOffset += szClassifiers;
2527 memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
2528 dataOffset += szFeatures;
2529 Ncv32u fsize = dataOffset;
2530
2531 //TODO: CRC32 here
2532
2533 //update header
2534 dataOffset = sizeof(Ncv32u);
2535 *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
2536
2537 FILE *fp = fopen(filename.c_str(), "wb");
2538 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2539 fwrite(&fdata[0], fsize, 1, fp);
2540 fclose(fp);
2541 return NCV_SUCCESS;
2542 }
2543