1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
46 //
47 // The algorithm and code are explained in the upcoming GPU Computing Gems
48 // chapter in detail:
49 //
50 //   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
51 //   PDF URL placeholder
52 //   email: aobukhov@nvidia.com, devsupport@nvidia.com
53 //
54 // Credits for help with the code to:
55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
56 //
57 ////////////////////////////////////////////////////////////////////////////////
58 
59 #include <algorithm>
60 #include <cstdio>
61 
62 #include "opencv2/core/cuda/warp.hpp"
63 #include "opencv2/core/cuda/warp_shuffle.hpp"
64 
65 #include "opencv2/opencv_modules.hpp"
66 
67 #ifdef HAVE_OPENCV_OBJDETECT
68 #  include "opencv2/objdetect.hpp"
69 #  include "opencv2/objdetect/objdetect_c.h"
70 #endif
71 
72 #include "opencv2/cudalegacy/NCV.hpp"
73 #include "opencv2/cudalegacy/NPP_staging.hpp"
74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
75 
76 #include "NCVRuntimeTemplates.hpp"
77 #include "NCVAlg.hpp"
78 
79 
80 //==============================================================================
81 //
82 // BlockScan file
83 //
84 //==============================================================================
85 
86 
87 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
88 
89 
90 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
91 //assuming size <= WARP_SIZE and size is power of 2
warpScanInclusive(Ncv32u idata,volatile Ncv32u * s_Data)92 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
93 {
94 #if __CUDA_ARCH__ >= 300
95     const unsigned int laneId = cv::cuda::device::Warp::laneId();
96 
97     // scan on shuffl functions
98     #pragma unroll
99     for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
100     {
101         const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
102         if (laneId >= i)
103               idata += n;
104     }
105 
106     return idata;
107 #else
108     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
109     s_Data[pos] = 0;
110     pos += K_WARP_SIZE;
111     s_Data[pos] = idata;
112 
113     s_Data[pos] += s_Data[pos - 1];
114     s_Data[pos] += s_Data[pos - 2];
115     s_Data[pos] += s_Data[pos - 4];
116     s_Data[pos] += s_Data[pos - 8];
117     s_Data[pos] += s_Data[pos - 16];
118 
119     return s_Data[pos];
120 #endif
121 }
122 
warpScanExclusive(Ncv32u idata,volatile Ncv32u * s_Data)123 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
124 {
125     return warpScanInclusive(idata, s_Data) - idata;
126 }
127 
128 template <Ncv32u tiNumScanThreads>
scan1Inclusive(Ncv32u idata,volatile Ncv32u * s_Data)129 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
130 {
131     if (tiNumScanThreads > K_WARP_SIZE)
132     {
133         //Bottom-level inclusive warp scan
134         Ncv32u warpResult = warpScanInclusive(idata, s_Data);
135 
136         //Save top elements of each warp for exclusive warp scan
137         //sync to wait for warp scans to complete (because s_Data is being overwritten)
138         __syncthreads();
139         if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
140         {
141             s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
142         }
143 
144         //wait for warp scans to complete
145         __syncthreads();
146 
147         if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
148         {
149             //grab top warp elements
150             Ncv32u val = s_Data[threadIdx.x];
151             //calculate exclusive scan and write back to shared memory
152             s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
153         }
154 
155         //return updated warp scans with exclusive scan results
156         __syncthreads();
157         return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
158     }
159     else
160     {
161         return warpScanInclusive(idata, s_Data);
162     }
163 }
164 
165 
166 //==============================================================================
167 //
168 // HaarClassifierCascade file
169 //
170 //==============================================================================
171 
172 
173 const Ncv32u MAX_GRID_DIM = 65535;
174 
175 
176 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
177 
178 
179 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6
180 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
181 
182 
183 /** \internal
184 * Haar features solid array.
185 */
186 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
187 
188 
189 /** \internal
190 * Haar classifiers flattened trees container.
191 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
192 * Drawback: breaks tree locality (might cause more cache misses
193 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
194 */
195 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
196 
197 
198 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
199 
200 
getStage(Ncv32u iStage,HaarStage64 * d_Stages)201 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
202 {
203     return d_Stages[iStage];
204 }
205 
206 
207 template <NcvBool tbCacheTextureCascade>
getClassifierNode(Ncv32u iNode,HaarClassifierNode128 * d_ClassifierNodes)208 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
209 {
210     HaarClassifierNode128 tmpNode;
211     if (tbCacheTextureCascade)
212     {
213         tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
214     }
215     else
216     {
217         tmpNode = d_ClassifierNodes[iNode];
218     }
219     return tmpNode;
220 }
221 
222 
223 template <NcvBool tbCacheTextureCascade>
getFeature(Ncv32u iFeature,HaarFeature64 * d_Features,Ncv32f * weight,Ncv32u * rectX,Ncv32u * rectY,Ncv32u * rectWidth,Ncv32u * rectHeight)224 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
225                            Ncv32f *weight,
226                            Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
227 {
228     HaarFeature64 feature;
229     if (tbCacheTextureCascade)
230     {
231         feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
232     }
233     else
234     {
235         feature = d_Features[iFeature];
236     }
237     feature.getRect(rectX, rectY, rectWidth, rectHeight);
238     *weight = feature.getWeight();
239 }
240 
241 
242 template <NcvBool tbCacheTextureIImg>
getElemIImg(Ncv32u x,Ncv32u * d_IImg)243 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
244 {
245     if (tbCacheTextureIImg)
246     {
247         return tex1Dfetch(texIImage, x);
248     }
249     else
250     {
251         return d_IImg[x];
252     }
253 }
254 
255 
256 __device__ Ncv32u d_outMaskPosition;
257 
258 
compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag,Ncv32u threadElem,Ncv32u * vectorOut)259 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
260 {
261 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
262 
263     __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
264     __shared__ Ncv32u numPassed;
265     __shared__ Ncv32u outMaskOffset;
266 
267     Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
268     __syncthreads();
269 
270     if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
271     {
272         numPassed = incScan;
273         outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
274     }
275 
276     if (threadPassFlag)
277     {
278         Ncv32u excScan = incScan - threadPassFlag;
279         shmem[excScan] = threadElem;
280     }
281 
282     __syncthreads();
283 
284     if (threadIdx.x < numPassed)
285     {
286         vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
287     }
288 #endif
289 }
290 
291 
292 template <NcvBool tbInitMaskPositively,
293           NcvBool tbCacheTextureIImg,
294           NcvBool tbCacheTextureCascade,
295           NcvBool tbReadPixelIndexFromVector,
296           NcvBool tbDoAtomicCompaction>
applyHaarClassifierAnchorParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)297 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
298                                                   Ncv32f *d_weights, Ncv32u weightsStride,
299                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
300                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
301                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
302                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
303 {
304     Ncv32u y_offs;
305     Ncv32u x_offs;
306     Ncv32u maskOffset;
307     Ncv32u outMaskVal;
308 
309     NcvBool bInactiveThread = false;
310 
311     if (tbReadPixelIndexFromVector)
312     {
313         maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
314 
315         if (maskOffset >= mask1Dlen)
316         {
317             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
318         }
319 
320         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
321         {
322             outMaskVal = d_inMask[maskOffset];
323             y_offs = outMaskVal >> 16;
324             x_offs = outMaskVal & 0xFFFF;
325         }
326     }
327     else
328     {
329         y_offs = blockIdx.y;
330         x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
331 
332         if (x_offs >= mask2Dstride)
333         {
334             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
335         }
336 
337         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
338         {
339             maskOffset = y_offs * mask2Dstride + x_offs;
340 
341             if ((x_offs >= anchorsRoi.width) ||
342                 (!tbInitMaskPositively &&
343                  d_inMask != d_outMask &&
344                  d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
345             {
346                 if (tbDoAtomicCompaction)
347                 {
348                     bInactiveThread = true;
349                 }
350                 else
351                 {
352                     d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
353                     return;
354                 }
355             }
356 
357             outMaskVal = (y_offs << 16) | x_offs;
358         }
359     }
360 
361     NcvBool bPass = true;
362 
363     if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
364     {
365         Ncv32f pixelStdDev = 0.0f;
366 
367         if (!bInactiveThread)
368             pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
369 
370         for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
371         {
372             Ncv32f curStageSum = 0.0f;
373 
374             HaarStage64 curStage = getStage(iStage, d_Stages);
375             Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
376             Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
377             Ncv32f stageThreshold = curStage.getStageThreshold();
378 
379             while (numRootNodesInStage--)
380             {
381                 NcvBool bMoreNodesToTraverse = true;
382                 Ncv32u iNode = curRootNodeOffset;
383 
384                 if (bPass && !bInactiveThread)
385                 {
386                     while (bMoreNodesToTraverse)
387                     {
388                         HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
389                         HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
390                         Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
391                         Ncv32u iFeature = featuresDesc.getFeaturesOffset();
392 
393                         Ncv32f curNodeVal = 0.0f;
394 
395                         for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
396                         {
397                             Ncv32f rectWeight;
398                             Ncv32u rectX, rectY, rectWidth, rectHeight;
399                             getFeature<tbCacheTextureCascade>
400                                 (iFeature + iRect, d_Features,
401                                 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
402 
403                             Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
404                             Ncv32u iioffsTR = iioffsTL + rectWidth;
405                             Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
406                             Ncv32u iioffsBR = iioffsBL + rectWidth;
407 
408                             Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
409                                              getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
410                                              getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
411                                              getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
412 
413     #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
414                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
415     #else
416                         curNodeVal += (Ncv32f)rectSum * rectWeight;
417     #endif
418                         }
419 
420                         HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
421                         HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
422                         Ncv32f nodeThreshold = curNode.getThreshold();
423 
424                         HaarClassifierNodeDescriptor32 nextNodeDescriptor;
425                         NcvBool nextNodeIsLeaf;
426 
427                         if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
428                         {
429                             nextNodeDescriptor = nodeLeft;
430                             nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
431                         }
432                         else
433                         {
434                             nextNodeDescriptor = nodeRight;
435                             nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
436                         }
437 
438                         if (nextNodeIsLeaf)
439                         {
440                             Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
441                             curStageSum += tmpLeafValue;
442                             bMoreNodesToTraverse = false;
443                         }
444                         else
445                         {
446                             iNode = nextNodeDescriptor.getNextNodeOffset();
447                         }
448                     }
449                 }
450 
451                 __syncthreads();
452                 curRootNodeOffset++;
453             }
454 
455             if (curStageSum < stageThreshold)
456             {
457                 bPass = false;
458                 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
459             }
460         }
461     }
462 
463     __syncthreads();
464 
465     if (!tbDoAtomicCompaction)
466     {
467         if (!tbReadPixelIndexFromVector ||
468             (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
469         {
470             d_outMask[maskOffset] = outMaskVal;
471         }
472     }
473     else
474     {
475         compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
476                                            outMaskVal,
477                                            d_outMask);
478     }
479 }
480 
481 
482 template <NcvBool tbCacheTextureIImg,
483           NcvBool tbCacheTextureCascade,
484           NcvBool tbDoAtomicCompaction>
applyHaarClassifierClassifierParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)485 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
486                                                       Ncv32f *d_weights, Ncv32u weightsStride,
487                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
488                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
489                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
490                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
491 {
492     Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
493 
494     if (maskOffset >= mask1Dlen)
495     {
496         return;
497     }
498 
499     Ncv32u outMaskVal = d_inMask[maskOffset];
500     Ncv32u y_offs = outMaskVal >> 16;
501     Ncv32u x_offs = outMaskVal & 0xFFFF;
502 
503     Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
504     NcvBool bPass = true;
505 
506     for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
507     {
508         //this variable is subject to reduction
509         Ncv32f curStageSum = 0.0f;
510 
511         HaarStage64 curStage = getStage(iStage, d_Stages);
512         Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
513         Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
514         Ncv32f stageThreshold = curStage.getStageThreshold();
515 
516         Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
517 
518         for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
519         {
520             NcvBool bMoreNodesToTraverse = true;
521 
522             if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
523             {
524                 Ncv32u iNode = curRootNodeOffset;
525 
526                 while (bMoreNodesToTraverse)
527                 {
528                     HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
529                     HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
530                     Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
531                     Ncv32u iFeature = featuresDesc.getFeaturesOffset();
532 
533                     Ncv32f curNodeVal = 0.0f;
534                     //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
535                     for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
536                     {
537                         Ncv32f rectWeight;
538                         Ncv32u rectX, rectY, rectWidth, rectHeight;
539                         getFeature<tbCacheTextureCascade>
540                             (iFeature + iRect, d_Features,
541                             &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
542 
543                         Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
544                         Ncv32u iioffsTR = iioffsTL + rectWidth;
545                         Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
546                         Ncv32u iioffsBR = iioffsBL + rectWidth;
547 
548                         Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
549                                          getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
550                                          getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
551                                          getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
552 
553 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
554                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
555 #else
556                         curNodeVal += (Ncv32f)rectSum * rectWeight;
557 #endif
558                     }
559 
560                     HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
561                     HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
562                     Ncv32f nodeThreshold = curNode.getThreshold();
563 
564                     HaarClassifierNodeDescriptor32 nextNodeDescriptor;
565                     NcvBool nextNodeIsLeaf;
566 
567                     if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
568                     {
569                         nextNodeDescriptor = nodeLeft;
570                         nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
571                     }
572                     else
573                     {
574                         nextNodeDescriptor = nodeRight;
575                         nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
576                     }
577 
578                     if (nextNodeIsLeaf)
579                     {
580                         Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
581                         curStageSum += tmpLeafValue;
582                         bMoreNodesToTraverse = false;
583                     }
584                     else
585                     {
586                         iNode = nextNodeDescriptor.getNextNodeOffset();
587                     }
588                 }
589             }
590             __syncthreads();
591 
592             curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
593         }
594 
595         Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
596 
597         if (finalStageSum < stageThreshold)
598         {
599             bPass = false;
600             outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
601             break;
602         }
603     }
604 
605     if (!tbDoAtomicCompaction)
606     {
607         if (!bPass || d_inMask != d_outMask)
608         {
609             if (!threadIdx.x)
610             {
611                 d_outMask[maskOffset] = outMaskVal;
612             }
613         }
614     }
615     else
616     {
617 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
618         if (bPass && !threadIdx.x)
619         {
620             Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
621             d_outMask[outMaskOffset] = outMaskVal;
622         }
623 #endif
624     }
625 }
626 
627 
628 template <NcvBool tbMaskByInmask,
629           NcvBool tbDoAtomicCompaction>
initializeMaskVector(Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)630 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
631                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
632                                      NcvSize32u anchorsRoi, Ncv32u step)
633 {
634     Ncv32u y_offs = blockIdx.y;
635     Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
636     Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
637 
638     Ncv32u y_offs_upsc = step * y_offs;
639     Ncv32u x_offs_upsc = step * x_offs;
640     Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
641 
642     Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
643 
644     if (x_offs_upsc < anchorsRoi.width &&
645         (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
646     {
647         outElem = (y_offs_upsc << 16) | x_offs_upsc;
648     }
649 
650     if (!tbDoAtomicCompaction)
651     {
652         d_outMask[outMaskOffset] = outElem;
653     }
654     else
655     {
656         compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
657                                            outElem,
658                                            d_outMask);
659     }
660 }
661 
662 
663 struct applyHaarClassifierAnchorParallelFunctor
664 {
665     dim3 gridConf, blockConf;
666     cudaStream_t cuStream;
667 
668     //Kernel arguments are stored as members;
669     Ncv32u *d_IImg;
670     Ncv32u IImgStride;
671     Ncv32f *d_weights;
672     Ncv32u weightsStride;
673     HaarFeature64 *d_Features;
674     HaarClassifierNode128 *d_ClassifierNodes;
675     HaarStage64 *d_Stages;
676     Ncv32u *d_inMask;
677     Ncv32u *d_outMask;
678     Ncv32u mask1Dlen;
679     Ncv32u mask2Dstride;
680     NcvSize32u anchorsRoi;
681     Ncv32u startStageInc;
682     Ncv32u endStageExc;
683     Ncv32f scaleArea;
684 
685     //Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctorapplyHaarClassifierAnchorParallelFunctor686     applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
687                                              Ncv32u *_d_IImg, Ncv32u _IImgStride,
688                                              Ncv32f *_d_weights, Ncv32u _weightsStride,
689                                              HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
690                                              Ncv32u *_d_inMask, Ncv32u *_d_outMask,
691                                              Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
692                                              NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
693                                              Ncv32u _endStageExc, Ncv32f _scaleArea) :
694     gridConf(_gridConf),
695     blockConf(_blockConf),
696     cuStream(_cuStream),
697     d_IImg(_d_IImg),
698     IImgStride(_IImgStride),
699     d_weights(_d_weights),
700     weightsStride(_weightsStride),
701     d_Features(_d_Features),
702     d_ClassifierNodes(_d_ClassifierNodes),
703     d_Stages(_d_Stages),
704     d_inMask(_d_inMask),
705     d_outMask(_d_outMask),
706     mask1Dlen(_mask1Dlen),
707     mask2Dstride(_mask2Dstride),
708     anchorsRoi(_anchorsRoi),
709     startStageInc(_startStageInc),
710     endStageExc(_endStageExc),
711     scaleArea(_scaleArea)
712     {}
713 
714     template<class TList>
callapplyHaarClassifierAnchorParallelFunctor715     void call(TList tl)
716     {
717         (void)tl;
718         applyHaarClassifierAnchorParallel <
719             Loki::TL::TypeAt<TList, 0>::Result::value,
720             Loki::TL::TypeAt<TList, 1>::Result::value,
721             Loki::TL::TypeAt<TList, 2>::Result::value,
722             Loki::TL::TypeAt<TList, 3>::Result::value,
723             Loki::TL::TypeAt<TList, 4>::Result::value >
724             <<<gridConf, blockConf, 0, cuStream>>>
725             (d_IImg, IImgStride,
726             d_weights, weightsStride,
727             d_Features, d_ClassifierNodes, d_Stages,
728             d_inMask, d_outMask,
729             mask1Dlen, mask2Dstride,
730             anchorsRoi, startStageInc,
731             endStageExc, scaleArea);
732     }
733 };
734 
735 
applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbReadPixelIndexFromVector,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)736 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
737                                                   NcvBool tbCacheTextureIImg,
738                                                   NcvBool tbCacheTextureCascade,
739                                                   NcvBool tbReadPixelIndexFromVector,
740                                                   NcvBool tbDoAtomicCompaction,
741 
742                                                   dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
743 
744                                                   Ncv32u *d_IImg, Ncv32u IImgStride,
745                                                   Ncv32f *d_weights, Ncv32u weightsStride,
746                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
747                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
748                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
749                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc,
750                                                   Ncv32u endStageExc, Ncv32f scaleArea)
751 {
752 
753     applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
754                                                      d_IImg, IImgStride,
755                                                      d_weights, weightsStride,
756                                                      d_Features, d_ClassifierNodes, d_Stages,
757                                                      d_inMask, d_outMask,
758                                                      mask1Dlen, mask2Dstride,
759                                                      anchorsRoi, startStageInc,
760                                                      endStageExc, scaleArea);
761 
762     //Second parameter is the number of "dynamic" template parameters
763     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
764         ::call( &functor,
765                 tbInitMaskPositively,
766                 tbCacheTextureIImg,
767                 tbCacheTextureCascade,
768                 tbReadPixelIndexFromVector,
769                 tbDoAtomicCompaction);
770 }
771 
772 
773 struct applyHaarClassifierClassifierParallelFunctor
774 {
775     dim3 gridConf, blockConf;
776     cudaStream_t cuStream;
777 
778     //Kernel arguments are stored as members;
779     Ncv32u *d_IImg;
780     Ncv32u IImgStride;
781     Ncv32f *d_weights;
782     Ncv32u weightsStride;
783     HaarFeature64 *d_Features;
784     HaarClassifierNode128 *d_ClassifierNodes;
785     HaarStage64 *d_Stages;
786     Ncv32u *d_inMask;
787     Ncv32u *d_outMask;
788     Ncv32u mask1Dlen;
789     Ncv32u mask2Dstride;
790     NcvSize32u anchorsRoi;
791     Ncv32u startStageInc;
792     Ncv32u endStageExc;
793     Ncv32f scaleArea;
794 
795     //Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctorapplyHaarClassifierClassifierParallelFunctor796     applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
797                                                  Ncv32u *_d_IImg, Ncv32u _IImgStride,
798                                                  Ncv32f *_d_weights, Ncv32u _weightsStride,
799                                                  HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
800                                                  Ncv32u *_d_inMask, Ncv32u *_d_outMask,
801                                                  Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
802                                                  NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
803                                                  Ncv32u _endStageExc, Ncv32f _scaleArea) :
804     gridConf(_gridConf),
805     blockConf(_blockConf),
806     cuStream(_cuStream),
807     d_IImg(_d_IImg),
808     IImgStride(_IImgStride),
809     d_weights(_d_weights),
810     weightsStride(_weightsStride),
811     d_Features(_d_Features),
812     d_ClassifierNodes(_d_ClassifierNodes),
813     d_Stages(_d_Stages),
814     d_inMask(_d_inMask),
815     d_outMask(_d_outMask),
816     mask1Dlen(_mask1Dlen),
817     mask2Dstride(_mask2Dstride),
818     anchorsRoi(_anchorsRoi),
819     startStageInc(_startStageInc),
820     endStageExc(_endStageExc),
821     scaleArea(_scaleArea)
822     {}
823 
824     template<class TList>
callapplyHaarClassifierClassifierParallelFunctor825     void call(TList tl)
826     {
827         (void)tl;
828         applyHaarClassifierClassifierParallel <
829             Loki::TL::TypeAt<TList, 0>::Result::value,
830             Loki::TL::TypeAt<TList, 1>::Result::value,
831             Loki::TL::TypeAt<TList, 2>::Result::value >
832             <<<gridConf, blockConf, 0, cuStream>>>
833             (d_IImg, IImgStride,
834             d_weights, weightsStride,
835             d_Features, d_ClassifierNodes, d_Stages,
836             d_inMask, d_outMask,
837             mask1Dlen, mask2Dstride,
838             anchorsRoi, startStageInc,
839             endStageExc, scaleArea);
840     }
841 };
842 
843 
applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)844 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
845                                                       NcvBool tbCacheTextureCascade,
846                                                       NcvBool tbDoAtomicCompaction,
847 
848                                                       dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
849 
850                                                       Ncv32u *d_IImg, Ncv32u IImgStride,
851                                                       Ncv32f *d_weights, Ncv32u weightsStride,
852                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
853                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
854                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
855                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc,
856                                                       Ncv32u endStageExc, Ncv32f scaleArea)
857 {
858     applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
859                                                          d_IImg, IImgStride,
860                                                          d_weights, weightsStride,
861                                                          d_Features, d_ClassifierNodes, d_Stages,
862                                                          d_inMask, d_outMask,
863                                                          mask1Dlen, mask2Dstride,
864                                                          anchorsRoi, startStageInc,
865                                                          endStageExc, scaleArea);
866 
867     //Second parameter is the number of "dynamic" template parameters
868     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
869         ::call( &functor,
870                 tbCacheTextureIImg,
871                 tbCacheTextureCascade,
872                 tbDoAtomicCompaction);
873 }
874 
875 
876 struct initializeMaskVectorFunctor
877 {
878     dim3 gridConf, blockConf;
879     cudaStream_t cuStream;
880 
881     //Kernel arguments are stored as members;
882     Ncv32u *d_inMask;
883     Ncv32u *d_outMask;
884     Ncv32u mask1Dlen;
885     Ncv32u mask2Dstride;
886     NcvSize32u anchorsRoi;
887     Ncv32u step;
888 
889     //Arguments are passed through the constructor
initializeMaskVectorFunctorinitializeMaskVectorFunctor890     initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
891                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
892                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
893                                 NcvSize32u _anchorsRoi, Ncv32u _step) :
894     gridConf(_gridConf),
895     blockConf(_blockConf),
896     cuStream(_cuStream),
897     d_inMask(_d_inMask),
898     d_outMask(_d_outMask),
899     mask1Dlen(_mask1Dlen),
900     mask2Dstride(_mask2Dstride),
901     anchorsRoi(_anchorsRoi),
902     step(_step)
903     {}
904 
905     template<class TList>
callinitializeMaskVectorFunctor906     void call(TList tl)
907     {
908         (void)tl;
909         initializeMaskVector <
910             Loki::TL::TypeAt<TList, 0>::Result::value,
911             Loki::TL::TypeAt<TList, 1>::Result::value >
912             <<<gridConf, blockConf, 0, cuStream>>>
913             (d_inMask, d_outMask,
914              mask1Dlen, mask2Dstride,
915              anchorsRoi, step);
916     }
917 };
918 
919 
initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)920 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
921                                      NcvBool tbDoAtomicCompaction,
922 
923                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
924 
925                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
926                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
927                                      NcvSize32u anchorsRoi, Ncv32u step)
928 {
929     initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
930                                         d_inMask, d_outMask,
931                                         mask1Dlen, mask2Dstride,
932                                         anchorsRoi, step);
933 
934     //Second parameter is the number of "dynamic" template parameters
935     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
936         ::call( &functor,
937                 tbMaskByInmask,
938                 tbDoAtomicCompaction);
939 }
940 
941 
getStageNumWithNotLessThanNclassifiers(Ncv32u N,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages)942 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
943                                               NCVVector<HaarStage64> &h_HaarStages)
944 {
945     Ncv32u i = 0;
946     for (; i<haar.NumStages; i++)
947     {
948         if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
949         {
950             break;
951         }
952     }
953     return i;
954 }
955 
956 
ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> & integral,NCVMatrix<Ncv32f> & d_weights,NCVMatrixAlloc<Ncv32u> & d_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)957 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
958                                                NCVMatrix<Ncv32f> &d_weights,
959                                                NCVMatrixAlloc<Ncv32u> &d_pixelMask,
960                                                Ncv32u &numDetections,
961                                                HaarClassifierCascadeDescriptor &haar,
962                                                NCVVector<HaarStage64> &h_HaarStages,
963                                                NCVVector<HaarStage64> &d_HaarStages,
964                                                NCVVector<HaarClassifierNode128> &d_HaarNodes,
965                                                NCVVector<HaarFeature64> &d_HaarFeatures,
966                                                NcvBool bMaskElements,
967                                                NcvSize32u anchorsRoi,
968                                                Ncv32u pixelStep,
969                                                Ncv32f scaleArea,
970                                                INCVMemAllocator &gpuAllocator,
971                                                INCVMemAllocator &cpuAllocator,
972                                                cudaDeviceProp &devProp,
973                                                cudaStream_t cuStream)
974 {
975     ncvAssertReturn(integral.memType() == d_weights.memType()&&
976                     integral.memType() == d_pixelMask.memType() &&
977                     integral.memType() == gpuAllocator.memType() &&
978                    (integral.memType() == NCVMemoryTypeDevice ||
979                     integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
980 
981     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
982                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
983                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
984                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
985 
986     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
987 
988     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
989 
990     ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
991                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
992                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
993 
994     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
995                     d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
996                     d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
997                     integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
998                     integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
999 
1000     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1001 
1002     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1003                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1004                     d_HaarFeatures.length() >= haar.NumFeatures &&
1005                     d_HaarStages.length() == h_HaarStages.length() &&
1006                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1007 
1008     ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
1009 
1010     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1011 
1012     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1013 
1014 #if defined _SELF_TEST_
1015 
1016     NCVStatus ncvStat;
1017 
1018     NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
1019     ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1020     NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
1021     ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1022     NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1023     ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1024     NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
1025     ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1026     NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
1027     ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1028 
1029     NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1030     ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1031 
1032     NCV_SKIP_COND_BEGIN
1033 
1034     ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
1035     ncvAssertReturnNcvStat(ncvStat);
1036     ncvStat = integral.copySolid(h_integralImage, 0);
1037     ncvAssertReturnNcvStat(ncvStat);
1038     ncvStat = d_weights.copySolid(h_weights, 0);
1039     ncvAssertReturnNcvStat(ncvStat);
1040     ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
1041     ncvAssertReturnNcvStat(ncvStat);
1042     ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
1043     ncvAssertReturnNcvStat(ncvStat);
1044     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
1045 
1046     for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
1047     {
1048         for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
1049         {
1050             if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
1051             {
1052                 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
1053                 {
1054                     h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
1055                 }
1056             }
1057             else
1058             {
1059                 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1060             }
1061         }
1062     }
1063 
1064     NCV_SKIP_COND_END
1065 
1066 #endif
1067 
1068     NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
1069     ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
1070 
1071     NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
1072     ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1073 
1074     NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
1075     ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1076     Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
1077     Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
1078 
1079     NCV_SKIP_COND_BEGIN
1080     *hp_zero = 0;
1081     *hp_numDet = 0;
1082     NCV_SKIP_COND_END
1083 
1084     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1085                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1086 
1087     NcvBool bTexCacheCascade = devProp.major < 2;
1088     NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
1089     NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
1090 
1091     NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
1092     NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
1093 
1094     Ncv32u szNppCompactTmpBuf;
1095     nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
1096     if (bDoAtomicCompaction)
1097     {
1098         szNppCompactTmpBuf = 0;
1099     }
1100     NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
1101 
1102     NCV_SKIP_COND_BEGIN
1103 
1104     if (bTexCacheIImg)
1105     {
1106         cudaChannelFormatDesc cfdTexIImage;
1107         cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
1108 
1109         size_t alignmentOffset;
1110         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
1111             (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
1112         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1113     }
1114 
1115     if (bTexCacheCascade)
1116     {
1117         cudaChannelFormatDesc cfdTexHaarFeatures;
1118         cudaChannelFormatDesc cfdTexHaarClassifierNodes;
1119         cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
1120         cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
1121 
1122         size_t alignmentOffset;
1123         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
1124             d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
1125         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1126         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
1127             d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
1128         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1129     }
1130 
1131     Ncv32u stageStartAnchorParallel = 0;
1132     Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
1133         haar, h_HaarStages);
1134     Ncv32u stageEndClassifierParallel = haar.NumStages;
1135     if (stageMiddleSwitch == 0)
1136     {
1137         stageMiddleSwitch = 1;
1138     }
1139 
1140     //create stages subdivision for pixel-parallel processing
1141     const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
1142     Ncv32u curStop = stageStartAnchorParallel;
1143     std::vector<Ncv32u> pixParallelStageStops;
1144     while (curStop < stageMiddleSwitch)
1145     {
1146         pixParallelStageStops.push_back(curStop);
1147         curStop += compactEveryNstage;
1148     }
1149     if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
1150     {
1151         pixParallelStageStops[pixParallelStageStops.size()-1] =
1152             (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
1153     }
1154     pixParallelStageStops.push_back(stageMiddleSwitch);
1155     Ncv32u pixParallelStageStopsIndex = 0;
1156 
1157     if (pixelStep != 1 || bMaskElements)
1158     {
1159         if (bDoAtomicCompaction)
1160         {
1161             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1162                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1163             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1164         }
1165 
1166         dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1167                         (anchorsRoi.height + pixelStep - 1) / pixelStep);
1168         dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
1169 
1170         if (gridInit.x == 0 || gridInit.y == 0)
1171         {
1172             numDetections = 0;
1173             return NCV_SUCCESS;
1174         }
1175 
1176         initializeMaskVectorDynTemplate(bMaskElements,
1177                                         bDoAtomicCompaction,
1178                                         gridInit, blockInit, cuStream,
1179                                         d_ptrNowData->ptr(),
1180                                         d_ptrNowTmp->ptr(),
1181                                         static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
1182                                         anchorsRoi, pixelStep);
1183         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1184 
1185         if (bDoAtomicCompaction)
1186         {
1187             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1188             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1189                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1190             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1191             swap(d_ptrNowData, d_ptrNowTmp);
1192         }
1193         else
1194         {
1195             NCVStatus nppSt;
1196             nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1197                                       d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1198                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1199             ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
1200         }
1201         numDetections = *hp_numDet;
1202     }
1203     else
1204     {
1205         //
1206         // 1. Run the first pixel-input pixel-parallel classifier for few stages
1207         //
1208 
1209         if (bDoAtomicCompaction)
1210         {
1211             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1212                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1213             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1214         }
1215 
1216         dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1217                    anchorsRoi.height);
1218         dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
1219         applyHaarClassifierAnchorParallelDynTemplate(
1220             true,                         //tbInitMaskPositively
1221             bTexCacheIImg,                //tbCacheTextureIImg
1222             bTexCacheCascade,             //tbCacheTextureCascade
1223             pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
1224             bDoAtomicCompaction,          //tbDoAtomicCompaction
1225             grid1,
1226             block1,
1227             cuStream,
1228             integral.ptr(), integral.stride(),
1229             d_weights.ptr(), d_weights.stride(),
1230             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1231             d_ptrNowData->ptr(),
1232             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1233             0,
1234             d_pixelMask.stride(),
1235             anchorsRoi,
1236             pixParallelStageStops[pixParallelStageStopsIndex],
1237             pixParallelStageStops[pixParallelStageStopsIndex+1],
1238             scaleAreaPixels);
1239         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1240 
1241         if (bDoAtomicCompaction)
1242         {
1243             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1244             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1245                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1246             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1247         }
1248         else
1249         {
1250             NCVStatus nppSt;
1251             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1252                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1253                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1254             ncvAssertReturnNcvStat(nppSt);
1255         }
1256 
1257         swap(d_ptrNowData, d_ptrNowTmp);
1258         numDetections = *hp_numDet;
1259 
1260         pixParallelStageStopsIndex++;
1261     }
1262 
1263     //
1264     // 2. Run pixel-parallel stages
1265     //
1266 
1267     for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
1268     {
1269         if (numDetections == 0)
1270         {
1271             break;
1272         }
1273 
1274         if (bDoAtomicCompaction)
1275         {
1276             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1277                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1278             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1279         }
1280 
1281         dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
1282         if (numDetections > MAX_GRID_DIM)
1283         {
1284             grid2.x = MAX_GRID_DIM;
1285             grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1286         }
1287         dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
1288 
1289         applyHaarClassifierAnchorParallelDynTemplate(
1290             false,                        //tbInitMaskPositively
1291             bTexCacheIImg,                //tbCacheTextureIImg
1292             bTexCacheCascade,             //tbCacheTextureCascade
1293             pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
1294             bDoAtomicCompaction,          //tbDoAtomicCompaction
1295             grid2,
1296             block2,
1297             cuStream,
1298             integral.ptr(), integral.stride(),
1299             d_weights.ptr(), d_weights.stride(),
1300             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1301             d_ptrNowData->ptr(),
1302             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1303             numDetections,
1304             d_pixelMask.stride(),
1305             anchorsRoi,
1306             pixParallelStageStops[pixParallelStageStopsIndex],
1307             pixParallelStageStops[pixParallelStageStopsIndex+1],
1308             scaleAreaPixels);
1309         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1310 
1311         if (bDoAtomicCompaction)
1312         {
1313             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1314             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1315                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1316             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1317         }
1318         else
1319         {
1320             NCVStatus nppSt;
1321             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1322                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1323                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1324             ncvAssertReturnNcvStat(nppSt);
1325         }
1326 
1327         swap(d_ptrNowData, d_ptrNowTmp);
1328         numDetections = *hp_numDet;
1329     }
1330 
1331     //
1332     // 3. Run all left stages in one stage-parallel kernel
1333     //
1334 
1335     if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
1336     {
1337         if (bDoAtomicCompaction)
1338         {
1339             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1340                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1341             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1342         }
1343 
1344         dim3 grid3(numDetections);
1345         if (numDetections > MAX_GRID_DIM)
1346         {
1347             grid3.x = MAX_GRID_DIM;
1348             grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1349         }
1350         dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
1351 
1352         applyHaarClassifierClassifierParallelDynTemplate(
1353             bTexCacheIImg,                //tbCacheTextureIImg
1354             bTexCacheCascade,             //tbCacheTextureCascade
1355             bDoAtomicCompaction,          //tbDoAtomicCompaction
1356             grid3,
1357             block3,
1358             cuStream,
1359             integral.ptr(), integral.stride(),
1360             d_weights.ptr(), d_weights.stride(),
1361             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1362             d_ptrNowData->ptr(),
1363             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1364             numDetections,
1365             d_pixelMask.stride(),
1366             anchorsRoi,
1367             stageMiddleSwitch,
1368             stageEndClassifierParallel,
1369             scaleAreaPixels);
1370         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1371 
1372         if (bDoAtomicCompaction)
1373         {
1374             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1375             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1376                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1377             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1378         }
1379         else
1380         {
1381             NCVStatus nppSt;
1382             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1383                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1384                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1385             ncvAssertReturnNcvStat(nppSt);
1386         }
1387 
1388         swap(d_ptrNowData, d_ptrNowTmp);
1389         numDetections = *hp_numDet;
1390     }
1391 
1392     if (d_ptrNowData != &d_vecPixelMask)
1393     {
1394         d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
1395         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1396     }
1397 
1398 #if defined _SELF_TEST_
1399 
1400     ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
1401     ncvAssertReturnNcvStat(ncvStat);
1402     ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1403 
1404     if (bDoAtomicCompaction)
1405     {
1406         std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
1407     }
1408 
1409     Ncv32u fpu_oldcw, fpu_cw;
1410     _controlfp_s(&fpu_cw, 0, 0);
1411     fpu_oldcw = fpu_cw;
1412     _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
1413     Ncv32u numDetGold;
1414     ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
1415                                                  h_HaarStages, h_HaarNodes, h_HaarFeatures,
1416                                                  bMaskElements, anchorsRoi, pixelStep, scaleArea);
1417     ncvAssertReturnNcvStat(ncvStat);
1418     _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
1419 
1420     bool bPass = true;
1421 
1422     if (numDetGold != numDetections)
1423     {
1424         printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
1425         bPass = false;
1426     }
1427     else
1428     {
1429         for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
1430         {
1431             if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
1432             {
1433                 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
1434                 bPass = false;
1435             }
1436         }
1437     }
1438 
1439     printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
1440 #endif
1441 
1442     NCV_SKIP_COND_END
1443 
1444     return NCV_SUCCESS;
1445 }
1446 
1447 
1448 //==============================================================================
1449 //
1450 // HypothesesOperations file
1451 //
1452 //==============================================================================
1453 
1454 
1455 const Ncv32u NUM_GROW_THREADS = 128;
1456 
1457 
pixelToRect(Ncv32u pixel,Ncv32u width,Ncv32u height,Ncv32f scale)1458 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
1459 {
1460     NcvRect32u res;
1461     res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
1462     res.y = (Ncv32u)(scale * (pixel >> 16));
1463     res.width = (Ncv32u)(scale * width);
1464     res.height = (Ncv32u)(scale * height);
1465     return res;
1466 }
1467 
1468 
growDetectionsKernel(Ncv32u * pixelMask,Ncv32u numElements,NcvRect32u * hypotheses,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1469 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
1470                                      NcvRect32u *hypotheses,
1471                                      Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
1472 {
1473     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1474     Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
1475     if (elemAddr >= numElements)
1476     {
1477         return;
1478     }
1479     hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
1480 }
1481 
1482 
ncvGrowDetectionsVector_device(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale,cudaStream_t cuStream)1483 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
1484                                          Ncv32u numPixelMaskDetections,
1485                                          NCVVector<NcvRect32u> &hypotheses,
1486                                          Ncv32u &totalDetections,
1487                                          Ncv32u totalMaxDetections,
1488                                          Ncv32u rectWidth,
1489                                          Ncv32u rectHeight,
1490                                          Ncv32f curScale,
1491                                          cudaStream_t cuStream)
1492 {
1493     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1494 
1495     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1496                     pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1497 
1498     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1499 
1500     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1501 
1502     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1503                     numPixelMaskDetections <= pixelMask.length() &&
1504                     totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1505 
1506     NCVStatus ncvStat = NCV_SUCCESS;
1507     Ncv32u numDetsToCopy = numPixelMaskDetections;
1508 
1509     if (numDetsToCopy == 0)
1510     {
1511         return ncvStat;
1512     }
1513 
1514     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
1515     {
1516         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1517         numDetsToCopy = totalMaxDetections - totalDetections;
1518     }
1519 
1520     dim3 block(NUM_GROW_THREADS);
1521     dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
1522     if (grid.x > 65535)
1523     {
1524         grid.y = (grid.x + 65534) / 65535;
1525         grid.x = 65535;
1526     }
1527     growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
1528                                                        hypotheses.ptr() + totalDetections,
1529                                                        rectWidth, rectHeight, curScale);
1530     ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1531 
1532     totalDetections += numDetsToCopy;
1533     return ncvStat;
1534 }
1535 
1536 
1537 //==============================================================================
1538 //
1539 // Pipeline file
1540 //
1541 //==============================================================================
1542 
1543 
ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> & d_srcImg,NcvSize32u srcRoi,NCVVector<NcvRect32u> & d_dstRects,Ncv32u & dstNumRects,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvSize32u minObjSize,Ncv32u minNeighbors,Ncv32f scaleStep,Ncv32u pixelStep,Ncv32u flags,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)1544 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
1545                                             NcvSize32u srcRoi,
1546                                             NCVVector<NcvRect32u> &d_dstRects,
1547                                             Ncv32u &dstNumRects,
1548 
1549                                             HaarClassifierCascadeDescriptor &haar,
1550                                             NCVVector<HaarStage64> &h_HaarStages,
1551                                             NCVVector<HaarStage64> &d_HaarStages,
1552                                             NCVVector<HaarClassifierNode128> &d_HaarNodes,
1553                                             NCVVector<HaarFeature64> &d_HaarFeatures,
1554 
1555                                             NcvSize32u minObjSize,
1556                                             Ncv32u minNeighbors,      //default 4
1557                                             Ncv32f scaleStep,         //default 1.2f
1558                                             Ncv32u pixelStep,         //default 1
1559                                             Ncv32u flags,             //default NCVPipeObjDet_Default
1560 
1561                                             INCVMemAllocator &gpuAllocator,
1562                                             INCVMemAllocator &cpuAllocator,
1563                                             cudaDeviceProp &devProp,
1564                                             cudaStream_t cuStream)
1565 {
1566     ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
1567                     d_srcImg.memType() == gpuAllocator.memType() &&
1568                      (d_srcImg.memType() == NCVMemoryTypeDevice ||
1569                       d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1570 
1571     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
1572                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
1573                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
1574                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1575 
1576     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1577 
1578     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
1579 
1580     ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
1581                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
1582                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
1583     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
1584                     d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
1585                     srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
1586                     d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
1587 
1588     ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
1589 
1590     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1591                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1592                     d_HaarFeatures.length() >= haar.NumFeatures &&
1593                     d_HaarStages.length() == h_HaarStages.length() &&
1594                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1595 
1596     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1597 
1598     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1599 
1600     //TODO: set NPP active stream to cuStream
1601 
1602     NCVStatus ncvStat;
1603     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1604 
1605     Ncv32u integralWidth = d_srcImg.width() + 1;
1606     Ncv32u integralHeight = d_srcImg.height() + 1;
1607 
1608     NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
1609     ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1610     NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1611     ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1612 
1613     NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1614     ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1615     NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1616     ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1617 
1618     NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
1619     ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1620     NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1621     ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1622 
1623     NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
1624     ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1625     NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
1626     ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1627 
1628     NCVStatus nppStat;
1629     Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
1630     nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
1631     ncvAssertReturnNcvStat(nppStat);
1632     nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
1633     ncvAssertReturnNcvStat(nppStat);
1634     NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
1635     ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1636 
1637     NCV_SKIP_COND_BEGIN
1638 
1639     nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1640                                        integral.ptr(), integral.pitch(),
1641                                        NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1642                                        d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
1643     ncvAssertReturnNcvStat(nppStat);
1644 
1645     nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1646                                           d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1647                                           NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1648                                           d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
1649     ncvAssertReturnNcvStat(nppStat);
1650 
1651     NCV_SKIP_COND_END
1652 
1653     dstNumRects = 0;
1654 
1655     Ncv32u lastCheckedScale = 0;
1656     NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
1657     std::vector<Ncv32u> scalesVector;
1658 
1659     NcvBool bFoundLargestFace = false;
1660 
1661     for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
1662     {
1663         Ncv32u scale = (Ncv32u)scaleIter;
1664         if (lastCheckedScale == scale)
1665         {
1666             continue;
1667         }
1668         lastCheckedScale = scale;
1669 
1670         if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
1671             haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
1672         {
1673             continue;
1674         }
1675 
1676         NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
1677 
1678         srcRoi_.width = d_srcImg.width();
1679         srcRoi_.height = d_srcImg.height();
1680 
1681         srcIIRo_i.width = srcRoi_.width + 1;
1682         srcIIRo_i.height = srcRoi_.height + 1;
1683 
1684         scaledIIRoi.width = srcIIRo_i.width / scale;
1685         scaledIIRoi.height = srcIIRo_i.height / scale;
1686 
1687         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1688         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1689 
1690         if (searchRoi.width <= 0 || searchRoi.height <= 0)
1691         {
1692             break;
1693         }
1694 
1695         scalesVector.push_back(scale);
1696 
1697         if (gpuAllocator.isCounting())
1698         {
1699             break;
1700         }
1701     }
1702 
1703     if (bReverseTraverseScale)
1704     {
1705         std::reverse(scalesVector.begin(), scalesVector.end());
1706     }
1707 
1708     //TODO: handle _fair_scale_ flag
1709     for (Ncv32u i=0; i<scalesVector.size(); i++)
1710     {
1711         Ncv32u scale = scalesVector[i];
1712 
1713         NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
1714         NcvSize32u srcIIRoi;
1715         srcRoi_.width = d_srcImg.width();
1716         srcRoi_.height = d_srcImg.height();
1717         srcIIRoi.width = srcRoi_.width + 1;
1718         srcIIRoi.height = srcRoi_.height + 1;
1719         scaledIIRoi.width = srcIIRoi.width / scale;
1720         scaledIIRoi.height = srcIIRoi.height / scale;
1721         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1722         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1723 
1724         NCV_SKIP_COND_BEGIN
1725 
1726         nppStat = nppiStDecimate_32u_C1R(
1727             integral.ptr(), integral.pitch(),
1728             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1729             srcIIRoi, scale, true);
1730         ncvAssertReturnNcvStat(nppStat);
1731 
1732         nppStat = nppiStDecimate_64u_C1R(
1733             d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1734             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1735             srcIIRoi, scale, true);
1736         ncvAssertReturnNcvStat(nppStat);
1737 
1738         const NcvRect32u rect(
1739             HAAR_STDDEV_BORDER,
1740             HAAR_STDDEV_BORDER,
1741             haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
1742             haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
1743         nppStat = nppiStRectStdDev_32f_C1R(
1744             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1745             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1746             d_rectStdDev.ptr(), d_rectStdDev.pitch(),
1747             NcvSize32u(searchRoi.width, searchRoi.height), rect,
1748             (Ncv32f)scale*scale, true);
1749         ncvAssertReturnNcvStat(nppStat);
1750 
1751         NCV_SKIP_COND_END
1752 
1753         Ncv32u detectionsOnThisScale;
1754         ncvStat = ncvApplyHaarClassifierCascade_device(
1755             d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
1756             detectionsOnThisScale,
1757             haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
1758             searchRoi, pixelStep, (Ncv32f)scale*scale,
1759             gpuAllocator, cpuAllocator, devProp, cuStream);
1760         ncvAssertReturnNcvStat(nppStat);
1761 
1762         NCV_SKIP_COND_BEGIN
1763 
1764         NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
1765         ncvStat = ncvGrowDetectionsVector_device(
1766             d_vecPixelMask,
1767             detectionsOnThisScale,
1768             d_hypothesesIntermediate,
1769             dstNumRects,
1770             static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
1771             haar.ClassifierSize.width,
1772             haar.ClassifierSize.height,
1773             (Ncv32f)scale,
1774             cuStream);
1775         ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
1776 
1777         if (flags & NCVPipeObjDet_FindLargestObject)
1778         {
1779             if (dstNumRects == 0)
1780             {
1781                 continue;
1782             }
1783 
1784             if (dstNumRects != 0)
1785             {
1786                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1787                 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1788                                                              dstNumRects * sizeof(NcvRect32u));
1789                 ncvAssertReturnNcvStat(ncvStat);
1790                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1791             }
1792 
1793             Ncv32u numStrongHypothesesNow = dstNumRects;
1794             ncvStat = ncvGroupRectangles_host(
1795                 h_hypothesesIntermediate,
1796                 numStrongHypothesesNow,
1797                 minNeighbors,
1798                 RECT_SIMILARITY_PROPORTION,
1799                 NULL);
1800             ncvAssertReturnNcvStat(ncvStat);
1801 
1802             if (numStrongHypothesesNow > 0)
1803             {
1804                 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
1805                 for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
1806                 {
1807                     if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
1808                     {
1809                         maxRect = h_hypothesesIntermediate.ptr()[j];
1810                     }
1811                 }
1812 
1813                 h_hypothesesIntermediate.ptr()[0] = maxRect;
1814                 dstNumRects = 1;
1815 
1816                 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
1817                 ncvAssertReturnNcvStat(ncvStat);
1818 
1819                 bFoundLargestFace = true;
1820 
1821                 break;
1822             }
1823         }
1824 
1825         NCV_SKIP_COND_END
1826 
1827         if (gpuAllocator.isCounting())
1828         {
1829             break;
1830         }
1831     }
1832 
1833     NCVStatus ncvRetCode = NCV_SUCCESS;
1834 
1835     NCV_SKIP_COND_BEGIN
1836 
1837     if (flags & NCVPipeObjDet_FindLargestObject)
1838     {
1839         if (!bFoundLargestFace)
1840         {
1841             dstNumRects = 0;
1842         }
1843     }
1844     else
1845     {
1846         //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
1847         if (dstNumRects != 0)
1848         {
1849             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1850             ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1851                                                          dstNumRects * sizeof(NcvRect32u));
1852             ncvAssertReturnNcvStat(ncvStat);
1853             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1854         }
1855 
1856         ncvStat = ncvGroupRectangles_host(
1857             h_hypothesesIntermediate,
1858             dstNumRects,
1859             minNeighbors,
1860             RECT_SIMILARITY_PROPORTION,
1861             NULL);
1862         ncvAssertReturnNcvStat(ncvStat);
1863 
1864         if (dstNumRects > d_dstRects.length())
1865         {
1866             ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1867             dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
1868         }
1869 
1870         if (dstNumRects != 0)
1871         {
1872             ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
1873                                                          dstNumRects * sizeof(NcvRect32u));
1874             ncvAssertReturnNcvStat(ncvStat);
1875         }
1876     }
1877 
1878     if (flags & NCVPipeObjDet_VisualizeInPlace)
1879     {
1880         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1881         ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
1882                                d_srcImg.width(), d_srcImg.height(),
1883                                d_dstRects.ptr(), dstNumRects, 255, cuStream);
1884     }
1885 
1886     NCV_SKIP_COND_END
1887 
1888     return ncvRetCode;
1889 }
1890 
1891 
1892 //==============================================================================
1893 //
1894 // Purely Host code: classifier IO, mock-ups
1895 //
1896 //==============================================================================
1897 
1898 
1899 #ifdef _SELF_TEST_
1900 #include <float.h>
1901 #endif
1902 
1903 
ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> & h_integralImage,NCVMatrix<Ncv32f> & h_weights,NCVMatrixAlloc<Ncv32u> & h_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea)1904 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
1905                                              NCVMatrix<Ncv32f> &h_weights,
1906                                              NCVMatrixAlloc<Ncv32u> &h_pixelMask,
1907                                              Ncv32u &numDetections,
1908                                              HaarClassifierCascadeDescriptor &haar,
1909                                              NCVVector<HaarStage64> &h_HaarStages,
1910                                              NCVVector<HaarClassifierNode128> &h_HaarNodes,
1911                                              NCVVector<HaarFeature64> &h_HaarFeatures,
1912                                              NcvBool bMaskElements,
1913                                              NcvSize32u anchorsRoi,
1914                                              Ncv32u pixelStep,
1915                                              Ncv32f scaleArea)
1916 {
1917     ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
1918                     h_integralImage.memType() == h_pixelMask.memType() &&
1919                      (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
1920                       h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1921     ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
1922                     h_HaarStages.memType() == h_HaarFeatures.memType() &&
1923                      (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
1924                       h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1925     ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
1926                     h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
1927     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
1928                     h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
1929                     h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
1930                     h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
1931                     h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
1932     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1933     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
1934                     h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1935                     h_HaarFeatures.length() >= haar.NumFeatures &&
1936                     h_HaarStages.length() == h_HaarStages.length() &&
1937                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1938     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1939     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1940 
1941     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1942                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1943 
1944     for (Ncv32u i=0; i<anchorsRoi.height; i++)
1945     {
1946         for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
1947         {
1948             if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
1949             {
1950                 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1951             }
1952             else
1953             {
1954                 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
1955                 {
1956                     Ncv32f curStageSum = 0.0f;
1957                     Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
1958                     Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
1959 
1960                     if (iStage == 0)
1961                     {
1962                         if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1963                         {
1964                             break;
1965                         }
1966                         else
1967                         {
1968                             h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
1969                         }
1970                     }
1971                     else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1972                     {
1973                         break;
1974                     }
1975 
1976                     while (numRootNodesInStage--)
1977                     {
1978                         NcvBool bMoreNodesToTraverse = true;
1979                         Ncv32u curNodeOffset = curRootNodeOffset;
1980 
1981                         while (bMoreNodesToTraverse)
1982                         {
1983                             HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
1984                             HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
1985                             Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
1986                             Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
1987 
1988                             Ncv32f curNodeVal = 0.f;
1989                             for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
1990                             {
1991                                 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
1992                                 Ncv32u rectX, rectY, rectWidth, rectHeight;
1993                                 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
1994                                 Ncv32f rectWeight = feature.getWeight();
1995                                 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
1996                                 Ncv32u iioffsTR = iioffsTL + rectWidth;
1997                                 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
1998                                 Ncv32u iioffsBR = iioffsBL + rectWidth;
1999 
2000                                 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
2001                                 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
2002                                 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
2003                                 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
2004                                 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
2005                                 curNodeVal += (Ncv32f)rectSum * rectWeight;
2006                             }
2007 
2008                             HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
2009                             HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
2010                             Ncv32f nodeThreshold = curNode.getThreshold();
2011 
2012                             HaarClassifierNodeDescriptor32 nextNodeDescriptor;
2013                             NcvBool nextNodeIsLeaf;
2014 
2015                             if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
2016                             {
2017                                 nextNodeDescriptor = nodeLeft;
2018                                 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
2019                             }
2020                             else
2021                             {
2022                                 nextNodeDescriptor = nodeRight;
2023                                 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
2024                             }
2025 
2026                             if (nextNodeIsLeaf)
2027                             {
2028                                 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
2029                                 curStageSum += tmpLeafValue;
2030                                 bMoreNodesToTraverse = false;
2031                             }
2032                             else
2033                             {
2034                                 curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
2035                             }
2036                         }
2037 
2038                         curRootNodeOffset++;
2039                     }
2040 
2041                     Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
2042                     if (curStageSum < tmpStageThreshold)
2043                     {
2044                         //drop
2045                         h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
2046                         break;
2047                     }
2048                 }
2049             }
2050         }
2051     }
2052 
2053     std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
2054     Ncv32u i = 0;
2055     for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
2056     {
2057         if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
2058         {
2059             break;
2060         }
2061     }
2062     numDetections = i;
2063 
2064     return NCV_SUCCESS;
2065 }
2066 
2067 
ncvGrowDetectionsVector_host(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)2068 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
2069                                        Ncv32u numPixelMaskDetections,
2070                                        NCVVector<NcvRect32u> &hypotheses,
2071                                        Ncv32u &totalDetections,
2072                                        Ncv32u totalMaxDetections,
2073                                        Ncv32u rectWidth,
2074                                        Ncv32u rectHeight,
2075                                        Ncv32f curScale)
2076 {
2077     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
2078     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
2079                     pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
2080     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
2081     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
2082     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
2083                     numPixelMaskDetections <= pixelMask.length() &&
2084                     totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
2085 
2086     NCVStatus ncvStat = NCV_SUCCESS;
2087     Ncv32u numDetsToCopy = numPixelMaskDetections;
2088 
2089     if (numDetsToCopy == 0)
2090     {
2091         return ncvStat;
2092     }
2093 
2094     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
2095     {
2096         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
2097         numDetsToCopy = totalMaxDetections - totalDetections;
2098     }
2099 
2100     for (Ncv32u i=0; i<numDetsToCopy; i++)
2101     {
2102         hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
2103     }
2104 
2105     totalDetections += numDetsToCopy;
2106     return ncvStat;
2107 }
2108 
loadFromXML(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2109 static NCVStatus loadFromXML(const cv::String &filename,
2110                       HaarClassifierCascadeDescriptor &haar,
2111                       std::vector<HaarStage64> &haarStages,
2112                       std::vector<HaarClassifierNode128> &haarClassifierNodes,
2113                       std::vector<HaarFeature64> &haarFeatures)
2114 {
2115 #ifndef HAVE_OPENCV_OBJDETECT
2116     (void) filename;
2117     (void) haar;
2118     (void) haarStages;
2119     (void) haarClassifierNodes;
2120     (void) haarFeatures;
2121     CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
2122     return NCV_HAAR_XML_LOADING_EXCEPTION;
2123 #else
2124     NCVStatus ncvStat;
2125 
2126     haar.NumStages = 0;
2127     haar.NumClassifierRootNodes = 0;
2128     haar.NumClassifierTotalNodes = 0;
2129     haar.NumFeatures = 0;
2130     haar.ClassifierSize.width = 0;
2131     haar.ClassifierSize.height = 0;
2132     haar.bHasStumpsOnly = true;
2133     haar.bNeedsTiltedII = false;
2134     Ncv32u curMaxTreeDepth = 0;
2135 
2136     std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
2137     haarStages.resize(0);
2138     haarClassifierNodes.resize(0);
2139     haarFeatures.resize(0);
2140 
2141     cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
2142     if (!oldCascade)
2143     {
2144         return NCV_HAAR_XML_LOADING_EXCEPTION;
2145     }
2146 
2147     haar.ClassifierSize.width = oldCascade->orig_window_size.width;
2148     haar.ClassifierSize.height = oldCascade->orig_window_size.height;
2149 
2150     int stagesCound = oldCascade->count;
2151     for(int s = 0; s < stagesCound; ++s) // by stages
2152     {
2153         HaarStage64 curStage;
2154         curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
2155 
2156         curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
2157 
2158         int treesCount = oldCascade->stage_classifier[s].count;
2159         for(int t = 0; t < treesCount; ++t) // by trees
2160         {
2161             Ncv32u nodeId = 0;
2162             CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
2163 
2164             int nodesCount = tree->count;
2165             for(int n = 0; n < nodesCount; ++n)  //by features
2166             {
2167                 CvHaarFeature* feature = &tree->haar_feature[n];
2168 
2169                 HaarClassifierNode128 curNode;
2170                 curNode.setThreshold(tree->threshold[n]);
2171 
2172                 NcvBool bIsLeftNodeLeaf = false;
2173                 NcvBool bIsRightNodeLeaf = false;
2174 
2175                 HaarClassifierNodeDescriptor32 nodeLeft;
2176                 if ( tree->left[n] <= 0 )
2177                 {
2178                     Ncv32f leftVal = tree->alpha[-tree->left[n]];
2179                     ncvStat = nodeLeft.create(leftVal);
2180                     ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2181                     bIsLeftNodeLeaf = true;
2182                 }
2183                 else
2184                 {
2185                     Ncv32u leftNodeOffset = tree->left[n];
2186                     nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
2187                     haar.bHasStumpsOnly = false;
2188                 }
2189                 curNode.setLeftNodeDesc(nodeLeft);
2190 
2191                 HaarClassifierNodeDescriptor32 nodeRight;
2192                 if ( tree->right[n] <= 0 )
2193                 {
2194                     Ncv32f rightVal = tree->alpha[-tree->right[n]];
2195                     ncvStat = nodeRight.create(rightVal);
2196                     ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2197                     bIsRightNodeLeaf = true;
2198                 }
2199                 else
2200                 {
2201                     Ncv32u rightNodeOffset = tree->right[n];
2202                     nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
2203                     haar.bHasStumpsOnly = false;
2204                 }
2205                 curNode.setRightNodeDesc(nodeRight);
2206 
2207                 Ncv32u tiltedVal = feature->tilted;
2208                 haar.bNeedsTiltedII = (tiltedVal != 0);
2209 
2210                 Ncv32u featureId = 0;
2211                 for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
2212                 {
2213                     Ncv32u rectX = feature->rect[l].r.x;
2214                     Ncv32u rectY = feature->rect[l].r.y;
2215                     Ncv32u rectWidth = feature->rect[l].r.width;
2216                     Ncv32u rectHeight = feature->rect[l].r.height;
2217 
2218                     Ncv32f rectWeight = feature->rect[l].weight;
2219 
2220                     if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
2221                         break;
2222 
2223                     HaarFeature64 curFeature;
2224                     ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
2225                     curFeature.setWeight(rectWeight);
2226                     ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2227                     haarFeatures.push_back(curFeature);
2228 
2229                     featureId++;
2230                 }
2231 
2232                 HaarFeatureDescriptor32 tmpFeatureDesc;
2233                 ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
2234                     featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
2235                 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2236                 curNode.setFeatureDesc(tmpFeatureDesc);
2237 
2238                 if (!nodeId)
2239                 {
2240                     //root node
2241                     haarClassifierNodes.push_back(curNode);
2242                     curMaxTreeDepth = 1;
2243                 }
2244                 else
2245                 {
2246                     //other node
2247                     h_TmpClassifierNotRootNodes.push_back(curNode);
2248                     curMaxTreeDepth++;
2249                 }
2250 
2251                 nodeId++;
2252             }
2253         }
2254 
2255         curStage.setNumClassifierRootNodes(treesCount);
2256         haarStages.push_back(curStage);
2257     }
2258 
2259     //fill in cascade stats
2260     haar.NumStages = static_cast<Ncv32u>(haarStages.size());
2261     haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
2262     haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
2263     haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
2264 
2265     //merge root and leaf nodes in one classifiers array
2266     Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
2267     for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
2268     {
2269         HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
2270 
2271         HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
2272         if (!featureDesc.isLeftNodeLeaf())
2273         {
2274             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2275             nodeLeft.create(newOffset);
2276         }
2277         haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
2278 
2279         HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
2280         if (!featureDesc.isRightNodeLeaf())
2281         {
2282             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2283             nodeRight.create(newOffset);
2284         }
2285         haarClassifierNodes[i].setRightNodeDesc(nodeRight);
2286     }
2287 
2288     for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
2289     {
2290         HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
2291 
2292         HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
2293         if (!featureDesc.isLeftNodeLeaf())
2294         {
2295             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2296             nodeLeft.create(newOffset);
2297         }
2298         h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
2299 
2300         HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
2301         if (!featureDesc.isRightNodeLeaf())
2302         {
2303             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2304             nodeRight.create(newOffset);
2305         }
2306         h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
2307 
2308         haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
2309     }
2310 
2311     return NCV_SUCCESS;
2312 #endif
2313 }
2314 
2315 
2316 #define NVBIN_HAAR_SIZERESERVED     16
2317 #define NVBIN_HAAR_VERSION          0x1
2318 
2319 
loadFromNVBIN(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2320 static NCVStatus loadFromNVBIN(const cv::String &filename,
2321                                HaarClassifierCascadeDescriptor &haar,
2322                                std::vector<HaarStage64> &haarStages,
2323                                std::vector<HaarClassifierNode128> &haarClassifierNodes,
2324                                std::vector<HaarFeature64> &haarFeatures)
2325 {
2326     size_t readCount;
2327     FILE *fp = fopen(filename.c_str(), "rb");
2328     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2329     Ncv32u fileVersion;
2330     readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2331     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2332     ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2333     Ncv32u fsize;
2334     readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
2335     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2336     fseek(fp, 0, SEEK_END);
2337     Ncv32u fsizeActual = ftell(fp);
2338     ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
2339 
2340     std::vector<unsigned char> fdata;
2341     fdata.resize(fsize);
2342     Ncv32u dataOffset = 0;
2343     fseek(fp, 0, SEEK_SET);
2344     readCount = fread(&fdata[0], fsize, 1, fp);
2345     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2346     fclose(fp);
2347 
2348     //data
2349     dataOffset = NVBIN_HAAR_SIZERESERVED;
2350     haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
2351     dataOffset += sizeof(Ncv32u);
2352     haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2353     dataOffset += sizeof(Ncv32u);
2354     haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2355     dataOffset += sizeof(Ncv32u);
2356     haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
2357     dataOffset += sizeof(Ncv32u);
2358     haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
2359     dataOffset += sizeof(NcvSize32u);
2360     haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
2361     dataOffset += sizeof(NcvBool);
2362     haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
2363     dataOffset += sizeof(NcvBool);
2364 
2365     haarStages.resize(haar.NumStages);
2366     haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
2367     haarFeatures.resize(haar.NumFeatures);
2368 
2369     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2370     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2371     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2372 
2373     memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
2374     dataOffset += szStages;
2375     memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
2376     dataOffset += szClassifiers;
2377     memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
2378     dataOffset += szFeatures;
2379 
2380     return NCV_SUCCESS;
2381 }
2382 
2383 
ncvHaarGetClassifierSize(const cv::String & filename,Ncv32u & numStages,Ncv32u & numNodes,Ncv32u & numFeatures)2384 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
2385                                    Ncv32u &numNodes, Ncv32u &numFeatures)
2386 {
2387     size_t readCount;
2388     NCVStatus ncvStat;
2389 
2390     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2391     fext = fext.toLowerCase();
2392 
2393     if (fext == "nvbin")
2394     {
2395         FILE *fp = fopen(filename.c_str(), "rb");
2396         ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2397         Ncv32u fileVersion;
2398         readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2399         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2400         ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2401         fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
2402         Ncv32u tmp;
2403         readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
2404         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2405         readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
2406         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2407         readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
2408         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2409         readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
2410         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2411         fclose(fp);
2412     }
2413     else if (fext == "xml")
2414     {
2415         HaarClassifierCascadeDescriptor haar;
2416         std::vector<HaarStage64> haarStages;
2417         std::vector<HaarClassifierNode128> haarNodes;
2418         std::vector<HaarFeature64> haarFeatures;
2419 
2420         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2421         ncvAssertReturnNcvStat(ncvStat);
2422 
2423         numStages = haar.NumStages;
2424         numNodes = haar.NumClassifierTotalNodes;
2425         numFeatures = haar.NumFeatures;
2426     }
2427     else
2428     {
2429         return NCV_HAAR_XML_LOADING_EXCEPTION;
2430     }
2431 
2432     return NCV_SUCCESS;
2433 }
2434 
2435 
ncvHaarLoadFromFile_host(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2436 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
2437                                    HaarClassifierCascadeDescriptor &haar,
2438                                    NCVVector<HaarStage64> &h_HaarStages,
2439                                    NCVVector<HaarClassifierNode128> &h_HaarNodes,
2440                                    NCVVector<HaarFeature64> &h_HaarFeatures)
2441 {
2442     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2443                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2444                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2445 
2446     NCVStatus ncvStat;
2447 
2448     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2449     fext = fext.toLowerCase();
2450 
2451     std::vector<HaarStage64> haarStages;
2452     std::vector<HaarClassifierNode128> haarNodes;
2453     std::vector<HaarFeature64> haarFeatures;
2454 
2455     if (fext == "nvbin")
2456     {
2457         ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
2458         ncvAssertReturnNcvStat(ncvStat);
2459     }
2460     else if (fext == "xml")
2461     {
2462         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2463         ncvAssertReturnNcvStat(ncvStat);
2464     }
2465     else
2466     {
2467         return NCV_HAAR_XML_LOADING_EXCEPTION;
2468     }
2469 
2470     ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2471     ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2472     ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2473 
2474     memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
2475     memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
2476     memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
2477 
2478     return NCV_SUCCESS;
2479 }
2480 
2481 
ncvHaarStoreNVBIN_host(const cv::String & filename,HaarClassifierCascadeDescriptor haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2482 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
2483                                  HaarClassifierCascadeDescriptor haar,
2484                                  NCVVector<HaarStage64> &h_HaarStages,
2485                                  NCVVector<HaarClassifierNode128> &h_HaarNodes,
2486                                  NCVVector<HaarFeature64> &h_HaarFeatures)
2487 {
2488     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
2489     ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
2490     ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
2491     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2492                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2493                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2494 
2495     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2496     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2497     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2498 
2499     Ncv32u dataOffset = 0;
2500     std::vector<unsigned char> fdata;
2501     fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
2502 
2503     //header
2504     *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
2505 
2506     //data
2507     dataOffset = NVBIN_HAAR_SIZERESERVED;
2508     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
2509     dataOffset += sizeof(Ncv32u);
2510     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
2511     dataOffset += sizeof(Ncv32u);
2512     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
2513     dataOffset += sizeof(Ncv32u);
2514     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
2515     dataOffset += sizeof(Ncv32u);
2516     *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
2517     dataOffset += sizeof(NcvSize32u);
2518     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
2519     dataOffset += sizeof(NcvBool);
2520     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
2521     dataOffset += sizeof(NcvBool);
2522 
2523     memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
2524     dataOffset += szStages;
2525     memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
2526     dataOffset += szClassifiers;
2527     memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
2528     dataOffset += szFeatures;
2529     Ncv32u fsize = dataOffset;
2530 
2531     //TODO: CRC32 here
2532 
2533     //update header
2534     dataOffset = sizeof(Ncv32u);
2535     *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
2536 
2537     FILE *fp = fopen(filename.c_str(), "wb");
2538     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2539     fwrite(&fdata[0], fsize, 1, fp);
2540     fclose(fp);
2541     return NCV_SUCCESS;
2542 }
2543