1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43////////////////////////////////////////////////////////////////////////////////
44//
45// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
46//
47// The algorithm and code are explained in the upcoming GPU Computing Gems
48// chapter in detail:
49//
50//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
51//   PDF URL placeholder
52//   email: aobukhov@nvidia.com, devsupport@nvidia.com
53//
54// Credits for help with the code to:
55// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
56//
57////////////////////////////////////////////////////////////////////////////////
58
59#include <algorithm>
60#include <cstdio>
61
62#include "opencv2/core/cuda/warp.hpp"
63#include "opencv2/core/cuda/warp_shuffle.hpp"
64
65#include "opencv2/opencv_modules.hpp"
66
67#ifdef HAVE_OPENCV_OBJDETECT
68#  include "opencv2/objdetect.hpp"
69#  include "opencv2/objdetect/objdetect_c.h"
70#endif
71
72#include "opencv2/cudalegacy/NCV.hpp"
73#include "opencv2/cudalegacy/NPP_staging.hpp"
74#include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
75
76#include "NCVRuntimeTemplates.hpp"
77#include "NCVAlg.hpp"
78
79
80//==============================================================================
81//
82// BlockScan file
83//
84//==============================================================================
85
86
87NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
88
89
90//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
91//assuming size <= WARP_SIZE and size is power of 2
92__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
93{
94#if __CUDA_ARCH__ >= 300
95    const unsigned int laneId = cv::cuda::device::Warp::laneId();
96
97    // scan on shuffl functions
98    #pragma unroll
99    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
100    {
101        const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
102        if (laneId >= i)
103              idata += n;
104    }
105
106    return idata;
107#else
108    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
109    s_Data[pos] = 0;
110    pos += K_WARP_SIZE;
111    s_Data[pos] = idata;
112
113    s_Data[pos] += s_Data[pos - 1];
114    s_Data[pos] += s_Data[pos - 2];
115    s_Data[pos] += s_Data[pos - 4];
116    s_Data[pos] += s_Data[pos - 8];
117    s_Data[pos] += s_Data[pos - 16];
118
119    return s_Data[pos];
120#endif
121}
122
123__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
124{
125    return warpScanInclusive(idata, s_Data) - idata;
126}
127
128template <Ncv32u tiNumScanThreads>
129__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
130{
131    if (tiNumScanThreads > K_WARP_SIZE)
132    {
133        //Bottom-level inclusive warp scan
134        Ncv32u warpResult = warpScanInclusive(idata, s_Data);
135
136        //Save top elements of each warp for exclusive warp scan
137        //sync to wait for warp scans to complete (because s_Data is being overwritten)
138        __syncthreads();
139        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
140        {
141            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
142        }
143
144        //wait for warp scans to complete
145        __syncthreads();
146
147        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
148        {
149            //grab top warp elements
150            Ncv32u val = s_Data[threadIdx.x];
151            //calculate exclusive scan and write back to shared memory
152            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
153        }
154
155        //return updated warp scans with exclusive scan results
156        __syncthreads();
157        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
158    }
159    else
160    {
161        return warpScanInclusive(idata, s_Data);
162    }
163}
164
165
166//==============================================================================
167//
168// HaarClassifierCascade file
169//
170//==============================================================================
171
172
173const Ncv32u MAX_GRID_DIM = 65535;
174
175
176const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
177
178
179#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6
180#define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
181
182
183/** \internal
184* Haar features solid array.
185*/
186texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
187
188
189/** \internal
190* Haar classifiers flattened trees container.
191* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
192* Drawback: breaks tree locality (might cause more cache misses
193* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
194*/
195texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
196
197
198texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
199
200
201__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
202{
203    return d_Stages[iStage];
204}
205
206
207template <NcvBool tbCacheTextureCascade>
208__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
209{
210    HaarClassifierNode128 tmpNode;
211    if (tbCacheTextureCascade)
212    {
213        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
214    }
215    else
216    {
217        tmpNode = d_ClassifierNodes[iNode];
218    }
219    return tmpNode;
220}
221
222
223template <NcvBool tbCacheTextureCascade>
224__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
225                           Ncv32f *weight,
226                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
227{
228    HaarFeature64 feature;
229    if (tbCacheTextureCascade)
230    {
231        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
232    }
233    else
234    {
235        feature = d_Features[iFeature];
236    }
237    feature.getRect(rectX, rectY, rectWidth, rectHeight);
238    *weight = feature.getWeight();
239}
240
241
242template <NcvBool tbCacheTextureIImg>
243__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
244{
245    if (tbCacheTextureIImg)
246    {
247        return tex1Dfetch(texIImage, x);
248    }
249    else
250    {
251        return d_IImg[x];
252    }
253}
254
255
256__device__ Ncv32u d_outMaskPosition;
257
258
259__device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
260{
261#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
262
263    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
264    __shared__ Ncv32u numPassed;
265    __shared__ Ncv32u outMaskOffset;
266
267    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
268    __syncthreads();
269
270    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
271    {
272        numPassed = incScan;
273        outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
274    }
275
276    if (threadPassFlag)
277    {
278        Ncv32u excScan = incScan - threadPassFlag;
279        shmem[excScan] = threadElem;
280    }
281
282    __syncthreads();
283
284    if (threadIdx.x < numPassed)
285    {
286        vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
287    }
288#endif
289}
290
291
292template <NcvBool tbInitMaskPositively,
293          NcvBool tbCacheTextureIImg,
294          NcvBool tbCacheTextureCascade,
295          NcvBool tbReadPixelIndexFromVector,
296          NcvBool tbDoAtomicCompaction>
297__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
298                                                  Ncv32f *d_weights, Ncv32u weightsStride,
299                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
300                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
301                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
302                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
303{
304    Ncv32u y_offs;
305    Ncv32u x_offs;
306    Ncv32u maskOffset;
307    Ncv32u outMaskVal;
308
309    NcvBool bInactiveThread = false;
310
311    if (tbReadPixelIndexFromVector)
312    {
313        maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
314
315        if (maskOffset >= mask1Dlen)
316        {
317            if (tbDoAtomicCompaction) bInactiveThread = true; else return;
318        }
319
320        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
321        {
322            outMaskVal = d_inMask[maskOffset];
323            y_offs = outMaskVal >> 16;
324            x_offs = outMaskVal & 0xFFFF;
325        }
326    }
327    else
328    {
329        y_offs = blockIdx.y;
330        x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
331
332        if (x_offs >= mask2Dstride)
333        {
334            if (tbDoAtomicCompaction) bInactiveThread = true; else return;
335        }
336
337        if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
338        {
339            maskOffset = y_offs * mask2Dstride + x_offs;
340
341            if ((x_offs >= anchorsRoi.width) ||
342                (!tbInitMaskPositively &&
343                 d_inMask != d_outMask &&
344                 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
345            {
346                if (tbDoAtomicCompaction)
347                {
348                    bInactiveThread = true;
349                }
350                else
351                {
352                    d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
353                    return;
354                }
355            }
356
357            outMaskVal = (y_offs << 16) | x_offs;
358        }
359    }
360
361    NcvBool bPass = true;
362
363    if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
364    {
365        Ncv32f pixelStdDev = 0.0f;
366
367        if (!bInactiveThread)
368            pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
369
370        for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
371        {
372            Ncv32f curStageSum = 0.0f;
373
374            HaarStage64 curStage = getStage(iStage, d_Stages);
375            Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
376            Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
377            Ncv32f stageThreshold = curStage.getStageThreshold();
378
379            while (numRootNodesInStage--)
380            {
381                NcvBool bMoreNodesToTraverse = true;
382                Ncv32u iNode = curRootNodeOffset;
383
384                if (bPass && !bInactiveThread)
385                {
386                    while (bMoreNodesToTraverse)
387                    {
388                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
389                        HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
390                        Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
391                        Ncv32u iFeature = featuresDesc.getFeaturesOffset();
392
393                        Ncv32f curNodeVal = 0.0f;
394
395                        for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
396                        {
397                            Ncv32f rectWeight;
398                            Ncv32u rectX, rectY, rectWidth, rectHeight;
399                            getFeature<tbCacheTextureCascade>
400                                (iFeature + iRect, d_Features,
401                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
402
403                            Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
404                            Ncv32u iioffsTR = iioffsTL + rectWidth;
405                            Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
406                            Ncv32u iioffsBR = iioffsBL + rectWidth;
407
408                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
409                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
410                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
411                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
412
413    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
414                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
415    #else
416                        curNodeVal += (Ncv32f)rectSum * rectWeight;
417    #endif
418                        }
419
420                        HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
421                        HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
422                        Ncv32f nodeThreshold = curNode.getThreshold();
423
424                        HaarClassifierNodeDescriptor32 nextNodeDescriptor;
425                        NcvBool nextNodeIsLeaf;
426
427                        if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
428                        {
429                            nextNodeDescriptor = nodeLeft;
430                            nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
431                        }
432                        else
433                        {
434                            nextNodeDescriptor = nodeRight;
435                            nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
436                        }
437
438                        if (nextNodeIsLeaf)
439                        {
440                            Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
441                            curStageSum += tmpLeafValue;
442                            bMoreNodesToTraverse = false;
443                        }
444                        else
445                        {
446                            iNode = nextNodeDescriptor.getNextNodeOffset();
447                        }
448                    }
449                }
450
451                __syncthreads();
452                curRootNodeOffset++;
453            }
454
455            if (curStageSum < stageThreshold)
456            {
457                bPass = false;
458                outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
459            }
460        }
461    }
462
463    __syncthreads();
464
465    if (!tbDoAtomicCompaction)
466    {
467        if (!tbReadPixelIndexFromVector ||
468            (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
469        {
470            d_outMask[maskOffset] = outMaskVal;
471        }
472    }
473    else
474    {
475        compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
476                                           outMaskVal,
477                                           d_outMask);
478    }
479}
480
481
482template <NcvBool tbCacheTextureIImg,
483          NcvBool tbCacheTextureCascade,
484          NcvBool tbDoAtomicCompaction>
485__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
486                                                      Ncv32f *d_weights, Ncv32u weightsStride,
487                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
488                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
489                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
490                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
491{
492    Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
493
494    if (maskOffset >= mask1Dlen)
495    {
496        return;
497    }
498
499    Ncv32u outMaskVal = d_inMask[maskOffset];
500    Ncv32u y_offs = outMaskVal >> 16;
501    Ncv32u x_offs = outMaskVal & 0xFFFF;
502
503    Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
504    NcvBool bPass = true;
505
506    for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
507    {
508        //this variable is subject to reduction
509        Ncv32f curStageSum = 0.0f;
510
511        HaarStage64 curStage = getStage(iStage, d_Stages);
512        Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
513        Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
514        Ncv32f stageThreshold = curStage.getStageThreshold();
515
516        Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
517
518        for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
519        {
520            NcvBool bMoreNodesToTraverse = true;
521
522            if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
523            {
524                Ncv32u iNode = curRootNodeOffset;
525
526                while (bMoreNodesToTraverse)
527                {
528                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
529                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
530                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
531                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();
532
533                    Ncv32f curNodeVal = 0.0f;
534                    //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
535                    for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
536                    {
537                        Ncv32f rectWeight;
538                        Ncv32u rectX, rectY, rectWidth, rectHeight;
539                        getFeature<tbCacheTextureCascade>
540                            (iFeature + iRect, d_Features,
541                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
542
543                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
544                        Ncv32u iioffsTR = iioffsTL + rectWidth;
545                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
546                        Ncv32u iioffsBR = iioffsBL + rectWidth;
547
548                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
549                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
550                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
551                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
552
553#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
554                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
555#else
556                        curNodeVal += (Ncv32f)rectSum * rectWeight;
557#endif
558                    }
559
560                    HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
561                    HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
562                    Ncv32f nodeThreshold = curNode.getThreshold();
563
564                    HaarClassifierNodeDescriptor32 nextNodeDescriptor;
565                    NcvBool nextNodeIsLeaf;
566
567                    if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
568                    {
569                        nextNodeDescriptor = nodeLeft;
570                        nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
571                    }
572                    else
573                    {
574                        nextNodeDescriptor = nodeRight;
575                        nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
576                    }
577
578                    if (nextNodeIsLeaf)
579                    {
580                        Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
581                        curStageSum += tmpLeafValue;
582                        bMoreNodesToTraverse = false;
583                    }
584                    else
585                    {
586                        iNode = nextNodeDescriptor.getNextNodeOffset();
587                    }
588                }
589            }
590            __syncthreads();
591
592            curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
593        }
594
595        Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
596
597        if (finalStageSum < stageThreshold)
598        {
599            bPass = false;
600            outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
601            break;
602        }
603    }
604
605    if (!tbDoAtomicCompaction)
606    {
607        if (!bPass || d_inMask != d_outMask)
608        {
609            if (!threadIdx.x)
610            {
611                d_outMask[maskOffset] = outMaskVal;
612            }
613        }
614    }
615    else
616    {
617#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
618        if (bPass && !threadIdx.x)
619        {
620            Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
621            d_outMask[outMaskOffset] = outMaskVal;
622        }
623#endif
624    }
625}
626
627
628template <NcvBool tbMaskByInmask,
629          NcvBool tbDoAtomicCompaction>
630__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
631                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,
632                                     NcvSize32u anchorsRoi, Ncv32u step)
633{
634    Ncv32u y_offs = blockIdx.y;
635    Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
636    Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
637
638    Ncv32u y_offs_upsc = step * y_offs;
639    Ncv32u x_offs_upsc = step * x_offs;
640    Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
641
642    Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
643
644    if (x_offs_upsc < anchorsRoi.width &&
645        (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
646    {
647        outElem = (y_offs_upsc << 16) | x_offs_upsc;
648    }
649
650    if (!tbDoAtomicCompaction)
651    {
652        d_outMask[outMaskOffset] = outElem;
653    }
654    else
655    {
656        compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
657                                           outElem,
658                                           d_outMask);
659    }
660}
661
662
663struct applyHaarClassifierAnchorParallelFunctor
664{
665    dim3 gridConf, blockConf;
666    cudaStream_t cuStream;
667
668    //Kernel arguments are stored as members;
669    Ncv32u *d_IImg;
670    Ncv32u IImgStride;
671    Ncv32f *d_weights;
672    Ncv32u weightsStride;
673    HaarFeature64 *d_Features;
674    HaarClassifierNode128 *d_ClassifierNodes;
675    HaarStage64 *d_Stages;
676    Ncv32u *d_inMask;
677    Ncv32u *d_outMask;
678    Ncv32u mask1Dlen;
679    Ncv32u mask2Dstride;
680    NcvSize32u anchorsRoi;
681    Ncv32u startStageInc;
682    Ncv32u endStageExc;
683    Ncv32f scaleArea;
684
685    //Arguments are passed through the constructor
686    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
687                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,
688                                             Ncv32f *_d_weights, Ncv32u _weightsStride,
689                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
690                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,
691                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
692                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
693                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :
694    gridConf(_gridConf),
695    blockConf(_blockConf),
696    cuStream(_cuStream),
697    d_IImg(_d_IImg),
698    IImgStride(_IImgStride),
699    d_weights(_d_weights),
700    weightsStride(_weightsStride),
701    d_Features(_d_Features),
702    d_ClassifierNodes(_d_ClassifierNodes),
703    d_Stages(_d_Stages),
704    d_inMask(_d_inMask),
705    d_outMask(_d_outMask),
706    mask1Dlen(_mask1Dlen),
707    mask2Dstride(_mask2Dstride),
708    anchorsRoi(_anchorsRoi),
709    startStageInc(_startStageInc),
710    endStageExc(_endStageExc),
711    scaleArea(_scaleArea)
712    {}
713
714    template<class TList>
715    void call(TList tl)
716    {
717        (void)tl;
718        applyHaarClassifierAnchorParallel <
719            Loki::TL::TypeAt<TList, 0>::Result::value,
720            Loki::TL::TypeAt<TList, 1>::Result::value,
721            Loki::TL::TypeAt<TList, 2>::Result::value,
722            Loki::TL::TypeAt<TList, 3>::Result::value,
723            Loki::TL::TypeAt<TList, 4>::Result::value >
724            <<<gridConf, blockConf, 0, cuStream>>>
725            (d_IImg, IImgStride,
726            d_weights, weightsStride,
727            d_Features, d_ClassifierNodes, d_Stages,
728            d_inMask, d_outMask,
729            mask1Dlen, mask2Dstride,
730            anchorsRoi, startStageInc,
731            endStageExc, scaleArea);
732    }
733};
734
735
736void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
737                                                  NcvBool tbCacheTextureIImg,
738                                                  NcvBool tbCacheTextureCascade,
739                                                  NcvBool tbReadPixelIndexFromVector,
740                                                  NcvBool tbDoAtomicCompaction,
741
742                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
743
744                                                  Ncv32u *d_IImg, Ncv32u IImgStride,
745                                                  Ncv32f *d_weights, Ncv32u weightsStride,
746                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
747                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
748                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
749                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,
750                                                  Ncv32u endStageExc, Ncv32f scaleArea)
751{
752
753    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
754                                                     d_IImg, IImgStride,
755                                                     d_weights, weightsStride,
756                                                     d_Features, d_ClassifierNodes, d_Stages,
757                                                     d_inMask, d_outMask,
758                                                     mask1Dlen, mask2Dstride,
759                                                     anchorsRoi, startStageInc,
760                                                     endStageExc, scaleArea);
761
762    //Second parameter is the number of "dynamic" template parameters
763    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
764        ::call( &functor,
765                tbInitMaskPositively,
766                tbCacheTextureIImg,
767                tbCacheTextureCascade,
768                tbReadPixelIndexFromVector,
769                tbDoAtomicCompaction);
770}
771
772
773struct applyHaarClassifierClassifierParallelFunctor
774{
775    dim3 gridConf, blockConf;
776    cudaStream_t cuStream;
777
778    //Kernel arguments are stored as members;
779    Ncv32u *d_IImg;
780    Ncv32u IImgStride;
781    Ncv32f *d_weights;
782    Ncv32u weightsStride;
783    HaarFeature64 *d_Features;
784    HaarClassifierNode128 *d_ClassifierNodes;
785    HaarStage64 *d_Stages;
786    Ncv32u *d_inMask;
787    Ncv32u *d_outMask;
788    Ncv32u mask1Dlen;
789    Ncv32u mask2Dstride;
790    NcvSize32u anchorsRoi;
791    Ncv32u startStageInc;
792    Ncv32u endStageExc;
793    Ncv32f scaleArea;
794
795    //Arguments are passed through the constructor
796    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
797                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,
798                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,
799                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
800                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
801                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
802                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
803                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :
804    gridConf(_gridConf),
805    blockConf(_blockConf),
806    cuStream(_cuStream),
807    d_IImg(_d_IImg),
808    IImgStride(_IImgStride),
809    d_weights(_d_weights),
810    weightsStride(_weightsStride),
811    d_Features(_d_Features),
812    d_ClassifierNodes(_d_ClassifierNodes),
813    d_Stages(_d_Stages),
814    d_inMask(_d_inMask),
815    d_outMask(_d_outMask),
816    mask1Dlen(_mask1Dlen),
817    mask2Dstride(_mask2Dstride),
818    anchorsRoi(_anchorsRoi),
819    startStageInc(_startStageInc),
820    endStageExc(_endStageExc),
821    scaleArea(_scaleArea)
822    {}
823
824    template<class TList>
825    void call(TList tl)
826    {
827        (void)tl;
828        applyHaarClassifierClassifierParallel <
829            Loki::TL::TypeAt<TList, 0>::Result::value,
830            Loki::TL::TypeAt<TList, 1>::Result::value,
831            Loki::TL::TypeAt<TList, 2>::Result::value >
832            <<<gridConf, blockConf, 0, cuStream>>>
833            (d_IImg, IImgStride,
834            d_weights, weightsStride,
835            d_Features, d_ClassifierNodes, d_Stages,
836            d_inMask, d_outMask,
837            mask1Dlen, mask2Dstride,
838            anchorsRoi, startStageInc,
839            endStageExc, scaleArea);
840    }
841};
842
843
844void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
845                                                      NcvBool tbCacheTextureCascade,
846                                                      NcvBool tbDoAtomicCompaction,
847
848                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
849
850                                                      Ncv32u *d_IImg, Ncv32u IImgStride,
851                                                      Ncv32f *d_weights, Ncv32u weightsStride,
852                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
853                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
854                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
855                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,
856                                                      Ncv32u endStageExc, Ncv32f scaleArea)
857{
858    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
859                                                         d_IImg, IImgStride,
860                                                         d_weights, weightsStride,
861                                                         d_Features, d_ClassifierNodes, d_Stages,
862                                                         d_inMask, d_outMask,
863                                                         mask1Dlen, mask2Dstride,
864                                                         anchorsRoi, startStageInc,
865                                                         endStageExc, scaleArea);
866
867    //Second parameter is the number of "dynamic" template parameters
868    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
869        ::call( &functor,
870                tbCacheTextureIImg,
871                tbCacheTextureCascade,
872                tbDoAtomicCompaction);
873}
874
875
876struct initializeMaskVectorFunctor
877{
878    dim3 gridConf, blockConf;
879    cudaStream_t cuStream;
880
881    //Kernel arguments are stored as members;
882    Ncv32u *d_inMask;
883    Ncv32u *d_outMask;
884    Ncv32u mask1Dlen;
885    Ncv32u mask2Dstride;
886    NcvSize32u anchorsRoi;
887    Ncv32u step;
888
889    //Arguments are passed through the constructor
890    initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
891                                Ncv32u *_d_inMask, Ncv32u *_d_outMask,
892                                Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
893                                NcvSize32u _anchorsRoi, Ncv32u _step) :
894    gridConf(_gridConf),
895    blockConf(_blockConf),
896    cuStream(_cuStream),
897    d_inMask(_d_inMask),
898    d_outMask(_d_outMask),
899    mask1Dlen(_mask1Dlen),
900    mask2Dstride(_mask2Dstride),
901    anchorsRoi(_anchorsRoi),
902    step(_step)
903    {}
904
905    template<class TList>
906    void call(TList tl)
907    {
908        (void)tl;
909        initializeMaskVector <
910            Loki::TL::TypeAt<TList, 0>::Result::value,
911            Loki::TL::TypeAt<TList, 1>::Result::value >
912            <<<gridConf, blockConf, 0, cuStream>>>
913            (d_inMask, d_outMask,
914             mask1Dlen, mask2Dstride,
915             anchorsRoi, step);
916    }
917};
918
919
920void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
921                                     NcvBool tbDoAtomicCompaction,
922
923                                     dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
924
925                                     Ncv32u *d_inMask, Ncv32u *d_outMask,
926                                     Ncv32u mask1Dlen, Ncv32u mask2Dstride,
927                                     NcvSize32u anchorsRoi, Ncv32u step)
928{
929    initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
930                                        d_inMask, d_outMask,
931                                        mask1Dlen, mask2Dstride,
932                                        anchorsRoi, step);
933
934    //Second parameter is the number of "dynamic" template parameters
935    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
936        ::call( &functor,
937                tbMaskByInmask,
938                tbDoAtomicCompaction);
939}
940
941
942Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
943                                              NCVVector<HaarStage64> &h_HaarStages)
944{
945    Ncv32u i = 0;
946    for (; i<haar.NumStages; i++)
947    {
948        if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
949        {
950            break;
951        }
952    }
953    return i;
954}
955
956
957NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
958                                               NCVMatrix<Ncv32f> &d_weights,
959                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
960                                               Ncv32u &numDetections,
961                                               HaarClassifierCascadeDescriptor &haar,
962                                               NCVVector<HaarStage64> &h_HaarStages,
963                                               NCVVector<HaarStage64> &d_HaarStages,
964                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,
965                                               NCVVector<HaarFeature64> &d_HaarFeatures,
966                                               NcvBool bMaskElements,
967                                               NcvSize32u anchorsRoi,
968                                               Ncv32u pixelStep,
969                                               Ncv32f scaleArea,
970                                               INCVMemAllocator &gpuAllocator,
971                                               INCVMemAllocator &cpuAllocator,
972                                               cudaDeviceProp &devProp,
973                                               cudaStream_t cuStream)
974{
975    ncvAssertReturn(integral.memType() == d_weights.memType()&&
976                    integral.memType() == d_pixelMask.memType() &&
977                    integral.memType() == gpuAllocator.memType() &&
978                   (integral.memType() == NCVMemoryTypeDevice ||
979                    integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
980
981    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
982                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
983                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
984                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
985
986    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
987
988    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
989
990    ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
991                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
992                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
993
994    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
995                    d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
996                    d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
997                    integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
998                    integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
999
1000    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1001
1002    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1003                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1004                    d_HaarFeatures.length() >= haar.NumFeatures &&
1005                    d_HaarStages.length() == h_HaarStages.length() &&
1006                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1007
1008    ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
1009
1010    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1011
1012    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1013
1014#if defined _SELF_TEST_
1015
1016    NCVStatus ncvStat;
1017
1018    NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
1019    ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1020    NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
1021    ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1022    NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1023    ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1024    NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
1025    ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1026    NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
1027    ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1028
1029    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
1030    ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1031
1032    NCV_SKIP_COND_BEGIN
1033
1034    ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
1035    ncvAssertReturnNcvStat(ncvStat);
1036    ncvStat = integral.copySolid(h_integralImage, 0);
1037    ncvAssertReturnNcvStat(ncvStat);
1038    ncvStat = d_weights.copySolid(h_weights, 0);
1039    ncvAssertReturnNcvStat(ncvStat);
1040    ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
1041    ncvAssertReturnNcvStat(ncvStat);
1042    ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
1043    ncvAssertReturnNcvStat(ncvStat);
1044    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
1045
1046    for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
1047    {
1048        for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
1049        {
1050            if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
1051            {
1052                if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
1053                {
1054                    h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
1055                }
1056            }
1057            else
1058            {
1059                h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1060            }
1061        }
1062    }
1063
1064    NCV_SKIP_COND_END
1065
1066#endif
1067
1068    NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
1069    ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
1070
1071    NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
1072    ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1073
1074    NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
1075    ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1076    Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
1077    Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
1078
1079    NCV_SKIP_COND_BEGIN
1080    *hp_zero = 0;
1081    *hp_numDet = 0;
1082    NCV_SKIP_COND_END
1083
1084    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1085                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1086
1087    NcvBool bTexCacheCascade = devProp.major < 2;
1088    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
1089    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
1090
1091    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
1092    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
1093
1094    Ncv32u szNppCompactTmpBuf;
1095    nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
1096    if (bDoAtomicCompaction)
1097    {
1098        szNppCompactTmpBuf = 0;
1099    }
1100    NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
1101
1102    NCV_SKIP_COND_BEGIN
1103
1104    if (bTexCacheIImg)
1105    {
1106        cudaChannelFormatDesc cfdTexIImage;
1107        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
1108
1109        size_t alignmentOffset;
1110        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
1111            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
1112        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1113    }
1114
1115    if (bTexCacheCascade)
1116    {
1117        cudaChannelFormatDesc cfdTexHaarFeatures;
1118        cudaChannelFormatDesc cfdTexHaarClassifierNodes;
1119        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
1120        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
1121
1122        size_t alignmentOffset;
1123        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
1124            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
1125        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1126        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
1127            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
1128        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1129    }
1130
1131    Ncv32u stageStartAnchorParallel = 0;
1132    Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
1133        haar, h_HaarStages);
1134    Ncv32u stageEndClassifierParallel = haar.NumStages;
1135    if (stageMiddleSwitch == 0)
1136    {
1137        stageMiddleSwitch = 1;
1138    }
1139
1140    //create stages subdivision for pixel-parallel processing
1141    const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
1142    Ncv32u curStop = stageStartAnchorParallel;
1143    std::vector<Ncv32u> pixParallelStageStops;
1144    while (curStop < stageMiddleSwitch)
1145    {
1146        pixParallelStageStops.push_back(curStop);
1147        curStop += compactEveryNstage;
1148    }
1149    if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
1150    {
1151        pixParallelStageStops[pixParallelStageStops.size()-1] =
1152            (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
1153    }
1154    pixParallelStageStops.push_back(stageMiddleSwitch);
1155    Ncv32u pixParallelStageStopsIndex = 0;
1156
1157    if (pixelStep != 1 || bMaskElements)
1158    {
1159        if (bDoAtomicCompaction)
1160        {
1161            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1162                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1163            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1164        }
1165
1166        dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1167                        (anchorsRoi.height + pixelStep - 1) / pixelStep);
1168        dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
1169
1170        if (gridInit.x == 0 || gridInit.y == 0)
1171        {
1172            numDetections = 0;
1173            return NCV_SUCCESS;
1174        }
1175
1176        initializeMaskVectorDynTemplate(bMaskElements,
1177                                        bDoAtomicCompaction,
1178                                        gridInit, blockInit, cuStream,
1179                                        d_ptrNowData->ptr(),
1180                                        d_ptrNowTmp->ptr(),
1181                                        static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
1182                                        anchorsRoi, pixelStep);
1183        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1184
1185        if (bDoAtomicCompaction)
1186        {
1187            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1188            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1189                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1190            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1191            swap(d_ptrNowData, d_ptrNowTmp);
1192        }
1193        else
1194        {
1195            NCVStatus nppSt;
1196            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1197                                      d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1198                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1199            ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
1200        }
1201        numDetections = *hp_numDet;
1202    }
1203    else
1204    {
1205        //
1206        // 1. Run the first pixel-input pixel-parallel classifier for few stages
1207        //
1208
1209        if (bDoAtomicCompaction)
1210        {
1211            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1212                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1213            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1214        }
1215
1216        dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1217                   anchorsRoi.height);
1218        dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
1219        applyHaarClassifierAnchorParallelDynTemplate(
1220            true,                         //tbInitMaskPositively
1221            bTexCacheIImg,                //tbCacheTextureIImg
1222            bTexCacheCascade,             //tbCacheTextureCascade
1223            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
1224            bDoAtomicCompaction,          //tbDoAtomicCompaction
1225            grid1,
1226            block1,
1227            cuStream,
1228            integral.ptr(), integral.stride(),
1229            d_weights.ptr(), d_weights.stride(),
1230            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1231            d_ptrNowData->ptr(),
1232            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1233            0,
1234            d_pixelMask.stride(),
1235            anchorsRoi,
1236            pixParallelStageStops[pixParallelStageStopsIndex],
1237            pixParallelStageStops[pixParallelStageStopsIndex+1],
1238            scaleAreaPixels);
1239        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1240
1241        if (bDoAtomicCompaction)
1242        {
1243            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1244            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1245                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1246            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1247        }
1248        else
1249        {
1250            NCVStatus nppSt;
1251            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1252                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1253                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1254            ncvAssertReturnNcvStat(nppSt);
1255        }
1256
1257        swap(d_ptrNowData, d_ptrNowTmp);
1258        numDetections = *hp_numDet;
1259
1260        pixParallelStageStopsIndex++;
1261    }
1262
1263    //
1264    // 2. Run pixel-parallel stages
1265    //
1266
1267    for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
1268    {
1269        if (numDetections == 0)
1270        {
1271            break;
1272        }
1273
1274        if (bDoAtomicCompaction)
1275        {
1276            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1277                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1278            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1279        }
1280
1281        dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
1282        if (numDetections > MAX_GRID_DIM)
1283        {
1284            grid2.x = MAX_GRID_DIM;
1285            grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1286        }
1287        dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
1288
1289        applyHaarClassifierAnchorParallelDynTemplate(
1290            false,                        //tbInitMaskPositively
1291            bTexCacheIImg,                //tbCacheTextureIImg
1292            bTexCacheCascade,             //tbCacheTextureCascade
1293            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
1294            bDoAtomicCompaction,          //tbDoAtomicCompaction
1295            grid2,
1296            block2,
1297            cuStream,
1298            integral.ptr(), integral.stride(),
1299            d_weights.ptr(), d_weights.stride(),
1300            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1301            d_ptrNowData->ptr(),
1302            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1303            numDetections,
1304            d_pixelMask.stride(),
1305            anchorsRoi,
1306            pixParallelStageStops[pixParallelStageStopsIndex],
1307            pixParallelStageStops[pixParallelStageStopsIndex+1],
1308            scaleAreaPixels);
1309        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1310
1311        if (bDoAtomicCompaction)
1312        {
1313            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1314            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1315                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1316            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1317        }
1318        else
1319        {
1320            NCVStatus nppSt;
1321            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1322                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1323                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1324            ncvAssertReturnNcvStat(nppSt);
1325        }
1326
1327        swap(d_ptrNowData, d_ptrNowTmp);
1328        numDetections = *hp_numDet;
1329    }
1330
1331    //
1332    // 3. Run all left stages in one stage-parallel kernel
1333    //
1334
1335    if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
1336    {
1337        if (bDoAtomicCompaction)
1338        {
1339            ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1340                                                        0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1341            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1342        }
1343
1344        dim3 grid3(numDetections);
1345        if (numDetections > MAX_GRID_DIM)
1346        {
1347            grid3.x = MAX_GRID_DIM;
1348            grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1349        }
1350        dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
1351
1352        applyHaarClassifierClassifierParallelDynTemplate(
1353            bTexCacheIImg,                //tbCacheTextureIImg
1354            bTexCacheCascade,             //tbCacheTextureCascade
1355            bDoAtomicCompaction,          //tbDoAtomicCompaction
1356            grid3,
1357            block3,
1358            cuStream,
1359            integral.ptr(), integral.stride(),
1360            d_weights.ptr(), d_weights.stride(),
1361            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1362            d_ptrNowData->ptr(),
1363            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1364            numDetections,
1365            d_pixelMask.stride(),
1366            anchorsRoi,
1367            stageMiddleSwitch,
1368            stageEndClassifierParallel,
1369            scaleAreaPixels);
1370        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1371
1372        if (bDoAtomicCompaction)
1373        {
1374            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1375            ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1376                                                          0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1377            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1378        }
1379        else
1380        {
1381            NCVStatus nppSt;
1382            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1383                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1384                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1385            ncvAssertReturnNcvStat(nppSt);
1386        }
1387
1388        swap(d_ptrNowData, d_ptrNowTmp);
1389        numDetections = *hp_numDet;
1390    }
1391
1392    if (d_ptrNowData != &d_vecPixelMask)
1393    {
1394        d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
1395        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1396    }
1397
1398#if defined _SELF_TEST_
1399
1400    ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
1401    ncvAssertReturnNcvStat(ncvStat);
1402    ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1403
1404    if (bDoAtomicCompaction)
1405    {
1406        std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
1407    }
1408
1409    Ncv32u fpu_oldcw, fpu_cw;
1410    _controlfp_s(&fpu_cw, 0, 0);
1411    fpu_oldcw = fpu_cw;
1412    _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
1413    Ncv32u numDetGold;
1414    ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
1415                                                 h_HaarStages, h_HaarNodes, h_HaarFeatures,
1416                                                 bMaskElements, anchorsRoi, pixelStep, scaleArea);
1417    ncvAssertReturnNcvStat(ncvStat);
1418    _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
1419
1420    bool bPass = true;
1421
1422    if (numDetGold != numDetections)
1423    {
1424        printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
1425        bPass = false;
1426    }
1427    else
1428    {
1429        for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
1430        {
1431            if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
1432            {
1433                printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
1434                bPass = false;
1435            }
1436        }
1437    }
1438
1439    printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
1440#endif
1441
1442    NCV_SKIP_COND_END
1443
1444    return NCV_SUCCESS;
1445}
1446
1447
1448//==============================================================================
1449//
1450// HypothesesOperations file
1451//
1452//==============================================================================
1453
1454
1455const Ncv32u NUM_GROW_THREADS = 128;
1456
1457
1458__device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
1459{
1460    NcvRect32u res;
1461    res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
1462    res.y = (Ncv32u)(scale * (pixel >> 16));
1463    res.width = (Ncv32u)(scale * width);
1464    res.height = (Ncv32u)(scale * height);
1465    return res;
1466}
1467
1468
1469__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
1470                                     NcvRect32u *hypotheses,
1471                                     Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
1472{
1473    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1474    Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
1475    if (elemAddr >= numElements)
1476    {
1477        return;
1478    }
1479    hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
1480}
1481
1482
1483NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
1484                                         Ncv32u numPixelMaskDetections,
1485                                         NCVVector<NcvRect32u> &hypotheses,
1486                                         Ncv32u &totalDetections,
1487                                         Ncv32u totalMaxDetections,
1488                                         Ncv32u rectWidth,
1489                                         Ncv32u rectHeight,
1490                                         Ncv32f curScale,
1491                                         cudaStream_t cuStream)
1492{
1493    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1494
1495    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1496                    pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1497
1498    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1499
1500    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1501
1502    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1503                    numPixelMaskDetections <= pixelMask.length() &&
1504                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1505
1506    NCVStatus ncvStat = NCV_SUCCESS;
1507    Ncv32u numDetsToCopy = numPixelMaskDetections;
1508
1509    if (numDetsToCopy == 0)
1510    {
1511        return ncvStat;
1512    }
1513
1514    if (totalDetections + numPixelMaskDetections > totalMaxDetections)
1515    {
1516        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1517        numDetsToCopy = totalMaxDetections - totalDetections;
1518    }
1519
1520    dim3 block(NUM_GROW_THREADS);
1521    dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
1522    if (grid.x > 65535)
1523    {
1524        grid.y = (grid.x + 65534) / 65535;
1525        grid.x = 65535;
1526    }
1527    growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
1528                                                       hypotheses.ptr() + totalDetections,
1529                                                       rectWidth, rectHeight, curScale);
1530    ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1531
1532    totalDetections += numDetsToCopy;
1533    return ncvStat;
1534}
1535
1536
1537//==============================================================================
1538//
1539// Pipeline file
1540//
1541//==============================================================================
1542
1543
1544NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
1545                                            NcvSize32u srcRoi,
1546                                            NCVVector<NcvRect32u> &d_dstRects,
1547                                            Ncv32u &dstNumRects,
1548
1549                                            HaarClassifierCascadeDescriptor &haar,
1550                                            NCVVector<HaarStage64> &h_HaarStages,
1551                                            NCVVector<HaarStage64> &d_HaarStages,
1552                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,
1553                                            NCVVector<HaarFeature64> &d_HaarFeatures,
1554
1555                                            NcvSize32u minObjSize,
1556                                            Ncv32u minNeighbors,      //default 4
1557                                            Ncv32f scaleStep,         //default 1.2f
1558                                            Ncv32u pixelStep,         //default 1
1559                                            Ncv32u flags,             //default NCVPipeObjDet_Default
1560
1561                                            INCVMemAllocator &gpuAllocator,
1562                                            INCVMemAllocator &cpuAllocator,
1563                                            cudaDeviceProp &devProp,
1564                                            cudaStream_t cuStream)
1565{
1566    ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
1567                    d_srcImg.memType() == gpuAllocator.memType() &&
1568                     (d_srcImg.memType() == NCVMemoryTypeDevice ||
1569                      d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1570
1571    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
1572                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
1573                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
1574                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1575
1576    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1577
1578    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
1579
1580    ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
1581                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
1582                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
1583    ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
1584                    d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
1585                    srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
1586                    d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
1587
1588    ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
1589
1590    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1591                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1592                    d_HaarFeatures.length() >= haar.NumFeatures &&
1593                    d_HaarStages.length() == h_HaarStages.length() &&
1594                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1595
1596    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1597
1598    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1599
1600    //TODO: set NPP active stream to cuStream
1601
1602    NCVStatus ncvStat;
1603    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1604
1605    Ncv32u integralWidth = d_srcImg.width() + 1;
1606    Ncv32u integralHeight = d_srcImg.height() + 1;
1607
1608    NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
1609    ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1610    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1611    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1612
1613    NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1614    ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1615    NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1616    ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1617
1618    NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
1619    ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1620    NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1621    ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1622
1623    NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
1624    ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1625    NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
1626    ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1627
1628    NCVStatus nppStat;
1629    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
1630    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
1631    ncvAssertReturnNcvStat(nppStat);
1632    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
1633    ncvAssertReturnNcvStat(nppStat);
1634    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
1635    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1636
1637    NCV_SKIP_COND_BEGIN
1638
1639    nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1640                                       integral.ptr(), integral.pitch(),
1641                                       NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1642                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
1643    ncvAssertReturnNcvStat(nppStat);
1644
1645    nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1646                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1647                                          NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1648                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
1649    ncvAssertReturnNcvStat(nppStat);
1650
1651    NCV_SKIP_COND_END
1652
1653    dstNumRects = 0;
1654
1655    Ncv32u lastCheckedScale = 0;
1656    NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
1657    std::vector<Ncv32u> scalesVector;
1658
1659    NcvBool bFoundLargestFace = false;
1660
1661    for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
1662    {
1663        Ncv32u scale = (Ncv32u)scaleIter;
1664        if (lastCheckedScale == scale)
1665        {
1666            continue;
1667        }
1668        lastCheckedScale = scale;
1669
1670        if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
1671            haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
1672        {
1673            continue;
1674        }
1675
1676        NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
1677
1678        srcRoi_.width = d_srcImg.width();
1679        srcRoi_.height = d_srcImg.height();
1680
1681        srcIIRo_i.width = srcRoi_.width + 1;
1682        srcIIRo_i.height = srcRoi_.height + 1;
1683
1684        scaledIIRoi.width = srcIIRo_i.width / scale;
1685        scaledIIRoi.height = srcIIRo_i.height / scale;
1686
1687        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1688        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1689
1690        if (searchRoi.width <= 0 || searchRoi.height <= 0)
1691        {
1692            break;
1693        }
1694
1695        scalesVector.push_back(scale);
1696
1697        if (gpuAllocator.isCounting())
1698        {
1699            break;
1700        }
1701    }
1702
1703    if (bReverseTraverseScale)
1704    {
1705        std::reverse(scalesVector.begin(), scalesVector.end());
1706    }
1707
1708    //TODO: handle _fair_scale_ flag
1709    for (Ncv32u i=0; i<scalesVector.size(); i++)
1710    {
1711        Ncv32u scale = scalesVector[i];
1712
1713        NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
1714        NcvSize32u srcIIRoi;
1715        srcRoi_.width = d_srcImg.width();
1716        srcRoi_.height = d_srcImg.height();
1717        srcIIRoi.width = srcRoi_.width + 1;
1718        srcIIRoi.height = srcRoi_.height + 1;
1719        scaledIIRoi.width = srcIIRoi.width / scale;
1720        scaledIIRoi.height = srcIIRoi.height / scale;
1721        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1722        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1723
1724        NCV_SKIP_COND_BEGIN
1725
1726        nppStat = nppiStDecimate_32u_C1R(
1727            integral.ptr(), integral.pitch(),
1728            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1729            srcIIRoi, scale, true);
1730        ncvAssertReturnNcvStat(nppStat);
1731
1732        nppStat = nppiStDecimate_64u_C1R(
1733            d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1734            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1735            srcIIRoi, scale, true);
1736        ncvAssertReturnNcvStat(nppStat);
1737
1738        const NcvRect32u rect(
1739            HAAR_STDDEV_BORDER,
1740            HAAR_STDDEV_BORDER,
1741            haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
1742            haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
1743        nppStat = nppiStRectStdDev_32f_C1R(
1744            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1745            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1746            d_rectStdDev.ptr(), d_rectStdDev.pitch(),
1747            NcvSize32u(searchRoi.width, searchRoi.height), rect,
1748            (Ncv32f)scale*scale, true);
1749        ncvAssertReturnNcvStat(nppStat);
1750
1751        NCV_SKIP_COND_END
1752
1753        Ncv32u detectionsOnThisScale;
1754        ncvStat = ncvApplyHaarClassifierCascade_device(
1755            d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
1756            detectionsOnThisScale,
1757            haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
1758            searchRoi, pixelStep, (Ncv32f)scale*scale,
1759            gpuAllocator, cpuAllocator, devProp, cuStream);
1760        ncvAssertReturnNcvStat(nppStat);
1761
1762        NCV_SKIP_COND_BEGIN
1763
1764        NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
1765        ncvStat = ncvGrowDetectionsVector_device(
1766            d_vecPixelMask,
1767            detectionsOnThisScale,
1768            d_hypothesesIntermediate,
1769            dstNumRects,
1770            static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
1771            haar.ClassifierSize.width,
1772            haar.ClassifierSize.height,
1773            (Ncv32f)scale,
1774            cuStream);
1775        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
1776
1777        if (flags & NCVPipeObjDet_FindLargestObject)
1778        {
1779            if (dstNumRects == 0)
1780            {
1781                continue;
1782            }
1783
1784            if (dstNumRects != 0)
1785            {
1786                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1787                ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1788                                                             dstNumRects * sizeof(NcvRect32u));
1789                ncvAssertReturnNcvStat(ncvStat);
1790                ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1791            }
1792
1793            Ncv32u numStrongHypothesesNow = dstNumRects;
1794            ncvStat = ncvGroupRectangles_host(
1795                h_hypothesesIntermediate,
1796                numStrongHypothesesNow,
1797                minNeighbors,
1798                RECT_SIMILARITY_PROPORTION,
1799                NULL);
1800            ncvAssertReturnNcvStat(ncvStat);
1801
1802            if (numStrongHypothesesNow > 0)
1803            {
1804                NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
1805                for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
1806                {
1807                    if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
1808                    {
1809                        maxRect = h_hypothesesIntermediate.ptr()[j];
1810                    }
1811                }
1812
1813                h_hypothesesIntermediate.ptr()[0] = maxRect;
1814                dstNumRects = 1;
1815
1816                ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
1817                ncvAssertReturnNcvStat(ncvStat);
1818
1819                bFoundLargestFace = true;
1820
1821                break;
1822            }
1823        }
1824
1825        NCV_SKIP_COND_END
1826
1827        if (gpuAllocator.isCounting())
1828        {
1829            break;
1830        }
1831    }
1832
1833    NCVStatus ncvRetCode = NCV_SUCCESS;
1834
1835    NCV_SKIP_COND_BEGIN
1836
1837    if (flags & NCVPipeObjDet_FindLargestObject)
1838    {
1839        if (!bFoundLargestFace)
1840        {
1841            dstNumRects = 0;
1842        }
1843    }
1844    else
1845    {
1846        //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
1847        if (dstNumRects != 0)
1848        {
1849            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1850            ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1851                                                         dstNumRects * sizeof(NcvRect32u));
1852            ncvAssertReturnNcvStat(ncvStat);
1853            ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1854        }
1855
1856        ncvStat = ncvGroupRectangles_host(
1857            h_hypothesesIntermediate,
1858            dstNumRects,
1859            minNeighbors,
1860            RECT_SIMILARITY_PROPORTION,
1861            NULL);
1862        ncvAssertReturnNcvStat(ncvStat);
1863
1864        if (dstNumRects > d_dstRects.length())
1865        {
1866            ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1867            dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
1868        }
1869
1870        if (dstNumRects != 0)
1871        {
1872            ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
1873                                                         dstNumRects * sizeof(NcvRect32u));
1874            ncvAssertReturnNcvStat(ncvStat);
1875        }
1876    }
1877
1878    if (flags & NCVPipeObjDet_VisualizeInPlace)
1879    {
1880        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1881        ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
1882                               d_srcImg.width(), d_srcImg.height(),
1883                               d_dstRects.ptr(), dstNumRects, 255, cuStream);
1884    }
1885
1886    NCV_SKIP_COND_END
1887
1888    return ncvRetCode;
1889}
1890
1891
1892//==============================================================================
1893//
1894// Purely Host code: classifier IO, mock-ups
1895//
1896//==============================================================================
1897
1898
1899#ifdef _SELF_TEST_
1900#include <float.h>
1901#endif
1902
1903
1904NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
1905                                             NCVMatrix<Ncv32f> &h_weights,
1906                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
1907                                             Ncv32u &numDetections,
1908                                             HaarClassifierCascadeDescriptor &haar,
1909                                             NCVVector<HaarStage64> &h_HaarStages,
1910                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
1911                                             NCVVector<HaarFeature64> &h_HaarFeatures,
1912                                             NcvBool bMaskElements,
1913                                             NcvSize32u anchorsRoi,
1914                                             Ncv32u pixelStep,
1915                                             Ncv32f scaleArea)
1916{
1917    ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
1918                    h_integralImage.memType() == h_pixelMask.memType() &&
1919                     (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
1920                      h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1921    ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
1922                    h_HaarStages.memType() == h_HaarFeatures.memType() &&
1923                     (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
1924                      h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1925    ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
1926                    h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
1927    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
1928                    h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
1929                    h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
1930                    h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
1931                    h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
1932    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1933    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
1934                    h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1935                    h_HaarFeatures.length() >= haar.NumFeatures &&
1936                    h_HaarStages.length() == h_HaarStages.length() &&
1937                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1938    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1939    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1940
1941    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1942                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1943
1944    for (Ncv32u i=0; i<anchorsRoi.height; i++)
1945    {
1946        for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
1947        {
1948            if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
1949            {
1950                h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1951            }
1952            else
1953            {
1954                for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
1955                {
1956                    Ncv32f curStageSum = 0.0f;
1957                    Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
1958                    Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
1959
1960                    if (iStage == 0)
1961                    {
1962                        if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1963                        {
1964                            break;
1965                        }
1966                        else
1967                        {
1968                            h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
1969                        }
1970                    }
1971                    else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1972                    {
1973                        break;
1974                    }
1975
1976                    while (numRootNodesInStage--)
1977                    {
1978                        NcvBool bMoreNodesToTraverse = true;
1979                        Ncv32u curNodeOffset = curRootNodeOffset;
1980
1981                        while (bMoreNodesToTraverse)
1982                        {
1983                            HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
1984                            HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
1985                            Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
1986                            Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
1987
1988                            Ncv32f curNodeVal = 0.f;
1989                            for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
1990                            {
1991                                HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
1992                                Ncv32u rectX, rectY, rectWidth, rectHeight;
1993                                feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
1994                                Ncv32f rectWeight = feature.getWeight();
1995                                Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
1996                                Ncv32u iioffsTR = iioffsTL + rectWidth;
1997                                Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
1998                                Ncv32u iioffsBR = iioffsBL + rectWidth;
1999
2000                                Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
2001                                Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
2002                                Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
2003                                Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
2004                                Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
2005                                curNodeVal += (Ncv32f)rectSum * rectWeight;
2006                            }
2007
2008                            HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
2009                            HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
2010                            Ncv32f nodeThreshold = curNode.getThreshold();
2011
2012                            HaarClassifierNodeDescriptor32 nextNodeDescriptor;
2013                            NcvBool nextNodeIsLeaf;
2014
2015                            if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
2016                            {
2017                                nextNodeDescriptor = nodeLeft;
2018                                nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
2019                            }
2020                            else
2021                            {
2022                                nextNodeDescriptor = nodeRight;
2023                                nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
2024                            }
2025
2026                            if (nextNodeIsLeaf)
2027                            {
2028                                Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
2029                                curStageSum += tmpLeafValue;
2030                                bMoreNodesToTraverse = false;
2031                            }
2032                            else
2033                            {
2034                                curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
2035                            }
2036                        }
2037
2038                        curRootNodeOffset++;
2039                    }
2040
2041                    Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
2042                    if (curStageSum < tmpStageThreshold)
2043                    {
2044                        //drop
2045                        h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
2046                        break;
2047                    }
2048                }
2049            }
2050        }
2051    }
2052
2053    std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
2054    Ncv32u i = 0;
2055    for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
2056    {
2057        if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
2058        {
2059            break;
2060        }
2061    }
2062    numDetections = i;
2063
2064    return NCV_SUCCESS;
2065}
2066
2067
2068NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
2069                                       Ncv32u numPixelMaskDetections,
2070                                       NCVVector<NcvRect32u> &hypotheses,
2071                                       Ncv32u &totalDetections,
2072                                       Ncv32u totalMaxDetections,
2073                                       Ncv32u rectWidth,
2074                                       Ncv32u rectHeight,
2075                                       Ncv32f curScale)
2076{
2077    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
2078    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
2079                    pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
2080    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
2081    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
2082    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
2083                    numPixelMaskDetections <= pixelMask.length() &&
2084                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
2085
2086    NCVStatus ncvStat = NCV_SUCCESS;
2087    Ncv32u numDetsToCopy = numPixelMaskDetections;
2088
2089    if (numDetsToCopy == 0)
2090    {
2091        return ncvStat;
2092    }
2093
2094    if (totalDetections + numPixelMaskDetections > totalMaxDetections)
2095    {
2096        ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
2097        numDetsToCopy = totalMaxDetections - totalDetections;
2098    }
2099
2100    for (Ncv32u i=0; i<numDetsToCopy; i++)
2101    {
2102        hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
2103    }
2104
2105    totalDetections += numDetsToCopy;
2106    return ncvStat;
2107}
2108
2109static NCVStatus loadFromXML(const cv::String &filename,
2110                      HaarClassifierCascadeDescriptor &haar,
2111                      std::vector<HaarStage64> &haarStages,
2112                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
2113                      std::vector<HaarFeature64> &haarFeatures)
2114{
2115#ifndef HAVE_OPENCV_OBJDETECT
2116    (void) filename;
2117    (void) haar;
2118    (void) haarStages;
2119    (void) haarClassifierNodes;
2120    (void) haarFeatures;
2121    CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
2122    return NCV_HAAR_XML_LOADING_EXCEPTION;
2123#else
2124    NCVStatus ncvStat;
2125
2126    haar.NumStages = 0;
2127    haar.NumClassifierRootNodes = 0;
2128    haar.NumClassifierTotalNodes = 0;
2129    haar.NumFeatures = 0;
2130    haar.ClassifierSize.width = 0;
2131    haar.ClassifierSize.height = 0;
2132    haar.bHasStumpsOnly = true;
2133    haar.bNeedsTiltedII = false;
2134    Ncv32u curMaxTreeDepth = 0;
2135
2136    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
2137    haarStages.resize(0);
2138    haarClassifierNodes.resize(0);
2139    haarFeatures.resize(0);
2140
2141    cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
2142    if (!oldCascade)
2143    {
2144        return NCV_HAAR_XML_LOADING_EXCEPTION;
2145    }
2146
2147    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
2148    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
2149
2150    int stagesCound = oldCascade->count;
2151    for(int s = 0; s < stagesCound; ++s) // by stages
2152    {
2153        HaarStage64 curStage;
2154        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
2155
2156        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
2157
2158        int treesCount = oldCascade->stage_classifier[s].count;
2159        for(int t = 0; t < treesCount; ++t) // by trees
2160        {
2161            Ncv32u nodeId = 0;
2162            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
2163
2164            int nodesCount = tree->count;
2165            for(int n = 0; n < nodesCount; ++n)  //by features
2166            {
2167                CvHaarFeature* feature = &tree->haar_feature[n];
2168
2169                HaarClassifierNode128 curNode;
2170                curNode.setThreshold(tree->threshold[n]);
2171
2172                NcvBool bIsLeftNodeLeaf = false;
2173                NcvBool bIsRightNodeLeaf = false;
2174
2175                HaarClassifierNodeDescriptor32 nodeLeft;
2176                if ( tree->left[n] <= 0 )
2177                {
2178                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
2179                    ncvStat = nodeLeft.create(leftVal);
2180                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2181                    bIsLeftNodeLeaf = true;
2182                }
2183                else
2184                {
2185                    Ncv32u leftNodeOffset = tree->left[n];
2186                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
2187                    haar.bHasStumpsOnly = false;
2188                }
2189                curNode.setLeftNodeDesc(nodeLeft);
2190
2191                HaarClassifierNodeDescriptor32 nodeRight;
2192                if ( tree->right[n] <= 0 )
2193                {
2194                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
2195                    ncvStat = nodeRight.create(rightVal);
2196                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2197                    bIsRightNodeLeaf = true;
2198                }
2199                else
2200                {
2201                    Ncv32u rightNodeOffset = tree->right[n];
2202                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
2203                    haar.bHasStumpsOnly = false;
2204                }
2205                curNode.setRightNodeDesc(nodeRight);
2206
2207                Ncv32u tiltedVal = feature->tilted;
2208                haar.bNeedsTiltedII = (tiltedVal != 0);
2209
2210                Ncv32u featureId = 0;
2211                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
2212                {
2213                    Ncv32u rectX = feature->rect[l].r.x;
2214                    Ncv32u rectY = feature->rect[l].r.y;
2215                    Ncv32u rectWidth = feature->rect[l].r.width;
2216                    Ncv32u rectHeight = feature->rect[l].r.height;
2217
2218                    Ncv32f rectWeight = feature->rect[l].weight;
2219
2220                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
2221                        break;
2222
2223                    HaarFeature64 curFeature;
2224                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
2225                    curFeature.setWeight(rectWeight);
2226                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2227                    haarFeatures.push_back(curFeature);
2228
2229                    featureId++;
2230                }
2231
2232                HaarFeatureDescriptor32 tmpFeatureDesc;
2233                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
2234                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
2235                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2236                curNode.setFeatureDesc(tmpFeatureDesc);
2237
2238                if (!nodeId)
2239                {
2240                    //root node
2241                    haarClassifierNodes.push_back(curNode);
2242                    curMaxTreeDepth = 1;
2243                }
2244                else
2245                {
2246                    //other node
2247                    h_TmpClassifierNotRootNodes.push_back(curNode);
2248                    curMaxTreeDepth++;
2249                }
2250
2251                nodeId++;
2252            }
2253        }
2254
2255        curStage.setNumClassifierRootNodes(treesCount);
2256        haarStages.push_back(curStage);
2257    }
2258
2259    //fill in cascade stats
2260    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
2261    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
2262    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
2263    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
2264
2265    //merge root and leaf nodes in one classifiers array
2266    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
2267    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
2268    {
2269        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
2270
2271        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
2272        if (!featureDesc.isLeftNodeLeaf())
2273        {
2274            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2275            nodeLeft.create(newOffset);
2276        }
2277        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
2278
2279        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
2280        if (!featureDesc.isRightNodeLeaf())
2281        {
2282            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2283            nodeRight.create(newOffset);
2284        }
2285        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
2286    }
2287
2288    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
2289    {
2290        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
2291
2292        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
2293        if (!featureDesc.isLeftNodeLeaf())
2294        {
2295            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2296            nodeLeft.create(newOffset);
2297        }
2298        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
2299
2300        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
2301        if (!featureDesc.isRightNodeLeaf())
2302        {
2303            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2304            nodeRight.create(newOffset);
2305        }
2306        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
2307
2308        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
2309    }
2310
2311    return NCV_SUCCESS;
2312#endif
2313}
2314
2315
2316#define NVBIN_HAAR_SIZERESERVED     16
2317#define NVBIN_HAAR_VERSION          0x1
2318
2319
2320static NCVStatus loadFromNVBIN(const cv::String &filename,
2321                               HaarClassifierCascadeDescriptor &haar,
2322                               std::vector<HaarStage64> &haarStages,
2323                               std::vector<HaarClassifierNode128> &haarClassifierNodes,
2324                               std::vector<HaarFeature64> &haarFeatures)
2325{
2326    size_t readCount;
2327    FILE *fp = fopen(filename.c_str(), "rb");
2328    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2329    Ncv32u fileVersion;
2330    readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2331    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2332    ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2333    Ncv32u fsize;
2334    readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
2335    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2336    fseek(fp, 0, SEEK_END);
2337    Ncv32u fsizeActual = ftell(fp);
2338    ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
2339
2340    std::vector<unsigned char> fdata;
2341    fdata.resize(fsize);
2342    Ncv32u dataOffset = 0;
2343    fseek(fp, 0, SEEK_SET);
2344    readCount = fread(&fdata[0], fsize, 1, fp);
2345    ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2346    fclose(fp);
2347
2348    //data
2349    dataOffset = NVBIN_HAAR_SIZERESERVED;
2350    haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
2351    dataOffset += sizeof(Ncv32u);
2352    haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2353    dataOffset += sizeof(Ncv32u);
2354    haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2355    dataOffset += sizeof(Ncv32u);
2356    haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
2357    dataOffset += sizeof(Ncv32u);
2358    haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
2359    dataOffset += sizeof(NcvSize32u);
2360    haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
2361    dataOffset += sizeof(NcvBool);
2362    haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
2363    dataOffset += sizeof(NcvBool);
2364
2365    haarStages.resize(haar.NumStages);
2366    haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
2367    haarFeatures.resize(haar.NumFeatures);
2368
2369    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2370    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2371    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2372
2373    memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
2374    dataOffset += szStages;
2375    memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
2376    dataOffset += szClassifiers;
2377    memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
2378    dataOffset += szFeatures;
2379
2380    return NCV_SUCCESS;
2381}
2382
2383
2384NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
2385                                   Ncv32u &numNodes, Ncv32u &numFeatures)
2386{
2387    size_t readCount;
2388    NCVStatus ncvStat;
2389
2390    cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2391    fext = fext.toLowerCase();
2392
2393    if (fext == "nvbin")
2394    {
2395        FILE *fp = fopen(filename.c_str(), "rb");
2396        ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2397        Ncv32u fileVersion;
2398        readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2399        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2400        ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2401        fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
2402        Ncv32u tmp;
2403        readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
2404        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2405        readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
2406        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2407        readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
2408        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2409        readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
2410        ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2411        fclose(fp);
2412    }
2413    else if (fext == "xml")
2414    {
2415        HaarClassifierCascadeDescriptor haar;
2416        std::vector<HaarStage64> haarStages;
2417        std::vector<HaarClassifierNode128> haarNodes;
2418        std::vector<HaarFeature64> haarFeatures;
2419
2420        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2421        ncvAssertReturnNcvStat(ncvStat);
2422
2423        numStages = haar.NumStages;
2424        numNodes = haar.NumClassifierTotalNodes;
2425        numFeatures = haar.NumFeatures;
2426    }
2427    else
2428    {
2429        return NCV_HAAR_XML_LOADING_EXCEPTION;
2430    }
2431
2432    return NCV_SUCCESS;
2433}
2434
2435
2436NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
2437                                   HaarClassifierCascadeDescriptor &haar,
2438                                   NCVVector<HaarStage64> &h_HaarStages,
2439                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,
2440                                   NCVVector<HaarFeature64> &h_HaarFeatures)
2441{
2442    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2443                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2444                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2445
2446    NCVStatus ncvStat;
2447
2448    cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2449    fext = fext.toLowerCase();
2450
2451    std::vector<HaarStage64> haarStages;
2452    std::vector<HaarClassifierNode128> haarNodes;
2453    std::vector<HaarFeature64> haarFeatures;
2454
2455    if (fext == "nvbin")
2456    {
2457        ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
2458        ncvAssertReturnNcvStat(ncvStat);
2459    }
2460    else if (fext == "xml")
2461    {
2462        ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2463        ncvAssertReturnNcvStat(ncvStat);
2464    }
2465    else
2466    {
2467        return NCV_HAAR_XML_LOADING_EXCEPTION;
2468    }
2469
2470    ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2471    ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2472    ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2473
2474    memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
2475    memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
2476    memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
2477
2478    return NCV_SUCCESS;
2479}
2480
2481
2482NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
2483                                 HaarClassifierCascadeDescriptor haar,
2484                                 NCVVector<HaarStage64> &h_HaarStages,
2485                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
2486                                 NCVVector<HaarFeature64> &h_HaarFeatures)
2487{
2488    ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
2489    ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
2490    ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
2491    ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2492                    h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2493                    h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2494
2495    Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2496    Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2497    Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2498
2499    Ncv32u dataOffset = 0;
2500    std::vector<unsigned char> fdata;
2501    fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
2502
2503    //header
2504    *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
2505
2506    //data
2507    dataOffset = NVBIN_HAAR_SIZERESERVED;
2508    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
2509    dataOffset += sizeof(Ncv32u);
2510    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
2511    dataOffset += sizeof(Ncv32u);
2512    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
2513    dataOffset += sizeof(Ncv32u);
2514    *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
2515    dataOffset += sizeof(Ncv32u);
2516    *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
2517    dataOffset += sizeof(NcvSize32u);
2518    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
2519    dataOffset += sizeof(NcvBool);
2520    *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
2521    dataOffset += sizeof(NcvBool);
2522
2523    memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
2524    dataOffset += szStages;
2525    memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
2526    dataOffset += szClassifiers;
2527    memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
2528    dataOffset += szFeatures;
2529    Ncv32u fsize = dataOffset;
2530
2531    //TODO: CRC32 here
2532
2533    //update header
2534    dataOffset = sizeof(Ncv32u);
2535    *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
2536
2537    FILE *fp = fopen(filename.c_str(), "wb");
2538    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2539    fwrite(&fdata[0], fsize, 1, fp);
2540    fclose(fp);
2541    return NCV_SUCCESS;
2542}
2543