1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#if !defined CUDA_DISABLER
44
45#include "lbp.hpp"
46#include "opencv2/core/cuda/vec_traits.hpp"
47#include "opencv2/core/cuda/saturate_cast.hpp"
48
49namespace cv { namespace cuda { namespace device
50{
51    namespace lbp
52    {
53        struct LBP
54        {
55            __host__ __device__ __forceinline__ LBP() {}
56
57            __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
58            {
59                int anchors[9];
60
61                anchors[0]  = integral[ty];
62                anchors[1]  = integral[ty + fw];
63                anchors[0] -= anchors[1];
64                anchors[2]  = integral[ty + fw * 2];
65                anchors[1] -= anchors[2];
66                anchors[2] -= integral[ty + fw * 3];
67
68                ty += fh;
69                anchors[3]  = integral[ty];
70                anchors[4]  = integral[ty + fw];
71                anchors[3] -= anchors[4];
72                anchors[5]  = integral[ty + fw * 2];
73                anchors[4] -= anchors[5];
74                anchors[5] -= integral[ty + fw * 3];
75
76                anchors[0] -= anchors[3];
77                anchors[1] -= anchors[4];
78                anchors[2] -= anchors[5];
79                // 0 - 2 contains s0 - s2
80
81                ty += fh;
82                anchors[6]  = integral[ty];
83                anchors[7]  = integral[ty + fw];
84                anchors[6] -= anchors[7];
85                anchors[8]  = integral[ty + fw * 2];
86                anchors[7] -= anchors[8];
87                anchors[8] -= integral[ty + fw * 3];
88
89                anchors[3] -= anchors[6];
90                anchors[4] -= anchors[7];
91                anchors[5] -= anchors[8];
92                // 3 - 5 contains s3 - s5
93
94                anchors[0] -= anchors[4];
95                anchors[1] -= anchors[4];
96                anchors[2] -= anchors[4];
97                anchors[3] -= anchors[4];
98                anchors[5] -= anchors[4];
99
100                int response = (~(anchors[0] >> 31)) & 4;
101                response |= (~(anchors[1] >> 31)) & 2;;
102                response |= (~(anchors[2] >> 31)) & 1;
103
104                shift = (~(anchors[5] >> 31)) & 16;
105                shift |= (~(anchors[3] >> 31)) & 1;
106
107                ty += fh;
108                anchors[0]  = integral[ty];
109                anchors[1]  = integral[ty + fw];
110                anchors[0] -= anchors[1];
111                anchors[2]  = integral[ty + fw * 2];
112                anchors[1] -= anchors[2];
113                anchors[2] -= integral[ty + fw * 3];
114
115                anchors[6] -= anchors[0];
116                anchors[7] -= anchors[1];
117                anchors[8] -= anchors[2];
118                // 0 -2 contains s6 - s8
119
120                anchors[6] -= anchors[4];
121                anchors[7] -= anchors[4];
122                anchors[8] -= anchors[4];
123
124                shift |= (~(anchors[6] >> 31)) & 2;
125                shift |= (~(anchors[7] >> 31)) & 4;
126                shift |= (~(anchors[8] >> 31)) & 8;
127                return response;
128            }
129        };
130
131        template<typename Pr>
132        __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
133        {
134            unsigned int tid = threadIdx.x;
135            extern __shared__ int sbuff[];
136
137            int* labels = sbuff;
138            int* rrects = sbuff + n;
139
140            Pr predicate(grouping_eps);
141            partition(candidates, n, labels, predicate);
142
143            rrects[tid * 4 + 0] = 0;
144            rrects[tid * 4 + 1] = 0;
145            rrects[tid * 4 + 2] = 0;
146            rrects[tid * 4 + 3] = 0;
147            __syncthreads();
148
149            int cls = labels[tid];
150            Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
151            Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
152            Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
153            Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
154
155            __syncthreads();
156            labels[tid] = 0;
157
158            __syncthreads();
159            Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
160
161            __syncthreads();
162            *nclasses = 0;
163
164            int active = labels[tid];
165            if (active)
166            {
167                int* r1 = rrects + tid * 4;
168                float s = 1.f / active;
169                r1[0] = saturate_cast<int>(r1[0] * s);
170                r1[1] = saturate_cast<int>(r1[1] * s);
171                r1[2] = saturate_cast<int>(r1[2] * s);
172                r1[3] = saturate_cast<int>(r1[3] * s);
173            }
174            __syncthreads();
175
176            if (active && active >= groupThreshold)
177            {
178                int* r1 = rrects + tid * 4;
179                int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
180
181                int aidx = Emulation::smem::atomicInc(nclasses, n);
182                objects[aidx] = r_out;
183            }
184        }
185
186        void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
187        {
188            if (!ncandidates) return;
189            int block = ncandidates;
190            int smem  = block * ( sizeof(int) + sizeof(int4) );
191            disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
192            cudaSafeCall( cudaGetLastError() );
193        }
194
195        struct Cascade
196        {
197            __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
198                const int* _subsets, const uchar4* _features, int _subsetSize)
199
200            : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
201
202            __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
203            {
204                int current_node = 0;
205                int current_leave = 0;
206
207                for (int s = 0; s < nstages; ++s)
208                {
209                    float sum = 0;
210                    Stage stage = stages[s];
211                    for (int t = 0; t < stage.ntrees; t++)
212                    {
213                        ClNode node = nodes[current_node];
214                        uchar4 feature = features[node.featureIdx];
215
216                        int shift;
217                        int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
218                        int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
219                        sum += leaves[idx];
220
221                        current_node += 1;
222                        current_leave += 2;
223                    }
224
225                    if (sum < stage.threshold)
226                        return false;
227                }
228
229                return true;
230            }
231
232            const Stage*  stages;
233            const int nstages;
234
235            const ClNode* nodes;
236            const float* leaves;
237            const int* subsets;
238            const uchar4* features;
239
240            const int subsetSize;
241            const LBP evaluator;
242        };
243
244        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
245        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
246            const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
247        {
248            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
249            if (ftid >= total) return;
250
251            int step = (scale <= 2.f);
252
253            int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
254            int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
255            int wshift = 0;
256
257            int scaleTid = ftid;
258
259            while (scaleTid >= stotal)
260            {
261                scaleTid -= stotal;
262                wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
263                scale *= factor;
264                step = (scale <= 2.f);
265                windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
266                stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
267            }
268
269            int y = __fdividef(scaleTid, windowsForLine);
270            int x = scaleTid - y * windowsForLine;
271
272            x <<= step;
273            y <<= step;
274
275            if (cascade(y, x + wshift, integral, pitch))
276            {
277                if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
278
279                int4 rect;
280                rect.x = __float2int_rn(x * scale);
281                rect.y = __float2int_rn(y * scale);
282                rect.z = __float2int_rn(windowW * scale);
283                rect.w = __float2int_rn(windowH * scale);
284
285                int res = atomicInc(classified, (unsigned int)objects.cols);
286                objects(0, res) = rect;
287            }
288        }
289
290        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
291            const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
292            const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
293        {
294            const int block = 128;
295            int grid = divUp(workAmount, block);
296            cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
297            Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
298            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
299        }
300    }
301}}}
302
303#endif /* CUDA_DISABLER */
304