1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                        Intel License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000, Intel Corporation, all rights reserved.
14// Copyright (C) 2014, Itseez Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of Intel Corporation may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#include "precomp.hpp"
44#include "opencl_kernels_imgproc.hpp"
45
46
47#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
48#define USE_IPP_CANNY 1
49#else
50#undef USE_IPP_CANNY
51#endif
52
53
54namespace cv
55{
56
57#ifdef USE_IPP_CANNY
58static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
59{
60    int size = 0, size1 = 0;
61    IppiSize roi = { _src.cols, _src.rows };
62
63    if (ippiFilterSobelNegVertGetBufferSize_8u16s_C1R(roi, ippMskSize3x3, &size) < 0)
64        return false;
65    if (ippiFilterSobelHorizGetBufferSize_8u16s_C1R(roi, ippMskSize3x3, &size1) < 0)
66        return false;
67    size = std::max(size, size1);
68
69    if (ippiCannyGetSize(roi, &size1) < 0)
70        return false;
71    size = std::max(size, size1);
72
73    AutoBuffer<uchar> buf(size + 64);
74    uchar* buffer = alignPtr((uchar*)buf, 32);
75
76    Mat _dx(_src.rows, _src.cols, CV_16S);
77    if( ippiFilterSobelNegVertBorder_8u16s_C1R(_src.ptr(), (int)_src.step,
78                    _dx.ptr<short>(), (int)_dx.step, roi,
79                    ippMskSize3x3, ippBorderRepl, 0, buffer) < 0 )
80        return false;
81
82    Mat _dy(_src.rows, _src.cols, CV_16S);
83    if( ippiFilterSobelHorizBorder_8u16s_C1R(_src.ptr(), (int)_src.step,
84                    _dy.ptr<short>(), (int)_dy.step, roi,
85                    ippMskSize3x3, ippBorderRepl, 0, buffer) < 0 )
86        return false;
87
88    if( ippiCanny_16s8u_C1R(_dx.ptr<short>(), (int)_dx.step,
89                               _dy.ptr<short>(), (int)_dy.step,
90                              _dst.ptr(), (int)_dst.step, roi, low, high, buffer) < 0 )
91        return false;
92    return true;
93}
94#endif
95
96#ifdef HAVE_OPENCL
97
98static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
99                      int aperture_size, bool L2gradient, int cn, const Size & size)
100{
101    UMat map;
102
103    const ocl::Device &dev = ocl::Device::getDefault();
104    int max_wg_size = (int)dev.maxWorkGroupSize();
105
106    int lSizeX = 32;
107    int lSizeY = max_wg_size / 32;
108
109    if (lSizeY == 0)
110    {
111        lSizeX = 16;
112        lSizeY = max_wg_size / 16;
113    }
114    if (lSizeY == 0)
115    {
116        lSizeY = 1;
117    }
118
119    if (L2gradient)
120    {
121        low_thresh = std::min(32767.0f, low_thresh);
122        high_thresh = std::min(32767.0f, high_thresh);
123
124        if (low_thresh > 0)
125            low_thresh *= low_thresh;
126        if (high_thresh > 0)
127            high_thresh *= high_thresh;
128    }
129    int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
130
131    if (aperture_size == 3 && !_src.isSubmatrix())
132    {
133        /*
134            stage1_with_sobel:
135                Sobel operator
136                Calc magnitudes
137                Non maxima suppression
138                Double thresholding
139        */
140        char cvt[40];
141        ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc,
142                               format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_floatN=%s -D floatN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
143                                      cn, ocl::memopTypeToStr(_src.depth()),
144                                      ocl::convertTypeStr(_src.depth(), CV_32F, cn, cvt),
145                                      ocl::typeToStr(CV_MAKE_TYPE(CV_32F, cn)),
146                                      lSizeX, lSizeY,
147                                      L2gradient ? " -D L2GRAD" : ""));
148        if (with_sobel.empty())
149            return false;
150
151        UMat src = _src.getUMat();
152        map.create(size, CV_32S);
153        with_sobel.args(ocl::KernelArg::ReadOnly(src),
154                        ocl::KernelArg::WriteOnlyNoSize(map),
155                        (float) low, (float) high);
156
157        size_t globalsize[2] = { size.width, size.height },
158                localsize[2] = { lSizeX, lSizeY };
159
160        if (!with_sobel.run(2, globalsize, localsize, false))
161            return false;
162    }
163    else
164    {
165        /*
166            stage1_without_sobel:
167                Calc magnitudes
168                Non maxima suppression
169                Double thresholding
170        */
171        UMat dx, dy;
172        Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
173        Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
174
175        ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc,
176                                    format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
177                                           cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : ""));
178        if (without_sobel.empty())
179            return false;
180
181        map.create(size, CV_32S);
182        without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
183                           ocl::KernelArg::WriteOnly(map),
184                           low, high);
185
186        size_t globalsize[2] = { size.width, size.height },
187                localsize[2] = { lSizeX, lSizeY };
188
189        if (!without_sobel.run(2, globalsize, localsize, false))
190            return false;
191    }
192
193    int PIX_PER_WI = 8;
194    /*
195        stage2:
196            hysteresis (add weak edges if they are connected with strong edges)
197    */
198
199    int sizey = lSizeY / PIX_PER_WI;
200    if (sizey == 0)
201        sizey = 1;
202
203    size_t globalsize[2] = { size.width, (size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
204
205    ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
206                                format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d",
207                                PIX_PER_WI, lSizeX, sizey));
208
209    if (edgesHysteresis.empty())
210        return false;
211
212    edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
213    if (!edgesHysteresis.run(2, globalsize, localsize, false))
214        return false;
215
216    // get edges
217
218    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc,
219                                format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI));
220    if (getEdgesKernel.empty())
221        return false;
222
223    _dst.create(size, CV_8UC1);
224    UMat dst = _dst.getUMat();
225
226    getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst));
227
228    return getEdgesKernel.run(2, globalsize, NULL, false);
229}
230
231#endif
232
233#ifdef HAVE_TBB
234
235// Queue with peaks that will processed serially.
236static tbb::concurrent_queue<uchar*> borderPeaks;
237
238class tbbCanny
239{
240public:
241    tbbCanny(const Range _boundaries, const Mat& _src, uchar* _map, int _low,
242            int _high, int _aperture_size, bool _L2gradient)
243        : boundaries(_boundaries), src(_src), map(_map), low(_low), high(_high),
244          aperture_size(_aperture_size), L2gradient(_L2gradient)
245    {}
246
247    // This parallel version of Canny algorithm splits the src image in threadsNumber horizontal slices.
248    // The first row of each slice contains the last row of the previous slice and
249    // the last row of each slice contains the first row of the next slice
250    // so that each slice is independent and no mutexes are required.
251    void operator()() const
252    {
253#if CV_SSE2
254        bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
255#endif
256
257        const int type = src.type(), cn = CV_MAT_CN(type);
258
259        Mat dx, dy;
260
261        ptrdiff_t mapstep = src.cols + 2;
262
263        // In sobel transform we calculate ksize2 extra lines for the first and last rows of each slice
264        // because IPPDerivSobel expects only isolated ROIs, in contrast with the opencv version which
265        // uses the pixels outside of the ROI to form a border.
266        uchar ksize2 = aperture_size / 2;
267
268        if (boundaries.start == 0 && boundaries.end == src.rows)
269        {
270            Mat tempdx(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
271            Mat tempdy(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
272
273            memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
274            memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
275            memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
276            memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
277
278            Sobel(src, tempdx.rowRange(1, tempdx.rows - 1), CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
279            Sobel(src, tempdy.rowRange(1, tempdy.rows - 1), CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
280
281            dx = tempdx;
282            dy = tempdy;
283        }
284        else if (boundaries.start == 0)
285        {
286            Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
287            Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
288
289            memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
290            memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
291
292            Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdx.rowRange(1, tempdx.rows),
293                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
294            Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdy.rowRange(1, tempdy.rows),
295                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
296
297            dx = tempdx.rowRange(0, tempdx.rows - ksize2);
298            dy = tempdy.rowRange(0, tempdy.rows - ksize2);
299        }
300        else if (boundaries.end == src.rows)
301        {
302            Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
303            Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
304
305            memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
306            memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
307
308            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdx.rowRange(0, tempdx.rows - 1),
309                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
310            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdy.rowRange(0, tempdy.rows - 1),
311                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
312
313            dx = tempdx.rowRange(ksize2, tempdx.rows);
314            dy = tempdy.rowRange(ksize2, tempdy.rows);
315        }
316        else
317        {
318            Mat tempdx(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
319            Mat tempdy(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
320
321            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdx,
322                    CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
323            Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdy,
324                    CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
325
326            dx = tempdx.rowRange(ksize2, tempdx.rows - ksize2);
327            dy = tempdy.rowRange(ksize2, tempdy.rows - ksize2);
328        }
329
330        int maxsize = std::max(1 << 10, src.cols * (boundaries.end - boundaries.start) / 10);
331        std::vector<uchar*> stack(maxsize);
332        uchar **stack_top = &stack[0];
333        uchar **stack_bottom = &stack[0];
334
335        AutoBuffer<uchar> buffer(cn * mapstep * 3 * sizeof(int));
336
337        int* mag_buf[3];
338        mag_buf[0] = (int*)(uchar*)buffer;
339        mag_buf[1] = mag_buf[0] + mapstep*cn;
340        mag_buf[2] = mag_buf[1] + mapstep*cn;
341
342        // calculate magnitude and angle of gradient, perform non-maxima suppression.
343        // fill the map with one of the following values:
344        //   0 - the pixel might belong to an edge
345        //   1 - the pixel can not belong to an edge
346        //   2 - the pixel does belong to an edge
347        for (int i = boundaries.start - 1; i <= boundaries.end; i++)
348        {
349            int* _norm = mag_buf[(i > boundaries.start) - (i == boundaries.start - 1) + 1] + 1;
350
351            short* _dx = dx.ptr<short>(i - boundaries.start + 1);
352            short* _dy = dy.ptr<short>(i - boundaries.start + 1);
353
354            if (!L2gradient)
355            {
356                int j = 0, width = src.cols * cn;
357#if CV_SSE2
358                if (haveSSE2)
359                {
360                    __m128i v_zero = _mm_setzero_si128();
361                    for ( ; j <= width - 8; j += 8)
362                    {
363                        __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
364                        __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
365                        v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
366                        v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
367
368                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero));
369                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
370
371                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero));
372                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
373                    }
374                }
375#elif CV_NEON
376                for ( ; j <= width - 8; j += 8)
377                {
378                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
379                    vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
380                                                   vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
381                    vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
382                                                       vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
383                }
384#endif
385                for ( ; j < width; ++j)
386                    _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
387            }
388            else
389            {
390                int j = 0, width = src.cols * cn;
391#if CV_SSE2
392                if (haveSSE2)
393                {
394                    for ( ; j <= width - 8; j += 8)
395                    {
396                        __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
397                        __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
398
399                        __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
400                        __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
401
402                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
403                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
404
405                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
406                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
407                    }
408                }
409#elif CV_NEON
410                for ( ; j <= width - 8; j += 8)
411                {
412                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
413                    int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
414                    int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
415                    vst1q_s32(_norm + j, v_dst);
416
417                    v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
418                    v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
419                    vst1q_s32(_norm + j + 4, v_dst);
420                }
421#endif
422                for ( ; j < width; ++j)
423                    _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
424            }
425
426            if (cn > 1)
427            {
428                for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
429                {
430                    int maxIdx = jn;
431                    for(int k = 1; k < cn; ++k)
432                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
433                    _norm[j] = _norm[maxIdx];
434                    _dx[j] = _dx[maxIdx];
435                    _dy[j] = _dy[maxIdx];
436                }
437            }
438            _norm[-1] = _norm[src.cols] = 0;
439
440            // at the very beginning we do not have a complete ring
441            // buffer of 3 magnitude rows for non-maxima suppression
442            if (i <= boundaries.start)
443                continue;
444
445            uchar* _map = map + mapstep*i + 1;
446            _map[-1] = _map[src.cols] = 1;
447
448            int* _mag = mag_buf[1] + 1; // take the central row
449            ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
450            ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
451
452            const short* _x = dx.ptr<short>(i - boundaries.start);
453            const short* _y = dy.ptr<short>(i - boundaries.start);
454
455            if ((stack_top - stack_bottom) + src.cols > maxsize)
456            {
457                int sz = (int)(stack_top - stack_bottom);
458                maxsize = std::max(maxsize * 3/2, sz + src.cols);
459                stack.resize(maxsize);
460                stack_bottom = &stack[0];
461                stack_top = stack_bottom + sz;
462            }
463
464#define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
465#define CANNY_POP(d)     (d) = *--stack_top
466
467            int prev_flag = 0;
468            bool canny_push = false;
469            for (int j = 0; j < src.cols; j++)
470            {
471                #define CANNY_SHIFT 15
472                const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
473
474                int m = _mag[j];
475
476                if (m > low)
477                {
478                    int xs = _x[j];
479                    int ys = _y[j];
480                    int x = std::abs(xs);
481                    int y = std::abs(ys) << CANNY_SHIFT;
482
483                    int tg22x = x * TG22;
484
485                    if (y < tg22x)
486                    {
487                        if (m > _mag[j-1] && m >= _mag[j+1]) canny_push = true;
488                    }
489                    else
490                    {
491                        int tg67x = tg22x + (x << (CANNY_SHIFT+1));
492                        if (y > tg67x)
493                        {
494                            if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) canny_push = true;
495                        }
496                        else
497                        {
498                            int s = (xs ^ ys) < 0 ? -1 : 1;
499                            if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) canny_push = true;
500                        }
501                    }
502                }
503                if (!canny_push)
504                {
505                    prev_flag = 0;
506                    _map[j] = uchar(1);
507                    continue;
508                }
509                else
510                {
511                    // _map[j-mapstep] is short-circuited at the start because previous thread is
512                    // responsible for initializing it.
513                    if (!prev_flag && m > high && (i <= boundaries.start+1 || _map[j-mapstep] != 2) )
514                    {
515                        CANNY_PUSH(_map + j);
516                        prev_flag = 1;
517                    }
518                    else
519                        _map[j] = 0;
520
521                    canny_push = false;
522                }
523            }
524
525            // scroll the ring buffer
526            _mag = mag_buf[0];
527            mag_buf[0] = mag_buf[1];
528            mag_buf[1] = mag_buf[2];
529            mag_buf[2] = _mag;
530        }
531
532        // now track the edges (hysteresis thresholding)
533        while (stack_top > stack_bottom)
534        {
535            if ((stack_top - stack_bottom) + 8 > maxsize)
536            {
537                int sz = (int)(stack_top - stack_bottom);
538                maxsize = maxsize * 3/2;
539                stack.resize(maxsize);
540                stack_bottom = &stack[0];
541                stack_top = stack_bottom + sz;
542            }
543
544            uchar* m;
545            CANNY_POP(m);
546
547            // Stops thresholding from expanding to other slices by sending pixels in the borders of each
548            // slice in a queue to be serially processed later.
549            if ( (m < map + (boundaries.start + 2) * mapstep) || (m >= map + boundaries.end * mapstep) )
550            {
551                borderPeaks.push(m);
552                continue;
553            }
554
555            if (!m[-1])         CANNY_PUSH(m - 1);
556            if (!m[1])          CANNY_PUSH(m + 1);
557            if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
558            if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
559            if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
560            if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
561            if (!m[mapstep])    CANNY_PUSH(m + mapstep);
562            if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
563        }
564    }
565
566private:
567    const Range boundaries;
568    const Mat& src;
569    uchar* map;
570    int low;
571    int high;
572    int aperture_size;
573    bool L2gradient;
574};
575
576#endif
577
578} // namespace cv
579
580void cv::Canny( InputArray _src, OutputArray _dst,
581                double low_thresh, double high_thresh,
582                int aperture_size, bool L2gradient )
583{
584    const int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
585    const Size size = _src.size();
586
587    CV_Assert( depth == CV_8U );
588    _dst.create(size, CV_8U);
589
590    if (!L2gradient && (aperture_size & CV_CANNY_L2_GRADIENT) == CV_CANNY_L2_GRADIENT)
591    {
592        // backward compatibility
593        aperture_size &= ~CV_CANNY_L2_GRADIENT;
594        L2gradient = true;
595    }
596
597    if ((aperture_size & 1) == 0 || (aperture_size != -1 && (aperture_size < 3 || aperture_size > 7)))
598        CV_Error(CV_StsBadFlag, "Aperture size should be odd");
599
600    if (low_thresh > high_thresh)
601        std::swap(low_thresh, high_thresh);
602
603    CV_OCL_RUN(_dst.isUMat() && (cn == 1 || cn == 3),
604               ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
605
606    Mat src = _src.getMat(), dst = _dst.getMat();
607
608#ifdef HAVE_TEGRA_OPTIMIZATION
609    if (tegra::useTegra() && tegra::canny(src, dst, low_thresh, high_thresh, aperture_size, L2gradient))
610        return;
611#endif
612
613#ifdef USE_IPP_CANNY
614    CV_IPP_CHECK()
615    {
616        if( aperture_size == 3 && !L2gradient && 1 == cn )
617        {
618            if (ippCanny(src, dst, (float)low_thresh, (float)high_thresh))
619            {
620                CV_IMPL_ADD(CV_IMPL_IPP);
621                return;
622            }
623            setIppErrorStatus();
624        }
625    }
626#endif
627
628#ifdef HAVE_TBB
629
630if (L2gradient)
631{
632    low_thresh = std::min(32767.0, low_thresh);
633    high_thresh = std::min(32767.0, high_thresh);
634
635    if (low_thresh > 0) low_thresh *= low_thresh;
636    if (high_thresh > 0) high_thresh *= high_thresh;
637}
638int low = cvFloor(low_thresh);
639int high = cvFloor(high_thresh);
640
641ptrdiff_t mapstep = src.cols + 2;
642AutoBuffer<uchar> buffer((src.cols+2)*(src.rows+2));
643
644uchar* map = (uchar*)buffer;
645memset(map, 1, mapstep);
646memset(map + mapstep*(src.rows + 1), 1, mapstep);
647
648int threadsNumber = tbb::task_scheduler_init::default_num_threads();
649int grainSize = src.rows / threadsNumber;
650
651// Make a fallback for pictures with too few rows.
652uchar ksize2 = aperture_size / 2;
653int minGrainSize = 1 + ksize2;
654int maxGrainSize = src.rows - 2 - 2*ksize2;
655if ( !( minGrainSize <= grainSize && grainSize <= maxGrainSize ) )
656{
657    threadsNumber = 1;
658    grainSize = src.rows;
659}
660
661tbb::task_group g;
662
663for (int i = 0; i < threadsNumber; ++i)
664{
665    if (i < threadsNumber - 1)
666        g.run(tbbCanny(Range(i * grainSize, (i + 1) * grainSize), src, map, low, high, aperture_size, L2gradient));
667    else
668        g.run(tbbCanny(Range(i * grainSize, src.rows), src, map, low, high, aperture_size, L2gradient));
669}
670
671g.wait();
672
673#define CANNY_PUSH_SERIAL(d)    *(d) = uchar(2), borderPeaks.push(d)
674
675// now track the edges (hysteresis thresholding)
676uchar* m;
677while (borderPeaks.try_pop(m))
678{
679    if (!m[-1])         CANNY_PUSH_SERIAL(m - 1);
680    if (!m[1])          CANNY_PUSH_SERIAL(m + 1);
681    if (!m[-mapstep-1]) CANNY_PUSH_SERIAL(m - mapstep - 1);
682    if (!m[-mapstep])   CANNY_PUSH_SERIAL(m - mapstep);
683    if (!m[-mapstep+1]) CANNY_PUSH_SERIAL(m - mapstep + 1);
684    if (!m[mapstep-1])  CANNY_PUSH_SERIAL(m + mapstep - 1);
685    if (!m[mapstep])    CANNY_PUSH_SERIAL(m + mapstep);
686    if (!m[mapstep+1])  CANNY_PUSH_SERIAL(m + mapstep + 1);
687}
688
689#else
690
691    Mat dx(src.rows, src.cols, CV_16SC(cn));
692    Mat dy(src.rows, src.cols, CV_16SC(cn));
693
694    Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
695    Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
696
697    if (L2gradient)
698    {
699        low_thresh = std::min(32767.0, low_thresh);
700        high_thresh = std::min(32767.0, high_thresh);
701
702        if (low_thresh > 0) low_thresh *= low_thresh;
703        if (high_thresh > 0) high_thresh *= high_thresh;
704    }
705    int low = cvFloor(low_thresh);
706    int high = cvFloor(high_thresh);
707
708    ptrdiff_t mapstep = src.cols + 2;
709    AutoBuffer<uchar> buffer((src.cols+2)*(src.rows+2) + cn * mapstep * 3 * sizeof(int));
710
711    int* mag_buf[3];
712    mag_buf[0] = (int*)(uchar*)buffer;
713    mag_buf[1] = mag_buf[0] + mapstep*cn;
714    mag_buf[2] = mag_buf[1] + mapstep*cn;
715    memset(mag_buf[0], 0, /* cn* */mapstep*sizeof(int));
716
717    uchar* map = (uchar*)(mag_buf[2] + mapstep*cn);
718    memset(map, 1, mapstep);
719    memset(map + mapstep*(src.rows + 1), 1, mapstep);
720
721    int maxsize = std::max(1 << 10, src.cols * src.rows / 10);
722    std::vector<uchar*> stack(maxsize);
723    uchar **stack_top = &stack[0];
724    uchar **stack_bottom = &stack[0];
725
726    /* sector numbers
727       (Top-Left Origin)
728
729        1   2   3
730         *  *  *
731          * * *
732        0*******0
733          * * *
734         *  *  *
735        3   2   1
736    */
737
738    #define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
739    #define CANNY_POP(d)     (d) = *--stack_top
740
741#if CV_SSE2
742    bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
743#endif
744
745    // calculate magnitude and angle of gradient, perform non-maxima suppression.
746    // fill the map with one of the following values:
747    //   0 - the pixel might belong to an edge
748    //   1 - the pixel can not belong to an edge
749    //   2 - the pixel does belong to an edge
750    for (int i = 0; i <= src.rows; i++)
751    {
752        int* _norm = mag_buf[(i > 0) + 1] + 1;
753        if (i < src.rows)
754        {
755            short* _dx = dx.ptr<short>(i);
756            short* _dy = dy.ptr<short>(i);
757
758            if (!L2gradient)
759            {
760                int j = 0, width = src.cols * cn;
761#if CV_SSE2
762                if (haveSSE2)
763                {
764                    __m128i v_zero = _mm_setzero_si128();
765                    for ( ; j <= width - 8; j += 8)
766                    {
767                        __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
768                        __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
769                        v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
770                        v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
771
772                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero));
773                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
774
775                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero));
776                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
777                    }
778                }
779#elif CV_NEON
780                for ( ; j <= width - 8; j += 8)
781                {
782                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
783                    vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
784                                                   vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
785                    vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
786                                                       vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
787                }
788#endif
789                for ( ; j < width; ++j)
790                    _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
791            }
792            else
793            {
794                int j = 0, width = src.cols * cn;
795#if CV_SSE2
796                if (haveSSE2)
797                {
798                    for ( ; j <= width - 8; j += 8)
799                    {
800                        __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
801                        __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
802
803                        __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
804                        __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
805
806                        __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
807                        _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
808
809                        v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
810                        _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
811                    }
812                }
813#elif CV_NEON
814                for ( ; j <= width - 8; j += 8)
815                {
816                    int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
817                    int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
818                    int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
819                    vst1q_s32(_norm + j, v_dst);
820
821                    v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
822                    v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
823                    vst1q_s32(_norm + j + 4, v_dst);
824                }
825#endif
826                for ( ; j < width; ++j)
827                    _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
828            }
829
830            if (cn > 1)
831            {
832                for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
833                {
834                    int maxIdx = jn;
835                    for(int k = 1; k < cn; ++k)
836                        if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
837                    _norm[j] = _norm[maxIdx];
838                    _dx[j] = _dx[maxIdx];
839                    _dy[j] = _dy[maxIdx];
840                }
841            }
842            _norm[-1] = _norm[src.cols] = 0;
843        }
844        else
845            memset(_norm-1, 0, /* cn* */mapstep*sizeof(int));
846
847        // at the very beginning we do not have a complete ring
848        // buffer of 3 magnitude rows for non-maxima suppression
849        if (i == 0)
850            continue;
851
852        uchar* _map = map + mapstep*i + 1;
853        _map[-1] = _map[src.cols] = 1;
854
855        int* _mag = mag_buf[1] + 1; // take the central row
856        ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
857        ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
858
859        const short* _x = dx.ptr<short>(i-1);
860        const short* _y = dy.ptr<short>(i-1);
861
862        if ((stack_top - stack_bottom) + src.cols > maxsize)
863        {
864            int sz = (int)(stack_top - stack_bottom);
865            maxsize = std::max(maxsize * 3/2, sz + src.cols);
866            stack.resize(maxsize);
867            stack_bottom = &stack[0];
868            stack_top = stack_bottom + sz;
869        }
870
871        int prev_flag = 0;
872        for (int j = 0; j < src.cols; j++)
873        {
874            #define CANNY_SHIFT 15
875            const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
876
877            int m = _mag[j];
878
879            if (m > low)
880            {
881                int xs = _x[j];
882                int ys = _y[j];
883                int x = std::abs(xs);
884                int y = std::abs(ys) << CANNY_SHIFT;
885
886                int tg22x = x * TG22;
887
888                if (y < tg22x)
889                {
890                    if (m > _mag[j-1] && m >= _mag[j+1]) goto __ocv_canny_push;
891                }
892                else
893                {
894                    int tg67x = tg22x + (x << (CANNY_SHIFT+1));
895                    if (y > tg67x)
896                    {
897                        if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __ocv_canny_push;
898                    }
899                    else
900                    {
901                        int s = (xs ^ ys) < 0 ? -1 : 1;
902                        if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __ocv_canny_push;
903                    }
904                }
905            }
906            prev_flag = 0;
907            _map[j] = uchar(1);
908            continue;
909__ocv_canny_push:
910            if (!prev_flag && m > high && _map[j-mapstep] != 2)
911            {
912                CANNY_PUSH(_map + j);
913                prev_flag = 1;
914            }
915            else
916                _map[j] = 0;
917        }
918
919        // scroll the ring buffer
920        _mag = mag_buf[0];
921        mag_buf[0] = mag_buf[1];
922        mag_buf[1] = mag_buf[2];
923        mag_buf[2] = _mag;
924    }
925
926    // now track the edges (hysteresis thresholding)
927    while (stack_top > stack_bottom)
928    {
929        uchar* m;
930        if ((stack_top - stack_bottom) + 8 > maxsize)
931        {
932            int sz = (int)(stack_top - stack_bottom);
933            maxsize = maxsize * 3/2;
934            stack.resize(maxsize);
935            stack_bottom = &stack[0];
936            stack_top = stack_bottom + sz;
937        }
938
939        CANNY_POP(m);
940
941        if (!m[-1])         CANNY_PUSH(m - 1);
942        if (!m[1])          CANNY_PUSH(m + 1);
943        if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
944        if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
945        if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
946        if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
947        if (!m[mapstep])    CANNY_PUSH(m + mapstep);
948        if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
949    }
950
951#endif
952
953    // the final pass, form the final image
954    const uchar* pmap = map + mapstep + 1;
955    uchar* pdst = dst.ptr();
956    for (int i = 0; i < src.rows; i++, pmap += mapstep, pdst += dst.step)
957    {
958        for (int j = 0; j < src.cols; j++)
959            pdst[j] = (uchar)-(pmap[j] >> 1);
960    }
961}
962
963void cvCanny( const CvArr* image, CvArr* edges, double threshold1,
964              double threshold2, int aperture_size )
965{
966    cv::Mat src = cv::cvarrToMat(image), dst = cv::cvarrToMat(edges);
967    CV_Assert( src.size == dst.size && src.depth() == CV_8U && dst.type() == CV_8U );
968
969    cv::Canny(src, dst, threshold1, threshold2, aperture_size & 255,
970              (aperture_size & CV_CANNY_L2_GRADIENT) != 0);
971}
972
973/* End of file. */
974