1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
15// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16// Third party copyrights are property of their respective owners.
17//
18// Redistribution and use in source and binary forms, with or without modification,
19// are permitted provided that the following conditions are met:
20//
21//   * Redistribution's of source code must retain the above copyright notice,
22//     this list of conditions and the following disclaimer.
23//
24//   * Redistribution's in binary form must reproduce the above copyright notice,
25//     this list of conditions and the following disclaimer in the documentation
26//     and/or other materials provided with the distribution.
27//
28//   * The name of the copyright holders may not be used to endorse or promote products
29//     derived from this software without specific prior written permission.
30//
31// This software is provided by the copyright holders and contributors "as is" and
32// any express or implied warranties, including, but not limited to, the implied
33// warranties of merchantability and fitness for a particular purpose are disclaimed.
34// In no event shall the Intel Corporation or contributors be liable for any direct,
35// indirect, incidental, special, exemplary, or consequential damages
36// (including, but not limited to, procurement of substitute goods or services;
37// loss of use, data, or profits; or business interruption) however caused
38// and on any theory of liability, whether in contract, strict liability,
39// or tort (including negligence or otherwise) arising in any way out of
40// the use of this software, even if advised of the possibility of such damage.
41//
42//M*/
43
44/********************************* COPYRIGHT NOTICE *******************************\
45  The function for RGB to Lab conversion is based on the MATLAB script
46  RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
47  See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
48\**********************************************************************************/
49
50/********************************* COPYRIGHT NOTICE *******************************\
51  Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
52  from MD-Mathematische Dienste GmbH. Below is the copyright notice:
53
54    IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
55    By downloading, copying, installing or using the software you agree
56    to this license. If you do not agree to this license, do not download,
57    install, copy or use the software.
58
59    Contributors License Agreement:
60
61      Copyright (c) 2002,
62      MD-Mathematische Dienste GmbH
63      Im Defdahl 5-10
64      44141 Dortmund
65      Germany
66      www.md-it.de
67
68    Redistribution and use in source and binary forms,
69    with or without modification, are permitted provided
70    that the following conditions are met:
71
72    Redistributions of source code must retain
73    the above copyright notice, this list of conditions and the following disclaimer.
74    Redistributions in binary form must reproduce the above copyright notice,
75    this list of conditions and the following disclaimer in the documentation
76    and/or other materials provided with the distribution.
77    The name of Contributor may not be used to endorse or promote products
78    derived from this software without specific prior written permission.
79
80    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
81    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
82    THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
83    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
84    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
85    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
86    OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
87    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
88    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
89    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
90    THE POSSIBILITY OF SUCH DAMAGE.
91\**********************************************************************************/
92
93#include "precomp.hpp"
94#include "opencl_kernels_imgproc.hpp"
95#include <limits>
96
97#define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
98
99#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
100#define MAX_IPP8u   255
101#define MAX_IPP16u  65535
102#define MAX_IPP32f  1.0
103static IppStatus sts = ippInit();
104#endif
105
106namespace cv
107{
108
109// computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
110template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
111{
112    _Tp cn = 0;
113    int i;
114    tab[0] = tab[1] = (_Tp)0;
115
116    for(i = 1; i < n-1; i++)
117    {
118        _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
119        _Tp l = 1/(4 - tab[(i-1)*4]);
120        tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
121    }
122
123    for(i = n-1; i >= 0; i--)
124    {
125        _Tp c = tab[i*4+1] - tab[i*4]*cn;
126        _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
127        _Tp d = (cn - c)*(_Tp)0.3333333333333333;
128        tab[i*4] = f[i]; tab[i*4+1] = b;
129        tab[i*4+2] = c; tab[i*4+3] = d;
130        cn = c;
131    }
132}
133
134// interpolates value of a function at x, 0 <= x <= n using a cubic spline.
135template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
136{
137    // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
138    int ix = std::min(std::max(int(x), 0), n-1);
139    x -= ix;
140    tab += ix*4;
141    return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
142}
143
144
145template<typename _Tp> struct ColorChannel
146{
147    typedef float worktype_f;
148    static _Tp max() { return std::numeric_limits<_Tp>::max(); }
149    static _Tp half() { return (_Tp)(max()/2 + 1); }
150};
151
152template<> struct ColorChannel<float>
153{
154    typedef float worktype_f;
155    static float max() { return 1.f; }
156    static float half() { return 0.5f; }
157};
158
159/*template<> struct ColorChannel<double>
160{
161    typedef double worktype_f;
162    static double max() { return 1.; }
163    static double half() { return 0.5; }
164};*/
165
166
167///////////////////////////// Top-level template function ////////////////////////////////
168
169template <typename Cvt>
170class CvtColorLoop_Invoker : public ParallelLoopBody
171{
172    typedef typename Cvt::channel_type _Tp;
173public:
174
175    CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
176        ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
177    {
178    }
179
180    virtual void operator()(const Range& range) const
181    {
182        const uchar* yS = src.ptr<uchar>(range.start);
183        uchar* yD = dst.ptr<uchar>(range.start);
184
185        for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
186            cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
187    }
188
189private:
190    const Mat& src;
191    Mat& dst;
192    const Cvt& cvt;
193
194    const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
195};
196
197template <typename Cvt>
198void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
199{
200    parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
201}
202
203#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
204
205typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
206typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
207typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
208
209template <typename Cvt>
210class CvtColorIPPLoop_Invoker :
211        public ParallelLoopBody
212{
213public:
214
215    CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
216        ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
217    {
218        *ok = true;
219    }
220
221    virtual void operator()(const Range& range) const
222    {
223        const void *yS = src.ptr<uchar>(range.start);
224        void *yD = dst.ptr<uchar>(range.start);
225        if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
226            *ok = false;
227        else
228        {
229            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
230        }
231    }
232
233private:
234    const Mat& src;
235    Mat& dst;
236    const Cvt& cvt;
237    bool *ok;
238
239    const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
240};
241
242template <typename Cvt>
243bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
244{
245    bool ok;
246    parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
247    return ok;
248}
249
250template <typename Cvt>
251bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
252{
253    Mat temp;
254    Mat &source = src;
255    if( src.data == dst.data )
256    {
257        src.copyTo(temp);
258        source = temp;
259    }
260    bool ok;
261    parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
262                  source.total()/(double)(1<<16) );
263    return ok;
264}
265
266static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
267         IppiSize roiSize, const int *dstOrder)
268{
269    return ippiSwapChannels_8u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
270}
271
272static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
273         IppiSize roiSize, const int *dstOrder)
274{
275    return ippiSwapChannels_16u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
276}
277
278static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
279         IppiSize roiSize, const int *dstOrder)
280{
281    return ippiSwapChannels_32f_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
282}
283
284static ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
285{
286    (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
287    0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
288};
289
290static ippiGeneralFunc ippiCopyAC4C3RTab[] =
291{
292    (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
293    0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
294};
295
296static ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
297{
298    (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
299    0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
300};
301
302static ippiReorderFunc ippiSwapChannelsC3RTab[] =
303{
304    (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
305    0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
306};
307
308#if IPP_VERSION_X100 >= 801
309static ippiReorderFunc ippiSwapChannelsC4RTab[] =
310{
311    (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
312    0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
313};
314#endif
315
316static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
317{
318    (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
319    0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
320};
321
322static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
323{
324    (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
325    0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
326};
327
328static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
329{
330    (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
331    0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
332};
333
334static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
335{
336    (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
337    0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
338};
339
340static ippiGeneralFunc ippiCopyP3C3RTab[] =
341{
342    (ippiGeneralFunc)ippiCopy_8u_P3C3R, 0, (ippiGeneralFunc)ippiCopy_16u_P3C3R, 0,
343    0, (ippiGeneralFunc)ippiCopy_32f_P3C3R, 0, 0
344};
345
346static ippiGeneralFunc ippiRGB2XYZTab[] =
347{
348    (ippiGeneralFunc)ippiRGBToXYZ_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToXYZ_16u_C3R, 0,
349    0, (ippiGeneralFunc)ippiRGBToXYZ_32f_C3R, 0, 0
350};
351
352static ippiGeneralFunc ippiXYZ2RGBTab[] =
353{
354    (ippiGeneralFunc)ippiXYZToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiXYZToRGB_16u_C3R, 0,
355    0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
356};
357
358static ippiGeneralFunc ippiRGB2HSVTab[] =
359{
360    (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
361    0, 0, 0, 0
362};
363
364static ippiGeneralFunc ippiHSV2RGBTab[] =
365{
366    (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
367    0, 0, 0, 0
368};
369
370static ippiGeneralFunc ippiRGB2HLSTab[] =
371{
372    (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
373    0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
374};
375
376static ippiGeneralFunc ippiHLS2RGBTab[] =
377{
378    (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
379    0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
380};
381
382#if !defined(HAVE_IPP_ICV_ONLY) && 0
383static ippiGeneralFunc ippiRGBToLUVTab[] =
384{
385    (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
386    0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
387};
388
389static ippiGeneralFunc ippiLUVToRGBTab[] =
390{
391    (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
392    0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
393};
394#endif
395
396struct IPPGeneralFunctor
397{
398    IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
399    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
400    {
401        return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
402    }
403private:
404    ippiGeneralFunc func;
405};
406
407struct IPPReorderFunctor
408{
409    IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : func(_func)
410    {
411        order[0] = _order0;
412        order[1] = _order1;
413        order[2] = _order2;
414        order[3] = 3;
415    }
416    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
417    {
418        return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
419    }
420private:
421    ippiReorderFunc func;
422    int order[4];
423};
424
425struct IPPColor2GrayFunctor
426{
427    IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
428        func(_func)
429    {
430        coeffs[0] = 0.114f;
431        coeffs[1] = 0.587f;
432        coeffs[2] = 0.299f;
433    }
434    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
435    {
436        return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
437    }
438private:
439    ippiColor2GrayFunc func;
440    Ipp32f coeffs[3];
441};
442
443struct IPPGray2BGRFunctor
444{
445    IPPGray2BGRFunctor(ippiGeneralFunc _func) :
446        func(_func)
447    {
448    }
449
450    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
451    {
452        if (func == 0)
453            return false;
454
455        const void* srcarray[3] = { src, src, src };
456        return func(srcarray, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0;
457    }
458private:
459    ippiGeneralFunc func;
460};
461
462struct IPPGray2BGRAFunctor
463{
464    IPPGray2BGRAFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _depth) :
465        func1(_func1), func2(_func2), depth(_depth)
466    {
467    }
468
469    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
470    {
471        if (func1 == 0 || func2 == 0)
472            return false;
473
474        const void* srcarray[3] = { src, src, src };
475        Mat temp(rows, cols, CV_MAKETYPE(depth, 3));
476        if(func1(srcarray, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
477            return false;
478        int order[4] = {0, 1, 2, 3};
479        return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
480    }
481private:
482    ippiGeneralFunc func1;
483    ippiReorderFunc func2;
484    int depth;
485};
486
487struct IPPReorderGeneralFunctor
488{
489    IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
490        func1(_func1), func2(_func2), depth(_depth)
491    {
492        order[0] = _order0;
493        order[1] = _order1;
494        order[2] = _order2;
495        order[3] = 3;
496    }
497    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
498    {
499        if (func1 == 0 || func2 == 0)
500            return false;
501
502        Mat temp;
503        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
504        if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
505            return false;
506        return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
507    }
508private:
509    ippiReorderFunc func1;
510    ippiGeneralFunc func2;
511    int order[4];
512    int depth;
513};
514
515struct IPPGeneralReorderFunctor
516{
517    IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
518        func1(_func1), func2(_func2), depth(_depth)
519    {
520        order[0] = _order0;
521        order[1] = _order1;
522        order[2] = _order2;
523        order[3] = 3;
524    }
525    bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
526    {
527        if (func1 == 0 || func2 == 0)
528            return false;
529
530        Mat temp;
531        temp.create(rows, cols, CV_MAKETYPE(depth, 3));
532        if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
533            return false;
534        return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
535    }
536private:
537    ippiGeneralFunc func1;
538    ippiReorderFunc func2;
539    int order[4];
540    int depth;
541};
542
543#endif
544
545////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
546
547template<typename _Tp> struct RGB2RGB
548{
549    typedef _Tp channel_type;
550
551    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
552    void operator()(const _Tp* src, _Tp* dst, int n) const
553    {
554        int scn = srccn, dcn = dstcn, bidx = blueIdx;
555        if( dcn == 3 )
556        {
557            n *= 3;
558            for( int i = 0; i < n; i += 3, src += scn )
559            {
560                _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
561                dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
562            }
563        }
564        else if( scn == 3 )
565        {
566            n *= 3;
567            _Tp alpha = ColorChannel<_Tp>::max();
568            for( int i = 0; i < n; i += 3, dst += 4 )
569            {
570                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
571                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
572            }
573        }
574        else
575        {
576            n *= 4;
577            for( int i = 0; i < n; i += 4 )
578            {
579                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
580                dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
581            }
582        }
583    }
584
585    int srccn, dstcn, blueIdx;
586};
587
588#if CV_NEON
589
590template<> struct RGB2RGB<uchar>
591{
592    typedef uchar channel_type;
593
594    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
595        srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
596    {
597        v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
598        v_alpha2 = vget_low_u8(v_alpha);
599    }
600
601    void operator()(const uchar * src, uchar * dst, int n) const
602    {
603        int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
604        if (dcn == 3)
605        {
606            n *= 3;
607            if (scn == 3)
608            {
609                for ( ; i <= n - 48; i += 48, src += 48 )
610                {
611                    uint8x16x3_t v_src = vld3q_u8(src), v_dst;
612                    v_dst.val[0] = v_src.val[bidx];
613                    v_dst.val[1] = v_src.val[1];
614                    v_dst.val[2] = v_src.val[bidx ^ 2];
615                    vst3q_u8(dst + i, v_dst);
616                }
617                for ( ; i <= n - 24; i += 24, src += 24 )
618                {
619                    uint8x8x3_t v_src = vld3_u8(src), v_dst;
620                    v_dst.val[0] = v_src.val[bidx];
621                    v_dst.val[1] = v_src.val[1];
622                    v_dst.val[2] = v_src.val[bidx ^ 2];
623                    vst3_u8(dst + i, v_dst);
624                }
625                for ( ; i < n; i += 3, src += 3 )
626                {
627                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
628                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
629                }
630            }
631            else
632            {
633                for ( ; i <= n - 48; i += 48, src += 64 )
634                {
635                    uint8x16x4_t v_src = vld4q_u8(src);
636                    uint8x16x3_t v_dst;
637                    v_dst.val[0] = v_src.val[bidx];
638                    v_dst.val[1] = v_src.val[1];
639                    v_dst.val[2] = v_src.val[bidx ^ 2];
640                    vst3q_u8(dst + i, v_dst);
641                }
642                for ( ; i <= n - 24; i += 24, src += 32 )
643                {
644                    uint8x8x4_t v_src = vld4_u8(src);
645                    uint8x8x3_t v_dst;
646                    v_dst.val[0] = v_src.val[bidx];
647                    v_dst.val[1] = v_src.val[1];
648                    v_dst.val[2] = v_src.val[bidx ^ 2];
649                    vst3_u8(dst + i, v_dst);
650                }
651                for ( ; i < n; i += 3, src += 4 )
652                {
653                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
654                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
655                }
656            }
657        }
658        else if (scn == 3)
659        {
660            n *= 3;
661            for ( ; i <= n - 48; i += 48, dst += 64 )
662            {
663                uint8x16x3_t v_src = vld3q_u8(src + i);
664                uint8x16x4_t v_dst;
665                v_dst.val[bidx] = v_src.val[0];
666                v_dst.val[1] = v_src.val[1];
667                v_dst.val[bidx ^ 2] = v_src.val[2];
668                v_dst.val[3] = v_alpha;
669                vst4q_u8(dst, v_dst);
670            }
671            for ( ; i <= n - 24; i += 24, dst += 32 )
672            {
673                uint8x8x3_t v_src = vld3_u8(src + i);
674                uint8x8x4_t v_dst;
675                v_dst.val[bidx] = v_src.val[0];
676                v_dst.val[1] = v_src.val[1];
677                v_dst.val[bidx ^ 2] = v_src.val[2];
678                v_dst.val[3] = v_alpha2;
679                vst4_u8(dst, v_dst);
680            }
681            uchar alpha = ColorChannel<uchar>::max();
682            for (; i < n; i += 3, dst += 4 )
683            {
684                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
685                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
686            }
687        }
688        else
689        {
690            n *= 4;
691            for ( ; i <= n - 64; i += 64 )
692            {
693                uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
694                v_dst.val[0] = v_src.val[2];
695                v_dst.val[1] = v_src.val[1];
696                v_dst.val[2] = v_src.val[0];
697                v_dst.val[3] = v_src.val[3];
698                vst4q_u8(dst + i, v_dst);
699            }
700            for ( ; i <= n - 32; i += 32 )
701            {
702                uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
703                v_dst.val[0] = v_src.val[2];
704                v_dst.val[1] = v_src.val[1];
705                v_dst.val[2] = v_src.val[0];
706                v_dst.val[3] = v_src.val[3];
707                vst4_u8(dst + i, v_dst);
708            }
709            for ( ; i < n; i += 4)
710            {
711                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
712                dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
713            }
714        }
715    }
716
717    int srccn, dstcn, blueIdx;
718
719    uint8x16_t v_alpha;
720    uint8x8_t v_alpha2;
721};
722
723#endif
724
725/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
726
727struct RGB5x52RGB
728{
729    typedef uchar channel_type;
730
731    RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
732        : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
733    {
734        #if CV_NEON
735        v_n3 = vdupq_n_u16(~3);
736        v_n7 = vdupq_n_u16(~7);
737        v_255 = vdupq_n_u8(255);
738        v_0 = vdupq_n_u8(0);
739        v_mask = vdupq_n_u16(0x8000);
740        #endif
741    }
742
743    void operator()(const uchar* src, uchar* dst, int n) const
744    {
745        int dcn = dstcn, bidx = blueIdx, i = 0;
746        if( greenBits == 6 )
747        {
748            #if CV_NEON
749            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
750            {
751                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
752                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
753                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
754                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
755                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
756                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
757                if (dcn == 3)
758                {
759                    uint8x16x3_t v_dst;
760                    v_dst.val[bidx] = v_b;
761                    v_dst.val[1] = v_g;
762                    v_dst.val[bidx^2] = v_r;
763                    vst3q_u8(dst, v_dst);
764                }
765                else
766                {
767                    uint8x16x4_t v_dst;
768                    v_dst.val[bidx] = v_b;
769                    v_dst.val[1] = v_g;
770                    v_dst.val[bidx^2] = v_r;
771                    v_dst.val[3] = v_255;
772                    vst4q_u8(dst, v_dst);
773                }
774            }
775            #endif
776            for( ; i < n; i++, dst += dcn )
777            {
778                unsigned t = ((const ushort*)src)[i];
779                dst[bidx] = (uchar)(t << 3);
780                dst[1] = (uchar)((t >> 3) & ~3);
781                dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
782                if( dcn == 4 )
783                    dst[3] = 255;
784            }
785        }
786        else
787        {
788            #if CV_NEON
789            for ( ; i <= n - 16; i += 16, dst += dcn * 16)
790            {
791                uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
792                uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
793                uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
794                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
795                uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
796                                             vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
797                if (dcn == 3)
798                {
799                    uint8x16x3_t v_dst;
800                    v_dst.val[bidx] = v_b;
801                    v_dst.val[1] = v_g;
802                    v_dst.val[bidx^2] = v_r;
803                    vst3q_u8(dst, v_dst);
804                }
805                else
806                {
807                    uint8x16x4_t v_dst;
808                    v_dst.val[bidx] = v_b;
809                    v_dst.val[1] = v_g;
810                    v_dst.val[bidx^2] = v_r;
811                    v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
812                                                        vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
813                    vst4q_u8(dst, v_dst);
814                }
815            }
816            #endif
817            for( ; i < n; i++, dst += dcn )
818            {
819                unsigned t = ((const ushort*)src)[i];
820                dst[bidx] = (uchar)(t << 3);
821                dst[1] = (uchar)((t >> 2) & ~7);
822                dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
823                if( dcn == 4 )
824                    dst[3] = t & 0x8000 ? 255 : 0;
825            }
826        }
827    }
828
829    int dstcn, blueIdx, greenBits;
830    #if CV_NEON
831    uint16x8_t v_n3, v_n7, v_mask;
832    uint8x16_t v_255, v_0;
833    #endif
834};
835
836
837struct RGB2RGB5x5
838{
839    typedef uchar channel_type;
840
841    RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
842        : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
843    {
844        #if CV_NEON
845        v_n3 = vdup_n_u8(~3);
846        v_n7 = vdup_n_u8(~7);
847        v_mask = vdupq_n_u16(0x8000);
848        v_0 = vdupq_n_u16(0);
849        v_full = vdupq_n_u16(0xffff);
850        #endif
851    }
852
853    void operator()(const uchar* src, uchar* dst, int n) const
854    {
855        int scn = srccn, bidx = blueIdx, i = 0;
856        if (greenBits == 6)
857        {
858            if (scn == 3)
859            {
860                #if CV_NEON
861                for ( ; i <= n - 8; i += 8, src += 24 )
862                {
863                    uint8x8x3_t v_src = vld3_u8(src);
864                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
865                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
866                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
867                    vst1q_u16((ushort *)dst + i, v_dst);
868                }
869                #endif
870                for ( ; i < n; i++, src += 3 )
871                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
872            }
873            else
874            {
875                #if CV_NEON
876                for ( ; i <= n - 8; i += 8, src += 32 )
877                {
878                    uint8x8x4_t v_src = vld4_u8(src);
879                    uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
880                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
881                    v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
882                    vst1q_u16((ushort *)dst + i, v_dst);
883                }
884                #endif
885                for ( ; i < n; i++, src += 4 )
886                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
887            }
888        }
889        else if (scn == 3)
890        {
891            #if CV_NEON
892            for ( ; i <= n - 8; i += 8, src += 24 )
893            {
894                uint8x8x3_t v_src = vld3_u8(src);
895                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
896                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
897                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
898                vst1q_u16((ushort *)dst + i, v_dst);
899            }
900            #endif
901            for ( ; i < n; i++, src += 3 )
902                ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
903        }
904        else
905        {
906            #if CV_NEON
907            for ( ; i <= n - 8; i += 8, src += 32 )
908            {
909                uint8x8x4_t v_src = vld4_u8(src);
910                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
911                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
912                v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
913                                                   vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
914                vst1q_u16((ushort *)dst + i, v_dst);
915            }
916            #endif
917            for ( ; i < n; i++, src += 4 )
918                ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
919                    ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
920        }
921    }
922
923    int srccn, blueIdx, greenBits;
924    #if CV_NEON
925    uint8x8_t v_n3, v_n7;
926    uint16x8_t v_mask, v_0, v_full;
927    #endif
928};
929
930///////////////////////////////// Color to/from Grayscale ////////////////////////////////
931
932template<typename _Tp>
933struct Gray2RGB
934{
935    typedef _Tp channel_type;
936
937    Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
938    void operator()(const _Tp* src, _Tp* dst, int n) const
939    {
940        if( dstcn == 3 )
941            for( int i = 0; i < n; i++, dst += 3 )
942            {
943                dst[0] = dst[1] = dst[2] = src[i];
944            }
945        else
946        {
947            _Tp alpha = ColorChannel<_Tp>::max();
948            for( int i = 0; i < n; i++, dst += 4 )
949            {
950                dst[0] = dst[1] = dst[2] = src[i];
951                dst[3] = alpha;
952            }
953        }
954    }
955
956    int dstcn;
957};
958
959
960struct Gray2RGB5x5
961{
962    typedef uchar channel_type;
963
964    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
965    {
966        #if CV_NEON
967        v_n7 = vdup_n_u8(~7);
968        v_n3 = vdup_n_u8(~3);
969        #elif CV_SSE2
970        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
971        v_n7 = _mm_set1_epi16(~7);
972        v_n3 = _mm_set1_epi16(~3);
973        v_zero = _mm_setzero_si128();
974        #endif
975    }
976
977    void operator()(const uchar* src, uchar* dst, int n) const
978    {
979        int i = 0;
980        if( greenBits == 6 )
981        {
982            #if CV_NEON
983            for ( ; i <= n - 8; i += 8 )
984            {
985                uint8x8_t v_src = vld1_u8(src + i);
986                uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
987                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
988                v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
989                vst1q_u16((ushort *)dst + i, v_dst);
990            }
991            #elif CV_SSE2
992            if (haveSIMD)
993            {
994                for ( ; i <= n - 16; i += 16 )
995                {
996                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
997
998                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
999                    __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
1000                                    _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
1001                                                 _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
1002                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
1003
1004                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
1005                    v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
1006                            _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
1007                                         _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
1008                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
1009                }
1010            }
1011            #endif
1012            for ( ; i < n; i++ )
1013            {
1014                int t = src[i];
1015                ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
1016            }
1017        }
1018        else
1019        {
1020            #if CV_NEON
1021            for ( ; i <= n - 8; i += 8 )
1022            {
1023                uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
1024                uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
1025                vst1q_u16((ushort *)dst + i, v_dst);
1026            }
1027            #elif CV_SSE2
1028            if (haveSIMD)
1029            {
1030                for ( ; i <= n - 16; i += 8 )
1031                {
1032                    __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
1033
1034                    __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
1035                    __m128i v_dst = _mm_or_si128(v_src_p,
1036                                    _mm_or_si128(_mm_slli_epi32(v_src_p, 5),
1037                                                 _mm_slli_epi16(v_src_p, 10)));
1038                    _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
1039
1040                    v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
1041                    v_dst = _mm_or_si128(v_src_p,
1042                            _mm_or_si128(_mm_slli_epi16(v_src_p, 5),
1043                                         _mm_slli_epi16(v_src_p, 10)));
1044                    _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
1045                }
1046            }
1047            #endif
1048            for( ; i < n; i++ )
1049            {
1050                int t = src[i] >> 3;
1051                ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
1052            }
1053        }
1054    }
1055    int greenBits;
1056
1057    #if CV_NEON
1058    uint8x8_t v_n7, v_n3;
1059    #elif CV_SSE2
1060    __m128i v_n7, v_n3, v_zero;
1061    bool haveSIMD;
1062    #endif
1063};
1064
1065
1066#undef R2Y
1067#undef G2Y
1068#undef B2Y
1069
1070enum
1071{
1072    yuv_shift = 14,
1073    xyz_shift = 12,
1074    R2Y = 4899,
1075    G2Y = 9617,
1076    B2Y = 1868,
1077    BLOCK_SIZE = 256
1078};
1079
1080
1081struct RGB5x52Gray
1082{
1083    typedef uchar channel_type;
1084
1085    RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
1086    {
1087        #if CV_NEON
1088        v_b2y = vdup_n_u16(B2Y);
1089        v_g2y = vdup_n_u16(G2Y);
1090        v_r2y = vdup_n_u16(R2Y);
1091        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1092        v_f8 = vdupq_n_u16(0xf8);
1093        v_fc = vdupq_n_u16(0xfc);
1094        #elif CV_SSE2
1095        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1096        v_b2y = _mm_set1_epi16(B2Y);
1097        v_g2y = _mm_set1_epi16(G2Y);
1098        v_r2y = _mm_set1_epi16(R2Y);
1099        v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
1100        v_f8 = _mm_set1_epi16(0xf8);
1101        v_fc = _mm_set1_epi16(0xfc);
1102        #endif
1103    }
1104
1105    void operator()(const uchar* src, uchar* dst, int n) const
1106    {
1107        int i = 0;
1108        if( greenBits == 6 )
1109        {
1110            #if CV_NEON
1111            for ( ; i <= n - 8; i += 8)
1112            {
1113                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1114                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1115                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
1116                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
1117
1118                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1119                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1120                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1121                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1122                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1123                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1124
1125                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1126            }
1127            #elif CV_SSE2
1128            if (haveSIMD)
1129            {
1130                __m128i v_zero = _mm_setzero_si128();
1131
1132                for ( ; i <= n - 8; i += 8)
1133                {
1134                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
1135                    __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
1136                            v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc),
1137                            v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8);
1138
1139                    __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
1140                    __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
1141                    __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
1142                    __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
1143                    __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
1144                    __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
1145
1146                    __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
1147                                                   _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1148                    v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
1149                                           _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
1150
1151                    __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
1152                                                   _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1153                    v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
1154                                           _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
1155
1156                    v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
1157                    v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
1158
1159                    __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
1160                    _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
1161                }
1162            }
1163            #endif
1164            for ( ; i < n; i++)
1165            {
1166                int t = ((ushort*)src)[i];
1167                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1168                                           ((t >> 3) & 0xfc)*G2Y +
1169                                           ((t >> 8) & 0xf8)*R2Y, yuv_shift);
1170            }
1171        }
1172        else
1173        {
1174            #if CV_NEON
1175            for ( ; i <= n - 8; i += 8)
1176            {
1177                uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1178                uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1179                           v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
1180                           v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
1181
1182                uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1183                                              vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1184                uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1185                                              vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1186                v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1187                v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1188
1189                vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1190            }
1191            #elif CV_SSE2
1192            if (haveSIMD)
1193            {
1194                __m128i v_zero = _mm_setzero_si128();
1195
1196                for ( ; i <= n - 8; i += 8)
1197                {
1198                    __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
1199                    __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
1200                            v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8),
1201                            v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8);
1202
1203                    __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
1204                    __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
1205                    __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
1206                    __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
1207                    __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
1208                    __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
1209
1210                    __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
1211                                                   _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1212                    v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
1213                                           _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
1214
1215                    __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
1216                                                   _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1217                    v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
1218                                           _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
1219
1220                    v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
1221                    v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
1222
1223                    __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
1224                    _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
1225                }
1226            }
1227            #endif
1228            for ( ; i < n; i++)
1229            {
1230                int t = ((ushort*)src)[i];
1231                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1232                                           ((t >> 2) & 0xf8)*G2Y +
1233                                           ((t >> 7) & 0xf8)*R2Y, yuv_shift);
1234            }
1235        }
1236    }
1237    int greenBits;
1238
1239    #if CV_NEON
1240    uint16x4_t v_b2y, v_g2y, v_r2y;
1241    uint32x4_t v_delta;
1242    uint16x8_t v_f8, v_fc;
1243    #elif CV_SSE2
1244    bool haveSIMD;
1245    __m128i v_b2y, v_g2y, v_r2y;
1246    __m128i v_delta;
1247    __m128i v_f8, v_fc;
1248    #endif
1249};
1250
1251
1252template<typename _Tp> struct RGB2Gray
1253{
1254    typedef _Tp channel_type;
1255
1256    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1257    {
1258        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1259        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1260        if(blueIdx == 0)
1261            std::swap(coeffs[0], coeffs[2]);
1262    }
1263
1264    void operator()(const _Tp* src, _Tp* dst, int n) const
1265    {
1266        int scn = srccn;
1267        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1268        for(int i = 0; i < n; i++, src += scn)
1269            dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
1270    }
1271    int srccn;
1272    float coeffs[3];
1273};
1274
1275template<> struct RGB2Gray<uchar>
1276{
1277    typedef uchar channel_type;
1278
1279    RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
1280    {
1281        const int coeffs0[] = { R2Y, G2Y, B2Y };
1282        if(!coeffs) coeffs = coeffs0;
1283
1284        int b = 0, g = 0, r = (1 << (yuv_shift-1));
1285        int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
1286
1287        for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
1288        {
1289            tab[i] = b;
1290            tab[i+256] = g;
1291            tab[i+512] = r;
1292        }
1293    }
1294    void operator()(const uchar* src, uchar* dst, int n) const
1295    {
1296        int scn = srccn;
1297        const int* _tab = tab;
1298        for(int i = 0; i < n; i++, src += scn)
1299            dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
1300    }
1301    int srccn;
1302    int tab[256*3];
1303};
1304
1305#if CV_NEON
1306
1307template <>
1308struct RGB2Gray<ushort>
1309{
1310    typedef ushort channel_type;
1311
1312    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1313        srccn(_srccn)
1314    {
1315        static const int coeffs0[] = { R2Y, G2Y, B2Y };
1316        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1317        if( blueIdx == 0 )
1318            std::swap(coeffs[0], coeffs[2]);
1319
1320        v_cb = vdup_n_u16(coeffs[0]);
1321        v_cg = vdup_n_u16(coeffs[1]);
1322        v_cr = vdup_n_u16(coeffs[2]);
1323        v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1324    }
1325
1326    void operator()(const ushort* src, ushort* dst, int n) const
1327    {
1328        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1329
1330        for ( ; i <= n - 8; i += 8, src += scn * 8)
1331        {
1332            uint16x8_t v_b, v_r, v_g;
1333            if (scn == 3)
1334            {
1335                uint16x8x3_t v_src = vld3q_u16(src);
1336                v_b = v_src.val[0];
1337                v_g = v_src.val[1];
1338                v_r = v_src.val[2];
1339            }
1340            else
1341            {
1342                uint16x8x4_t v_src = vld4q_u16(src);
1343                v_b = v_src.val[0];
1344                v_g = v_src.val[1];
1345                v_r = v_src.val[2];
1346            }
1347
1348            uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
1349                                           vmull_u16(vget_low_u16(v_b), v_cb),
1350                                                     vget_low_u16(v_g), v_cg),
1351                                                     vget_low_u16(v_r), v_cr);
1352            uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
1353                                           vmull_u16(vget_high_u16(v_b), v_cb),
1354                                                     vget_high_u16(v_g), v_cg),
1355                                                     vget_high_u16(v_r), v_cr);
1356
1357            uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
1358            uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
1359
1360            vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
1361        }
1362
1363        for ( ; i <= n - 4; i += 4, src += scn * 4)
1364        {
1365            uint16x4_t v_b, v_r, v_g;
1366            if (scn == 3)
1367            {
1368                uint16x4x3_t v_src = vld3_u16(src);
1369                v_b = v_src.val[0];
1370                v_g = v_src.val[1];
1371                v_r = v_src.val[2];
1372            }
1373            else
1374            {
1375                uint16x4x4_t v_src = vld4_u16(src);
1376                v_b = v_src.val[0];
1377                v_g = v_src.val[1];
1378                v_r = v_src.val[2];
1379            }
1380
1381            uint32x4_t v_dst = vmlal_u16(vmlal_u16(
1382                                         vmull_u16(v_b, v_cb),
1383                                                   v_g, v_cg),
1384                                                   v_r, v_cr);
1385
1386            vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
1387        }
1388
1389        for( ; i < n; i++, src += scn)
1390            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1391    }
1392
1393    int srccn, coeffs[3];
1394    uint16x4_t v_cb, v_cg, v_cr;
1395    uint32x4_t v_delta;
1396};
1397
1398template <>
1399struct RGB2Gray<float>
1400{
1401    typedef float channel_type;
1402
1403    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1404    {
1405        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1406        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1407        if(blueIdx == 0)
1408            std::swap(coeffs[0], coeffs[2]);
1409
1410        v_cb = vdupq_n_f32(coeffs[0]);
1411        v_cg = vdupq_n_f32(coeffs[1]);
1412        v_cr = vdupq_n_f32(coeffs[2]);
1413    }
1414
1415    void operator()(const float * src, float * dst, int n) const
1416    {
1417        int scn = srccn, i = 0;
1418        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1419
1420        if (scn == 3)
1421        {
1422            for ( ; i <= n - 8; i += 8, src += scn * 8)
1423            {
1424                float32x4x3_t v_src = vld3q_f32(src);
1425                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1426
1427                v_src = vld3q_f32(src + scn * 4);
1428                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1429            }
1430
1431            for ( ; i <= n - 4; i += 4, src += scn * 4)
1432            {
1433                float32x4x3_t v_src = vld3q_f32(src);
1434                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1435            }
1436        }
1437        else
1438        {
1439            for ( ; i <= n - 8; i += 8, src += scn * 8)
1440            {
1441                float32x4x4_t v_src = vld4q_f32(src);
1442                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1443
1444                v_src = vld4q_f32(src + scn * 4);
1445                vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1446            }
1447
1448            for ( ; i <= n - 4; i += 4, src += scn * 4)
1449            {
1450                float32x4x4_t v_src = vld4q_f32(src);
1451                vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1452            }
1453        }
1454
1455        for ( ; i < n; i++, src += scn)
1456            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1457    }
1458
1459    int srccn;
1460    float coeffs[3];
1461    float32x4_t v_cb, v_cg, v_cr;
1462};
1463
1464#elif CV_SSE2
1465
1466#if CV_SSE4_1
1467
1468template <>
1469struct RGB2Gray<ushort>
1470{
1471    typedef ushort channel_type;
1472
1473    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1474        srccn(_srccn)
1475    {
1476        static const int coeffs0[] = { R2Y, G2Y, B2Y };
1477        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1478        if( blueIdx == 0 )
1479            std::swap(coeffs[0], coeffs[2]);
1480
1481        v_cb = _mm_set1_epi16((short)coeffs[0]);
1482        v_cg = _mm_set1_epi16((short)coeffs[1]);
1483        v_cr = _mm_set1_epi16((short)coeffs[2]);
1484        v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
1485
1486        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
1487    }
1488
1489    // 16s x 8
1490    void process(__m128i v_b, __m128i v_g, __m128i v_r,
1491                 __m128i & v_gray) const
1492    {
1493        __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr);
1494        __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg);
1495        __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb);
1496        __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr);
1497        __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg);
1498        __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb);
1499
1500        __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r),
1501                                        _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1502        v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0);
1503        v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift);
1504
1505        __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r),
1506                                        _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1507        v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1);
1508        v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift);
1509
1510        v_gray = _mm_packus_epi32(v_gray0, v_gray1);
1511    }
1512
1513    void operator()(const ushort* src, ushort* dst, int n) const
1514    {
1515        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1516
1517        if (scn == 3 && haveSIMD)
1518        {
1519            for ( ; i <= n - 16; i += 16, src += scn * 16)
1520            {
1521                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
1522                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
1523                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
1524                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
1525                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
1526                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
1527
1528                _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1529
1530                __m128i v_gray0;
1531                process(v_r0, v_g0, v_b0,
1532                        v_gray0);
1533
1534                __m128i v_gray1;
1535                process(v_r1, v_g1, v_b1,
1536                        v_gray1);
1537
1538                _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
1539                _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
1540            }
1541        }
1542        else if (scn == 4 && haveSIMD)
1543        {
1544            for ( ; i <= n - 16; i += 16, src += scn * 16)
1545            {
1546                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
1547                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
1548                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
1549                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
1550                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
1551                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
1552                __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
1553                __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
1554
1555                _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
1556
1557                __m128i v_gray0;
1558                process(v_r0, v_g0, v_b0,
1559                        v_gray0);
1560
1561                __m128i v_gray1;
1562                process(v_r1, v_g1, v_b1,
1563                        v_gray1);
1564
1565                _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
1566                _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
1567            }
1568        }
1569
1570        for( ; i < n; i++, src += scn)
1571            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1572    }
1573
1574    int srccn, coeffs[3];
1575    __m128i v_cb, v_cg, v_cr;
1576    __m128i v_delta;
1577    bool haveSIMD;
1578};
1579
1580#endif // CV_SSE4_1
1581
1582template <>
1583struct RGB2Gray<float>
1584{
1585    typedef float channel_type;
1586
1587    RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1588    {
1589        static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1590        memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1591        if(blueIdx == 0)
1592            std::swap(coeffs[0], coeffs[2]);
1593
1594        v_cb = _mm_set1_ps(coeffs[0]);
1595        v_cg = _mm_set1_ps(coeffs[1]);
1596        v_cr = _mm_set1_ps(coeffs[2]);
1597
1598        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1599    }
1600
1601    void process(__m128 v_b, __m128 v_g, __m128 v_r,
1602                 __m128 & v_gray) const
1603    {
1604        v_gray = _mm_mul_ps(v_r, v_cr);
1605        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
1606        v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
1607    }
1608
1609    void operator()(const float * src, float * dst, int n) const
1610    {
1611        int scn = srccn, i = 0;
1612        float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1613
1614        if (scn == 3 && haveSIMD)
1615        {
1616            for ( ; i <= n - 8; i += 8, src += scn * 8)
1617            {
1618                __m128 v_r0 = _mm_loadu_ps(src);
1619                __m128 v_r1 = _mm_loadu_ps(src + 4);
1620                __m128 v_g0 = _mm_loadu_ps(src + 8);
1621                __m128 v_g1 = _mm_loadu_ps(src + 12);
1622                __m128 v_b0 = _mm_loadu_ps(src + 16);
1623                __m128 v_b1 = _mm_loadu_ps(src + 20);
1624
1625                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1626
1627                __m128 v_gray0;
1628                process(v_r0, v_g0, v_b0,
1629                        v_gray0);
1630
1631                __m128 v_gray1;
1632                process(v_r1, v_g1, v_b1,
1633                        v_gray1);
1634
1635                _mm_storeu_ps(dst + i, v_gray0);
1636                _mm_storeu_ps(dst + i + 4, v_gray1);
1637            }
1638        }
1639        else if (scn == 4 && haveSIMD)
1640        {
1641            for ( ; i <= n - 8; i += 8, src += scn * 8)
1642            {
1643                __m128 v_r0 = _mm_loadu_ps(src);
1644                __m128 v_r1 = _mm_loadu_ps(src + 4);
1645                __m128 v_g0 = _mm_loadu_ps(src + 8);
1646                __m128 v_g1 = _mm_loadu_ps(src + 12);
1647                __m128 v_b0 = _mm_loadu_ps(src + 16);
1648                __m128 v_b1 = _mm_loadu_ps(src + 20);
1649                __m128 v_a0 = _mm_loadu_ps(src + 24);
1650                __m128 v_a1 = _mm_loadu_ps(src + 28);
1651
1652                _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
1653
1654                __m128 v_gray0;
1655                process(v_r0, v_g0, v_b0,
1656                        v_gray0);
1657
1658                __m128 v_gray1;
1659                process(v_r1, v_g1, v_b1,
1660                        v_gray1);
1661
1662                _mm_storeu_ps(dst + i, v_gray0);
1663                _mm_storeu_ps(dst + i + 4, v_gray1);
1664            }
1665        }
1666
1667        for ( ; i < n; i++, src += scn)
1668            dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1669    }
1670
1671    int srccn;
1672    float coeffs[3];
1673    __m128 v_cb, v_cg, v_cr;
1674    bool haveSIMD;
1675};
1676
1677#else
1678
1679template<> struct RGB2Gray<ushort>
1680{
1681    typedef ushort channel_type;
1682
1683    RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
1684    {
1685        static const int coeffs0[] = { R2Y, G2Y, B2Y };
1686        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1687        if( blueIdx == 0 )
1688            std::swap(coeffs[0], coeffs[2]);
1689    }
1690
1691    void operator()(const ushort* src, ushort* dst, int n) const
1692    {
1693        int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1694        for(int i = 0; i < n; i++, src += scn)
1695            dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1696    }
1697    int srccn;
1698    int coeffs[3];
1699};
1700
1701#endif
1702
1703///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
1704
1705template<typename _Tp> struct RGB2YCrCb_f
1706{
1707    typedef _Tp channel_type;
1708
1709    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
1710    {
1711        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1712        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1713        if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1714    }
1715
1716    void operator()(const _Tp* src, _Tp* dst, int n) const
1717    {
1718        int scn = srccn, bidx = blueIdx;
1719        const _Tp delta = ColorChannel<_Tp>::half();
1720        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1721        n *= 3;
1722        for(int i = 0; i < n; i += 3, src += scn)
1723        {
1724            _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
1725            _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
1726            _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
1727            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1728        }
1729    }
1730    int srccn, blueIdx;
1731    float coeffs[5];
1732};
1733
1734#if CV_NEON
1735
1736template <>
1737struct RGB2YCrCb_f<float>
1738{
1739    typedef float channel_type;
1740
1741    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1742        srccn(_srccn), blueIdx(_blueIdx)
1743    {
1744        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1745        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1746        if(blueIdx==0)
1747            std::swap(coeffs[0], coeffs[2]);
1748
1749        v_c0 = vdupq_n_f32(coeffs[0]);
1750        v_c1 = vdupq_n_f32(coeffs[1]);
1751        v_c2 = vdupq_n_f32(coeffs[2]);
1752        v_c3 = vdupq_n_f32(coeffs[3]);
1753        v_c4 = vdupq_n_f32(coeffs[4]);
1754        v_delta = vdupq_n_f32(ColorChannel<float>::half());
1755    }
1756
1757    void operator()(const float * src, float * dst, int n) const
1758    {
1759        int scn = srccn, bidx = blueIdx, i = 0;
1760        const float delta = ColorChannel<float>::half();
1761        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1762        n *= 3;
1763
1764        if (scn == 3)
1765            for ( ; i <= n - 12; i += 12, src += 12)
1766            {
1767                float32x4x3_t v_src = vld3q_f32(src), v_dst;
1768                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1769                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1770                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1771
1772                vst3q_f32(dst + i, v_dst);
1773            }
1774        else
1775            for ( ; i <= n - 12; i += 12, src += 16)
1776            {
1777                float32x4x4_t v_src = vld4q_f32(src);
1778                float32x4x3_t v_dst;
1779                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1780                v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1781                v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1782
1783                vst3q_f32(dst + i, v_dst);
1784            }
1785
1786        for ( ; i < n; i += 3, src += scn)
1787        {
1788            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1789            float Cr = (src[bidx^2] - Y)*C3 + delta;
1790            float Cb = (src[bidx] - Y)*C4 + delta;
1791            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1792        }
1793    }
1794    int srccn, blueIdx;
1795    float coeffs[5];
1796    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1797};
1798
1799#elif CV_SSE2
1800
1801template <>
1802struct RGB2YCrCb_f<float>
1803{
1804    typedef float channel_type;
1805
1806    RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1807        srccn(_srccn), blueIdx(_blueIdx)
1808    {
1809        static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1810        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1811        if (blueIdx==0)
1812            std::swap(coeffs[0], coeffs[2]);
1813
1814        v_c0 = _mm_set1_ps(coeffs[0]);
1815        v_c1 = _mm_set1_ps(coeffs[1]);
1816        v_c2 = _mm_set1_ps(coeffs[2]);
1817        v_c3 = _mm_set1_ps(coeffs[3]);
1818        v_c4 = _mm_set1_ps(coeffs[4]);
1819        v_delta = _mm_set1_ps(ColorChannel<float>::half());
1820
1821        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1822    }
1823
1824    void process(__m128 v_r, __m128 v_g, __m128 v_b,
1825                 __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const
1826    {
1827        v_y = _mm_mul_ps(v_r, v_c0);
1828        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1));
1829        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2));
1830
1831        v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta);
1832        v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta);
1833    }
1834
1835    void operator()(const float * src, float * dst, int n) const
1836    {
1837        int scn = srccn, bidx = blueIdx, i = 0;
1838        const float delta = ColorChannel<float>::half();
1839        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1840        n *= 3;
1841
1842        if (haveSIMD)
1843        {
1844            for ( ; i <= n - 24; i += 24, src += 8 * scn)
1845            {
1846                __m128 v_r0 = _mm_loadu_ps(src);
1847                __m128 v_r1 = _mm_loadu_ps(src + 4);
1848                __m128 v_g0 = _mm_loadu_ps(src + 8);
1849                __m128 v_g1 = _mm_loadu_ps(src + 12);
1850                __m128 v_b0 = _mm_loadu_ps(src + 16);
1851                __m128 v_b1 = _mm_loadu_ps(src + 20);
1852
1853                if (scn == 4)
1854                {
1855                    __m128 v_a0 = _mm_loadu_ps(src + 24);
1856                    __m128 v_a1 = _mm_loadu_ps(src + 28);
1857                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
1858                                        v_b0, v_b1, v_a0, v_a1);
1859                }
1860                else
1861                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1862
1863                __m128 v_y0, v_cr0, v_cb0;
1864                process(v_r0, v_g0, v_b0,
1865                        v_y0, v_cr0, v_cb0);
1866
1867                __m128 v_y1, v_cr1, v_cb1;
1868                process(v_r1, v_g1, v_b1,
1869                        v_y1, v_cr1, v_cb1);
1870
1871                _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
1872
1873                _mm_storeu_ps(dst + i, v_y0);
1874                _mm_storeu_ps(dst + i + 4, v_y1);
1875                _mm_storeu_ps(dst + i + 8, v_cr0);
1876                _mm_storeu_ps(dst + i + 12, v_cr1);
1877                _mm_storeu_ps(dst + i + 16, v_cb0);
1878                _mm_storeu_ps(dst + i + 20, v_cb1);
1879            }
1880        }
1881
1882        for ( ; i < n; i += 3, src += scn)
1883        {
1884            float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1885            float Cr = (src[bidx^2] - Y)*C3 + delta;
1886            float Cb = (src[bidx] - Y)*C4 + delta;
1887            dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1888        }
1889    }
1890    int srccn, blueIdx;
1891    float coeffs[5];
1892    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1893    bool haveSIMD;
1894};
1895
1896#endif
1897
1898template<typename _Tp> struct RGB2YCrCb_i
1899{
1900    typedef _Tp channel_type;
1901
1902    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1903        : srccn(_srccn), blueIdx(_blueIdx)
1904    {
1905        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1906        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1907        if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1908    }
1909    void operator()(const _Tp* src, _Tp* dst, int n) const
1910    {
1911        int scn = srccn, bidx = blueIdx;
1912        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1913        int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
1914        n *= 3;
1915        for(int i = 0; i < n; i += 3, src += scn)
1916        {
1917            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1918            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1919            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1920            dst[i] = saturate_cast<_Tp>(Y);
1921            dst[i+1] = saturate_cast<_Tp>(Cr);
1922            dst[i+2] = saturate_cast<_Tp>(Cb);
1923        }
1924    }
1925    int srccn, blueIdx;
1926    int coeffs[5];
1927};
1928
1929#if CV_NEON
1930
1931template <>
1932struct RGB2YCrCb_i<uchar>
1933{
1934    typedef uchar channel_type;
1935
1936    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1937        : srccn(_srccn), blueIdx(_blueIdx)
1938    {
1939        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1940        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1941        if (blueIdx==0)
1942            std::swap(coeffs[0], coeffs[2]);
1943
1944        v_c0 = vdup_n_s16(coeffs[0]);
1945        v_c1 = vdup_n_s16(coeffs[1]);
1946        v_c2 = vdup_n_s16(coeffs[2]);
1947        v_c3 = vdupq_n_s32(coeffs[3]);
1948        v_c4 = vdupq_n_s32(coeffs[4]);
1949        v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
1950        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1951    }
1952
1953    void operator()(const uchar * src, uchar * dst, int n) const
1954    {
1955        int scn = srccn, bidx = blueIdx, i = 0;
1956        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1957        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
1958        n *= 3;
1959
1960        for ( ; i <= n - 24; i += 24, src += scn * 8)
1961        {
1962            uint8x8x3_t v_dst;
1963            int16x8x3_t v_src16;
1964
1965            if (scn == 3)
1966            {
1967                uint8x8x3_t v_src = vld3_u8(src);
1968                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1969                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1970                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1971            }
1972            else
1973            {
1974                uint8x8x4_t v_src = vld4_u8(src);
1975                v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1976                v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1977                v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1978            }
1979
1980            int16x4x3_t v_src0;
1981            v_src0.val[0] = vget_low_s16(v_src16.val[0]);
1982            v_src0.val[1] = vget_low_s16(v_src16.val[1]);
1983            v_src0.val[2] = vget_low_s16(v_src16.val[2]);
1984
1985            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1986            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1987            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
1988            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1989            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
1990            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1991
1992            v_src0.val[0] = vget_high_s16(v_src16.val[0]);
1993            v_src0.val[1] = vget_high_s16(v_src16.val[1]);
1994            v_src0.val[2] = vget_high_s16(v_src16.val[2]);
1995
1996            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1997            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1998            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
1999            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
2000            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
2001            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
2002
2003            v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
2004            v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
2005            v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
2006
2007            vst3_u8(dst + i, v_dst);
2008        }
2009
2010        for ( ; i < n; i += 3, src += scn)
2011        {
2012            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2013            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2014            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2015            dst[i] = saturate_cast<uchar>(Y);
2016            dst[i+1] = saturate_cast<uchar>(Cr);
2017            dst[i+2] = saturate_cast<uchar>(Cb);
2018        }
2019    }
2020    int srccn, blueIdx, coeffs[5];
2021    int16x4_t v_c0, v_c1, v_c2;
2022    int32x4_t v_c3, v_c4, v_delta, v_delta2;
2023};
2024
2025template <>
2026struct RGB2YCrCb_i<ushort>
2027{
2028    typedef ushort channel_type;
2029
2030    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2031        : srccn(_srccn), blueIdx(_blueIdx)
2032    {
2033        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2034        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2035        if (blueIdx==0)
2036            std::swap(coeffs[0], coeffs[2]);
2037
2038        v_c0 = vdupq_n_s32(coeffs[0]);
2039        v_c1 = vdupq_n_s32(coeffs[1]);
2040        v_c2 = vdupq_n_s32(coeffs[2]);
2041        v_c3 = vdupq_n_s32(coeffs[3]);
2042        v_c4 = vdupq_n_s32(coeffs[4]);
2043        v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
2044        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2045    }
2046
2047    void operator()(const ushort * src, ushort * dst, int n) const
2048    {
2049        int scn = srccn, bidx = blueIdx, i = 0;
2050        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2051        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
2052        n *= 3;
2053
2054        for ( ; i <= n - 24; i += 24, src += scn * 8)
2055        {
2056            uint16x8x3_t v_src, v_dst;
2057            int32x4x3_t v_src0;
2058
2059            if (scn == 3)
2060                v_src = vld3q_u16(src);
2061            else
2062            {
2063                uint16x8x4_t v_src_ = vld4q_u16(src);
2064                v_src.val[0] = v_src_.val[0];
2065                v_src.val[1] = v_src_.val[1];
2066                v_src.val[2] = v_src_.val[2];
2067            }
2068
2069            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
2070            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
2071            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2072
2073            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2074            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
2075            int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
2076            v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
2077            int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
2078            v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
2079
2080            v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
2081            v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
2082            v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2083
2084            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2085            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
2086            int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
2087            v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
2088            int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
2089            v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
2090
2091            v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
2092            v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
2093            v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
2094
2095            vst3q_u16(dst + i, v_dst);
2096        }
2097
2098        for ( ; i <= n - 12; i += 12, src += scn * 4)
2099        {
2100            uint16x4x3_t v_dst;
2101            int32x4x3_t v_src0;
2102
2103            if (scn == 3)
2104            {
2105                uint16x4x3_t v_src = vld3_u16(src);
2106                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
2107                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
2108                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2109            }
2110            else
2111            {
2112                uint16x4x4_t v_src = vld4_u16(src);
2113                v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
2114                v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
2115                v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2116            }
2117
2118            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2119            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
2120            int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
2121            v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
2122            int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
2123            v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
2124
2125            v_dst.val[0] = vqmovun_s32(v_Y);
2126            v_dst.val[1] = vqmovun_s32(v_Cr);
2127            v_dst.val[2] = vqmovun_s32(v_Cb);
2128
2129            vst3_u16(dst + i, v_dst);
2130        }
2131
2132        for ( ; i < n; i += 3, src += scn)
2133        {
2134            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2135            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2136            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2137            dst[i] = saturate_cast<ushort>(Y);
2138            dst[i+1] = saturate_cast<ushort>(Cr);
2139            dst[i+2] = saturate_cast<ushort>(Cb);
2140        }
2141    }
2142    int srccn, blueIdx, coeffs[5];
2143    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
2144};
2145
2146#elif CV_SSE4_1
2147
2148template <>
2149struct RGB2YCrCb_i<uchar>
2150{
2151    typedef uchar channel_type;
2152
2153    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2154        : srccn(_srccn), blueIdx(_blueIdx)
2155    {
2156        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2157        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2158        if (blueIdx==0)
2159            std::swap(coeffs[0], coeffs[2]);
2160
2161        v_c0 = _mm_set1_epi32(coeffs[0]);
2162        v_c1 = _mm_set1_epi32(coeffs[1]);
2163        v_c2 = _mm_set1_epi32(coeffs[2]);
2164        v_c3 = _mm_set1_epi32(coeffs[3]);
2165        v_c4 = _mm_set1_epi32(coeffs[4]);
2166        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2167        v_delta = _mm_set1_epi32(ColorChannel<uchar>::half()*(1 << yuv_shift));
2168        v_delta = _mm_add_epi32(v_delta, v_delta2);
2169        v_zero = _mm_setzero_si128();
2170
2171        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
2172    }
2173
2174    // 16u x 8
2175    void process(__m128i v_r, __m128i v_g, __m128i v_b,
2176                 __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
2177    {
2178        __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
2179        __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
2180        __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
2181
2182        __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2183                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2184                                     _mm_mullo_epi32(v_b_p, v_c2)));
2185        v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
2186
2187        __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
2188        __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
2189        v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
2190        v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
2191
2192        v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
2193        v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
2194        v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
2195
2196        __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2197                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2198                                     _mm_mullo_epi32(v_b_p, v_c2)));
2199        v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
2200
2201        __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
2202        __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
2203        v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
2204        v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
2205
2206        v_y = _mm_packs_epi32(v_y0, v_y1);
2207        v_cr = _mm_packs_epi32(v_cr0, v_cr1);
2208        v_cb = _mm_packs_epi32(v_cb0, v_cb1);
2209    }
2210
2211    void operator()(const uchar * src, uchar * dst, int n) const
2212    {
2213        int scn = srccn, bidx = blueIdx, i = 0;
2214        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2215        int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
2216        n *= 3;
2217
2218        if (haveSIMD)
2219        {
2220            for ( ; i <= n - 96; i += 96, src += scn * 32)
2221            {
2222                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
2223                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16));
2224                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32));
2225                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48));
2226                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64));
2227                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80));
2228
2229                if (scn == 4)
2230                {
2231                    __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96));
2232                    __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112));
2233                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1,
2234                                          v_b0, v_b1, v_a0, v_a1);
2235                }
2236                else
2237                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2238
2239                __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
2240                process(_mm_unpacklo_epi8(v_r0, v_zero),
2241                        _mm_unpacklo_epi8(v_g0, v_zero),
2242                        _mm_unpacklo_epi8(v_b0, v_zero),
2243                        v_y0, v_cr0, v_cb0);
2244
2245                __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
2246                process(_mm_unpackhi_epi8(v_r0, v_zero),
2247                        _mm_unpackhi_epi8(v_g0, v_zero),
2248                        _mm_unpackhi_epi8(v_b0, v_zero),
2249                        v_y1, v_cr1, v_cb1);
2250
2251                __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1);
2252                __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1);
2253                __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1);
2254
2255                process(_mm_unpacklo_epi8(v_r1, v_zero),
2256                        _mm_unpacklo_epi8(v_g1, v_zero),
2257                        _mm_unpacklo_epi8(v_b1, v_zero),
2258                        v_y0, v_cr0, v_cb0);
2259
2260                process(_mm_unpackhi_epi8(v_r1, v_zero),
2261                        _mm_unpackhi_epi8(v_g1, v_zero),
2262                        _mm_unpackhi_epi8(v_b1, v_zero),
2263                        v_y1, v_cr1, v_cb1);
2264
2265                __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1);
2266                __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1);
2267                __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1);
2268
2269                _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1);
2270
2271                _mm_storeu_si128((__m128i *)(dst + i), v_y_0);
2272                _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1);
2273                _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0);
2274                _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1);
2275                _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0);
2276                _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1);
2277            }
2278        }
2279
2280        for ( ; i < n; i += 3, src += scn)
2281        {
2282            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2283            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2284            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2285            dst[i] = saturate_cast<uchar>(Y);
2286            dst[i+1] = saturate_cast<uchar>(Cr);
2287            dst[i+2] = saturate_cast<uchar>(Cb);
2288        }
2289    }
2290
2291    int srccn, blueIdx, coeffs[5];
2292    __m128i v_c0, v_c1, v_c2;
2293    __m128i v_c3, v_c4, v_delta, v_delta2;
2294    __m128i v_zero;
2295    bool haveSIMD;
2296};
2297
2298template <>
2299struct RGB2YCrCb_i<ushort>
2300{
2301    typedef ushort channel_type;
2302
2303    RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2304        : srccn(_srccn), blueIdx(_blueIdx)
2305    {
2306        static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2307        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2308        if (blueIdx==0)
2309            std::swap(coeffs[0], coeffs[2]);
2310
2311        v_c0 = _mm_set1_epi32(coeffs[0]);
2312        v_c1 = _mm_set1_epi32(coeffs[1]);
2313        v_c2 = _mm_set1_epi32(coeffs[2]);
2314        v_c3 = _mm_set1_epi32(coeffs[3]);
2315        v_c4 = _mm_set1_epi32(coeffs[4]);
2316        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2317        v_delta = _mm_set1_epi32(ColorChannel<ushort>::half()*(1 << yuv_shift));
2318        v_delta = _mm_add_epi32(v_delta, v_delta2);
2319        v_zero = _mm_setzero_si128();
2320
2321        haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
2322    }
2323
2324    // 16u x 8
2325    void process(__m128i v_r, __m128i v_g, __m128i v_b,
2326                 __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
2327    {
2328        __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
2329        __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
2330        __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
2331
2332        __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2333                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2334                                     _mm_mullo_epi32(v_b_p, v_c2)));
2335        v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
2336
2337        __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
2338        __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
2339        v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
2340        v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
2341
2342        v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
2343        v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
2344        v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
2345
2346        __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2347                       _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2348                                     _mm_mullo_epi32(v_b_p, v_c2)));
2349        v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
2350
2351        __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
2352        __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
2353        v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
2354        v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
2355
2356        v_y = _mm_packus_epi32(v_y0, v_y1);
2357        v_cr = _mm_packus_epi32(v_cr0, v_cr1);
2358        v_cb = _mm_packus_epi32(v_cb0, v_cb1);
2359    }
2360
2361    void operator()(const ushort * src, ushort * dst, int n) const
2362    {
2363        int scn = srccn, bidx = blueIdx, i = 0;
2364        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2365        int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
2366        n *= 3;
2367
2368        if (haveSIMD)
2369        {
2370            for ( ; i <= n - 48; i += 48, src += scn * 16)
2371            {
2372                __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
2373                __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
2374                __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
2375                __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
2376                __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
2377                __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
2378
2379                if (scn == 4)
2380                {
2381                    __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
2382                    __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
2383
2384                    _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1,
2385                                           v_b0, v_b1, v_a0, v_a1);
2386                }
2387                else
2388                    _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2389
2390                __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
2391                process(v_r0, v_g0, v_b0,
2392                        v_y0, v_cr0, v_cb0);
2393
2394                __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
2395                process(v_r1, v_g1, v_b1,
2396                        v_y1, v_cr1, v_cb1);
2397
2398                _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
2399
2400                _mm_storeu_si128((__m128i *)(dst + i), v_y0);
2401                _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1);
2402                _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0);
2403                _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1);
2404                _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0);
2405                _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1);
2406            }
2407        }
2408
2409        for ( ; i < n; i += 3, src += scn)
2410        {
2411            int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2412            int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2413            int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2414            dst[i] = saturate_cast<ushort>(Y);
2415            dst[i+1] = saturate_cast<ushort>(Cr);
2416            dst[i+2] = saturate_cast<ushort>(Cb);
2417        }
2418    }
2419
2420    int srccn, blueIdx, coeffs[5];
2421    __m128i v_c0, v_c1, v_c2;
2422    __m128i v_c3, v_c4, v_delta, v_delta2;
2423    __m128i v_zero;
2424    bool haveSIMD;
2425};
2426
2427#endif // CV_SSE4_1
2428
2429template<typename _Tp> struct YCrCb2RGB_f
2430{
2431    typedef _Tp channel_type;
2432
2433    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2434        : dstcn(_dstcn), blueIdx(_blueIdx)
2435    {
2436        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2437        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2438    }
2439    void operator()(const _Tp* src, _Tp* dst, int n) const
2440    {
2441        int dcn = dstcn, bidx = blueIdx;
2442        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
2443        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2444        n *= 3;
2445        for(int i = 0; i < n; i += 3, dst += dcn)
2446        {
2447            _Tp Y = src[i];
2448            _Tp Cr = src[i+1];
2449            _Tp Cb = src[i+2];
2450
2451            _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
2452            _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
2453            _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
2454
2455            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2456            if( dcn == 4 )
2457                dst[3] = alpha;
2458        }
2459    }
2460    int dstcn, blueIdx;
2461    float coeffs[4];
2462};
2463
2464#if CV_NEON
2465
2466template <>
2467struct YCrCb2RGB_f<float>
2468{
2469    typedef float channel_type;
2470
2471    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2472        : dstcn(_dstcn), blueIdx(_blueIdx)
2473    {
2474        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2475        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2476
2477        v_c0 = vdupq_n_f32(coeffs[0]);
2478        v_c1 = vdupq_n_f32(coeffs[1]);
2479        v_c2 = vdupq_n_f32(coeffs[2]);
2480        v_c3 = vdupq_n_f32(coeffs[3]);
2481        v_delta = vdupq_n_f32(ColorChannel<float>::half());
2482        v_alpha = vdupq_n_f32(ColorChannel<float>::max());
2483    }
2484
2485    void operator()(const float* src, float* dst, int n) const
2486    {
2487        int dcn = dstcn, bidx = blueIdx, i = 0;
2488        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
2489        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2490        n *= 3;
2491
2492        if (dcn == 3)
2493            for ( ; i <= n - 12; i += 12, dst += 12)
2494            {
2495                float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
2496                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
2497
2498                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
2499                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
2500                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
2501
2502                vst3q_f32(dst, v_dst);
2503            }
2504        else
2505            for ( ; i <= n - 12; i += 12, dst += 16)
2506            {
2507                float32x4x3_t v_src = vld3q_f32(src + i);
2508                float32x4x4_t v_dst;
2509                float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
2510
2511                v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
2512                v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
2513                v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
2514                v_dst.val[3] = v_alpha;
2515
2516                vst4q_f32(dst, v_dst);
2517            }
2518
2519        for ( ; i < n; i += 3, dst += dcn)
2520        {
2521            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
2522
2523            float b = Y + (Cb - delta)*C3;
2524            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
2525            float r = Y + (Cr - delta)*C0;
2526
2527            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2528            if( dcn == 4 )
2529                dst[3] = alpha;
2530        }
2531    }
2532    int dstcn, blueIdx;
2533    float coeffs[4];
2534    float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
2535};
2536
2537#elif CV_SSE2
2538
2539template <>
2540struct YCrCb2RGB_f<float>
2541{
2542    typedef float channel_type;
2543
2544    YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2545        : dstcn(_dstcn), blueIdx(_blueIdx)
2546    {
2547        static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2548        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2549
2550        v_c0 = _mm_set1_ps(coeffs[0]);
2551        v_c1 = _mm_set1_ps(coeffs[1]);
2552        v_c2 = _mm_set1_ps(coeffs[2]);
2553        v_c3 = _mm_set1_ps(coeffs[3]);
2554        v_delta = _mm_set1_ps(ColorChannel<float>::half());
2555        v_alpha = _mm_set1_ps(ColorChannel<float>::max());
2556
2557        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
2558    }
2559
2560    void process(__m128 v_y, __m128 v_cr, __m128 v_cb,
2561                 __m128 & v_r, __m128 & v_g, __m128 & v_b) const
2562    {
2563        v_cb = _mm_sub_ps(v_cb, v_delta);
2564        v_cr = _mm_sub_ps(v_cr, v_delta);
2565
2566        v_b = _mm_mul_ps(v_cb, v_c3);
2567        v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1));
2568        v_r = _mm_mul_ps(v_cr, v_c0);
2569
2570        v_b = _mm_add_ps(v_b, v_y);
2571        v_g = _mm_add_ps(v_g, v_y);
2572        v_r = _mm_add_ps(v_r, v_y);
2573
2574        if (blueIdx == 0)
2575            std::swap(v_b, v_r);
2576    }
2577
2578    void operator()(const float* src, float* dst, int n) const
2579    {
2580        int dcn = dstcn, bidx = blueIdx, i = 0;
2581        const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
2582        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2583        n *= 3;
2584
2585        if (haveSIMD)
2586        {
2587            for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
2588            {
2589                __m128 v_y0 = _mm_loadu_ps(src + i);
2590                __m128 v_y1 = _mm_loadu_ps(src + i + 4);
2591                __m128 v_cr0 = _mm_loadu_ps(src + i + 8);
2592                __m128 v_cr1 = _mm_loadu_ps(src + i + 12);
2593                __m128 v_cb0 = _mm_loadu_ps(src + i + 16);
2594                __m128 v_cb1 = _mm_loadu_ps(src + i + 20);
2595
2596                _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
2597
2598                __m128 v_r0, v_g0, v_b0;
2599                process(v_y0, v_cr0, v_cb0,
2600                        v_r0, v_g0, v_b0);
2601
2602                __m128 v_r1, v_g1, v_b1;
2603                process(v_y1, v_cr1, v_cb1,
2604                        v_r1, v_g1, v_b1);
2605
2606                __m128 v_a0 = v_alpha, v_a1 = v_alpha;
2607
2608                if (dcn == 3)
2609                    _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2610                else
2611                    _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1,
2612                                      v_b0, v_b1, v_a0, v_a1);
2613
2614                _mm_storeu_ps(dst, v_r0);
2615                _mm_storeu_ps(dst + 4, v_r1);
2616                _mm_storeu_ps(dst + 8, v_g0);
2617                _mm_storeu_ps(dst + 12, v_g1);
2618                _mm_storeu_ps(dst + 16, v_b0);
2619                _mm_storeu_ps(dst + 20, v_b1);
2620
2621                if (dcn == 4)
2622                {
2623                    _mm_storeu_ps(dst + 24, v_a0);
2624                    _mm_storeu_ps(dst + 28, v_a1);
2625                }
2626            }
2627        }
2628
2629        for ( ; i < n; i += 3, dst += dcn)
2630        {
2631            float Y = src[i], Cr = src[i+1], Cb = src[i+2];
2632
2633            float b = Y + (Cb - delta)*C3;
2634            float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
2635            float r = Y + (Cr - delta)*C0;
2636
2637            dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2638            if( dcn == 4 )
2639                dst[3] = alpha;
2640        }
2641    }
2642    int dstcn, blueIdx;
2643    float coeffs[4];
2644
2645    __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
2646    bool haveSIMD;
2647};
2648
2649#endif
2650
2651template<typename _Tp> struct YCrCb2RGB_i
2652{
2653    typedef _Tp channel_type;
2654
2655    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2656        : dstcn(_dstcn), blueIdx(_blueIdx)
2657    {
2658        static const int coeffs0[] = {22987, -11698, -5636, 29049};
2659        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2660    }
2661
2662    void operator()(const _Tp* src, _Tp* dst, int n) const
2663    {
2664        int dcn = dstcn, bidx = blueIdx;
2665        const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
2666        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2667        n *= 3;
2668        for(int i = 0; i < n; i += 3, dst += dcn)
2669        {
2670            _Tp Y = src[i];
2671            _Tp Cr = src[i+1];
2672            _Tp Cb = src[i+2];
2673
2674            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2675            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2676            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2677
2678            dst[bidx] = saturate_cast<_Tp>(b);
2679            dst[1] = saturate_cast<_Tp>(g);
2680            dst[bidx^2] = saturate_cast<_Tp>(r);
2681            if( dcn == 4 )
2682                dst[3] = alpha;
2683        }
2684    }
2685    int dstcn, blueIdx;
2686    int coeffs[4];
2687};
2688
2689#if CV_NEON
2690
2691template <>
2692struct YCrCb2RGB_i<uchar>
2693{
2694    typedef uchar channel_type;
2695
2696    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2697        : dstcn(_dstcn), blueIdx(_blueIdx)
2698    {
2699        static const int coeffs0[] = {22987, -11698, -5636, 29049};
2700        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2701
2702        v_c0 = vdupq_n_s32(coeffs[0]);
2703        v_c1 = vdupq_n_s32(coeffs[1]);
2704        v_c2 = vdupq_n_s32(coeffs[2]);
2705        v_c3 = vdupq_n_s32(coeffs[3]);
2706        v_delta = vdup_n_s16(ColorChannel<uchar>::half());
2707        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2708        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
2709    }
2710
2711    void operator()(const uchar* src, uchar* dst, int n) const
2712    {
2713        int dcn = dstcn, bidx = blueIdx, i = 0;
2714        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
2715        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2716        n *= 3;
2717
2718        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2719        {
2720            uint8x8x3_t v_src = vld3_u8(src + i);
2721            int16x8x3_t v_src16;
2722            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
2723            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
2724            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
2725
2726            int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
2727                      v_Cr = vget_low_s16(v_src16.val[1]),
2728                      v_Cb = vget_low_s16(v_src16.val[2]);
2729
2730            int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
2731            v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
2732            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
2733            v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
2734            int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
2735            v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2736
2737            v_Y = vget_high_s16(v_src16.val[0]);
2738            v_Cr = vget_high_s16(v_src16.val[1]);
2739            v_Cb = vget_high_s16(v_src16.val[2]);
2740
2741            int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
2742            v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2743            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
2744            v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2745            int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
2746            v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2747
2748            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
2749            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
2750            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
2751
2752            if (dcn == 3)
2753            {
2754                uint8x8x3_t v_dst;
2755                v_dst.val[bidx] = v_b;
2756                v_dst.val[1] = v_g;
2757                v_dst.val[bidx^2] = v_r;
2758                vst3_u8(dst, v_dst);
2759            }
2760            else
2761            {
2762                uint8x8x4_t v_dst;
2763                v_dst.val[bidx] = v_b;
2764                v_dst.val[1] = v_g;
2765                v_dst.val[bidx^2] = v_r;
2766                v_dst.val[3] = v_alpha;
2767                vst4_u8(dst, v_dst);
2768            }
2769        }
2770
2771        for ( ; i < n; i += 3, dst += dcn)
2772        {
2773            uchar Y = src[i];
2774            uchar Cr = src[i+1];
2775            uchar Cb = src[i+2];
2776
2777            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2778            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2779            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2780
2781            dst[bidx] = saturate_cast<uchar>(b);
2782            dst[1] = saturate_cast<uchar>(g);
2783            dst[bidx^2] = saturate_cast<uchar>(r);
2784            if( dcn == 4 )
2785                dst[3] = alpha;
2786        }
2787    }
2788    int dstcn, blueIdx;
2789    int coeffs[4];
2790
2791    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
2792    int16x4_t v_delta;
2793    uint8x8_t v_alpha;
2794};
2795
2796template <>
2797struct YCrCb2RGB_i<ushort>
2798{
2799    typedef ushort channel_type;
2800
2801    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2802        : dstcn(_dstcn), blueIdx(_blueIdx)
2803    {
2804        static const int coeffs0[] = {22987, -11698, -5636, 29049};
2805        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2806
2807        v_c0 = vdupq_n_s32(coeffs[0]);
2808        v_c1 = vdupq_n_s32(coeffs[1]);
2809        v_c2 = vdupq_n_s32(coeffs[2]);
2810        v_c3 = vdupq_n_s32(coeffs[3]);
2811        v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
2812        v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2813        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
2814        v_alpha2 = vget_low_u16(v_alpha);
2815    }
2816
2817    void operator()(const ushort* src, ushort* dst, int n) const
2818    {
2819        int dcn = dstcn, bidx = blueIdx, i = 0;
2820        const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
2821        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2822        n *= 3;
2823
2824        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2825        {
2826            uint16x8x3_t v_src = vld3q_u16(src + i);
2827
2828            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
2829                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
2830                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2831
2832            int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2833            v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
2834            int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2835            v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
2836            int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2837            v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2838
2839            v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
2840            v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
2841            v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2842
2843            int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2844            v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2845            int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2846            v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2847            int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2848            v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2849
2850            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
2851            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
2852            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
2853
2854            if (dcn == 3)
2855            {
2856                uint16x8x3_t v_dst;
2857                v_dst.val[bidx] = v_b;
2858                v_dst.val[1] = v_g;
2859                v_dst.val[bidx^2] = v_r;
2860                vst3q_u16(dst, v_dst);
2861            }
2862            else
2863            {
2864                uint16x8x4_t v_dst;
2865                v_dst.val[bidx] = v_b;
2866                v_dst.val[1] = v_g;
2867                v_dst.val[bidx^2] = v_r;
2868                v_dst.val[3] = v_alpha;
2869                vst4q_u16(dst, v_dst);
2870            }
2871        }
2872
2873        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2874        {
2875            uint16x4x3_t v_src = vld3_u16(src + i);
2876
2877            int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2878                      v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2879                      v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2880
2881            int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2882            v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
2883            int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2884            v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
2885            int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
2886            v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
2887
2888            uint16x4_t v_bd = vqmovun_s32(v_b);
2889            uint16x4_t v_gd = vqmovun_s32(v_g);
2890            uint16x4_t v_rd = vqmovun_s32(v_r);
2891
2892            if (dcn == 3)
2893            {
2894                uint16x4x3_t v_dst;
2895                v_dst.val[bidx] = v_bd;
2896                v_dst.val[1] = v_gd;
2897                v_dst.val[bidx^2] = v_rd;
2898                vst3_u16(dst, v_dst);
2899            }
2900            else
2901            {
2902                uint16x4x4_t v_dst;
2903                v_dst.val[bidx] = v_bd;
2904                v_dst.val[1] = v_gd;
2905                v_dst.val[bidx^2] = v_rd;
2906                v_dst.val[3] = v_alpha2;
2907                vst4_u16(dst, v_dst);
2908            }
2909        }
2910
2911        for ( ; i < n; i += 3, dst += dcn)
2912        {
2913            ushort Y = src[i];
2914            ushort Cr = src[i+1];
2915            ushort Cb = src[i+2];
2916
2917            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2918            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2919            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2920
2921            dst[bidx] = saturate_cast<ushort>(b);
2922            dst[1] = saturate_cast<ushort>(g);
2923            dst[bidx^2] = saturate_cast<ushort>(r);
2924            if( dcn == 4 )
2925                dst[3] = alpha;
2926        }
2927    }
2928    int dstcn, blueIdx;
2929    int coeffs[4];
2930
2931    int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
2932    uint16x8_t v_alpha;
2933    uint16x4_t v_alpha2;
2934};
2935
2936#elif CV_SSE2
2937
2938template <>
2939struct YCrCb2RGB_i<uchar>
2940{
2941    typedef uchar channel_type;
2942
2943    YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2944        : dstcn(_dstcn), blueIdx(_blueIdx)
2945    {
2946        static const int coeffs0[] = {22987, -11698, -5636, 29049};
2947        memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2948
2949        v_c0 = _mm_set1_epi16((short)coeffs[0]);
2950        v_c1 = _mm_set1_epi16((short)coeffs[1]);
2951        v_c2 = _mm_set1_epi16((short)coeffs[2]);
2952        v_c3 = _mm_set1_epi16((short)coeffs[3]);
2953        v_delta = _mm_set1_epi16(ColorChannel<uchar>::half());
2954        v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2955        v_zero = _mm_setzero_si128();
2956
2957        uchar alpha = ColorChannel<uchar>::max();
2958        v_alpha = _mm_set1_epi8(*(char *)&alpha);
2959
2960        useSSE = coeffs[0] <= std::numeric_limits<short>::max();
2961        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
2962    }
2963
2964    // 16s x 8
2965    void process(__m128i v_y, __m128i v_cr, __m128i v_cb,
2966                 __m128i & v_r, __m128i & v_g, __m128i & v_b) const
2967    {
2968        v_cr = _mm_sub_epi16(v_cr, v_delta);
2969        v_cb = _mm_sub_epi16(v_cb, v_delta);
2970
2971        __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero);
2972
2973        __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3);
2974        __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2);
2975        __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1);
2976        __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0);
2977
2978        __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3);
2979        __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2);
2980        __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1);
2981        __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0);
2982
2983        __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
2984        __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2),
2985                                                                  _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
2986                                      yuv_shift);
2987        __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
2988
2989        v_r0 = _mm_add_epi32(v_r0, v_y_p);
2990        v_g0 = _mm_add_epi32(v_g0, v_y_p);
2991        v_b0 = _mm_add_epi32(v_b0, v_y_p);
2992
2993        v_y_p = _mm_unpackhi_epi16(v_y, v_zero);
2994
2995        __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
2996        __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2),
2997                                                                  _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
2998                                      yuv_shift);
2999        __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
3000
3001        v_r1 = _mm_add_epi32(v_r1, v_y_p);
3002        v_g1 = _mm_add_epi32(v_g1, v_y_p);
3003        v_b1 = _mm_add_epi32(v_b1, v_y_p);
3004
3005        v_r = _mm_packs_epi32(v_r0, v_r1);
3006        v_g = _mm_packs_epi32(v_g0, v_g1);
3007        v_b = _mm_packs_epi32(v_b0, v_b1);
3008    }
3009
3010    void operator()(const uchar* src, uchar* dst, int n) const
3011    {
3012        int dcn = dstcn, bidx = blueIdx, i = 0;
3013        const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
3014        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
3015        n *= 3;
3016
3017        if (haveSIMD && useSSE)
3018        {
3019            for ( ; i <= n - 96; i += 96, dst += dcn * 32)
3020            {
3021                __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i));
3022                __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16));
3023                __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32));
3024                __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48));
3025                __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64));
3026                __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80));
3027
3028                _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
3029
3030                __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero;
3031                process(_mm_unpacklo_epi8(v_y0, v_zero),
3032                        _mm_unpacklo_epi8(v_cr0, v_zero),
3033                        _mm_unpacklo_epi8(v_cb0, v_zero),
3034                        v_r_0, v_g_0, v_b_0);
3035
3036                __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero;
3037                process(_mm_unpackhi_epi8(v_y0, v_zero),
3038                        _mm_unpackhi_epi8(v_cr0, v_zero),
3039                        _mm_unpackhi_epi8(v_cb0, v_zero),
3040                        v_r_1, v_g_1, v_b_1);
3041
3042                __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1);
3043                __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1);
3044                __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1);
3045
3046                process(_mm_unpacklo_epi8(v_y1, v_zero),
3047                        _mm_unpacklo_epi8(v_cr1, v_zero),
3048                        _mm_unpacklo_epi8(v_cb1, v_zero),
3049                        v_r_0, v_g_0, v_b_0);
3050
3051                process(_mm_unpackhi_epi8(v_y1, v_zero),
3052                        _mm_unpackhi_epi8(v_cr1, v_zero),
3053                        _mm_unpackhi_epi8(v_cb1, v_zero),
3054                        v_r_1, v_g_1, v_b_1);
3055
3056                __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1);
3057                __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1);
3058                __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1);
3059
3060                if (bidx == 0)
3061                {
3062                    std::swap(v_r0, v_b0);
3063                    std::swap(v_r1, v_b1);
3064                }
3065
3066                __m128i v_a0 = v_alpha, v_a1 = v_alpha;
3067
3068                if (dcn == 3)
3069                    _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
3070                else
3071                    _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1,
3072                                        v_b0, v_b1, v_a0, v_a1);
3073
3074                _mm_storeu_si128((__m128i *)(dst), v_r0);
3075                _mm_storeu_si128((__m128i *)(dst + 16), v_r1);
3076                _mm_storeu_si128((__m128i *)(dst + 32), v_g0);
3077                _mm_storeu_si128((__m128i *)(dst + 48), v_g1);
3078                _mm_storeu_si128((__m128i *)(dst + 64), v_b0);
3079                _mm_storeu_si128((__m128i *)(dst + 80), v_b1);
3080
3081                if (dcn == 4)
3082                {
3083                    _mm_storeu_si128((__m128i *)(dst + 96), v_a0);
3084                    _mm_storeu_si128((__m128i *)(dst + 112), v_a1);
3085                }
3086            }
3087        }
3088
3089        for ( ; i < n; i += 3, dst += dcn)
3090        {
3091            uchar Y = src[i];
3092            uchar Cr = src[i+1];
3093            uchar Cb = src[i+2];
3094
3095            int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
3096            int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
3097            int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
3098
3099            dst[bidx] = saturate_cast<uchar>(b);
3100            dst[1] = saturate_cast<uchar>(g);
3101            dst[bidx^2] = saturate_cast<uchar>(r);
3102            if( dcn == 4 )
3103                dst[3] = alpha;
3104        }
3105    }
3106    int dstcn, blueIdx;
3107    int coeffs[4];
3108    bool useSSE, haveSIMD;
3109
3110    __m128i v_c0, v_c1, v_c2, v_c3, v_delta2;
3111    __m128i v_delta, v_alpha, v_zero;
3112};
3113
3114#endif // CV_SSE2
3115
3116////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
3117
3118static const float sRGB2XYZ_D65[] =
3119{
3120    0.412453f, 0.357580f, 0.180423f,
3121    0.212671f, 0.715160f, 0.072169f,
3122    0.019334f, 0.119193f, 0.950227f
3123};
3124
3125static const float XYZ2sRGB_D65[] =
3126{
3127    3.240479f, -1.53715f, -0.498535f,
3128    -0.969256f, 1.875991f, 0.041556f,
3129    0.055648f, -0.204043f, 1.057311f
3130};
3131
3132template<typename _Tp> struct RGB2XYZ_f
3133{
3134    typedef _Tp channel_type;
3135
3136    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3137    {
3138        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3139        if(blueIdx == 0)
3140        {
3141            std::swap(coeffs[0], coeffs[2]);
3142            std::swap(coeffs[3], coeffs[5]);
3143            std::swap(coeffs[6], coeffs[8]);
3144        }
3145    }
3146    void operator()(const _Tp* src, _Tp* dst, int n) const
3147    {
3148        int scn = srccn;
3149        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3150              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3151              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3152
3153        n *= 3;
3154        for(int i = 0; i < n; i += 3, src += scn)
3155        {
3156            _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3157            _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3158            _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3159            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3160        }
3161    }
3162    int srccn;
3163    float coeffs[9];
3164};
3165
3166#if CV_NEON
3167
3168template <>
3169struct RGB2XYZ_f<float>
3170{
3171    typedef float channel_type;
3172
3173    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3174    {
3175        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3176        if(blueIdx == 0)
3177        {
3178            std::swap(coeffs[0], coeffs[2]);
3179            std::swap(coeffs[3], coeffs[5]);
3180            std::swap(coeffs[6], coeffs[8]);
3181        }
3182
3183        v_c0 = vdupq_n_f32(coeffs[0]);
3184        v_c1 = vdupq_n_f32(coeffs[1]);
3185        v_c2 = vdupq_n_f32(coeffs[2]);
3186        v_c3 = vdupq_n_f32(coeffs[3]);
3187        v_c4 = vdupq_n_f32(coeffs[4]);
3188        v_c5 = vdupq_n_f32(coeffs[5]);
3189        v_c6 = vdupq_n_f32(coeffs[6]);
3190        v_c7 = vdupq_n_f32(coeffs[7]);
3191        v_c8 = vdupq_n_f32(coeffs[8]);
3192    }
3193
3194    void operator()(const float* src, float* dst, int n) const
3195    {
3196        int scn = srccn, i = 0;
3197        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3198              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3199              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3200
3201        n *= 3;
3202
3203        if (scn == 3)
3204            for ( ; i <= n - 12; i += 12, src += 12)
3205            {
3206                float32x4x3_t v_src = vld3q_f32(src), v_dst;
3207                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
3208                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
3209                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
3210                vst3q_f32(dst + i, v_dst);
3211            }
3212        else
3213            for ( ; i <= n - 12; i += 12, src += 16)
3214            {
3215                float32x4x4_t v_src = vld4q_f32(src);
3216                float32x4x3_t v_dst;
3217                v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
3218                v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
3219                v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
3220                vst3q_f32(dst + i, v_dst);
3221            }
3222
3223        for ( ; i < n; i += 3, src += scn)
3224        {
3225            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3226            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3227            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3228            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3229        }
3230    }
3231
3232    int srccn;
3233    float coeffs[9];
3234    float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3235};
3236
3237#elif CV_SSE2
3238
3239template <>
3240struct RGB2XYZ_f<float>
3241{
3242    typedef float channel_type;
3243
3244    RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3245    {
3246        memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3247        if(blueIdx == 0)
3248        {
3249            std::swap(coeffs[0], coeffs[2]);
3250            std::swap(coeffs[3], coeffs[5]);
3251            std::swap(coeffs[6], coeffs[8]);
3252        }
3253
3254        v_c0 = _mm_set1_ps(coeffs[0]);
3255        v_c1 = _mm_set1_ps(coeffs[1]);
3256        v_c2 = _mm_set1_ps(coeffs[2]);
3257        v_c3 = _mm_set1_ps(coeffs[3]);
3258        v_c4 = _mm_set1_ps(coeffs[4]);
3259        v_c5 = _mm_set1_ps(coeffs[5]);
3260        v_c6 = _mm_set1_ps(coeffs[6]);
3261        v_c7 = _mm_set1_ps(coeffs[7]);
3262        v_c8 = _mm_set1_ps(coeffs[8]);
3263
3264        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
3265    }
3266
3267    void process(__m128 v_r, __m128 v_g, __m128 v_b,
3268                 __m128 & v_x, __m128 & v_y, __m128 & v_z) const
3269    {
3270        v_x = _mm_mul_ps(v_r, v_c0);
3271        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1));
3272        v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2));
3273
3274        v_y = _mm_mul_ps(v_r, v_c3);
3275        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4));
3276        v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5));
3277
3278        v_z = _mm_mul_ps(v_r, v_c6);
3279        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7));
3280        v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8));
3281    }
3282
3283    void operator()(const float* src, float* dst, int n) const
3284    {
3285        int scn = srccn, i = 0;
3286        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3287              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3288              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3289
3290        n *= 3;
3291
3292        if (haveSIMD)
3293        {
3294            for ( ; i <= n - 24; i += 24, src += 8 * scn)
3295            {
3296                __m128 v_r0 = _mm_loadu_ps(src);
3297                __m128 v_r1 = _mm_loadu_ps(src + 4);
3298                __m128 v_g0 = _mm_loadu_ps(src + 8);
3299                __m128 v_g1 = _mm_loadu_ps(src + 12);
3300                __m128 v_b0 = _mm_loadu_ps(src + 16);
3301                __m128 v_b1 = _mm_loadu_ps(src + 20);
3302
3303                if (scn == 4)
3304                {
3305                    __m128 v_a0 = _mm_loadu_ps(src + 24);
3306                    __m128 v_a1 = _mm_loadu_ps(src + 28);
3307
3308                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
3309                                        v_b0, v_b1, v_a0, v_a1);
3310                }
3311                else
3312                    _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
3313
3314                __m128 v_x0, v_y0, v_z0;
3315                process(v_r0, v_g0, v_b0,
3316                        v_x0, v_y0, v_z0);
3317
3318                __m128 v_x1, v_y1, v_z1;
3319                process(v_r1, v_g1, v_b1,
3320                        v_x1, v_y1, v_z1);
3321
3322                _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
3323
3324                _mm_storeu_ps(dst + i, v_x0);
3325                _mm_storeu_ps(dst + i + 4, v_x1);
3326                _mm_storeu_ps(dst + i + 8, v_y0);
3327                _mm_storeu_ps(dst + i + 12, v_y1);
3328                _mm_storeu_ps(dst + i + 16, v_z0);
3329                _mm_storeu_ps(dst + i + 20, v_z1);
3330            }
3331        }
3332
3333        for ( ; i < n; i += 3, src += scn)
3334        {
3335            float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3336            float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3337            float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3338            dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3339        }
3340    }
3341
3342    int srccn;
3343    float coeffs[9];
3344    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3345    bool haveSIMD;
3346};
3347
3348
3349#endif
3350
3351template<typename _Tp> struct RGB2XYZ_i
3352{
3353    typedef _Tp channel_type;
3354
3355    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3356    {
3357        static const int coeffs0[] =
3358        {
3359            1689,    1465,    739,
3360            871,     2929,    296,
3361            79,      488,     3892
3362        };
3363        for( int i = 0; i < 9; i++ )
3364            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3365        if(blueIdx == 0)
3366        {
3367            std::swap(coeffs[0], coeffs[2]);
3368            std::swap(coeffs[3], coeffs[5]);
3369            std::swap(coeffs[6], coeffs[8]);
3370        }
3371    }
3372    void operator()(const _Tp* src, _Tp* dst, int n) const
3373    {
3374        int scn = srccn;
3375        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3376            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3377            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3378        n *= 3;
3379
3380        for(int i = 0; i < n; i += 3, src += scn)
3381        {
3382            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3383            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3384            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3385            dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
3386            dst[i+2] = saturate_cast<_Tp>(Z);
3387        }
3388    }
3389    int srccn;
3390    int coeffs[9];
3391};
3392
3393#if CV_NEON
3394
3395template <>
3396struct RGB2XYZ_i<uchar>
3397{
3398    typedef uchar channel_type;
3399
3400    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3401    {
3402        static const int coeffs0[] =
3403        {
3404            1689,    1465,    739,
3405            871,     2929,    296,
3406            79,      488,     3892
3407        };
3408        for( int i = 0; i < 9; i++ )
3409            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3410        if(blueIdx == 0)
3411        {
3412            std::swap(coeffs[0], coeffs[2]);
3413            std::swap(coeffs[3], coeffs[5]);
3414            std::swap(coeffs[6], coeffs[8]);
3415        }
3416
3417        v_c0 = vdup_n_u16(coeffs[0]);
3418        v_c1 = vdup_n_u16(coeffs[1]);
3419        v_c2 = vdup_n_u16(coeffs[2]);
3420        v_c3 = vdup_n_u16(coeffs[3]);
3421        v_c4 = vdup_n_u16(coeffs[4]);
3422        v_c5 = vdup_n_u16(coeffs[5]);
3423        v_c6 = vdup_n_u16(coeffs[6]);
3424        v_c7 = vdup_n_u16(coeffs[7]);
3425        v_c8 = vdup_n_u16(coeffs[8]);
3426        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
3427    }
3428    void operator()(const uchar * src, uchar * dst, int n) const
3429    {
3430        int scn = srccn, i = 0;
3431        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3432            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3433            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3434        n *= 3;
3435
3436        for ( ; i <= n - 24; i += 24, src += scn * 8)
3437        {
3438            uint8x8x3_t v_dst;
3439            uint16x8x3_t v_src16;
3440
3441            if (scn == 3)
3442            {
3443                uint8x8x3_t v_src = vld3_u8(src);
3444                v_src16.val[0] = vmovl_u8(v_src.val[0]);
3445                v_src16.val[1] = vmovl_u8(v_src.val[1]);
3446                v_src16.val[2] = vmovl_u8(v_src.val[2]);
3447            }
3448            else
3449            {
3450                uint8x8x4_t v_src = vld4_u8(src);
3451                v_src16.val[0] = vmovl_u8(v_src.val[0]);
3452                v_src16.val[1] = vmovl_u8(v_src.val[1]);
3453                v_src16.val[2] = vmovl_u8(v_src.val[2]);
3454            }
3455
3456            uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
3457                       v_s1 = vget_low_u16(v_src16.val[1]),
3458                       v_s2 = vget_low_u16(v_src16.val[2]);
3459
3460            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3461            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3462            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3463            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
3464            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
3465            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
3466
3467            v_s0 = vget_high_u16(v_src16.val[0]),
3468            v_s1 = vget_high_u16(v_src16.val[1]),
3469            v_s2 = vget_high_u16(v_src16.val[2]);
3470
3471            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3472            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3473            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3474            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
3475            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
3476            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
3477
3478            v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
3479            v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
3480            v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
3481
3482            vst3_u8(dst + i, v_dst);
3483        }
3484
3485        for ( ; i < n; i += 3, src += scn)
3486        {
3487            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3488            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3489            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3490            dst[i] = saturate_cast<uchar>(X);
3491            dst[i+1] = saturate_cast<uchar>(Y);
3492            dst[i+2] = saturate_cast<uchar>(Z);
3493        }
3494    }
3495
3496    int srccn, coeffs[9];
3497    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3498    uint32x4_t v_delta;
3499};
3500
3501template <>
3502struct RGB2XYZ_i<ushort>
3503{
3504    typedef ushort channel_type;
3505
3506    RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3507    {
3508        static const int coeffs0[] =
3509        {
3510            1689,    1465,    739,
3511            871,     2929,    296,
3512            79,      488,     3892
3513        };
3514        for( int i = 0; i < 9; i++ )
3515            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3516        if(blueIdx == 0)
3517        {
3518            std::swap(coeffs[0], coeffs[2]);
3519            std::swap(coeffs[3], coeffs[5]);
3520            std::swap(coeffs[6], coeffs[8]);
3521        }
3522
3523        v_c0 = vdup_n_u16(coeffs[0]);
3524        v_c1 = vdup_n_u16(coeffs[1]);
3525        v_c2 = vdup_n_u16(coeffs[2]);
3526        v_c3 = vdup_n_u16(coeffs[3]);
3527        v_c4 = vdup_n_u16(coeffs[4]);
3528        v_c5 = vdup_n_u16(coeffs[5]);
3529        v_c6 = vdup_n_u16(coeffs[6]);
3530        v_c7 = vdup_n_u16(coeffs[7]);
3531        v_c8 = vdup_n_u16(coeffs[8]);
3532        v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
3533    }
3534
3535    void operator()(const ushort * src, ushort * dst, int n) const
3536    {
3537        int scn = srccn, i = 0;
3538        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3539            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3540            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3541        n *= 3;
3542
3543        for ( ; i <= n - 24; i += 24, src += scn * 8)
3544        {
3545            uint16x8x3_t v_src, v_dst;
3546
3547            if (scn == 3)
3548                v_src = vld3q_u16(src);
3549            else
3550            {
3551                uint16x8x4_t v_src4 = vld4q_u16(src);
3552                v_src.val[0] = v_src4.val[0];
3553                v_src.val[1] = v_src4.val[1];
3554                v_src.val[2] = v_src4.val[2];
3555            }
3556
3557            uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
3558                       v_s1 = vget_low_u16(v_src.val[1]),
3559                       v_s2 = vget_low_u16(v_src.val[2]);
3560
3561            uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3562            uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3563            uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3564            v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
3565            v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
3566            v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
3567
3568            v_s0 = vget_high_u16(v_src.val[0]),
3569            v_s1 = vget_high_u16(v_src.val[1]),
3570            v_s2 = vget_high_u16(v_src.val[2]);
3571
3572            uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3573            uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3574            uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3575            v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
3576            v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
3577            v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
3578
3579            v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
3580            v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
3581            v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
3582
3583            vst3q_u16(dst + i, v_dst);
3584        }
3585
3586        for ( ; i <= n - 12; i += 12, src += scn * 4)
3587        {
3588            uint16x4x3_t v_dst;
3589            uint16x4_t v_s0, v_s1, v_s2;
3590
3591            if (scn == 3)
3592            {
3593                uint16x4x3_t v_src = vld3_u16(src);
3594                v_s0 = v_src.val[0];
3595                v_s1 = v_src.val[1];
3596                v_s2 = v_src.val[2];
3597            }
3598            else
3599            {
3600                uint16x4x4_t v_src = vld4_u16(src);
3601                v_s0 = v_src.val[0];
3602                v_s1 = v_src.val[1];
3603                v_s2 = v_src.val[2];
3604            }
3605
3606            uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3607            uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3608            uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3609
3610            v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
3611            v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
3612            v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
3613
3614            vst3_u16(dst + i, v_dst);
3615        }
3616
3617        for ( ; i < n; i += 3, src += scn)
3618        {
3619            int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3620            int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3621            int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3622            dst[i] = saturate_cast<ushort>(X);
3623            dst[i+1] = saturate_cast<ushort>(Y);
3624            dst[i+2] = saturate_cast<ushort>(Z);
3625        }
3626    }
3627
3628    int srccn, coeffs[9];
3629    uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3630    uint32x4_t v_delta;
3631};
3632
3633#endif
3634
3635template<typename _Tp> struct XYZ2RGB_f
3636{
3637    typedef _Tp channel_type;
3638
3639    XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
3640    : dstcn(_dstcn), blueIdx(_blueIdx)
3641    {
3642        memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
3643        if(blueIdx == 0)
3644        {
3645            std::swap(coeffs[0], coeffs[6]);
3646            std::swap(coeffs[1], coeffs[7]);
3647            std::swap(coeffs[2], coeffs[8]);
3648        }
3649    }
3650
3651    void operator()(const _Tp* src, _Tp* dst, int n) const
3652    {
3653        int dcn = dstcn;
3654        _Tp alpha = ColorChannel<_Tp>::max();
3655        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3656              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3657              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3658        n *= 3;
3659        for(int i = 0; i < n; i += 3, dst += dcn)
3660        {
3661            _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
3662            _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
3663            _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
3664            dst[0] = B; dst[1] = G; dst[2] = R;
3665            if( dcn == 4 )
3666                dst[3] = alpha;
3667        }
3668    }
3669    int dstcn, blueIdx;
3670    float coeffs[9];
3671};
3672
3673#if CV_SSE2
3674
3675template <>
3676struct XYZ2RGB_f<float>
3677{
3678    typedef float channel_type;
3679
3680    XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
3681    : dstcn(_dstcn), blueIdx(_blueIdx)
3682    {
3683        memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
3684        if(blueIdx == 0)
3685        {
3686            std::swap(coeffs[0], coeffs[6]);
3687            std::swap(coeffs[1], coeffs[7]);
3688            std::swap(coeffs[2], coeffs[8]);
3689        }
3690
3691        v_c0 = _mm_set1_ps(coeffs[0]);
3692        v_c1 = _mm_set1_ps(coeffs[1]);
3693        v_c2 = _mm_set1_ps(coeffs[2]);
3694        v_c3 = _mm_set1_ps(coeffs[3]);
3695        v_c4 = _mm_set1_ps(coeffs[4]);
3696        v_c5 = _mm_set1_ps(coeffs[5]);
3697        v_c6 = _mm_set1_ps(coeffs[6]);
3698        v_c7 = _mm_set1_ps(coeffs[7]);
3699        v_c8 = _mm_set1_ps(coeffs[8]);
3700
3701        v_alpha = _mm_set1_ps(ColorChannel<float>::max());
3702
3703        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
3704    }
3705
3706    void process(__m128 v_x, __m128 v_y, __m128 v_z,
3707                 __m128 & v_r, __m128 & v_g, __m128 & v_b) const
3708    {
3709        v_b = _mm_mul_ps(v_x, v_c0);
3710        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1));
3711        v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2));
3712
3713        v_g = _mm_mul_ps(v_x, v_c3);
3714        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4));
3715        v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5));
3716
3717        v_r = _mm_mul_ps(v_x, v_c6);
3718        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7));
3719        v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8));
3720    }
3721
3722    void operator()(const float* src, float* dst, int n) const
3723    {
3724        int dcn = dstcn;
3725        float alpha = ColorChannel<float>::max();
3726        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3727              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3728              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3729        n *= 3;
3730        int i = 0;
3731
3732        if (haveSIMD)
3733        {
3734            for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
3735            {
3736                __m128 v_x0 = _mm_loadu_ps(src + i);
3737                __m128 v_x1 = _mm_loadu_ps(src + i + 4);
3738                __m128 v_y0 = _mm_loadu_ps(src + i + 8);
3739                __m128 v_y1 = _mm_loadu_ps(src + i + 12);
3740                __m128 v_z0 = _mm_loadu_ps(src + i + 16);
3741                __m128 v_z1 = _mm_loadu_ps(src + i + 20);
3742
3743                _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
3744
3745                __m128 v_r0, v_g0, v_b0;
3746                process(v_x0, v_y0, v_z0,
3747                        v_r0, v_g0, v_b0);
3748
3749                __m128 v_r1, v_g1, v_b1;
3750                process(v_x1, v_y1, v_z1,
3751                        v_r1, v_g1, v_b1);
3752
3753                __m128 v_a0 = v_alpha, v_a1 = v_alpha;
3754
3755                if (dcn == 4)
3756                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1,
3757                                      v_r0, v_r1, v_a0, v_a1);
3758                else
3759                    _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
3760
3761                _mm_storeu_ps(dst, v_b0);
3762                _mm_storeu_ps(dst + 4, v_b1);
3763                _mm_storeu_ps(dst + 8, v_g0);
3764                _mm_storeu_ps(dst + 12, v_g1);
3765                _mm_storeu_ps(dst + 16, v_r0);
3766                _mm_storeu_ps(dst + 20, v_r1);
3767
3768                if (dcn == 4)
3769                {
3770                    _mm_storeu_ps(dst + 24, v_a0);
3771                    _mm_storeu_ps(dst + 28, v_a1);
3772                }
3773            }
3774
3775        }
3776
3777        for( ; i < n; i += 3, dst += dcn)
3778        {
3779            float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2;
3780            float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5;
3781            float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8;
3782            dst[0] = B; dst[1] = G; dst[2] = R;
3783            if( dcn == 4 )
3784                dst[3] = alpha;
3785        }
3786    }
3787    int dstcn, blueIdx;
3788    float coeffs[9];
3789
3790    __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3791    __m128 v_alpha;
3792    bool haveSIMD;
3793};
3794
3795#endif // CV_SSE2
3796
3797
3798template<typename _Tp> struct XYZ2RGB_i
3799{
3800    typedef _Tp channel_type;
3801
3802    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3803    : dstcn(_dstcn), blueIdx(_blueIdx)
3804    {
3805        static const int coeffs0[] =
3806        {
3807            13273,  -6296,  -2042,
3808            -3970,   7684,    170,
3809              228,   -836,   4331
3810        };
3811        for(int i = 0; i < 9; i++)
3812            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3813
3814        if(blueIdx == 0)
3815        {
3816            std::swap(coeffs[0], coeffs[6]);
3817            std::swap(coeffs[1], coeffs[7]);
3818            std::swap(coeffs[2], coeffs[8]);
3819        }
3820    }
3821    void operator()(const _Tp* src, _Tp* dst, int n) const
3822    {
3823        int dcn = dstcn;
3824        _Tp alpha = ColorChannel<_Tp>::max();
3825        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3826            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3827            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3828        n *= 3;
3829        for(int i = 0; i < n; i += 3, dst += dcn)
3830        {
3831            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
3832            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
3833            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
3834            dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
3835            dst[2] = saturate_cast<_Tp>(R);
3836            if( dcn == 4 )
3837                dst[3] = alpha;
3838        }
3839    }
3840    int dstcn, blueIdx;
3841    int coeffs[9];
3842};
3843
3844#if CV_NEON
3845
3846template <>
3847struct XYZ2RGB_i<uchar>
3848{
3849    typedef uchar channel_type;
3850
3851    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3852    : dstcn(_dstcn), blueIdx(_blueIdx)
3853    {
3854        static const int coeffs0[] =
3855        {
3856            13273,  -6296,  -2042,
3857            -3970,   7684,    170,
3858              228,   -836,   4331
3859        };
3860        for(int i = 0; i < 9; i++)
3861            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3862
3863        if(blueIdx == 0)
3864        {
3865            std::swap(coeffs[0], coeffs[6]);
3866            std::swap(coeffs[1], coeffs[7]);
3867            std::swap(coeffs[2], coeffs[8]);
3868        }
3869
3870        v_c0 = vdup_n_s16(coeffs[0]);
3871        v_c1 = vdup_n_s16(coeffs[1]);
3872        v_c2 = vdup_n_s16(coeffs[2]);
3873        v_c3 = vdup_n_s16(coeffs[3]);
3874        v_c4 = vdup_n_s16(coeffs[4]);
3875        v_c5 = vdup_n_s16(coeffs[5]);
3876        v_c6 = vdup_n_s16(coeffs[6]);
3877        v_c7 = vdup_n_s16(coeffs[7]);
3878        v_c8 = vdup_n_s16(coeffs[8]);
3879        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
3880        v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
3881    }
3882
3883    void operator()(const uchar* src, uchar* dst, int n) const
3884    {
3885        int dcn = dstcn, i = 0;
3886        uchar alpha = ColorChannel<uchar>::max();
3887        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3888            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3889            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3890        n *= 3;
3891
3892        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
3893        {
3894            uint8x8x3_t v_src = vld3_u8(src + i);
3895            int16x8x3_t v_src16;
3896            v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
3897            v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
3898            v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
3899
3900            int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
3901                       v_s1 = vget_low_s16(v_src16.val[1]),
3902                       v_s2 = vget_low_s16(v_src16.val[2]);
3903
3904            int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3905            int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3906            int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3907            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
3908            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
3909            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
3910
3911            v_s0 = vget_high_s16(v_src16.val[0]),
3912            v_s1 = vget_high_s16(v_src16.val[1]),
3913            v_s2 = vget_high_s16(v_src16.val[2]);
3914
3915            int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3916            int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3917            int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3918            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
3919            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
3920            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
3921
3922            uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
3923            uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
3924            uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
3925
3926            if (dcn == 3)
3927            {
3928                uint8x8x3_t v_dst;
3929                v_dst.val[0] = v_b;
3930                v_dst.val[1] = v_g;
3931                v_dst.val[2] = v_r;
3932                vst3_u8(dst, v_dst);
3933            }
3934            else
3935            {
3936                uint8x8x4_t v_dst;
3937                v_dst.val[0] = v_b;
3938                v_dst.val[1] = v_g;
3939                v_dst.val[2] = v_r;
3940                v_dst.val[3] = v_alpha;
3941                vst4_u8(dst, v_dst);
3942            }
3943        }
3944
3945        for ( ; i < n; i += 3, dst += dcn)
3946        {
3947            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
3948            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
3949            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
3950            dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
3951            dst[2] = saturate_cast<uchar>(R);
3952            if( dcn == 4 )
3953                dst[3] = alpha;
3954        }
3955    }
3956    int dstcn, blueIdx;
3957    int coeffs[9];
3958
3959    int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3960    uint8x8_t v_alpha;
3961    int32x4_t v_delta;
3962};
3963
3964template <>
3965struct XYZ2RGB_i<ushort>
3966{
3967    typedef ushort channel_type;
3968
3969    XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3970    : dstcn(_dstcn), blueIdx(_blueIdx)
3971    {
3972        static const int coeffs0[] =
3973        {
3974            13273,  -6296,  -2042,
3975            -3970,   7684,    170,
3976              228,   -836,   4331
3977        };
3978        for(int i = 0; i < 9; i++)
3979            coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3980
3981        if(blueIdx == 0)
3982        {
3983            std::swap(coeffs[0], coeffs[6]);
3984            std::swap(coeffs[1], coeffs[7]);
3985            std::swap(coeffs[2], coeffs[8]);
3986        }
3987
3988        v_c0 = vdupq_n_s32(coeffs[0]);
3989        v_c1 = vdupq_n_s32(coeffs[1]);
3990        v_c2 = vdupq_n_s32(coeffs[2]);
3991        v_c3 = vdupq_n_s32(coeffs[3]);
3992        v_c4 = vdupq_n_s32(coeffs[4]);
3993        v_c5 = vdupq_n_s32(coeffs[5]);
3994        v_c6 = vdupq_n_s32(coeffs[6]);
3995        v_c7 = vdupq_n_s32(coeffs[7]);
3996        v_c8 = vdupq_n_s32(coeffs[8]);
3997        v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
3998        v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
3999        v_alpha2 = vget_low_u16(v_alpha);
4000    }
4001
4002    void operator()(const ushort* src, ushort* dst, int n) const
4003    {
4004        int dcn = dstcn, i = 0;
4005        ushort alpha = ColorChannel<ushort>::max();
4006        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
4007            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
4008            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
4009        n *= 3;
4010
4011        for ( ; i <= n - 24; i += 24, dst += dcn * 8)
4012        {
4013            uint16x8x3_t v_src = vld3q_u16(src + i);
4014            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
4015                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
4016                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
4017
4018            int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4019            int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4020            int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4021            v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
4022            v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
4023            v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
4024
4025            v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
4026            v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
4027            v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
4028
4029            int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4030            int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4031            int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4032            v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
4033            v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
4034            v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
4035
4036            uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
4037            uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
4038            uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
4039
4040            if (dcn == 3)
4041            {
4042                uint16x8x3_t v_dst;
4043                v_dst.val[0] = v_b;
4044                v_dst.val[1] = v_g;
4045                v_dst.val[2] = v_r;
4046                vst3q_u16(dst, v_dst);
4047            }
4048            else
4049            {
4050                uint16x8x4_t v_dst;
4051                v_dst.val[0] = v_b;
4052                v_dst.val[1] = v_g;
4053                v_dst.val[2] = v_r;
4054                v_dst.val[3] = v_alpha;
4055                vst4q_u16(dst, v_dst);
4056            }
4057        }
4058
4059        for ( ; i <= n - 12; i += 12, dst += dcn * 4)
4060        {
4061            uint16x4x3_t v_src = vld3_u16(src + i);
4062            int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
4063                      v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
4064                      v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
4065
4066            int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4067            int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4068            int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4069            v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
4070            v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
4071            v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
4072
4073            uint16x4_t v_b = vqmovun_s32(v_X);
4074            uint16x4_t v_g = vqmovun_s32(v_Y);
4075            uint16x4_t v_r = vqmovun_s32(v_Z);
4076
4077            if (dcn == 3)
4078            {
4079                uint16x4x3_t v_dst;
4080                v_dst.val[0] = v_b;
4081                v_dst.val[1] = v_g;
4082                v_dst.val[2] = v_r;
4083                vst3_u16(dst, v_dst);
4084            }
4085            else
4086            {
4087                uint16x4x4_t v_dst;
4088                v_dst.val[0] = v_b;
4089                v_dst.val[1] = v_g;
4090                v_dst.val[2] = v_r;
4091                v_dst.val[3] = v_alpha2;
4092                vst4_u16(dst, v_dst);
4093            }
4094        }
4095
4096        for ( ; i < n; i += 3, dst += dcn)
4097        {
4098            int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
4099            int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
4100            int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
4101            dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
4102            dst[2] = saturate_cast<ushort>(R);
4103            if( dcn == 4 )
4104                dst[3] = alpha;
4105        }
4106    }
4107    int dstcn, blueIdx;
4108    int coeffs[9];
4109
4110    int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
4111    uint16x4_t v_alpha2;
4112    uint16x8_t v_alpha;
4113};
4114
4115#endif
4116
4117////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
4118
4119
4120struct RGB2HSV_b
4121{
4122    typedef uchar channel_type;
4123
4124    RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
4125    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
4126    {
4127        CV_Assert( hrange == 180 || hrange == 256 );
4128    }
4129
4130    void operator()(const uchar* src, uchar* dst, int n) const
4131    {
4132        int i, bidx = blueIdx, scn = srccn;
4133        const int hsv_shift = 12;
4134
4135        static int sdiv_table[256];
4136        static int hdiv_table180[256];
4137        static int hdiv_table256[256];
4138        static volatile bool initialized = false;
4139
4140        int hr = hrange;
4141        const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
4142        n *= 3;
4143
4144        if( !initialized )
4145        {
4146            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
4147            for( i = 1; i < 256; i++ )
4148            {
4149                sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
4150                hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
4151                hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
4152            }
4153            initialized = true;
4154        }
4155
4156        for( i = 0; i < n; i += 3, src += scn )
4157        {
4158            int b = src[bidx], g = src[1], r = src[bidx^2];
4159            int h, s, v = b;
4160            int vmin = b, diff;
4161            int vr, vg;
4162
4163            CV_CALC_MAX_8U( v, g );
4164            CV_CALC_MAX_8U( v, r );
4165            CV_CALC_MIN_8U( vmin, g );
4166            CV_CALC_MIN_8U( vmin, r );
4167
4168            diff = v - vmin;
4169            vr = v == r ? -1 : 0;
4170            vg = v == g ? -1 : 0;
4171
4172            s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
4173            h = (vr & (g - b)) +
4174                (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
4175            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
4176            h += h < 0 ? hr : 0;
4177
4178            dst[i] = saturate_cast<uchar>(h);
4179            dst[i+1] = (uchar)s;
4180            dst[i+2] = (uchar)v;
4181        }
4182    }
4183
4184    int srccn, blueIdx, hrange;
4185};
4186
4187
4188struct RGB2HSV_f
4189{
4190    typedef float channel_type;
4191
4192    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
4193    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
4194
4195    void operator()(const float* src, float* dst, int n) const
4196    {
4197        int i, bidx = blueIdx, scn = srccn;
4198        float hscale = hrange*(1.f/360.f);
4199        n *= 3;
4200
4201        for( i = 0; i < n; i += 3, src += scn )
4202        {
4203            float b = src[bidx], g = src[1], r = src[bidx^2];
4204            float h, s, v;
4205
4206            float vmin, diff;
4207
4208            v = vmin = r;
4209            if( v < g ) v = g;
4210            if( v < b ) v = b;
4211            if( vmin > g ) vmin = g;
4212            if( vmin > b ) vmin = b;
4213
4214            diff = v - vmin;
4215            s = diff/(float)(fabs(v) + FLT_EPSILON);
4216            diff = (float)(60./(diff + FLT_EPSILON));
4217            if( v == r )
4218                h = (g - b)*diff;
4219            else if( v == g )
4220                h = (b - r)*diff + 120.f;
4221            else
4222                h = (r - g)*diff + 240.f;
4223
4224            if( h < 0 ) h += 360.f;
4225
4226            dst[i] = h*hscale;
4227            dst[i+1] = s;
4228            dst[i+2] = v;
4229        }
4230    }
4231
4232    int srccn, blueIdx;
4233    float hrange;
4234};
4235
4236
4237struct HSV2RGB_f
4238{
4239    typedef float channel_type;
4240
4241    HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
4242    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
4243
4244    void operator()(const float* src, float* dst, int n) const
4245    {
4246        int i, bidx = blueIdx, dcn = dstcn;
4247        float _hscale = hscale;
4248        float alpha = ColorChannel<float>::max();
4249        n *= 3;
4250
4251        for( i = 0; i < n; i += 3, dst += dcn )
4252        {
4253            float h = src[i], s = src[i+1], v = src[i+2];
4254            float b, g, r;
4255
4256            if( s == 0 )
4257                b = g = r = v;
4258            else
4259            {
4260                static const int sector_data[][3]=
4261                    {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
4262                float tab[4];
4263                int sector;
4264                h *= _hscale;
4265                if( h < 0 )
4266                    do h += 6; while( h < 0 );
4267                else if( h >= 6 )
4268                    do h -= 6; while( h >= 6 );
4269                sector = cvFloor(h);
4270                h -= sector;
4271                if( (unsigned)sector >= 6u )
4272                {
4273                    sector = 0;
4274                    h = 0.f;
4275                }
4276
4277                tab[0] = v;
4278                tab[1] = v*(1.f - s);
4279                tab[2] = v*(1.f - s*h);
4280                tab[3] = v*(1.f - s*(1.f - h));
4281
4282                b = tab[sector_data[sector][0]];
4283                g = tab[sector_data[sector][1]];
4284                r = tab[sector_data[sector][2]];
4285            }
4286
4287            dst[bidx] = b;
4288            dst[1] = g;
4289            dst[bidx^2] = r;
4290            if( dcn == 4 )
4291                dst[3] = alpha;
4292        }
4293    }
4294
4295    int dstcn, blueIdx;
4296    float hscale;
4297};
4298
4299
4300struct HSV2RGB_b
4301{
4302    typedef uchar channel_type;
4303
4304    HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
4305    : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
4306    {
4307        #if CV_NEON
4308        v_scale_inv = vdupq_n_f32(1.f/255.f);
4309        v_scale = vdupq_n_f32(255.f);
4310        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4311        #elif CV_SSE2
4312        v_scale_inv = _mm_set1_ps(1.f/255.f);
4313        v_scale = _mm_set1_ps(255.0f);
4314        v_zero = _mm_setzero_si128();
4315        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4316        #endif
4317    }
4318
4319    #if CV_SSE2
4320    // 16s x 8
4321    void process(__m128i v_r, __m128i v_g, __m128i v_b,
4322                 float * buf) const
4323    {
4324        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
4325        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
4326        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
4327
4328        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
4329        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
4330        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
4331
4332        v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
4333        v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
4334
4335        v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
4336        v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
4337
4338        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4339
4340        _mm_store_ps(buf, v_r0);
4341        _mm_store_ps(buf + 4, v_r1);
4342        _mm_store_ps(buf + 8, v_g0);
4343        _mm_store_ps(buf + 12, v_g1);
4344        _mm_store_ps(buf + 16, v_b0);
4345        _mm_store_ps(buf + 20, v_b1);
4346    }
4347    #endif
4348
4349    void operator()(const uchar* src, uchar* dst, int n) const
4350    {
4351        int i, j, dcn = dstcn;
4352        uchar alpha = ColorChannel<uchar>::max();
4353        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4354
4355        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4356        {
4357            int dn = std::min(n - i, (int)BLOCK_SIZE);
4358            j = 0;
4359
4360            #if CV_NEON
4361            for ( ; j <= (dn - 8) * 3; j += 24)
4362            {
4363                uint8x8x3_t v_src = vld3_u8(src + j);
4364                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4365                           v_t1 = vmovl_u8(v_src.val[1]),
4366                           v_t2 = vmovl_u8(v_src.val[2]);
4367
4368                float32x4x3_t v_dst;
4369                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
4370                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4371                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4372                vst3q_f32(buf + j, v_dst);
4373
4374                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
4375                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4376                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4377                vst3q_f32(buf + j + 12, v_dst);
4378            }
4379            #elif CV_SSE2
4380            if (haveSIMD)
4381            {
4382                for ( ; j <= (dn - 32) * 3; j += 96)
4383                {
4384                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
4385                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
4386                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
4387                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
4388                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
4389                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
4390
4391                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4392
4393                    process(_mm_unpacklo_epi8(v_r0, v_zero),
4394                            _mm_unpacklo_epi8(v_g0, v_zero),
4395                            _mm_unpacklo_epi8(v_b0, v_zero),
4396                            buf + j);
4397
4398                    process(_mm_unpackhi_epi8(v_r0, v_zero),
4399                            _mm_unpackhi_epi8(v_g0, v_zero),
4400                            _mm_unpackhi_epi8(v_b0, v_zero),
4401                            buf + j + 24);
4402
4403                    process(_mm_unpacklo_epi8(v_r1, v_zero),
4404                            _mm_unpacklo_epi8(v_g1, v_zero),
4405                            _mm_unpacklo_epi8(v_b1, v_zero),
4406                            buf + j + 48);
4407
4408                    process(_mm_unpackhi_epi8(v_r1, v_zero),
4409                            _mm_unpackhi_epi8(v_g1, v_zero),
4410                            _mm_unpackhi_epi8(v_b1, v_zero),
4411                            buf + j + 72);
4412                }
4413            }
4414            #endif
4415
4416            for( ; j < dn*3; j += 3 )
4417            {
4418                buf[j] = src[j];
4419                buf[j+1] = src[j+1]*(1.f/255.f);
4420                buf[j+2] = src[j+2]*(1.f/255.f);
4421            }
4422            cvt(buf, buf, dn);
4423
4424            j = 0;
4425            #if CV_NEON
4426            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4427            {
4428                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4429                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4430                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4431                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4432                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4433                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4434                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4435
4436                if (dcn == 4)
4437                {
4438                    uint8x8x4_t v_dst;
4439                    v_dst.val[0] = v_dst0;
4440                    v_dst.val[1] = v_dst1;
4441                    v_dst.val[2] = v_dst2;
4442                    v_dst.val[3] = v_alpha;
4443                    vst4_u8(dst, v_dst);
4444                }
4445                else
4446                {
4447                    uint8x8x3_t v_dst;
4448                    v_dst.val[0] = v_dst0;
4449                    v_dst.val[1] = v_dst1;
4450                    v_dst.val[2] = v_dst2;
4451                    vst3_u8(dst, v_dst);
4452                }
4453            }
4454            #elif CV_SSE2
4455            if (dcn == 3 && haveSIMD)
4456            {
4457                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
4458                {
4459                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
4460                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
4461                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
4462                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
4463
4464                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4465                                                     _mm_cvtps_epi32(v_src1));
4466                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
4467                                                     _mm_cvtps_epi32(v_src3));
4468
4469                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
4470                }
4471
4472                int jr = j % 3;
4473                if (jr)
4474                    dst -= jr, j -= jr;
4475            }
4476            #endif
4477
4478            for( ; j < dn*3; j += 3, dst += dcn )
4479            {
4480                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4481                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4482                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4483                if( dcn == 4 )
4484                    dst[3] = alpha;
4485            }
4486        }
4487    }
4488
4489    int dstcn;
4490    HSV2RGB_f cvt;
4491    #if CV_NEON
4492    float32x4_t v_scale, v_scale_inv;
4493    uint8x8_t v_alpha;
4494    #elif CV_SSE2
4495    __m128 v_scale_inv, v_scale;
4496    __m128i v_zero;
4497    bool haveSIMD;
4498    #endif
4499};
4500
4501
4502///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
4503
4504struct RGB2HLS_f
4505{
4506    typedef float channel_type;
4507
4508    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
4509    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
4510
4511    void operator()(const float* src, float* dst, int n) const
4512    {
4513        int i, bidx = blueIdx, scn = srccn;
4514        float hscale = hrange*(1.f/360.f);
4515        n *= 3;
4516
4517        for( i = 0; i < n; i += 3, src += scn )
4518        {
4519            float b = src[bidx], g = src[1], r = src[bidx^2];
4520            float h = 0.f, s = 0.f, l;
4521            float vmin, vmax, diff;
4522
4523            vmax = vmin = r;
4524            if( vmax < g ) vmax = g;
4525            if( vmax < b ) vmax = b;
4526            if( vmin > g ) vmin = g;
4527            if( vmin > b ) vmin = b;
4528
4529            diff = vmax - vmin;
4530            l = (vmax + vmin)*0.5f;
4531
4532            if( diff > FLT_EPSILON )
4533            {
4534                s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
4535                diff = 60.f/diff;
4536
4537                if( vmax == r )
4538                    h = (g - b)*diff;
4539                else if( vmax == g )
4540                    h = (b - r)*diff + 120.f;
4541                else
4542                    h = (r - g)*diff + 240.f;
4543
4544                if( h < 0.f ) h += 360.f;
4545            }
4546
4547            dst[i] = h*hscale;
4548            dst[i+1] = l;
4549            dst[i+2] = s;
4550        }
4551    }
4552
4553    int srccn, blueIdx;
4554    float hrange;
4555};
4556
4557
4558struct RGB2HLS_b
4559{
4560    typedef uchar channel_type;
4561
4562    RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
4563    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
4564    {
4565        #if CV_NEON
4566        v_scale_inv = vdupq_n_f32(1.f/255.f);
4567        v_scale = vdupq_n_f32(255.f);
4568        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4569        #elif CV_SSE2
4570        v_scale_inv = _mm_set1_ps(1.f/255.f);
4571        v_scale = _mm_set1_ps(255.f);
4572        v_zero = _mm_setzero_si128();
4573        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4574        #endif
4575    }
4576
4577    #if CV_SSE2
4578    void process(const float * buf,
4579                 __m128i & v_h, __m128i & v_l, __m128i & v_s) const
4580    {
4581        __m128 v_h0f = _mm_load_ps(buf);
4582        __m128 v_h1f = _mm_load_ps(buf + 4);
4583        __m128 v_l0f = _mm_load_ps(buf + 8);
4584        __m128 v_l1f = _mm_load_ps(buf + 12);
4585        __m128 v_s0f = _mm_load_ps(buf + 16);
4586        __m128 v_s1f = _mm_load_ps(buf + 20);
4587
4588        _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f);
4589
4590        v_l0f = _mm_mul_ps(v_l0f, v_scale);
4591        v_l1f = _mm_mul_ps(v_l1f, v_scale);
4592        v_s0f = _mm_mul_ps(v_s0f, v_scale);
4593        v_s1f = _mm_mul_ps(v_s1f, v_scale);
4594
4595        v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f));
4596        v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
4597        v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f));
4598    }
4599    #endif
4600
4601    void operator()(const uchar* src, uchar* dst, int n) const
4602    {
4603        int i, j, scn = srccn;
4604        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4605
4606        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
4607        {
4608            int dn = std::min(n - i, (int)BLOCK_SIZE);
4609            j = 0;
4610
4611            #if CV_NEON
4612            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
4613            {
4614                uint16x8_t v_t0, v_t1, v_t2;
4615
4616                if (scn == 3)
4617                {
4618                    uint8x8x3_t v_src = vld3_u8(src);
4619                    v_t0 = vmovl_u8(v_src.val[0]);
4620                    v_t1 = vmovl_u8(v_src.val[1]);
4621                    v_t2 = vmovl_u8(v_src.val[2]);
4622                }
4623                else
4624                {
4625                    uint8x8x4_t v_src = vld4_u8(src);
4626                    v_t0 = vmovl_u8(v_src.val[0]);
4627                    v_t1 = vmovl_u8(v_src.val[1]);
4628                    v_t2 = vmovl_u8(v_src.val[2]);
4629                }
4630
4631                float32x4x3_t v_dst;
4632                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4633                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4634                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4635                vst3q_f32(buf + j, v_dst);
4636
4637                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4638                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4639                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4640                vst3q_f32(buf + j + 12, v_dst);
4641            }
4642            #elif CV_SSE2
4643            if (scn == 3 && haveSIMD)
4644            {
4645                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
4646                {
4647                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
4648
4649                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
4650                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
4651                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
4652
4653                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
4654                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
4655                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
4656                }
4657
4658                int jr = j % 3;
4659                if (jr)
4660                    src -= jr, j -= jr;
4661            }
4662            #endif
4663            for( ; j < dn*3; j += 3, src += scn )
4664            {
4665                buf[j] = src[0]*(1.f/255.f);
4666                buf[j+1] = src[1]*(1.f/255.f);
4667                buf[j+2] = src[2]*(1.f/255.f);
4668            }
4669            cvt(buf, buf, dn);
4670
4671            j = 0;
4672            #if CV_NEON
4673            for ( ; j <= (dn - 8) * 3; j += 24)
4674            {
4675                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4676
4677                uint8x8x3_t v_dst;
4678                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
4679                                                       vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
4680                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4681                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4682                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4683                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4684                vst3_u8(dst + j, v_dst);
4685            }
4686            #elif CV_SSE2
4687            if (haveSIMD)
4688            {
4689                for ( ; j <= (dn - 32) * 3; j += 96)
4690                {
4691                    __m128i v_h_0, v_l_0, v_s_0;
4692                    process(buf + j,
4693                            v_h_0, v_l_0, v_s_0);
4694
4695                    __m128i v_h_1, v_l_1, v_s_1;
4696                    process(buf + j + 24,
4697                            v_h_1, v_l_1, v_s_1);
4698
4699                    __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1);
4700                    __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
4701                    __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1);
4702
4703                    process(buf + j + 48,
4704                            v_h_0, v_l_0, v_s_0);
4705
4706                    process(buf + j + 72,
4707                            v_h_1, v_l_1, v_s_1);
4708
4709                    __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1);
4710                    __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
4711                    __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1);
4712
4713                    _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
4714
4715                    _mm_storeu_si128((__m128i *)(dst + j), v_h0);
4716                    _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1);
4717                    _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0);
4718                    _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1);
4719                    _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0);
4720                    _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1);
4721                }
4722            }
4723            #endif
4724            for( ; j < dn*3; j += 3 )
4725            {
4726                dst[j] = saturate_cast<uchar>(buf[j]);
4727                dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
4728                dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
4729            }
4730        }
4731    }
4732
4733    int srccn;
4734    RGB2HLS_f cvt;
4735    #if CV_NEON
4736    float32x4_t v_scale, v_scale_inv;
4737    uint8x8_t v_alpha;
4738    #elif CV_SSE2
4739    __m128 v_scale, v_scale_inv;
4740    __m128i v_zero;
4741    bool haveSIMD;
4742    #endif
4743};
4744
4745
4746struct HLS2RGB_f
4747{
4748    typedef float channel_type;
4749
4750    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
4751    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
4752
4753    void operator()(const float* src, float* dst, int n) const
4754    {
4755        int i, bidx = blueIdx, dcn = dstcn;
4756        float _hscale = hscale;
4757        float alpha = ColorChannel<float>::max();
4758        n *= 3;
4759
4760        for( i = 0; i < n; i += 3, dst += dcn )
4761        {
4762            float h = src[i], l = src[i+1], s = src[i+2];
4763            float b, g, r;
4764
4765            if( s == 0 )
4766                b = g = r = l;
4767            else
4768            {
4769                static const int sector_data[][3]=
4770                {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
4771                float tab[4];
4772                int sector;
4773
4774                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
4775                float p1 = 2*l - p2;
4776
4777                h *= _hscale;
4778                if( h < 0 )
4779                    do h += 6; while( h < 0 );
4780                else if( h >= 6 )
4781                    do h -= 6; while( h >= 6 );
4782
4783                assert( 0 <= h && h < 6 );
4784                sector = cvFloor(h);
4785                h -= sector;
4786
4787                tab[0] = p2;
4788                tab[1] = p1;
4789                tab[2] = p1 + (p2 - p1)*(1-h);
4790                tab[3] = p1 + (p2 - p1)*h;
4791
4792                b = tab[sector_data[sector][0]];
4793                g = tab[sector_data[sector][1]];
4794                r = tab[sector_data[sector][2]];
4795            }
4796
4797            dst[bidx] = b;
4798            dst[1] = g;
4799            dst[bidx^2] = r;
4800            if( dcn == 4 )
4801                dst[3] = alpha;
4802        }
4803    }
4804
4805    int dstcn, blueIdx;
4806    float hscale;
4807};
4808
4809
4810struct HLS2RGB_b
4811{
4812    typedef uchar channel_type;
4813
4814    HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
4815    : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
4816    {
4817        #if CV_NEON
4818        v_scale_inv = vdupq_n_f32(1.f/255.f);
4819        v_scale = vdupq_n_f32(255.f);
4820        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4821        #elif CV_SSE2
4822        v_scale_inv = _mm_set1_ps(1.f/255.f);
4823        v_scale = _mm_set1_ps(255.f);
4824        v_zero = _mm_setzero_si128();
4825        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4826        #endif
4827    }
4828
4829    #if CV_SSE2
4830    // 16s x 8
4831    void process(__m128i v_r, __m128i v_g, __m128i v_b,
4832                 float * buf) const
4833    {
4834        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
4835        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
4836        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
4837
4838        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
4839        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
4840        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
4841
4842        v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
4843        v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
4844
4845        v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
4846        v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
4847
4848        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4849
4850        _mm_store_ps(buf, v_r0);
4851        _mm_store_ps(buf + 4, v_r1);
4852        _mm_store_ps(buf + 8, v_g0);
4853        _mm_store_ps(buf + 12, v_g1);
4854        _mm_store_ps(buf + 16, v_b0);
4855        _mm_store_ps(buf + 20, v_b1);
4856    }
4857    #endif
4858
4859    void operator()(const uchar* src, uchar* dst, int n) const
4860    {
4861        int i, j, dcn = dstcn;
4862        uchar alpha = ColorChannel<uchar>::max();
4863        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4864
4865        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4866        {
4867            int dn = std::min(n - i, (int)BLOCK_SIZE);
4868            j = 0;
4869
4870            #if CV_NEON
4871            for ( ; j <= (dn - 8) * 3; j += 24)
4872            {
4873                uint8x8x3_t v_src = vld3_u8(src + j);
4874                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4875                           v_t1 = vmovl_u8(v_src.val[1]),
4876                           v_t2 = vmovl_u8(v_src.val[2]);
4877
4878                float32x4x3_t v_dst;
4879                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
4880                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4881                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4882                vst3q_f32(buf + j, v_dst);
4883
4884                v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
4885                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4886                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4887                vst3q_f32(buf + j + 12, v_dst);
4888            }
4889            #elif CV_SSE2
4890            if (haveSIMD)
4891            {
4892                for ( ; j <= (dn - 32) * 3; j += 96)
4893                {
4894                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
4895                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
4896                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
4897                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
4898                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
4899                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
4900
4901                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4902
4903                    process(_mm_unpacklo_epi8(v_r0, v_zero),
4904                            _mm_unpacklo_epi8(v_g0, v_zero),
4905                            _mm_unpacklo_epi8(v_b0, v_zero),
4906                            buf + j);
4907
4908                    process(_mm_unpackhi_epi8(v_r0, v_zero),
4909                            _mm_unpackhi_epi8(v_g0, v_zero),
4910                            _mm_unpackhi_epi8(v_b0, v_zero),
4911                            buf + j + 24);
4912
4913                    process(_mm_unpacklo_epi8(v_r1, v_zero),
4914                            _mm_unpacklo_epi8(v_g1, v_zero),
4915                            _mm_unpacklo_epi8(v_b1, v_zero),
4916                            buf + j + 48);
4917
4918                    process(_mm_unpackhi_epi8(v_r1, v_zero),
4919                            _mm_unpackhi_epi8(v_g1, v_zero),
4920                            _mm_unpackhi_epi8(v_b1, v_zero),
4921                            buf + j + 72);
4922                }
4923            }
4924            #endif
4925            for( ; j < dn*3; j += 3 )
4926            {
4927                buf[j] = src[j];
4928                buf[j+1] = src[j+1]*(1.f/255.f);
4929                buf[j+2] = src[j+2]*(1.f/255.f);
4930            }
4931            cvt(buf, buf, dn);
4932
4933            j = 0;
4934            #if CV_NEON
4935            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4936            {
4937                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4938                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4939                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4940                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4941                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4942                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4943                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4944
4945                if (dcn == 4)
4946                {
4947                    uint8x8x4_t v_dst;
4948                    v_dst.val[0] = v_dst0;
4949                    v_dst.val[1] = v_dst1;
4950                    v_dst.val[2] = v_dst2;
4951                    v_dst.val[3] = v_alpha;
4952                    vst4_u8(dst, v_dst);
4953                }
4954                else
4955                {
4956                    uint8x8x3_t v_dst;
4957                    v_dst.val[0] = v_dst0;
4958                    v_dst.val[1] = v_dst1;
4959                    v_dst.val[2] = v_dst2;
4960                    vst3_u8(dst, v_dst);
4961                }
4962            }
4963            #elif CV_SSE2
4964            if (dcn == 3 && haveSIMD)
4965            {
4966                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
4967                {
4968                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
4969                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
4970                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
4971                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
4972
4973                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4974                                                     _mm_cvtps_epi32(v_src1));
4975                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
4976                                                     _mm_cvtps_epi32(v_src3));
4977
4978                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
4979                }
4980
4981                int jr = j % 3;
4982                if (jr)
4983                    dst -= jr, j -= jr;
4984            }
4985            #endif
4986
4987            for( ; j < dn*3; j += 3, dst += dcn )
4988            {
4989                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4990                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4991                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4992                if( dcn == 4 )
4993                    dst[3] = alpha;
4994            }
4995        }
4996    }
4997
4998    int dstcn;
4999    HLS2RGB_f cvt;
5000    #if CV_NEON
5001    float32x4_t v_scale, v_scale_inv;
5002    uint8x8_t v_alpha;
5003    #elif CV_SSE2
5004    __m128 v_scale, v_scale_inv;
5005    __m128i v_zero;
5006    bool haveSIMD;
5007    #endif
5008};
5009
5010
5011///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
5012
5013static const float D65[] = { 0.950456f, 1.f, 1.088754f };
5014
5015enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
5016static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
5017static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
5018
5019static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
5020static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
5021
5022static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
5023#undef lab_shift
5024#define lab_shift xyz_shift
5025#define gamma_shift 3
5026#define lab_shift2 (lab_shift + gamma_shift)
5027#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
5028static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
5029
5030static void initLabTabs()
5031{
5032    static bool initialized = false;
5033    if(!initialized)
5034    {
5035        float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
5036        int i;
5037        for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
5038        {
5039            float x = i*scale;
5040            f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
5041        }
5042        splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
5043
5044        scale = 1.f/GammaTabScale;
5045        for(i = 0; i <= GAMMA_TAB_SIZE; i++)
5046        {
5047            float x = i*scale;
5048            g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4);
5049            ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055);
5050        }
5051        splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
5052        splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
5053
5054        for(i = 0; i < 256; i++)
5055        {
5056            float x = i*(1.f/255.f);
5057            sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4)));
5058            linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
5059        }
5060
5061        for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
5062        {
5063            float x = i*(1.f/(255.f*(1 << gamma_shift)));
5064            LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
5065        }
5066        initialized = true;
5067    }
5068}
5069
5070struct RGB2Lab_b
5071{
5072    typedef uchar channel_type;
5073
5074    RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
5075              const float* _whitept, bool _srgb)
5076    : srccn(_srccn), srgb(_srgb)
5077    {
5078        static volatile int _3 = 3;
5079        initLabTabs();
5080
5081        if (!_coeffs)
5082            _coeffs = sRGB2XYZ_D65;
5083        if (!_whitept)
5084            _whitept = D65;
5085
5086        float scale[] =
5087        {
5088            (1 << lab_shift)/_whitept[0],
5089            (float)(1 << lab_shift),
5090            (1 << lab_shift)/_whitept[2]
5091        };
5092
5093        for( int i = 0; i < _3; i++ )
5094        {
5095            coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
5096            coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
5097            coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
5098
5099            CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5100                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
5101        }
5102    }
5103
5104    void operator()(const uchar* src, uchar* dst, int n) const
5105    {
5106        const int Lscale = (116*255+50)/100;
5107        const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
5108        const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
5109        int i, scn = srccn;
5110        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5111            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5112            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5113        n *= 3;
5114
5115        for( i = 0; i < n; i += 3, src += scn )
5116        {
5117            int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
5118            int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
5119            int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
5120            int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
5121
5122            int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
5123            int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
5124            int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
5125
5126            dst[i] = saturate_cast<uchar>(L);
5127            dst[i+1] = saturate_cast<uchar>(a);
5128            dst[i+2] = saturate_cast<uchar>(b);
5129        }
5130    }
5131
5132    int srccn;
5133    int coeffs[9];
5134    bool srgb;
5135};
5136
5137
5138#define clip(value) \
5139    value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
5140
5141struct RGB2Lab_f
5142{
5143    typedef float channel_type;
5144
5145    RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
5146              const float* _whitept, bool _srgb)
5147    : srccn(_srccn), srgb(_srgb)
5148    {
5149        volatile int _3 = 3;
5150        initLabTabs();
5151
5152        if (!_coeffs)
5153            _coeffs = sRGB2XYZ_D65;
5154        if (!_whitept)
5155            _whitept = D65;
5156
5157        float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
5158
5159        for( int i = 0; i < _3; i++ )
5160        {
5161            int j = i * 3;
5162            coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
5163            coeffs[j + 1] = _coeffs[j + 1] * scale[i];
5164            coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
5165
5166            CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
5167                       coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
5168        }
5169    }
5170
5171    void operator()(const float* src, float* dst, int n) const
5172    {
5173        int i, scn = srccn;
5174        float gscale = GammaTabScale;
5175        const float* gammaTab = srgb ? sRGBGammaTab : 0;
5176        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5177              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5178              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5179        n *= 3;
5180
5181        static const float _1_3 = 1.0f / 3.0f;
5182        static const float _a = 16.0f / 116.0f;
5183        for (i = 0; i < n; i += 3, src += scn )
5184        {
5185            float R = clip(src[0]);
5186            float G = clip(src[1]);
5187            float B = clip(src[2]);
5188
5189            if (gammaTab)
5190            {
5191                R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
5192                G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
5193                B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
5194            }
5195            float X = R*C0 + G*C1 + B*C2;
5196            float Y = R*C3 + G*C4 + B*C5;
5197            float Z = R*C6 + G*C7 + B*C8;
5198
5199            float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a);
5200            float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a);
5201            float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a);
5202
5203            float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
5204            float a = 500.f * (FX - FY);
5205            float b = 200.f * (FY - FZ);
5206
5207            dst[i] = L;
5208            dst[i + 1] = a;
5209            dst[i + 2] = b;
5210        }
5211    }
5212
5213    int srccn;
5214    float coeffs[9];
5215    bool srgb;
5216};
5217
5218struct Lab2RGB_f
5219{
5220    typedef float channel_type;
5221
5222    Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
5223              const float* _whitept, bool _srgb )
5224    : dstcn(_dstcn), srgb(_srgb)
5225    {
5226        initLabTabs();
5227
5228        if(!_coeffs)
5229            _coeffs = XYZ2sRGB_D65;
5230        if(!_whitept)
5231            _whitept = D65;
5232
5233        for( int i = 0; i < 3; i++ )
5234        {
5235            coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
5236            coeffs[i+3] = _coeffs[i+3]*_whitept[i];
5237            coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
5238        }
5239    }
5240
5241    void operator()(const float* src, float* dst, int n) const
5242    {
5243        int i, dcn = dstcn;
5244        const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
5245        float gscale = GammaTabScale;
5246        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5247        C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5248        C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5249        float alpha = ColorChannel<float>::max();
5250        n *= 3;
5251
5252        static const float lThresh = 0.008856f * 903.3f;
5253        static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
5254        for (i = 0; i < n; i += 3, dst += dcn)
5255        {
5256            float li = src[i];
5257            float ai = src[i + 1];
5258            float bi = src[i + 2];
5259
5260            float y, fy;
5261            if (li <= lThresh)
5262            {
5263                y = li / 903.3f;
5264                fy = 7.787f * y + 16.0f / 116.0f;
5265            }
5266            else
5267            {
5268                fy = (li + 16.0f) / 116.0f;
5269                y = fy * fy * fy;
5270            }
5271
5272            float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
5273
5274            for (int j = 0; j < 2; j++)
5275                if (fxz[j] <= fThresh)
5276                    fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
5277                else
5278                    fxz[j] = fxz[j] * fxz[j] * fxz[j];
5279
5280
5281            float x = fxz[0], z = fxz[1];
5282            float ro = C0 * x + C1 * y + C2 * z;
5283            float go = C3 * x + C4 * y + C5 * z;
5284            float bo = C6 * x + C7 * y + C8 * z;
5285            ro = clip(ro);
5286            go = clip(go);
5287            bo = clip(bo);
5288
5289            if (gammaTab)
5290            {
5291                ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
5292                go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
5293                bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
5294            }
5295
5296            dst[0] = ro, dst[1] = go, dst[2] = bo;
5297            if( dcn == 4 )
5298                dst[3] = alpha;
5299        }
5300    }
5301
5302    int dstcn;
5303    float coeffs[9];
5304    bool srgb;
5305};
5306
5307#undef clip
5308
5309struct Lab2RGB_b
5310{
5311    typedef uchar channel_type;
5312
5313    Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
5314               const float* _whitept, bool _srgb )
5315    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
5316    {
5317        #if CV_NEON
5318        v_scale_inv = vdupq_n_f32(100.f/255.f);
5319        v_scale = vdupq_n_f32(255.f);
5320        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5321        v_128 = vdupq_n_f32(128.0f);
5322        #elif CV_SSE2
5323        v_scale_inv = _mm_set1_ps(100.f/255.f);
5324        v_scale = _mm_set1_ps(255.f);
5325        v_128 = _mm_set1_ps(128.0f);
5326        v_zero = _mm_setzero_si128();
5327        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5328        #endif
5329    }
5330
5331    #if CV_SSE2
5332    // 16s x 8
5333    void process(__m128i v_r, __m128i v_g, __m128i v_b,
5334                 float * buf) const
5335    {
5336        __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
5337        __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
5338        __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
5339
5340        __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
5341        __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
5342        __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
5343
5344        v_r0 = _mm_mul_ps(v_r0, v_scale_inv);
5345        v_r1 = _mm_mul_ps(v_r1, v_scale_inv);
5346
5347        v_g0 = _mm_sub_ps(v_g0, v_128);
5348        v_g1 = _mm_sub_ps(v_g1, v_128);
5349        v_b0 = _mm_sub_ps(v_b0, v_128);
5350        v_b1 = _mm_sub_ps(v_b1, v_128);
5351
5352        _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5353
5354        _mm_store_ps(buf, v_r0);
5355        _mm_store_ps(buf + 4, v_r1);
5356        _mm_store_ps(buf + 8, v_g0);
5357        _mm_store_ps(buf + 12, v_g1);
5358        _mm_store_ps(buf + 16, v_b0);
5359        _mm_store_ps(buf + 20, v_b1);
5360    }
5361    #endif
5362
5363    void operator()(const uchar* src, uchar* dst, int n) const
5364    {
5365        int i, j, dcn = dstcn;
5366        uchar alpha = ColorChannel<uchar>::max();
5367        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5368
5369        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
5370        {
5371            int dn = std::min(n - i, (int)BLOCK_SIZE);
5372            j = 0;
5373
5374            #if CV_NEON
5375            for ( ; j <= (dn - 8) * 3; j += 24)
5376            {
5377                uint8x8x3_t v_src = vld3_u8(src + j);
5378                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
5379                           v_t1 = vmovl_u8(v_src.val[1]),
5380                           v_t2 = vmovl_u8(v_src.val[2]);
5381
5382                float32x4x3_t v_dst;
5383                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5384                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
5385                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
5386                vst3q_f32(buf + j, v_dst);
5387
5388                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5389                v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
5390                v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
5391                vst3q_f32(buf + j + 12, v_dst);
5392            }
5393            #elif CV_SSE2
5394            if (haveSIMD)
5395            {
5396                for ( ; j <= (dn - 32) * 3; j += 96)
5397                {
5398                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
5399                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
5400                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
5401                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
5402                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
5403                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
5404
5405                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5406
5407                    process(_mm_unpacklo_epi8(v_r0, v_zero),
5408                            _mm_unpacklo_epi8(v_g0, v_zero),
5409                            _mm_unpacklo_epi8(v_b0, v_zero),
5410                            buf + j);
5411
5412                    process(_mm_unpackhi_epi8(v_r0, v_zero),
5413                            _mm_unpackhi_epi8(v_g0, v_zero),
5414                            _mm_unpackhi_epi8(v_b0, v_zero),
5415                            buf + j + 24);
5416
5417                    process(_mm_unpacklo_epi8(v_r1, v_zero),
5418                            _mm_unpacklo_epi8(v_g1, v_zero),
5419                            _mm_unpacklo_epi8(v_b1, v_zero),
5420                            buf + j + 48);
5421
5422                    process(_mm_unpackhi_epi8(v_r1, v_zero),
5423                            _mm_unpackhi_epi8(v_g1, v_zero),
5424                            _mm_unpackhi_epi8(v_b1, v_zero),
5425                            buf + j + 72);
5426                }
5427            }
5428            #endif
5429
5430            for( ; j < dn*3; j += 3 )
5431            {
5432                buf[j] = src[j]*(100.f/255.f);
5433                buf[j+1] = (float)(src[j+1] - 128);
5434                buf[j+2] = (float)(src[j+2] - 128);
5435            }
5436            cvt(buf, buf, dn);
5437            j = 0;
5438
5439            #if CV_NEON
5440            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
5441            {
5442                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
5443                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
5444                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
5445                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
5446                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
5447                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
5448                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
5449
5450                if (dcn == 4)
5451                {
5452                    uint8x8x4_t v_dst;
5453                    v_dst.val[0] = v_dst0;
5454                    v_dst.val[1] = v_dst1;
5455                    v_dst.val[2] = v_dst2;
5456                    v_dst.val[3] = v_alpha;
5457                    vst4_u8(dst, v_dst);
5458                }
5459                else
5460                {
5461                    uint8x8x3_t v_dst;
5462                    v_dst.val[0] = v_dst0;
5463                    v_dst.val[1] = v_dst1;
5464                    v_dst.val[2] = v_dst2;
5465                    vst3_u8(dst, v_dst);
5466                }
5467            }
5468            #elif CV_SSE2
5469            if (dcn == 3 && haveSIMD)
5470            {
5471                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
5472                {
5473                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
5474                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
5475                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
5476                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
5477
5478                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
5479                                                     _mm_cvtps_epi32(v_src1));
5480                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
5481                                                     _mm_cvtps_epi32(v_src3));
5482
5483                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
5484                }
5485
5486                int jr = j % 3;
5487                if (jr)
5488                    dst -= jr, j -= jr;
5489            }
5490            #endif
5491
5492            for( ; j < dn*3; j += 3, dst += dcn )
5493            {
5494                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
5495                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
5496                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
5497                if( dcn == 4 )
5498                    dst[3] = alpha;
5499            }
5500        }
5501    }
5502
5503    int dstcn;
5504    Lab2RGB_f cvt;
5505
5506    #if CV_NEON
5507    float32x4_t v_scale, v_scale_inv, v_128;
5508    uint8x8_t v_alpha;
5509    #elif CV_SSE2
5510    __m128 v_scale, v_scale_inv, v_128;
5511    __m128i v_zero;
5512    bool haveSIMD;
5513    #endif
5514};
5515
5516
5517///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
5518
5519struct RGB2Luv_f
5520{
5521    typedef float channel_type;
5522
5523    RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
5524               const float* whitept, bool _srgb )
5525    : srccn(_srccn), srgb(_srgb)
5526    {
5527        volatile int i;
5528        initLabTabs();
5529
5530        if(!_coeffs) _coeffs = sRGB2XYZ_D65;
5531        if(!whitept) whitept = D65;
5532
5533        for( i = 0; i < 3; i++ )
5534        {
5535            coeffs[i*3] = _coeffs[i*3];
5536            coeffs[i*3+1] = _coeffs[i*3+1];
5537            coeffs[i*3+2] = _coeffs[i*3+2];
5538            if( blueIdx == 0 )
5539                std::swap(coeffs[i*3], coeffs[i*3+2]);
5540            CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5541                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
5542        }
5543
5544        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
5545        un = 4*whitept[0]*d;
5546        vn = 9*whitept[1]*d;
5547
5548        CV_Assert(whitept[1] == 1.f);
5549    }
5550
5551    void operator()(const float* src, float* dst, int n) const
5552    {
5553        int i, scn = srccn;
5554        float gscale = GammaTabScale;
5555        const float* gammaTab = srgb ? sRGBGammaTab : 0;
5556        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5557              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5558              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5559        float _un = 13*un, _vn = 13*vn;
5560        n *= 3;
5561
5562        for( i = 0; i < n; i += 3, src += scn )
5563        {
5564            float R = src[0], G = src[1], B = src[2];
5565            if( gammaTab )
5566            {
5567                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
5568                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
5569                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
5570            }
5571
5572            float X = R*C0 + G*C1 + B*C2;
5573            float Y = R*C3 + G*C4 + B*C5;
5574            float Z = R*C6 + G*C7 + B*C8;
5575
5576            float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
5577            L = 116.f*L - 16.f;
5578
5579            float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
5580            float u = L*(X*d - _un);
5581            float v = L*((9*0.25f)*Y*d - _vn);
5582
5583            dst[i] = L; dst[i+1] = u; dst[i+2] = v;
5584        }
5585    }
5586
5587    int srccn;
5588    float coeffs[9], un, vn;
5589    bool srgb;
5590};
5591
5592
5593struct Luv2RGB_f
5594{
5595    typedef float channel_type;
5596
5597    Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
5598              const float* whitept, bool _srgb )
5599    : dstcn(_dstcn), srgb(_srgb)
5600    {
5601        initLabTabs();
5602
5603        if(!_coeffs) _coeffs = XYZ2sRGB_D65;
5604        if(!whitept) whitept = D65;
5605
5606        for( int i = 0; i < 3; i++ )
5607        {
5608            coeffs[i+(blueIdx^2)*3] = _coeffs[i];
5609            coeffs[i+3] = _coeffs[i+3];
5610            coeffs[i+blueIdx*3] = _coeffs[i+6];
5611        }
5612
5613        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
5614        un = 4*whitept[0]*d;
5615        vn = 9*whitept[1]*d;
5616
5617        CV_Assert(whitept[1] == 1.f);
5618    }
5619
5620    void operator()(const float* src, float* dst, int n) const
5621    {
5622        int i, dcn = dstcn;
5623        const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
5624        float gscale = GammaTabScale;
5625        float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5626              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5627              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5628        float alpha = ColorChannel<float>::max();
5629        float _un = un, _vn = vn;
5630        n *= 3;
5631
5632        for( i = 0; i < n; i += 3, dst += dcn )
5633        {
5634            float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
5635            Y = (L + 16.f) * (1.f/116.f);
5636            Y = Y*Y*Y;
5637            d = (1.f/13.f)/L;
5638            u = u*d + _un;
5639            v = v*d + _vn;
5640            float iv = 1.f/v;
5641            X = 2.25f * u * Y * iv ;
5642            Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
5643
5644            float R = X*C0 + Y*C1 + Z*C2;
5645            float G = X*C3 + Y*C4 + Z*C5;
5646            float B = X*C6 + Y*C7 + Z*C8;
5647
5648            R = std::min(std::max(R, 0.f), 1.f);
5649            G = std::min(std::max(G, 0.f), 1.f);
5650            B = std::min(std::max(B, 0.f), 1.f);
5651
5652            if( gammaTab )
5653            {
5654                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
5655                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
5656                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
5657            }
5658
5659            dst[0] = R; dst[1] = G; dst[2] = B;
5660            if( dcn == 4 )
5661                dst[3] = alpha;
5662        }
5663    }
5664
5665    int dstcn;
5666    float coeffs[9], un, vn;
5667    bool srgb;
5668};
5669
5670
5671struct RGB2Luv_b
5672{
5673    typedef uchar channel_type;
5674
5675    RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
5676               const float* _whitept, bool _srgb )
5677    : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
5678    {
5679        #if CV_NEON
5680        v_scale_inv = vdupq_n_f32(1.f/255.f);
5681        v_scale = vdupq_n_f32(2.55f);
5682        v_coeff1 = vdupq_n_f32(0.72033898305084743f);
5683        v_coeff2 = vdupq_n_f32(96.525423728813564f);
5684        v_coeff3 = vdupq_n_f32(0.9732824427480916f);
5685        v_coeff4 = vdupq_n_f32(136.259541984732824f);
5686        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5687        #elif CV_SSE2
5688        v_zero = _mm_setzero_si128();
5689        v_scale_inv = _mm_set1_ps(1.f/255.f);
5690        v_scale = _mm_set1_ps(2.55f);
5691        v_coeff1 = _mm_set1_ps(0.72033898305084743f);
5692        v_coeff2 = _mm_set1_ps(96.525423728813564f);
5693        v_coeff3 = _mm_set1_ps(0.9732824427480916f);
5694        v_coeff4 = _mm_set1_ps(136.259541984732824f);
5695        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5696        #endif
5697    }
5698
5699    #if CV_SSE2
5700    void process(const float * buf,
5701                 __m128i & v_l, __m128i & v_u, __m128i & v_v) const
5702    {
5703        __m128 v_l0f = _mm_load_ps(buf);
5704        __m128 v_l1f = _mm_load_ps(buf + 4);
5705        __m128 v_u0f = _mm_load_ps(buf + 8);
5706        __m128 v_u1f = _mm_load_ps(buf + 12);
5707        __m128 v_v0f = _mm_load_ps(buf + 16);
5708        __m128 v_v1f = _mm_load_ps(buf + 20);
5709
5710        _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f);
5711
5712        v_l0f = _mm_mul_ps(v_l0f, v_scale);
5713        v_l1f = _mm_mul_ps(v_l1f, v_scale);
5714        v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2);
5715        v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2);
5716        v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4);
5717        v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4);
5718
5719        v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
5720        v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
5721        v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f));
5722    }
5723    #endif
5724
5725    void operator()(const uchar* src, uchar* dst, int n) const
5726    {
5727        int i, j, scn = srccn;
5728        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5729
5730        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
5731        {
5732            int dn = std::min(n - i, (int)BLOCK_SIZE);
5733            j = 0;
5734
5735            #if CV_NEON
5736            for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
5737            {
5738                uint16x8_t v_t0, v_t1, v_t2;
5739
5740                if (scn == 3)
5741                {
5742                    uint8x8x3_t v_src = vld3_u8(src);
5743                    v_t0 = vmovl_u8(v_src.val[0]);
5744                    v_t1 = vmovl_u8(v_src.val[1]);
5745                    v_t2 = vmovl_u8(v_src.val[2]);
5746                }
5747                else
5748                {
5749                    uint8x8x4_t v_src = vld4_u8(src);
5750                    v_t0 = vmovl_u8(v_src.val[0]);
5751                    v_t1 = vmovl_u8(v_src.val[1]);
5752                    v_t2 = vmovl_u8(v_src.val[2]);
5753                }
5754
5755                float32x4x3_t v_dst;
5756                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5757                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
5758                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
5759                vst3q_f32(buf + j, v_dst);
5760
5761                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5762                v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
5763                v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
5764                vst3q_f32(buf + j + 12, v_dst);
5765            }
5766            #elif CV_SSE2
5767            if (scn == 3 && haveSIMD)
5768            {
5769                for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
5770                {
5771                    __m128i v_src = _mm_loadu_si128((__m128i const *)src);
5772
5773                    __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
5774                    _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
5775                    _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
5776
5777                    v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
5778                    _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
5779                    _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
5780                }
5781
5782                int jr = j % 3;
5783                if (jr)
5784                    src -= jr, j -= jr;
5785            }
5786            #endif
5787            for( ; j < dn*3; j += 3, src += scn )
5788            {
5789                buf[j] = src[0]*(1.f/255.f);
5790                buf[j+1] = (float)(src[1]*(1.f/255.f));
5791                buf[j+2] = (float)(src[2]*(1.f/255.f));
5792            }
5793            cvt(buf, buf, dn);
5794
5795            j = 0;
5796            #if CV_NEON
5797            for ( ; j <= (dn - 8) * 3; j += 24)
5798            {
5799                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
5800
5801                uint8x8x3_t v_dst;
5802                v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
5803                                                       vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
5804                v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
5805                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
5806                v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
5807                                                       vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
5808
5809                vst3_u8(dst + j, v_dst);
5810            }
5811            #elif CV_SSE2
5812            if (haveSIMD)
5813            {
5814                for ( ; j <= (dn - 32) * 3; j += 96)
5815                {
5816                    __m128i v_l_0, v_u_0, v_v_0;
5817                    process(buf + j,
5818                            v_l_0, v_u_0, v_v_0);
5819
5820                    __m128i v_l_1, v_u_1, v_v_1;
5821                    process(buf + j + 24,
5822                            v_l_1, v_u_1, v_v_1);
5823
5824                    __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
5825                    __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1);
5826                    __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1);
5827
5828                    process(buf + j + 48,
5829                            v_l_0, v_u_0, v_v_0);
5830
5831                    process(buf + j + 72,
5832                            v_l_1, v_u_1, v_v_1);
5833
5834                    __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
5835                    __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1);
5836                    __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1);
5837
5838                    _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
5839
5840                    _mm_storeu_si128((__m128i *)(dst + j), v_l0);
5841                    _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1);
5842                    _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0);
5843                    _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1);
5844                    _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0);
5845                    _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1);
5846                }
5847            }
5848            #endif
5849
5850            for( ; j < dn*3; j += 3 )
5851            {
5852                dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
5853                dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
5854                dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.9732824427480916f + 136.259541984732824f);
5855            }
5856        }
5857    }
5858
5859    int srccn;
5860    RGB2Luv_f cvt;
5861
5862    #if CV_NEON
5863    float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
5864    uint8x8_t v_alpha;
5865    #elif CV_SSE2
5866    __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
5867    __m128i v_zero;
5868    bool haveSIMD;
5869    #endif
5870};
5871
5872
5873struct Luv2RGB_b
5874{
5875    typedef uchar channel_type;
5876
5877    Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
5878               const float* _whitept, bool _srgb )
5879    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
5880    {
5881        #if CV_NEON
5882        v_scale_inv = vdupq_n_f32(100.f/255.f);
5883        v_coeff1 = vdupq_n_f32(1.388235294117647f);
5884        v_coeff2 = vdupq_n_f32(1.027450980392157f);
5885        v_134 = vdupq_n_f32(134.f);
5886        v_140 = vdupq_n_f32(140.f);
5887        v_scale = vdupq_n_f32(255.f);
5888        v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5889        #elif CV_SSE2
5890        v_scale_inv = _mm_set1_ps(100.f/255.f);
5891        v_coeff1 = _mm_set1_ps(1.388235294117647f);
5892        v_coeff2 = _mm_set1_ps(1.027450980392157f);
5893        v_134 = _mm_set1_ps(134.f);
5894        v_140 = _mm_set1_ps(140.f);
5895        v_scale = _mm_set1_ps(255.f);
5896        v_zero = _mm_setzero_si128();
5897        haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5898        #endif
5899    }
5900
5901    #if CV_SSE2
5902    // 16s x 8
5903    void process(__m128i v_l, __m128i v_u, __m128i v_v,
5904                 float * buf) const
5905    {
5906        __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
5907        __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero));
5908        __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero));
5909
5910        __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero));
5911        __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
5912        __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
5913
5914        v_l0 = _mm_mul_ps(v_l0, v_scale_inv);
5915        v_l1 = _mm_mul_ps(v_l1, v_scale_inv);
5916
5917        v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134);
5918        v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134);
5919        v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140);
5920        v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140);
5921
5922        _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
5923
5924        _mm_store_ps(buf, v_l0);
5925        _mm_store_ps(buf + 4, v_l1);
5926        _mm_store_ps(buf + 8, v_u0);
5927        _mm_store_ps(buf + 12, v_u1);
5928        _mm_store_ps(buf + 16, v_v0);
5929        _mm_store_ps(buf + 20, v_v1);
5930    }
5931    #endif
5932
5933    void operator()(const uchar* src, uchar* dst, int n) const
5934    {
5935        int i, j, dcn = dstcn;
5936        uchar alpha = ColorChannel<uchar>::max();
5937        float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5938
5939        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
5940        {
5941            int dn = std::min(n - i, (int)BLOCK_SIZE);
5942            j = 0;
5943
5944            #if CV_NEON
5945            for ( ; j <= (dn - 8) * 3; j += 24)
5946            {
5947                uint8x8x3_t v_src = vld3_u8(src + j);
5948                uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
5949                           v_t1 = vmovl_u8(v_src.val[1]),
5950                           v_t2 = vmovl_u8(v_src.val[2]);
5951
5952                float32x4x3_t v_dst;
5953                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5954                v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
5955                v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
5956                vst3q_f32(buf + j, v_dst);
5957
5958                v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5959                v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
5960                v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
5961                vst3q_f32(buf + j + 12, v_dst);
5962            }
5963            #elif CV_SSE2
5964            if (haveSIMD)
5965            {
5966                for ( ; j <= (dn - 32) * 3; j += 96)
5967                {
5968                    __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
5969                    __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
5970                    __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
5971                    __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
5972                    __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
5973                    __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
5974
5975                    _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5976
5977                    process(_mm_unpacklo_epi8(v_r0, v_zero),
5978                            _mm_unpacklo_epi8(v_g0, v_zero),
5979                            _mm_unpacklo_epi8(v_b0, v_zero),
5980                            buf + j);
5981
5982                    process(_mm_unpackhi_epi8(v_r0, v_zero),
5983                            _mm_unpackhi_epi8(v_g0, v_zero),
5984                            _mm_unpackhi_epi8(v_b0, v_zero),
5985                            buf + j + 24);
5986
5987                    process(_mm_unpacklo_epi8(v_r1, v_zero),
5988                            _mm_unpacklo_epi8(v_g1, v_zero),
5989                            _mm_unpacklo_epi8(v_b1, v_zero),
5990                            buf + j + 48);
5991
5992                    process(_mm_unpackhi_epi8(v_r1, v_zero),
5993                            _mm_unpackhi_epi8(v_g1, v_zero),
5994                            _mm_unpackhi_epi8(v_b1, v_zero),
5995                            buf + j + 72);
5996                }
5997            }
5998            #endif
5999            for( ; j < dn*3; j += 3 )
6000            {
6001                buf[j] = src[j]*(100.f/255.f);
6002                buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
6003                buf[j+2] = (float)(src[j+2]*1.027450980392157f - 140.f);
6004            }
6005            cvt(buf, buf, dn);
6006
6007            j = 0;
6008            #if CV_NEON
6009            for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
6010            {
6011                float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
6012                uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
6013                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
6014                uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
6015                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
6016                uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
6017                                                           vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
6018
6019                if (dcn == 4)
6020                {
6021                    uint8x8x4_t v_dst;
6022                    v_dst.val[0] = v_dst0;
6023                    v_dst.val[1] = v_dst1;
6024                    v_dst.val[2] = v_dst2;
6025                    v_dst.val[3] = v_alpha;
6026                    vst4_u8(dst, v_dst);
6027                }
6028                else
6029                {
6030                    uint8x8x3_t v_dst;
6031                    v_dst.val[0] = v_dst0;
6032                    v_dst.val[1] = v_dst1;
6033                    v_dst.val[2] = v_dst2;
6034                    vst3_u8(dst, v_dst);
6035                }
6036            }
6037            #elif CV_SSE2
6038            if (dcn == 3 && haveSIMD)
6039            {
6040                for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
6041                {
6042                    __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
6043                    __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
6044                    __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
6045                    __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
6046
6047                    __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
6048                                                     _mm_cvtps_epi32(v_src1));
6049                    __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
6050                                                     _mm_cvtps_epi32(v_src3));
6051
6052                    _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
6053                }
6054
6055                int jr = j % 3;
6056                if (jr)
6057                    dst -= jr, j -= jr;
6058            }
6059            #endif
6060
6061            for( ; j < dn*3; j += 3, dst += dcn )
6062            {
6063                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
6064                dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
6065                dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
6066                if( dcn == 4 )
6067                    dst[3] = alpha;
6068            }
6069        }
6070    }
6071
6072    int dstcn;
6073    Luv2RGB_f cvt;
6074
6075    #if CV_NEON
6076    float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6077    uint8x8_t v_alpha;
6078    #elif CV_SSE2
6079    __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6080    __m128i v_zero;
6081    bool haveSIMD;
6082    #endif
6083};
6084
6085
6086///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
6087
6088const int ITUR_BT_601_CY = 1220542;
6089const int ITUR_BT_601_CUB = 2116026;
6090const int ITUR_BT_601_CUG = -409993;
6091const int ITUR_BT_601_CVG = -852492;
6092const int ITUR_BT_601_CVR = 1673527;
6093const int ITUR_BT_601_SHIFT = 20;
6094
6095// Coefficients for RGB to YUV420p conversion
6096const int ITUR_BT_601_CRY =  269484;
6097const int ITUR_BT_601_CGY =  528482;
6098const int ITUR_BT_601_CBY =  102760;
6099const int ITUR_BT_601_CRU = -155188;
6100const int ITUR_BT_601_CGU = -305135;
6101const int ITUR_BT_601_CBU =  460324;
6102const int ITUR_BT_601_CGV = -385875;
6103const int ITUR_BT_601_CBV = -74448;
6104
6105template<int bIdx, int uIdx>
6106struct YUV420sp2RGB888Invoker : ParallelLoopBody
6107{
6108    Mat* dst;
6109    const uchar* my1, *muv;
6110    int width, stride;
6111
6112    YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
6113        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
6114
6115    void operator()(const Range& range) const
6116    {
6117        int rangeBegin = range.start * 2;
6118        int rangeEnd = range.end * 2;
6119
6120        //R = 1.164(Y - 16) + 1.596(V - 128)
6121        //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
6122        //B = 1.164(Y - 16)                  + 2.018(U - 128)
6123
6124        //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
6125        //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
6126        //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
6127
6128        const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
6129
6130#ifdef HAVE_TEGRA_OPTIMIZATION
6131        if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
6132            return;
6133#endif
6134
6135        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
6136        {
6137            uchar* row1 = dst->ptr<uchar>(j);
6138            uchar* row2 = dst->ptr<uchar>(j + 1);
6139            const uchar* y2 = y1 + stride;
6140
6141            for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
6142            {
6143                int u = int(uv[i + 0 + uIdx]) - 128;
6144                int v = int(uv[i + 1 - uIdx]) - 128;
6145
6146                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6147                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6148                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6149
6150                int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
6151                row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6152                row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6153                row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6154
6155                int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
6156                row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6157                row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6158                row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6159
6160                int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
6161                row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6162                row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6163                row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6164
6165                int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
6166                row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6167                row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6168                row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6169            }
6170        }
6171    }
6172};
6173
6174template<int bIdx, int uIdx>
6175struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
6176{
6177    Mat* dst;
6178    const uchar* my1, *muv;
6179    int width, stride;
6180
6181    YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
6182        : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
6183
6184    void operator()(const Range& range) const
6185    {
6186        int rangeBegin = range.start * 2;
6187        int rangeEnd = range.end * 2;
6188
6189        //R = 1.164(Y - 16) + 1.596(V - 128)
6190        //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
6191        //B = 1.164(Y - 16)                  + 2.018(U - 128)
6192
6193        //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
6194        //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
6195        //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
6196
6197        const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
6198
6199#ifdef HAVE_TEGRA_OPTIMIZATION
6200        if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
6201            return;
6202#endif
6203
6204        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
6205        {
6206            uchar* row1 = dst->ptr<uchar>(j);
6207            uchar* row2 = dst->ptr<uchar>(j + 1);
6208            const uchar* y2 = y1 + stride;
6209
6210            for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
6211            {
6212                int u = int(uv[i + 0 + uIdx]) - 128;
6213                int v = int(uv[i + 1 - uIdx]) - 128;
6214
6215                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6216                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6217                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6218
6219                int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
6220                row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6221                row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6222                row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6223                row1[3]      = uchar(0xff);
6224
6225                int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
6226                row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6227                row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6228                row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6229                row1[7]      = uchar(0xff);
6230
6231                int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
6232                row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6233                row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6234                row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6235                row2[3]      = uchar(0xff);
6236
6237                int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
6238                row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6239                row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6240                row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6241                row2[7]      = uchar(0xff);
6242            }
6243        }
6244    }
6245};
6246
6247template<int bIdx>
6248struct YUV420p2RGB888Invoker : ParallelLoopBody
6249{
6250    Mat* dst;
6251    const uchar* my1, *mu, *mv;
6252    int width, stride;
6253    int ustepIdx, vstepIdx;
6254
6255    YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
6256        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
6257
6258    void operator()(const Range& range) const
6259    {
6260        const int rangeBegin = range.start * 2;
6261        const int rangeEnd = range.end * 2;
6262
6263        int uvsteps[2] = {width/2, stride - width/2};
6264        int usIdx = ustepIdx, vsIdx = vstepIdx;
6265
6266        const uchar* y1 = my1 + rangeBegin * stride;
6267        const uchar* u1 = mu + (range.start / 2) * stride;
6268        const uchar* v1 = mv + (range.start / 2) * stride;
6269
6270        if(range.start % 2 == 1)
6271        {
6272            u1 += uvsteps[(usIdx++) & 1];
6273            v1 += uvsteps[(vsIdx++) & 1];
6274        }
6275
6276        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
6277        {
6278            uchar* row1 = dst->ptr<uchar>(j);
6279            uchar* row2 = dst->ptr<uchar>(j + 1);
6280            const uchar* y2 = y1 + stride;
6281
6282            for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
6283            {
6284                int u = int(u1[i]) - 128;
6285                int v = int(v1[i]) - 128;
6286
6287                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6288                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6289                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6290
6291                int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
6292                row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6293                row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6294                row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6295
6296                int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6297                row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6298                row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6299                row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6300
6301                int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
6302                row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6303                row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6304                row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6305
6306                int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6307                row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6308                row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6309                row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6310            }
6311        }
6312    }
6313};
6314
6315template<int bIdx>
6316struct YUV420p2RGBA8888Invoker : ParallelLoopBody
6317{
6318    Mat* dst;
6319    const uchar* my1, *mu, *mv;
6320    int width, stride;
6321    int ustepIdx, vstepIdx;
6322
6323    YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
6324        : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
6325
6326    void operator()(const Range& range) const
6327    {
6328        int rangeBegin = range.start * 2;
6329        int rangeEnd = range.end * 2;
6330
6331        int uvsteps[2] = {width/2, stride - width/2};
6332        int usIdx = ustepIdx, vsIdx = vstepIdx;
6333
6334        const uchar* y1 = my1 + rangeBegin * stride;
6335        const uchar* u1 = mu + (range.start / 2) * stride;
6336        const uchar* v1 = mv + (range.start / 2) * stride;
6337
6338        if(range.start % 2 == 1)
6339        {
6340            u1 += uvsteps[(usIdx++) & 1];
6341            v1 += uvsteps[(vsIdx++) & 1];
6342        }
6343
6344        for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
6345        {
6346            uchar* row1 = dst->ptr<uchar>(j);
6347            uchar* row2 = dst->ptr<uchar>(j + 1);
6348            const uchar* y2 = y1 + stride;
6349
6350            for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
6351            {
6352                int u = int(u1[i]) - 128;
6353                int v = int(v1[i]) - 128;
6354
6355                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6356                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6357                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6358
6359                int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
6360                row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6361                row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6362                row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6363                row1[3]      = uchar(0xff);
6364
6365                int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6366                row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6367                row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6368                row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6369                row1[7]      = uchar(0xff);
6370
6371                int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
6372                row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6373                row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6374                row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6375                row2[3]      = uchar(0xff);
6376
6377                int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6378                row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6379                row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6380                row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6381                row2[7]      = uchar(0xff);
6382            }
6383        }
6384    }
6385};
6386
6387#define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
6388
6389template<int bIdx, int uIdx>
6390inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
6391{
6392    YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
6393    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6394        parallel_for_(Range(0, _dst.rows/2), converter);
6395    else
6396        converter(Range(0, _dst.rows/2));
6397}
6398
6399template<int bIdx, int uIdx>
6400inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
6401{
6402    YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
6403    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6404        parallel_for_(Range(0, _dst.rows/2), converter);
6405    else
6406        converter(Range(0, _dst.rows/2));
6407}
6408
6409template<int bIdx>
6410inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
6411{
6412    YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
6413    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6414        parallel_for_(Range(0, _dst.rows/2), converter);
6415    else
6416        converter(Range(0, _dst.rows/2));
6417}
6418
6419template<int bIdx>
6420inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
6421{
6422    YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
6423    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6424        parallel_for_(Range(0, _dst.rows/2), converter);
6425    else
6426        converter(Range(0, _dst.rows/2));
6427}
6428
6429///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
6430
6431template<int bIdx>
6432struct RGB888toYUV420pInvoker: public ParallelLoopBody
6433{
6434    RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
6435        : src_(src),
6436          dst_(dst),
6437          uIdx_(uIdx) { }
6438
6439    void operator()(const Range& rowRange) const
6440    {
6441        const int w = src_.cols;
6442        const int h = src_.rows;
6443
6444        const int cn = src_.channels();
6445        for( int i = rowRange.start; i < rowRange.end; i++ )
6446        {
6447            const uchar* row0 = src_.ptr<uchar>(2 * i);
6448            const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
6449
6450            uchar* y = dst_->ptr<uchar>(2*i);
6451            uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
6452            uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
6453            if( uIdx_ == 2 ) std::swap(u, v);
6454
6455            for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
6456            {
6457                int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
6458                int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
6459                int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
6460                int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
6461
6462                const int shifted16 = (16 << ITUR_BT_601_SHIFT);
6463                const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
6464                int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
6465                int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
6466                int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
6467                int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
6468
6469                y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
6470                y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
6471                y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
6472                y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
6473
6474                const int shifted128 = (128 << ITUR_BT_601_SHIFT);
6475                int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
6476                int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
6477
6478                u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
6479                v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
6480            }
6481        }
6482    }
6483
6484    static bool isFit( const Mat& src )
6485    {
6486        return (src.total() >= 320*240);
6487    }
6488
6489private:
6490    RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
6491
6492    const Mat& src_;
6493    Mat* const dst_;
6494    const int uIdx_;
6495};
6496
6497template<int bIdx, int uIdx>
6498static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
6499{
6500    RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
6501    if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
6502        parallel_for_(Range(0, src.rows/2), colorConverter);
6503    else
6504        colorConverter(Range(0, src.rows/2));
6505}
6506
6507///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
6508
6509template<int bIdx, int uIdx, int yIdx>
6510struct YUV422toRGB888Invoker : ParallelLoopBody
6511{
6512    Mat* dst;
6513    const uchar* src;
6514    int width, stride;
6515
6516    YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
6517        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
6518
6519    void operator()(const Range& range) const
6520    {
6521        int rangeBegin = range.start;
6522        int rangeEnd = range.end;
6523
6524        const int uidx = 1 - yIdx + uIdx * 2;
6525        const int vidx = (2 + uidx) % 4;
6526        const uchar* yuv_src = src + rangeBegin * stride;
6527
6528        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
6529        {
6530            uchar* row = dst->ptr<uchar>(j);
6531
6532            for (int i = 0; i < 2 * width; i += 4, row += 6)
6533            {
6534                int u = int(yuv_src[i + uidx]) - 128;
6535                int v = int(yuv_src[i + vidx]) - 128;
6536
6537                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6538                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6539                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6540
6541                int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
6542                row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6543                row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6544                row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6545
6546                int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
6547                row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6548                row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6549                row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6550            }
6551        }
6552    }
6553};
6554
6555template<int bIdx, int uIdx, int yIdx>
6556struct YUV422toRGBA8888Invoker : ParallelLoopBody
6557{
6558    Mat* dst;
6559    const uchar* src;
6560    int width, stride;
6561
6562    YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
6563        : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
6564
6565    void operator()(const Range& range) const
6566    {
6567        int rangeBegin = range.start;
6568        int rangeEnd = range.end;
6569
6570        const int uidx = 1 - yIdx + uIdx * 2;
6571        const int vidx = (2 + uidx) % 4;
6572        const uchar* yuv_src = src + rangeBegin * stride;
6573
6574        for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
6575        {
6576            uchar* row = dst->ptr<uchar>(j);
6577
6578            for (int i = 0; i < 2 * width; i += 4, row += 8)
6579            {
6580                int u = int(yuv_src[i + uidx]) - 128;
6581                int v = int(yuv_src[i + vidx]) - 128;
6582
6583                int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6584                int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6585                int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6586
6587                int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
6588                row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6589                row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6590                row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6591                row[3]      = uchar(0xff);
6592
6593                int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
6594                row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6595                row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6596                row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6597                row[7]      = uchar(0xff);
6598            }
6599        }
6600    }
6601};
6602
6603#define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
6604
6605template<int bIdx, int uIdx, int yIdx>
6606inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
6607{
6608    YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
6609    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
6610        parallel_for_(Range(0, _dst.rows), converter);
6611    else
6612        converter(Range(0, _dst.rows));
6613}
6614
6615template<int bIdx, int uIdx, int yIdx>
6616inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
6617{
6618    YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
6619    if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
6620        parallel_for_(Range(0, _dst.rows), converter);
6621    else
6622        converter(Range(0, _dst.rows));
6623}
6624
6625/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
6626
6627template<typename _Tp>
6628struct RGBA2mRGBA
6629{
6630    typedef _Tp channel_type;
6631
6632    void operator()(const _Tp* src, _Tp* dst, int n) const
6633    {
6634        _Tp max_val  = ColorChannel<_Tp>::max();
6635        _Tp half_val = ColorChannel<_Tp>::half();
6636        for( int i = 0; i < n; i++ )
6637        {
6638            _Tp v0 = *src++;
6639            _Tp v1 = *src++;
6640            _Tp v2 = *src++;
6641            _Tp v3 = *src++;
6642
6643            *dst++ = (v0 * v3 + half_val) / max_val;
6644            *dst++ = (v1 * v3 + half_val) / max_val;
6645            *dst++ = (v2 * v3 + half_val) / max_val;
6646            *dst++ = v3;
6647        }
6648    }
6649};
6650
6651
6652template<typename _Tp>
6653struct mRGBA2RGBA
6654{
6655    typedef _Tp channel_type;
6656
6657    void operator()(const _Tp* src, _Tp* dst, int n) const
6658    {
6659        _Tp max_val = ColorChannel<_Tp>::max();
6660        for( int i = 0; i < n; i++ )
6661        {
6662            _Tp v0 = *src++;
6663            _Tp v1 = *src++;
6664            _Tp v2 = *src++;
6665            _Tp v3 = *src++;
6666            _Tp v3_half = v3 / 2;
6667
6668            *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
6669            *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
6670            *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
6671            *dst++ = v3;
6672        }
6673    }
6674};
6675
6676#ifdef HAVE_OPENCL
6677
6678static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
6679{
6680    bool ok = false;
6681    UMat src = _src.getUMat(), dst;
6682    Size sz = src.size(), dstSz = sz;
6683    int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
6684    int dims = 2, stripeSize = 1;
6685    ocl::Kernel k;
6686
6687    if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
6688        return false;
6689
6690    ocl::Device dev = ocl::Device::getDefault();
6691    int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
6692    int pxPerWIx = 1;
6693
6694    size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
6695    cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
6696                             depth, scn, pxPerWIy);
6697
6698    switch (code)
6699    {
6700    case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
6701    case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
6702    {
6703        CV_Assert(scn == 3 || scn == 4);
6704        dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
6705        bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
6706        k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
6707                 opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
6708                        reverse ? "REVERSE" : "ORDER"));
6709        break;
6710    }
6711    case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
6712    case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
6713    {
6714        dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
6715        CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
6716        bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
6717            code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
6718        int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
6719            code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
6720        k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
6721                 opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
6722        break;
6723    }
6724    case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
6725    case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
6726    {
6727        CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
6728        bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
6729            code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
6730        int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
6731            code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
6732        dcn = 2;
6733        k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
6734                 opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
6735        break;
6736    }
6737    case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
6738    {
6739        CV_Assert(scn == 2 && depth == CV_8U);
6740        dcn = 1;
6741        int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
6742        k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
6743                 opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
6744        break;
6745    }
6746    case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
6747    {
6748        CV_Assert(scn == 1 && depth == CV_8U);
6749        dcn = 2;
6750        int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
6751        k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
6752                 opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
6753        break;
6754    }
6755    case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
6756    case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
6757    {
6758        CV_Assert(scn == 3 || scn == 4);
6759        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
6760        dcn = 1;
6761        k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
6762                 opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
6763                               bidx, stripeSize));
6764        globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
6765        break;
6766    }
6767    case COLOR_GRAY2BGR:
6768    case COLOR_GRAY2BGRA:
6769    {
6770        CV_Assert(scn == 1);
6771        dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
6772        k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
6773                 opts + format("-D bidx=0 -D dcn=%d", dcn));
6774        break;
6775    }
6776    case COLOR_BGR2YUV:
6777    case COLOR_RGB2YUV:
6778    {
6779        CV_Assert(scn == 3 || scn == 4);
6780        bidx = code == COLOR_RGB2YUV ? 0 : 2;
6781        dcn = 3;
6782        k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
6783                 opts + format("-D dcn=3 -D bidx=%d", bidx));
6784        break;
6785    }
6786    case COLOR_YUV2BGR:
6787    case COLOR_YUV2RGB:
6788    {
6789        if(dcn < 0) dcn = 3;
6790        CV_Assert(dcn == 3 || dcn == 4);
6791        bidx = code == COLOR_YUV2RGB ? 0 : 2;
6792        k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
6793                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
6794        break;
6795    }
6796    case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
6797    case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
6798    {
6799        CV_Assert( scn == 1 );
6800        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6801        dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
6802               code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
6803        bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
6804               code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
6805        uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
6806               code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
6807
6808        dstSz = Size(sz.width, sz.height * 2 / 3);
6809        globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
6810        k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
6811                 opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
6812        break;
6813    }
6814    case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
6815    case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
6816    {
6817        CV_Assert( scn == 1 );
6818        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6819        dcn  = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
6820               code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
6821        bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
6822               code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
6823        uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
6824               code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
6825
6826        dstSz = Size(sz.width, sz.height * 2 / 3);
6827        globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
6828        k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
6829                 opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
6830                 src.isContinuous() ? " -D SRC_CONT" : ""));
6831        break;
6832    }
6833    case COLOR_YUV2GRAY_420:
6834    {
6835        if (dcn <= 0) dcn = 1;
6836
6837        CV_Assert( dcn == 1 );
6838        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6839
6840        dstSz = Size(sz.width, sz.height * 2 / 3);
6841        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6842        dst = _dst.getUMat();
6843
6844        src.rowRange(0, dstSz.height).copyTo(dst);
6845        return true;
6846    }
6847    case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
6848    case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
6849    {
6850        if (dcn <= 0) dcn = 1;
6851        bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
6852               code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
6853        uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
6854               code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
6855
6856        CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
6857        CV_Assert( dcn == 1 );
6858        CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
6859
6860        dstSz = Size(sz.width, sz.height / 2 * 3);
6861        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6862        dst = _dst.getUMat();
6863
6864        if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
6865            dst.step % 4 == 0 && dst.offset % 4 == 0)
6866        {
6867            pxPerWIx = 2;
6868        }
6869        globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
6870
6871        k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
6872                 opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
6873        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
6874        return k.run(2, globalsize, NULL, false);
6875    }
6876    case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
6877    case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
6878    case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
6879    {
6880        if (dcn <= 0)
6881            dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
6882                   code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
6883
6884        bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
6885                code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
6886        yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
6887        uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
6888                code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
6889        uidx = 1 - yidx + uidx;
6890
6891        CV_Assert( dcn == 3 || dcn == 4 );
6892        CV_Assert( scn == 2 && depth == CV_8U );
6893
6894        k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
6895                 opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
6896                                src.offset % 4 == 0 && src.step % 4 == 0 ? " -D USE_OPTIMIZED_LOAD" : ""));
6897        break;
6898    }
6899    case COLOR_BGR2YCrCb:
6900    case COLOR_RGB2YCrCb:
6901    {
6902        CV_Assert(scn == 3 || scn == 4);
6903        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
6904        dcn = 3;
6905        k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
6906                 opts + format("-D dcn=3 -D bidx=%d", bidx));
6907        break;
6908    }
6909    case COLOR_YCrCb2BGR:
6910    case COLOR_YCrCb2RGB:
6911    {
6912        if( dcn <= 0 )
6913            dcn = 3;
6914        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
6915        bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
6916        k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
6917                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
6918        break;
6919    }
6920    case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
6921    {
6922        CV_Assert(scn == 3 || scn == 4);
6923        bidx = code == COLOR_BGR2XYZ ? 0 : 2;
6924
6925        UMat c;
6926        if (depth == CV_32F)
6927        {
6928            float coeffs[] =
6929            {
6930                0.412453f, 0.357580f, 0.180423f,
6931                0.212671f, 0.715160f, 0.072169f,
6932                0.019334f, 0.119193f, 0.950227f
6933            };
6934            if (bidx == 0)
6935            {
6936                std::swap(coeffs[0], coeffs[2]);
6937                std::swap(coeffs[3], coeffs[5]);
6938                std::swap(coeffs[6], coeffs[8]);
6939            }
6940            Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
6941        }
6942        else
6943        {
6944            int coeffs[] =
6945            {
6946                1689,    1465,    739,
6947                871,     2929,    296,
6948                79,      488,     3892
6949            };
6950            if (bidx == 0)
6951            {
6952                std::swap(coeffs[0], coeffs[2]);
6953                std::swap(coeffs[3], coeffs[5]);
6954                std::swap(coeffs[6], coeffs[8]);
6955            }
6956            Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
6957        }
6958
6959        _dst.create(dstSz, CV_MAKETYPE(depth, 3));
6960        dst = _dst.getUMat();
6961
6962        k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
6963                 opts + format("-D dcn=3 -D bidx=%d", bidx));
6964        if (k.empty())
6965            return false;
6966        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
6967        return k.run(2, globalsize, 0, false);
6968    }
6969    case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
6970    {
6971        if (dcn <= 0)
6972            dcn = 3;
6973        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
6974        bidx = code == COLOR_XYZ2BGR ? 0 : 2;
6975
6976        UMat c;
6977        if (depth == CV_32F)
6978        {
6979            float coeffs[] =
6980            {
6981                3.240479f, -1.53715f, -0.498535f,
6982                -0.969256f, 1.875991f, 0.041556f,
6983                0.055648f, -0.204043f, 1.057311f
6984            };
6985            if (bidx == 0)
6986            {
6987                std::swap(coeffs[0], coeffs[6]);
6988                std::swap(coeffs[1], coeffs[7]);
6989                std::swap(coeffs[2], coeffs[8]);
6990            }
6991            Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
6992        }
6993        else
6994        {
6995            int coeffs[] =
6996            {
6997                13273,  -6296,  -2042,
6998                -3970,   7684,    170,
6999                  228,   -836,   4331
7000            };
7001            if (bidx == 0)
7002            {
7003                std::swap(coeffs[0], coeffs[6]);
7004                std::swap(coeffs[1], coeffs[7]);
7005                std::swap(coeffs[2], coeffs[8]);
7006            }
7007            Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
7008        }
7009
7010        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7011        dst = _dst.getUMat();
7012
7013        k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
7014                 opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
7015        if (k.empty())
7016            return false;
7017        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
7018        return k.run(2, globalsize, 0, false);
7019    }
7020    case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
7021    case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
7022    {
7023        CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
7024        bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
7025            code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
7026        int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
7027            code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
7028        bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
7029        String kernelName = String("RGB2") + (is_hsv ? "HSV" : "HLS");
7030        dcn = 3;
7031
7032        if (is_hsv && depth == CV_8U)
7033        {
7034            static UMat sdiv_data;
7035            static UMat hdiv_data180;
7036            static UMat hdiv_data256;
7037            static int sdiv_table[256];
7038            static int hdiv_table180[256];
7039            static int hdiv_table256[256];
7040            static volatile bool initialized180 = false, initialized256 = false;
7041            volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
7042
7043            if (!initialized)
7044            {
7045                int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
7046                UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
7047
7048                sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
7049
7050                int v = 255 << hsv_shift;
7051                if (!initialized180 && !initialized256)
7052                {
7053                    for(int i = 1; i < 256; i++ )
7054                        sdiv_table[i] = saturate_cast<int>(v/(1.*i));
7055                    Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
7056                }
7057
7058                v = hrange << hsv_shift;
7059                for (int i = 1; i < 256; i++ )
7060                    hdiv_table[i] = saturate_cast<int>(v/(6.*i));
7061
7062                Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
7063                initialized = true;
7064            }
7065
7066            _dst.create(dstSz, CV_8UC3);
7067            dst = _dst.getUMat();
7068
7069            k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
7070                     opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
7071                                   hrange, bidx));
7072            if (k.empty())
7073                return false;
7074
7075            k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
7076                   ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
7077                                                                       ocl::KernelArg::PtrReadOnly(hdiv_data180));
7078
7079            return k.run(2, globalsize, NULL, false);
7080        }
7081        else
7082            k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
7083                     opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
7084                                   hrange*(1.f/360.f), bidx));
7085        break;
7086    }
7087    case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
7088    case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
7089    {
7090        if (dcn <= 0)
7091            dcn = 3;
7092        CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
7093        bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
7094            code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
7095        int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
7096            code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
7097        bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
7098                code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
7099
7100        String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
7101        k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
7102                 opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
7103                               dcn, bidx, hrange, 6.f/hrange));
7104        break;
7105    }
7106    case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
7107    {
7108        CV_Assert(scn == 4 && depth == CV_8U);
7109        dcn = 4;
7110
7111        k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
7112                 opts + "-D dcn=4 -D bidx=3");
7113        break;
7114    }
7115    case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
7116    case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
7117    {
7118        CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
7119
7120        bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab || code == CV_BGR2Luv || code == CV_LBGR2Luv ? 0 : 2;
7121        bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_RGB2Luv || code == CV_BGR2Luv;
7122        bool lab = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_LBGR2Lab || code == CV_LRGB2Lab;
7123        float un, vn;
7124        dcn = 3;
7125
7126        k.create(format("BGR2%s", lab ? "Lab" : "Luv").c_str(),
7127                 ocl::imgproc::cvtcolor_oclsrc,
7128                 opts + format("-D dcn=%d -D bidx=%d%s",
7129                               dcn, bidx, srgb ? " -D SRGB" : ""));
7130        if (k.empty())
7131            return false;
7132
7133        initLabTabs();
7134
7135        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7136        dst = _dst.getUMat();
7137
7138        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
7139                dstarg = ocl::KernelArg::WriteOnly(dst);
7140
7141        if (depth == CV_8U && lab)
7142        {
7143            static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
7144
7145            if (srgb && usRGBGammaTab.empty())
7146                Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
7147            else if (ulinearGammaTab.empty())
7148                Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
7149            if (uLabCbrtTab.empty())
7150                Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
7151
7152            {
7153                int coeffs[9];
7154                const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
7155                const float scale[] =
7156                {
7157                    (1 << lab_shift)/_whitept[0],
7158                    (float)(1 << lab_shift),
7159                    (1 << lab_shift)/_whitept[2]
7160                };
7161
7162                for (int i = 0; i < 3; i++ )
7163                {
7164                    coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
7165                    coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
7166                    coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
7167
7168                    CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
7169                              coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
7170                }
7171                Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
7172            }
7173
7174            const int Lscale = (116*255+50)/100;
7175            const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
7176
7177            k.args(srcarg, dstarg,
7178                   ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
7179                   ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
7180                   Lscale, Lshift);
7181        }
7182        else
7183        {
7184            static UMat usRGBGammaTab, ucoeffs, uLabCbrtTab;
7185
7186            if (srgb && usRGBGammaTab.empty())
7187                Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
7188            if (!lab && uLabCbrtTab.empty())
7189                Mat(1, LAB_CBRT_TAB_SIZE * 4, CV_32FC1, LabCbrtTab).copyTo(uLabCbrtTab);
7190
7191            {
7192                float coeffs[9];
7193                const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
7194                float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
7195
7196                for (int i = 0; i < 3; i++)
7197                {
7198                    int j = i * 3;
7199                    coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1);
7200                    coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1);
7201                    coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1);
7202
7203                    CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
7204                               coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) );
7205                }
7206
7207                float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
7208                un = 13*4*_whitept[0]*d;
7209                vn = 13*9*_whitept[1]*d;
7210
7211                Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
7212            }
7213
7214            float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
7215            ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
7216
7217            if (lab)
7218            {
7219                if (srgb)
7220                    k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
7221                           ucoeffsarg, _1_3, _a);
7222                else
7223                    k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
7224            }
7225            else
7226            {
7227                ocl::KernelArg LabCbrtTabarg = ocl::KernelArg::PtrReadOnly(uLabCbrtTab);
7228                if (srgb)
7229                    k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
7230                           LabCbrtTabarg, ucoeffsarg, un, vn);
7231                else
7232                    k.args(srcarg, dstarg, LabCbrtTabarg, ucoeffsarg, un, vn);
7233            }
7234        }
7235
7236        return k.run(dims, globalsize, NULL, false);
7237    }
7238    case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
7239    case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
7240    {
7241        if( dcn <= 0 )
7242            dcn = 3;
7243        CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
7244
7245        bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR || code == CV_Luv2BGR || code == CV_Luv2LBGR ? 0 : 2;
7246        bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Luv2BGR || code == CV_Luv2RGB;
7247        bool lab = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Lab2LBGR || code == CV_Lab2LRGB;
7248        float un, vn;
7249
7250        k.create(format("%s2BGR", lab ? "Lab" : "Luv").c_str(),
7251                 ocl::imgproc::cvtcolor_oclsrc,
7252                 opts + format("-D dcn=%d -D bidx=%d%s",
7253                               dcn, bidx, srgb ? " -D SRGB" : ""));
7254        if (k.empty())
7255            return false;
7256
7257        initLabTabs();
7258        static UMat ucoeffs, usRGBInvGammaTab;
7259
7260        if (srgb && usRGBInvGammaTab.empty())
7261            Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
7262
7263        {
7264            float coeffs[9];
7265            const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
7266
7267            for( int i = 0; i < 3; i++ )
7268            {
7269                coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1);
7270                coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1);
7271                coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1);
7272            }
7273
7274            float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
7275            un = 4*_whitept[0]*d;
7276            vn = 9*_whitept[1]*d;
7277
7278            Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
7279        }
7280
7281        _dst.create(sz, CV_MAKETYPE(depth, dcn));
7282        dst = _dst.getUMat();
7283
7284        float lThresh = 0.008856f * 903.3f;
7285        float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
7286
7287        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
7288                dstarg = ocl::KernelArg::WriteOnly(dst),
7289                coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
7290
7291        if (lab)
7292        {
7293            if (srgb)
7294                k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
7295                       coeffsarg, lThresh, fThresh);
7296            else
7297                k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
7298        }
7299        else
7300        {
7301            if (srgb)
7302                k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
7303                       coeffsarg, un, vn);
7304            else
7305                k.args(srcarg, dstarg, coeffsarg, un, vn);
7306        }
7307
7308        return k.run(dims, globalsize, NULL, false);
7309    }
7310    default:
7311        break;
7312    }
7313
7314    if( !k.empty() )
7315    {
7316        _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7317        dst = _dst.getUMat();
7318        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
7319        ok = k.run(dims, globalsize, NULL, false);
7320    }
7321    return ok;
7322}
7323
7324#endif
7325
7326}//namespace cv
7327
7328//////////////////////////////////////////////////////////////////////////////////////////
7329//                                   The main function                                  //
7330//////////////////////////////////////////////////////////////////////////////////////////
7331
7332void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
7333{
7334    int stype = _src.type();
7335    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
7336
7337    CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
7338                ocl_cvtColor(_src, _dst, code, dcn) )
7339
7340    Mat src = _src.getMat(), dst;
7341    Size sz = src.size();
7342
7343    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
7344
7345    switch( code )
7346    {
7347        case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
7348        case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
7349            CV_Assert( scn == 3 || scn == 4 );
7350            dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
7351            bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
7352
7353            _dst.create( sz, CV_MAKETYPE(depth, dcn));
7354            dst = _dst.getMat();
7355
7356#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7357            CV_IPP_CHECK()
7358            {
7359                if( code == CV_BGR2BGRA)
7360                {
7361                    if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
7362                    {
7363                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7364                        return;
7365                    }
7366                    setIppErrorStatus();
7367                }
7368                else if( code == CV_BGRA2BGR )
7369                {
7370                    if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
7371                    {
7372                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7373                        return;
7374                    }
7375                    setIppErrorStatus();
7376                }
7377                else if( code == CV_BGR2RGBA )
7378                {
7379                    if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
7380                    {
7381                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7382                        return;
7383                    }
7384                    setIppErrorStatus();
7385                }
7386                else if( code == CV_RGBA2BGR )
7387                {
7388                    if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
7389                    {
7390                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7391                        return;
7392                    }
7393                    setIppErrorStatus();
7394                }
7395                else if( code == CV_RGB2BGR )
7396                {
7397                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
7398                    {
7399                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7400                        return;
7401                    }
7402                    setIppErrorStatus();
7403                }
7404#if IPP_VERSION_X100 >= 801
7405                else if( code == CV_RGBA2BGRA )
7406                {
7407                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
7408                    {
7409                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7410                        return;
7411                    }
7412                    setIppErrorStatus();
7413                }
7414#endif
7415            }
7416#endif
7417
7418            if( depth == CV_8U )
7419            {
7420#ifdef HAVE_TEGRA_OPTIMIZATION
7421                if(tegra::useTegra() && tegra::cvtBGR2RGB(src, dst, bidx))
7422                    break;
7423#endif
7424                CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
7425            }
7426            else if( depth == CV_16U )
7427                CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
7428            else
7429                CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
7430            break;
7431
7432        case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
7433        case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
7434            CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
7435            _dst.create(sz, CV_8UC2);
7436            dst = _dst.getMat();
7437
7438#if defined(HAVE_IPP) && 0 // breaks OCL accuracy tests
7439            CV_IPP_CHECK()
7440            {
7441                CV_SUPPRESS_DEPRECATED_START
7442
7443                if (code == CV_BGR2BGR565 && scn == 3)
7444                {
7445                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
7446                    {
7447                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7448                        return;
7449                    }
7450                    setIppErrorStatus();
7451                }
7452                else if (code == CV_BGRA2BGR565 && scn == 4)
7453                {
7454                    if (CvtColorIPPLoopCopy(src, dst,
7455                                            IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7456                                            (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
7457                    {
7458                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7459                        return;
7460                    }
7461                    setIppErrorStatus();
7462                }
7463                else if (code == CV_RGB2BGR565 && scn == 3)
7464                {
7465                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
7466                                                                               (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
7467                    {
7468                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7469                        return;
7470                    }
7471                    setIppErrorStatus();
7472                }
7473                else if (code == CV_RGBA2BGR565 && scn == 4)
7474                {
7475                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7476                                                                               (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
7477                    {
7478                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7479                        return;
7480                    }
7481                    setIppErrorStatus();
7482                }
7483                CV_SUPPRESS_DEPRECATED_END
7484            }
7485#endif
7486
7487#ifdef HAVE_TEGRA_OPTIMIZATION
7488            if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
7489                if(tegra::useTegra() && tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
7490                    break;
7491#endif
7492
7493            CvtColorLoop(src, dst, RGB2RGB5x5(scn,
7494                      code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
7495                      code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
7496                      code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
7497                      code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
7498                                              ));
7499            break;
7500
7501        case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
7502        case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
7503            if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
7504            CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
7505            _dst.create(sz, CV_MAKETYPE(depth, dcn));
7506            dst = _dst.getMat();
7507
7508#ifdef HAVE_IPP
7509            CV_IPP_CHECK()
7510            {
7511                CV_SUPPRESS_DEPRECATED_START
7512                if (code == CV_BGR5652BGR && dcn == 3)
7513                {
7514                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
7515                    {
7516                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7517                        return;
7518                    }
7519                    setIppErrorStatus();
7520                }
7521                else if (code == CV_BGR5652RGB && dcn == 3)
7522                {
7523                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7524                                                                           ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
7525                    {
7526                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7527                        return;
7528                    }
7529                    setIppErrorStatus();
7530                }
7531                else if (code == CV_BGR5652BGRA && dcn == 4)
7532                {
7533                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7534                                                                           ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
7535                    {
7536                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7537                        return;
7538                    }
7539                    setIppErrorStatus();
7540                }
7541                else if (code == CV_BGR5652RGBA && dcn == 4)
7542                {
7543                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7544                                                                           ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
7545                    {
7546                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7547                        return;
7548                    }
7549                    setIppErrorStatus();
7550                }
7551                CV_SUPPRESS_DEPRECATED_END
7552            }
7553#endif
7554
7555            CvtColorLoop(src, dst, RGB5x52RGB(dcn,
7556                      code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
7557                      code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
7558                      code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
7559                      code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
7560                      ));
7561            break;
7562
7563        case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
7564            CV_Assert( scn == 3 || scn == 4 );
7565            _dst.create(sz, CV_MAKETYPE(depth, 1));
7566            dst = _dst.getMat();
7567
7568#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7569            CV_IPP_CHECK()
7570            {
7571                if( code == CV_BGR2GRAY && depth == CV_32F )
7572                {
7573                    if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
7574                    {
7575                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7576                        return;
7577                    }
7578                    setIppErrorStatus();
7579                }
7580                else if( code == CV_RGB2GRAY && depth == CV_32F )
7581                {
7582                    if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
7583                    {
7584                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7585                        return;
7586                    }
7587                    setIppErrorStatus();
7588                }
7589                else if( code == CV_BGRA2GRAY && depth == CV_32F )
7590                {
7591                    if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
7592                    {
7593                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7594                        return;
7595                    }
7596                    setIppErrorStatus();
7597                }
7598                else if( code == CV_RGBA2GRAY && depth == CV_32F )
7599                {
7600                    if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
7601                    {
7602                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7603                        return;
7604                    }
7605                    setIppErrorStatus();
7606                }
7607            }
7608#endif
7609
7610            bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
7611
7612            if( depth == CV_8U )
7613            {
7614#ifdef HAVE_TEGRA_OPTIMIZATION
7615                if(tegra::useTegra() && tegra::cvtRGB2Gray(src, dst, bidx))
7616                    break;
7617#endif
7618                CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
7619            }
7620            else if( depth == CV_16U )
7621                CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
7622            else
7623                CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
7624            break;
7625
7626        case CV_BGR5652GRAY: case CV_BGR5552GRAY:
7627            CV_Assert( scn == 2 && depth == CV_8U );
7628            _dst.create(sz, CV_8UC1);
7629            dst = _dst.getMat();
7630
7631            CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
7632            break;
7633
7634        case CV_GRAY2BGR: case CV_GRAY2BGRA:
7635            if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
7636            CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
7637            _dst.create(sz, CV_MAKETYPE(depth, dcn));
7638            dst = _dst.getMat();
7639
7640#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7641            CV_IPP_CHECK()
7642            {
7643                if( code == CV_GRAY2BGR )
7644                {
7645                    if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
7646                    {
7647                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7648                        return;
7649                    }
7650                    setIppErrorStatus();
7651                }
7652                else if( code == CV_GRAY2BGRA )
7653                {
7654                    if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
7655                    {
7656                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7657                        return;
7658                    }
7659                    setIppErrorStatus();
7660                }
7661            }
7662#endif
7663
7664
7665            if( depth == CV_8U )
7666            {
7667#ifdef HAVE_TEGRA_OPTIMIZATION
7668                if(tegra::useTegra() && tegra::cvtGray2RGB(src, dst))
7669                    break;
7670#endif
7671                CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
7672            }
7673            else if( depth == CV_16U )
7674                CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
7675            else
7676                CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
7677            break;
7678
7679        case CV_GRAY2BGR565: case CV_GRAY2BGR555:
7680            CV_Assert( scn == 1 && depth == CV_8U );
7681            _dst.create(sz, CV_8UC2);
7682            dst = _dst.getMat();
7683
7684            CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
7685            break;
7686
7687        case CV_BGR2YCrCb: case CV_RGB2YCrCb:
7688        case CV_BGR2YUV: case CV_RGB2YUV:
7689            {
7690            CV_Assert( scn == 3 || scn == 4 );
7691            bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
7692            static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
7693            static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
7694            const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
7695            const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
7696
7697            _dst.create(sz, CV_MAKETYPE(depth, 3));
7698            dst = _dst.getMat();
7699
7700#if defined HAVE_IPP && 0
7701            CV_IPP_CHECK()
7702            {
7703                if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
7704                {
7705                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
7706                    {
7707                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7708                        return;
7709                    }
7710                    setIppErrorStatus();
7711                }
7712                else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
7713                {
7714                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
7715                                                                           (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
7716                    {
7717                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7718                        return;
7719                    }
7720                    setIppErrorStatus();
7721                }
7722                else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
7723                {
7724                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7725                                                                           (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
7726                    {
7727                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7728                        return;
7729                    }
7730                    setIppErrorStatus();
7731                }
7732                else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
7733                {
7734                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7735                                                                           (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
7736                    {
7737                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7738                        return;
7739                    }
7740                    setIppErrorStatus();
7741                }
7742            }
7743#endif
7744
7745            if( depth == CV_8U )
7746            {
7747#ifdef HAVE_TEGRA_OPTIMIZATION
7748                if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::useTegra() && tegra::cvtRGB2YCrCb(src, dst, bidx))
7749                    break;
7750#endif
7751                CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
7752            }
7753            else if( depth == CV_16U )
7754                CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
7755            else
7756                CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
7757            }
7758            break;
7759
7760        case CV_YCrCb2BGR: case CV_YCrCb2RGB:
7761        case CV_YUV2BGR: case CV_YUV2RGB:
7762            {
7763            if( dcn <= 0 ) dcn = 3;
7764            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
7765            bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
7766            static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
7767            static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
7768            const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
7769            const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
7770
7771            _dst.create(sz, CV_MAKETYPE(depth, dcn));
7772            dst = _dst.getMat();
7773
7774#if defined HAVE_IPP && 0
7775            CV_IPP_CHECK()
7776            {
7777                if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
7778                {
7779                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
7780                    {
7781                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7782                        return;
7783                    }
7784                    setIppErrorStatus();
7785                }
7786                else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
7787                {
7788                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7789                                                                           ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
7790                    {
7791                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7792                        return;
7793                    }
7794                    setIppErrorStatus();
7795                }
7796                else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
7797                {
7798                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7799                                                                           ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
7800                    {
7801                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7802                        return;
7803                    }
7804                    setIppErrorStatus();
7805                }
7806                else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
7807                {
7808                    if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7809                                                                           ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
7810                    {
7811                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7812                        return;
7813                    }
7814                    setIppErrorStatus();
7815                }
7816            }
7817#endif
7818
7819            if( depth == CV_8U )
7820                CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
7821            else if( depth == CV_16U )
7822                CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
7823            else
7824                CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
7825            }
7826            break;
7827
7828        case CV_BGR2XYZ: case CV_RGB2XYZ:
7829            CV_Assert( scn == 3 || scn == 4 );
7830            bidx = code == CV_BGR2XYZ ? 0 : 2;
7831
7832            _dst.create(sz, CV_MAKETYPE(depth, 3));
7833            dst = _dst.getMat();
7834
7835#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7836            CV_IPP_CHECK()
7837            {
7838                if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
7839                {
7840                    if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
7841                    {
7842                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7843                        return;
7844                    }
7845                    setIppErrorStatus();
7846                }
7847                else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
7848                {
7849                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
7850                    {
7851                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7852                        return;
7853                    }
7854                    setIppErrorStatus();
7855                }
7856                else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
7857                {
7858                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
7859                    {
7860                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7861                        return;
7862                    }
7863                    setIppErrorStatus();
7864                }
7865                else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
7866                {
7867                    if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
7868                    {
7869                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7870                        return;
7871                    }
7872                    setIppErrorStatus();
7873                }
7874            }
7875#endif
7876
7877            if( depth == CV_8U )
7878                CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
7879            else if( depth == CV_16U )
7880                CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
7881            else
7882                CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
7883            break;
7884
7885        case CV_XYZ2BGR: case CV_XYZ2RGB:
7886            if( dcn <= 0 ) dcn = 3;
7887            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
7888            bidx = code == CV_XYZ2BGR ? 0 : 2;
7889
7890            _dst.create(sz, CV_MAKETYPE(depth, dcn));
7891            dst = _dst.getMat();
7892
7893#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7894            CV_IPP_CHECK()
7895            {
7896                if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
7897                {
7898                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
7899                    {
7900                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7901                        return;
7902                    }
7903                    setIppErrorStatus();
7904                }
7905                else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
7906                {
7907                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
7908                    {
7909                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7910                        return;
7911                    }
7912                    setIppErrorStatus();
7913                }
7914                if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
7915                {
7916                    if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
7917                    {
7918                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7919                        return;
7920                    }
7921                    setIppErrorStatus();
7922                }
7923                else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
7924                {
7925                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
7926                    {
7927                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7928                        return;
7929                    }
7930                    setIppErrorStatus();
7931                }
7932            }
7933#endif
7934
7935            if( depth == CV_8U )
7936                CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
7937            else if( depth == CV_16U )
7938                CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
7939            else
7940                CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
7941            break;
7942
7943        case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
7944        case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
7945            {
7946            CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
7947            bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
7948                code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
7949            int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
7950                code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
7951
7952            _dst.create(sz, CV_MAKETYPE(depth, 3));
7953            dst = _dst.getMat();
7954
7955#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7956            CV_IPP_CHECK()
7957            {
7958                if( depth == CV_8U || depth == CV_16U )
7959                {
7960#if 0 // breaks OCL accuracy tests
7961                    if( code == CV_BGR2HSV_FULL && scn == 3 )
7962                    {
7963                        if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
7964                        {
7965                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7966                            return;
7967                        }
7968                        setIppErrorStatus();
7969                    }
7970                    else if( code == CV_BGR2HSV_FULL && scn == 4 )
7971                    {
7972                        if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
7973                        {
7974                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7975                            return;
7976                        }
7977                        setIppErrorStatus();
7978                    }
7979                    else if( code == CV_RGB2HSV_FULL && scn == 4 )
7980                    {
7981                        if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
7982                        {
7983                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7984                            return;
7985                        }
7986                        setIppErrorStatus();
7987                    } else
7988#endif
7989                    if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
7990                    {
7991                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
7992                        {
7993                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7994                            return;
7995                        }
7996                        setIppErrorStatus();
7997                    }
7998                    else if( code == CV_BGR2HLS_FULL && scn == 3 )
7999                    {
8000                        if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
8001                        {
8002                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8003                            return;
8004                        }
8005                        setIppErrorStatus();
8006                    }
8007                    else if( code == CV_BGR2HLS_FULL && scn == 4 )
8008                    {
8009                        if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
8010                        {
8011                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8012                            return;
8013                        }
8014                        setIppErrorStatus();
8015                    }
8016                    else if( code == CV_RGB2HLS_FULL && scn == 3 )
8017                    {
8018                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
8019                        {
8020                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8021                            return;
8022                        }
8023                        setIppErrorStatus();
8024                    }
8025                    else if( code == CV_RGB2HLS_FULL && scn == 4 )
8026                    {
8027                        if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
8028                        {
8029                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8030                            return;
8031                        }
8032                        setIppErrorStatus();
8033                    }
8034                }
8035            }
8036#endif
8037
8038            if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
8039                code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
8040            {
8041#ifdef HAVE_TEGRA_OPTIMIZATION
8042                if(tegra::useTegra() && tegra::cvtRGB2HSV(src, dst, bidx, hrange))
8043                    break;
8044#endif
8045                if( depth == CV_8U )
8046                    CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
8047                else
8048                    CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
8049            }
8050            else
8051            {
8052                if( depth == CV_8U )
8053                    CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
8054                else
8055                    CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
8056            }
8057            }
8058            break;
8059
8060        case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
8061        case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
8062            {
8063            if( dcn <= 0 ) dcn = 3;
8064            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
8065            bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
8066                code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
8067            int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
8068                code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
8069
8070            _dst.create(sz, CV_MAKETYPE(depth, dcn));
8071            dst = _dst.getMat();
8072
8073#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
8074            CV_IPP_CHECK()
8075            {
8076                if( depth == CV_8U || depth == CV_16U )
8077                {
8078                    if( code == CV_HSV2BGR_FULL && dcn == 3 )
8079                    {
8080                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8081                        {
8082                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8083                            return;
8084                        }
8085                        setIppErrorStatus();
8086                    }
8087                    else if( code == CV_HSV2BGR_FULL && dcn == 4 )
8088                    {
8089                        if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8090                        {
8091                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8092                            return;
8093                        }
8094                        setIppErrorStatus();
8095                    }
8096                    else if( code == CV_HSV2RGB_FULL && dcn == 3 )
8097                    {
8098                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
8099                        {
8100                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8101                            return;
8102                        }
8103                        setIppErrorStatus();
8104                    }
8105                    else if( code == CV_HSV2RGB_FULL && dcn == 4 )
8106                    {
8107                        if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8108                        {
8109                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8110                            return;
8111                        }
8112                        setIppErrorStatus();
8113                    }
8114                    else if( code == CV_HLS2BGR_FULL && dcn == 3 )
8115                    {
8116                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8117                        {
8118                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8119                            return;
8120                        }
8121                        setIppErrorStatus();
8122                    }
8123                    else if( code == CV_HLS2BGR_FULL && dcn == 4 )
8124                    {
8125                        if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8126                        {
8127                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8128                            return;
8129                        }
8130                        setIppErrorStatus();
8131                    }
8132                    else if( code == CV_HLS2RGB_FULL && dcn == 3 )
8133                    {
8134                        if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
8135                        {
8136                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8137                            return;
8138                        }
8139                        setIppErrorStatus();
8140                    }
8141                    else if( code == CV_HLS2RGB_FULL && dcn == 4 )
8142                    {
8143                        if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8144                        {
8145                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8146                            return;
8147                        }
8148                        setIppErrorStatus();
8149                    }
8150                }
8151            }
8152#endif
8153
8154            if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
8155                code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
8156            {
8157                if( depth == CV_8U )
8158                    CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
8159                else
8160                    CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
8161            }
8162            else
8163            {
8164                if( depth == CV_8U )
8165                    CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
8166                else
8167                    CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
8168            }
8169            }
8170            break;
8171
8172        case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
8173        case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
8174            {
8175            CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
8176            bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
8177                   code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
8178            bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
8179                        code == CV_BGR2Luv || code == CV_RGB2Luv;
8180
8181            _dst.create(sz, CV_MAKETYPE(depth, 3));
8182            dst = _dst.getMat();
8183
8184#if defined HAVE_IPP && 0
8185            CV_IPP_CHECK()
8186            {
8187                if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
8188                {
8189                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
8190                    {
8191                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8192                        return;
8193                    }
8194                    setIppErrorStatus();
8195                }
8196                else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
8197                {
8198                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8199                                                                           (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
8200                    {
8201                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8202                        return;
8203                    }
8204                    setIppErrorStatus();
8205                }
8206                else
8207                if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
8208                {
8209                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
8210                                                                           (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
8211                    {
8212                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8213                        return;
8214                    }
8215                    setIppErrorStatus();
8216                }
8217                else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
8218                {
8219                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8220                                                                           (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
8221                    {
8222                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8223                        return;
8224                    }
8225                    setIppErrorStatus();
8226                }
8227                else if (code == CV_LRGB2Luv && scn == 3)
8228                {
8229                    if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
8230                    {
8231                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8232                        return;
8233                    }
8234                    setIppErrorStatus();
8235                }
8236                else if (code == CV_LRGB2Luv && scn == 4)
8237                {
8238                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8239                                                                           ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
8240                    {
8241                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8242                        return;
8243                    }
8244                    setIppErrorStatus();
8245                }
8246                else if (code == CV_LBGR2Luv && scn == 3)
8247                {
8248                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
8249                                                                           ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
8250                    {
8251                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8252                        return;
8253                    }
8254                    setIppErrorStatus();
8255                }
8256                else if (code == CV_LBGR2Luv && scn == 4)
8257                {
8258                    if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8259                                                                           ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
8260                    {
8261                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8262                        return;
8263                    }
8264                    setIppErrorStatus();
8265                }
8266            }
8267#endif
8268
8269            if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
8270                code == CV_LBGR2Lab || code == CV_LRGB2Lab )
8271            {
8272                if( depth == CV_8U )
8273                    CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
8274                else
8275                    CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
8276            }
8277            else
8278            {
8279                if( depth == CV_8U )
8280                    CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
8281                else
8282                    CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
8283            }
8284            }
8285            break;
8286
8287        case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
8288        case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
8289            {
8290            if( dcn <= 0 ) dcn = 3;
8291            CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
8292            bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
8293                   code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
8294            bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
8295                    code == CV_Luv2BGR || code == CV_Luv2RGB;
8296
8297            _dst.create(sz, CV_MAKETYPE(depth, dcn));
8298            dst = _dst.getMat();
8299
8300#if defined HAVE_IPP && 0
8301            CV_IPP_CHECK()
8302            {
8303                if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
8304                {
8305                    if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
8306                    {
8307                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8308                        return;
8309                    }
8310                    setIppErrorStatus();
8311                }
8312                else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
8313                {
8314                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8315                                        ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8316                    {
8317                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8318                        return;
8319                    }
8320                    setIppErrorStatus();
8321                }
8322                if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
8323                {
8324                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8325                                                                               ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8326                    {
8327                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8328                        return;
8329                    }
8330                    setIppErrorStatus();
8331                }
8332                else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
8333                {
8334                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8335                                                                           ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8336                    {
8337                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8338                        return;
8339                    }
8340                    setIppErrorStatus();
8341                }
8342                if( code == CV_Luv2LRGB && dcn == 3 )
8343                {
8344                    if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
8345                        return;
8346                }
8347                else if( code == CV_Luv2LRGB && dcn == 4 )
8348                {
8349                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8350                                                                           ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8351                    {
8352                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8353                        return;
8354                    }
8355                }
8356                if( code == CV_Luv2LBGR && dcn == 3 )
8357                {
8358                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8359                                                                           ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8360                    {
8361                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8362                        return;
8363                    }
8364                }
8365                else if( code == CV_Luv2LBGR && dcn == 4 )
8366                {
8367                    if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8368                                                                           ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8369                    {
8370                        CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8371                        return;
8372                    }
8373                }
8374            }
8375#endif
8376
8377            if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
8378                code == CV_Lab2LBGR || code == CV_Lab2LRGB )
8379            {
8380                if( depth == CV_8U )
8381                    CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
8382                else
8383                    CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
8384            }
8385            else
8386            {
8387                if( depth == CV_8U )
8388                    CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
8389                else
8390                    CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
8391            }
8392            }
8393            break;
8394
8395        case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
8396        case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
8397        case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
8398        case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
8399            demosaicing(src, _dst, code, dcn);
8400            break;
8401
8402        case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
8403        case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
8404            {
8405                // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
8406                // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
8407
8408                if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
8409                const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
8410                const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
8411
8412                CV_Assert( dcn == 3 || dcn == 4 );
8413                CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8414
8415                Size dstSz(sz.width, sz.height * 2 / 3);
8416                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8417                dst = _dst.getMat();
8418
8419                int srcstep = (int)src.step;
8420                const uchar* y = src.ptr();
8421                const uchar* uv = y + srcstep * dstSz.height;
8422
8423                switch(dcn*100 + bIdx * 10 + uIdx)
8424                {
8425                    case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
8426                    case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
8427                    case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
8428                    case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
8429                    case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
8430                    case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
8431                    case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
8432                    case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
8433                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8434                };
8435            }
8436            break;
8437        case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
8438        case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
8439            {
8440                //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
8441                //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
8442
8443                if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
8444                const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
8445                const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
8446
8447                CV_Assert( dcn == 3 || dcn == 4 );
8448                CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8449
8450                Size dstSz(sz.width, sz.height * 2 / 3);
8451                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8452                dst = _dst.getMat();
8453
8454                int srcstep = (int)src.step;
8455                const uchar* y = src.ptr();
8456                const uchar* u = y + srcstep * dstSz.height;
8457                const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
8458
8459                int ustepIdx = 0;
8460                int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
8461
8462                if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
8463
8464                switch(dcn*10 + bIdx)
8465                {
8466                    case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8467                    case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8468                    case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8469                    case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8470                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8471                };
8472            }
8473            break;
8474        case CV_YUV2GRAY_420:
8475            {
8476                if (dcn <= 0) dcn = 1;
8477
8478                CV_Assert( dcn == 1 );
8479                CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8480
8481                Size dstSz(sz.width, sz.height * 2 / 3);
8482                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8483                dst = _dst.getMat();
8484#if defined HAVE_IPP
8485                CV_IPP_CHECK()
8486                {
8487                    if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
8488                            ippiSize(dstSz.width, dstSz.height)))
8489                    {
8490                        CV_IMPL_ADD(CV_IMPL_IPP);
8491                        return;
8492                    }
8493                    setIppErrorStatus();
8494                }
8495#endif
8496                src(Range(0, dstSz.height), Range::all()).copyTo(dst);
8497            }
8498            break;
8499        case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
8500        case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
8501            {
8502                if (dcn <= 0) dcn = 1;
8503                const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
8504                const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
8505
8506                CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
8507                CV_Assert( dcn == 1 );
8508                CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
8509
8510                Size dstSz(sz.width, sz.height / 2 * 3);
8511                _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8512                dst = _dst.getMat();
8513
8514                switch(bIdx + uIdx*10)
8515                {
8516                    case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
8517                    case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
8518                    case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
8519                    case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
8520                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8521                };
8522            }
8523            break;
8524        case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
8525        case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
8526        case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
8527            {
8528                //http://www.fourcc.org/yuv.php#UYVY
8529                //http://www.fourcc.org/yuv.php#YUY2
8530                //http://www.fourcc.org/yuv.php#YVYU
8531
8532                if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
8533                const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
8534                const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
8535                const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
8536
8537                CV_Assert( dcn == 3 || dcn == 4 );
8538                CV_Assert( scn == 2 && depth == CV_8U );
8539
8540                _dst.create(sz, CV_8UC(dcn));
8541                dst = _dst.getMat();
8542
8543                switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
8544                {
8545                    case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8546                    case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8547                    case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8548                    case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8549                    case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8550                    case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8551                    case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8552                    case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8553                    case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8554                    case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8555                    case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8556                    case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8557                    case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8558                    case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8559                    case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8560                    case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8561                    default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8562                };
8563            }
8564            break;
8565        case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
8566            {
8567                if (dcn <= 0) dcn = 1;
8568
8569                CV_Assert( dcn == 1 );
8570                CV_Assert( scn == 2 && depth == CV_8U );
8571
8572                extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
8573            }
8574            break;
8575        case CV_RGBA2mRGBA:
8576            {
8577                if (dcn <= 0) dcn = 4;
8578                CV_Assert( scn == 4 && dcn == 4 );
8579
8580                _dst.create(sz, CV_MAKETYPE(depth, dcn));
8581                dst = _dst.getMat();
8582
8583                if( depth == CV_8U )
8584                {
8585#if defined(HAVE_IPP)
8586                    CV_IPP_CHECK()
8587                    {
8588                        if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
8589                        {
8590                            CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8591                            return;
8592                        }
8593                        setIppErrorStatus();
8594                    }
8595#endif
8596                    CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
8597                }
8598                else
8599                {
8600                    CV_Error( CV_StsBadArg, "Unsupported image depth" );
8601                }
8602            }
8603            break;
8604        case CV_mRGBA2RGBA:
8605            {
8606                if (dcn <= 0) dcn = 4;
8607                CV_Assert( scn == 4 && dcn == 4 );
8608
8609                _dst.create(sz, CV_MAKETYPE(depth, dcn));
8610                dst = _dst.getMat();
8611
8612                if( depth == CV_8U )
8613                    CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
8614                else
8615                {
8616                    CV_Error( CV_StsBadArg, "Unsupported image depth" );
8617                }
8618            }
8619            break;
8620        default:
8621            CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
8622    }
8623}
8624
8625CV_IMPL void
8626cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
8627{
8628    cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
8629    CV_Assert( src.depth() == dst.depth() );
8630
8631    cv::cvtColor(src, dst, code, dst.channels());
8632    CV_Assert( dst.data == dst0.data );
8633}
8634
8635
8636/* End of file. */
8637