1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
6// Third party copyrights are property of their respective owners.
7
8#include "precomp.hpp"
9#ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
10#define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
11
12#include "opencl_kernels_photo.hpp"
13
14#ifdef HAVE_OPENCL
15
16namespace cv {
17
18enum
19{
20    BLOCK_ROWS = 32,
21    BLOCK_COLS = 32,
22    CTA_SIZE_INTEL = 64,
23    CTA_SIZE_DEFAULT = 256
24};
25
26static int divUp(int a, int b)
27{
28    return (a + b - 1) / b;
29}
30
31template <typename FT, typename ST, typename WT>
32static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
33                                      int searchWindowSize, int templateWindowSize,
34                                      const FT *h, int hn, int cn, int normType,
35                                      int & almostTemplateWindowSizeSqBinShift)
36{
37    const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
38        std::numeric_limits<ST>::max();
39    int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
40                                           std::numeric_limits<int>::max());
41    int depth = DataType<FT>::depth;
42    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
43
44    if (depth == CV_64F && !doubleSupport)
45        return false;
46
47    // precalc weight for every possible l2 dist between blocks
48    // additional optimization of precalced weights to replace division(averaging) by binary shift
49    CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX)
50    int templateWindowSizeSq = templateWindowSize * templateWindowSize;
51    almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq);
52    FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
53
54    const FT WEIGHT_THRESHOLD = 1e-3f;
55    int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
56        std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
57    int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
58    FT den[4];
59    CV_Assert(hn > 0 && hn <= 4);
60    for (int i=0; i<hn; i++)
61        den[i] = 1.0f / (h[i] * h[i] * cn);
62
63    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
64
65    char buf[40];
66    ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
67                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
68                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
69                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
70                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
71                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
72                         normType == NORM_L1 ? " -D ABS" : ""));
73    if (k.empty())
74        return false;
75
76    k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
77           almostDist2ActualDistMultiplier, fixedPointMult,
78           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
79
80    size_t globalsize[1] = { almostMaxDist };
81    return k.run(1, globalsize, NULL, false);
82}
83
84static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
85                                     int templateWindowSize, int searchWindowSize, int normType)
86{
87    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
88    int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
89    Size size = _src.size();
90
91    if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
92                             (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
93        return false;
94
95    int templateWindowHalfWize = templateWindowSize / 2;
96    int searchWindowHalfSize = searchWindowSize / 2;
97    templateWindowSize  = templateWindowHalfWize * 2 + 1;
98    searchWindowSize = searchWindowHalfSize * 2 + 1;
99    int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
100    int almostTemplateWindowSizeSqBinShift = -1;
101
102    char buf[4][40];
103    String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
104                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
105                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
106                         " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
107                         " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
108                         " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
109                         templateWindowSize, searchWindowSize,
110                         ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
111                         ocl::typeToStr(CV_32SC(hn)),
112                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
113                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
114                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
115                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
116                         depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
117                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
118                         depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
119                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
120                         BLOCK_COLS, BLOCK_ROWS,
121                         ctaSize, templateWindowHalfWize, searchWindowHalfSize,
122                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
123                         (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
124                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
125                         normType == NORM_L1 ? " -D ABS" : "");
126
127    ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
128    if (k.empty())
129        return false;
130
131    UMat almostDist2Weight;
132    if ((depth == CV_8U &&
133         !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
134                                                       searchWindowSize, templateWindowSize,
135                                                       h, hn, cn, normType,
136                                                       almostTemplateWindowSizeSqBinShift)) ||
137        (depth == CV_16U &&
138         !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
139                                                          searchWindowSize, templateWindowSize,
140                                                          h, hn, cn, normType,
141                                                          almostTemplateWindowSizeSqBinShift)))
142        return false;
143    CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
144
145    UMat srcex;
146    int borderSize = searchWindowHalfSize + templateWindowHalfWize;
147    if (cn == 3) {
148        srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
149        UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
150        int from_to[] = { 0,0, 1,1, 2,2 };
151        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
152        copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
153                       BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
154    }
155    else
156        copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
157
158    _dst.create(size, type);
159    UMat dst;
160    if (cn == 3)
161        dst.create(size, CV_MAKE_TYPE(depth, 4));
162    else
163        dst = _dst.getUMat();
164
165    int searchWindowSizeSq = searchWindowSize * searchWindowSize;
166    Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
167    Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
168    UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
169
170    srcex = srcex(Rect(Point(borderSize, borderSize), size));
171    k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
172           ocl::KernelArg::PtrReadOnly(almostDist2Weight),
173           ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
174
175    size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
176    if (!k.run(2, globalsize, localsize, false)) return false;
177
178    if (cn == 3) {
179        int from_to[] = { 0,0, 1,1, 2,2 };
180        mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
181    }
182
183    return true;
184}
185
186static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
187                                      float h, float hForColorComponents,
188                                      int templateWindowSize, int searchWindowSize)
189{
190    UMat src = _src.getUMat();
191    _dst.create(src.size(), src.type());
192    UMat dst = _dst.getUMat();
193
194    UMat src_lab;
195    cvtColor(src, src_lab, COLOR_LBGR2Lab);
196
197    UMat l(src.size(), CV_8U);
198    UMat ab(src.size(), CV_8UC2);
199    std::vector<UMat> l_ab(2), l_ab_denoised(2);
200    l_ab[0] = l;
201    l_ab[1] = ab;
202    l_ab_denoised[0].create(src.size(), CV_8U);
203    l_ab_denoised[1].create(src.size(), CV_8UC2);
204
205    int from_to[] = { 0,0, 1,1, 2,2 };
206    mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3);
207
208    fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize);
209    fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize);
210
211    UMat dst_lab(src.size(), CV_8UC3);
212    mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3);
213
214    cvtColor(dst_lab, dst, COLOR_Lab2LBGR, src.channels());
215    return true;
216}
217
218}
219
220#endif
221#endif
222