1// This file is part of OpenCV project. 2// It is subject to the license terms in the LICENSE file found in the top-level directory 3// of this distribution and at http://opencv.org/license.html. 4 5// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. 6// Third party copyrights are property of their respective owners. 7 8#include "precomp.hpp" 9#ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ 10#define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ 11 12#include "opencl_kernels_photo.hpp" 13 14#ifdef HAVE_OPENCL 15 16namespace cv { 17 18enum 19{ 20 BLOCK_ROWS = 32, 21 BLOCK_COLS = 32, 22 CTA_SIZE_INTEL = 64, 23 CTA_SIZE_DEFAULT = 256 24}; 25 26static int divUp(int a, int b) 27{ 28 return (a + b - 1) / b; 29} 30 31template <typename FT, typename ST, typename WT> 32static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, 33 int searchWindowSize, int templateWindowSize, 34 const FT *h, int hn, int cn, int normType, 35 int & almostTemplateWindowSizeSqBinShift) 36{ 37 const WT maxEstimateSumValue = searchWindowSize * searchWindowSize * 38 std::numeric_limits<ST>::max(); 39 int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue, 40 std::numeric_limits<int>::max()); 41 int depth = DataType<FT>::depth; 42 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 43 44 if (depth == CV_64F && !doubleSupport) 45 return false; 46 47 // precalc weight for every possible l2 dist between blocks 48 // additional optimization of precalced weights to replace division(averaging) by binary shift 49 CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) 50 int templateWindowSizeSq = templateWindowSize * templateWindowSize; 51 almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); 52 FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; 53 54 const FT WEIGHT_THRESHOLD = 1e-3f; 55 int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn : 56 std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn; 57 int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); 58 FT den[4]; 59 CV_Assert(hn > 0 && hn <= 4); 60 for (int i=0; i<hn; i++) 61 den[i] = 1.0f / (h[i] * h[i] * cn); 62 63 almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn)); 64 65 char buf[40]; 66 ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, 67 format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s" 68 " -D wlut_t=%s -D convert_wlut_t=%s%s%s", 69 ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)), 70 ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf), 71 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 72 normType == NORM_L1 ? " -D ABS" : "")); 73 if (k.empty()) 74 return false; 75 76 k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist, 77 almostDist2ActualDistMultiplier, fixedPointMult, 78 ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD); 79 80 size_t globalsize[1] = { almostMaxDist }; 81 return k.run(1, globalsize, NULL, false); 82} 83 84static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn, 85 int templateWindowSize, int searchWindowSize, int normType) 86{ 87 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 88 int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT; 89 Size size = _src.size(); 90 91 if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) && 92 (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U)))) 93 return false; 94 95 int templateWindowHalfWize = templateWindowSize / 2; 96 int searchWindowHalfSize = searchWindowSize / 2; 97 templateWindowSize = templateWindowHalfWize * 2 + 1; 98 searchWindowSize = searchWindowHalfSize * 2 + 1; 99 int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS); 100 int almostTemplateWindowSizeSqBinShift = -1; 101 102 char buf[4][40]; 103 String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" 104 " -D pixel_t=%s -D int_t=%s -D wlut_t=%s" 105 " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s" 106 " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" 107 " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" 108 " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s", 109 templateWindowSize, searchWindowSize, 110 ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)), 111 ocl::typeToStr(CV_32SC(hn)), 112 depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) : 113 format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(), 114 depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) : 115 format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(), 116 depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) : 117 format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(), 118 depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) : 119 format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(), 120 BLOCK_COLS, BLOCK_ROWS, 121 ctaSize, templateWindowHalfWize, searchWindowHalfSize, 122 ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, 123 (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn), 124 ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), 125 normType == NORM_L1 ? " -D ABS" : ""); 126 127 ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); 128 if (k.empty()) 129 return false; 130 131 UMat almostDist2Weight; 132 if ((depth == CV_8U && 133 !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight, 134 searchWindowSize, templateWindowSize, 135 h, hn, cn, normType, 136 almostTemplateWindowSizeSqBinShift)) || 137 (depth == CV_16U && 138 !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight, 139 searchWindowSize, templateWindowSize, 140 h, hn, cn, normType, 141 almostTemplateWindowSizeSqBinShift))) 142 return false; 143 CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); 144 145 UMat srcex; 146 int borderSize = searchWindowHalfSize + templateWindowHalfWize; 147 if (cn == 3) { 148 srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4)); 149 UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height)); 150 int from_to[] = { 0,0, 1,1, 2,2 }; 151 mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3); 152 copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize, 153 BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place 154 } 155 else 156 copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); 157 158 _dst.create(size, type); 159 UMat dst; 160 if (cn == 3) 161 dst.create(size, CV_MAKE_TYPE(depth, 4)); 162 else 163 dst = _dst.getUMat(); 164 165 int searchWindowSizeSq = searchWindowSize * searchWindowSize; 166 Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); 167 Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy); 168 UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); 169 170 srcex = srcex(Rect(Point(borderSize, borderSize), size)); 171 k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), 172 ocl::KernelArg::PtrReadOnly(almostDist2Weight), 173 ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); 174 175 size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 }; 176 if (!k.run(2, globalsize, localsize, false)) return false; 177 178 if (cn == 3) { 179 int from_to[] = { 0,0, 1,1, 2,2 }; 180 mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3); 181 } 182 183 return true; 184} 185 186static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, 187 float h, float hForColorComponents, 188 int templateWindowSize, int searchWindowSize) 189{ 190 UMat src = _src.getUMat(); 191 _dst.create(src.size(), src.type()); 192 UMat dst = _dst.getUMat(); 193 194 UMat src_lab; 195 cvtColor(src, src_lab, COLOR_LBGR2Lab); 196 197 UMat l(src.size(), CV_8U); 198 UMat ab(src.size(), CV_8UC2); 199 std::vector<UMat> l_ab(2), l_ab_denoised(2); 200 l_ab[0] = l; 201 l_ab[1] = ab; 202 l_ab_denoised[0].create(src.size(), CV_8U); 203 l_ab_denoised[1].create(src.size(), CV_8UC2); 204 205 int from_to[] = { 0,0, 1,1, 2,2 }; 206 mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3); 207 208 fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize); 209 fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize); 210 211 UMat dst_lab(src.size(), CV_8UC3); 212 mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3); 213 214 cvtColor(dst_lab, dst, COLOR_Lab2LBGR, src.channels()); 215 return true; 216} 217 218} 219 220#endif 221#endif 222