1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14// Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// Redistribution and use in source and binary forms, with or without modification, 18// are permitted provided that the following conditions are met: 19// 20// * Redistribution's of source code must retain the above copyright notice, 21// this list of conditions and the following disclaimer. 22// 23// * Redistribution's in binary form must reproduce the above copyright notice, 24// this list of conditions and the following disclaimer in the documentation 25// and/or other materials provided with the distribution. 26// 27// * The name of the copyright holders may not be used to endorse or promote products 28// derived from this software without specific prior written permission. 29// 30// This software is provided by the copyright holders and contributors "as is" and 31// any express or implied warranties, including, but not limited to, the implied 32// warranties of merchantability and fitness for a particular purpose are disclaimed. 33// In no event shall the Intel Corporation or contributors be liable for any direct, 34// indirect, incidental, special, exemplary, or consequential damages 35// (including, but not limited to, procurement of substitute goods or services; 36// loss of use, data, or profits; or business interruption) however caused 37// and on any theory of liability, whether in contract, strict liability, 38// or tort (including negligence or otherwise) arising in any way out of 39// the use of this software, even if advised of the possibility of such damage. 40// 41//M*/ 42 43#include "opencv2/core/cuda/common.hpp" 44#include "opencv2/core/cuda/vec_traits.hpp" 45#include "opencv2/core/cuda/vec_math.hpp" 46#include "opencv2/core/cuda/functional.hpp" 47#include "opencv2/core/cuda/reduce.hpp" 48#include "opencv2/core/cuda/border_interpolate.hpp" 49 50using namespace cv::cuda; 51 52typedef unsigned char uchar; 53typedef unsigned short ushort; 54 55////////////////////////////////////////////////////////////////////////////////// 56//// Non Local Means Denosing 57 58namespace cv { namespace cuda { namespace device 59{ 60 namespace imgproc 61 { 62 __device__ __forceinline__ float norm2(const float& v) { return v*v; } 63 __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; } 64 __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; } 65 __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w; } 66 67 template<typename T, typename B> 68 __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult) 69 { 70 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; 71 72 const int i = blockDim.y * blockIdx.y + threadIdx.y; 73 const int j = blockDim.x * blockIdx.x + threadIdx.x; 74 75 if (j >= dst.cols || i >= dst.rows) 76 return; 77 78 int bsize = search_radius + block_radius; 79 int search_window = 2 * search_radius + 1; 80 float minus_search_window2_inv = -1.f/(search_window * search_window); 81 82 value_type sum1 = VecTraits<value_type>::all(0); 83 float sum2 = 0.f; 84 85 if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows) 86 { 87 for(float y = -search_radius; y <= search_radius; ++y) 88 for(float x = -search_radius; x <= search_radius; ++x) 89 { 90 float dist2 = 0; 91 for(float ty = -block_radius; ty <= block_radius; ++ty) 92 for(float tx = -block_radius; tx <= block_radius; ++tx) 93 { 94 value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx)); 95 value_type av = saturate_cast<value_type>(src(i + ty, j + tx)); 96 97 dist2 += norm2(av - bv); 98 } 99 100 float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv); 101 102 /*if (i == 255 && j == 255) 103 printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/ 104 105 sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x)); 106 sum2 += w; 107 } 108 } 109 else 110 { 111 for(float y = -search_radius; y <= search_radius; ++y) 112 for(float x = -search_radius; x <= search_radius; ++x) 113 { 114 float dist2 = 0; 115 for(float ty = -block_radius; ty <= block_radius; ++ty) 116 for(float tx = -block_radius; tx <= block_radius; ++tx) 117 { 118 value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src)); 119 value_type av = saturate_cast<value_type>(b.at(i + ty, j + tx, src)); 120 dist2 += norm2(av - bv); 121 } 122 123 float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv); 124 125 sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src)); 126 sum2 += w; 127 } 128 129 } 130 131 dst(i, j) = saturate_cast<T>(sum1 / sum2); 132 133 } 134 135 template<typename T, template <typename> class B> 136 void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream) 137 { 138 dim3 block (32, 8); 139 dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y)); 140 141 B<T> b(src.rows, src.cols); 142 143 int block_window = 2 * block_radius + 1; 144 float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn); 145 float noise_mult = minus_h2_inv/(block_window * block_window); 146 147 cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) ); 148 nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult); 149 cudaSafeCall ( cudaGetLastError () ); 150 151 if (stream == 0) 152 cudaSafeCall( cudaDeviceSynchronize() ); 153 } 154 155 template<typename T> 156 void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream) 157 { 158 typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream); 159 160 static func_t funcs[] = 161 { 162 nlm_caller<T, BrdConstant>, 163 nlm_caller<T, BrdReplicate>, 164 nlm_caller<T, BrdReflect>, 165 nlm_caller<T, BrdWrap>, 166 nlm_caller<T, BrdReflect101> 167 }; 168 funcs[borderMode](src, dst, search_radius, block_radius, h, stream); 169 } 170 171 template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t); 172 template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t); 173 template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t); 174 } 175}}} 176 177////////////////////////////////////////////////////////////////////////////////// 178//// Non Local Means Denosing (fast approximate version) 179 180namespace cv { namespace cuda { namespace device 181{ 182 namespace imgproc 183 { 184 185 template <int cn> struct Unroll; 186 template <> struct Unroll<1> 187 { 188 template <int BLOCK_SIZE> 189 static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem) 190 { 191 return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE); 192 } 193 194 static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2) 195 { 196 return thrust::tie(val1, val2); 197 } 198 199 static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op() 200 { 201 plus<float> op; 202 return thrust::make_tuple(op, op); 203 } 204 }; 205 template <> struct Unroll<2> 206 { 207 template <int BLOCK_SIZE> 208 static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem) 209 { 210 return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE); 211 } 212 213 static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2) 214 { 215 return thrust::tie(val1, val2.x, val2.y); 216 } 217 218 static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op() 219 { 220 plus<float> op; 221 return thrust::make_tuple(op, op, op); 222 } 223 }; 224 template <> struct Unroll<3> 225 { 226 template <int BLOCK_SIZE> 227 static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem) 228 { 229 return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE); 230 } 231 232 static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2) 233 { 234 return thrust::tie(val1, val2.x, val2.y, val2.z); 235 } 236 237 static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op() 238 { 239 plus<float> op; 240 return thrust::make_tuple(op, op, op, op); 241 } 242 }; 243 template <> struct Unroll<4> 244 { 245 template <int BLOCK_SIZE> 246 static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem) 247 { 248 return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE); 249 } 250 251 static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2) 252 { 253 return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w); 254 } 255 256 static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op() 257 { 258 plus<float> op; 259 return thrust::make_tuple(op, op, op, op, op); 260 } 261 }; 262 263 __device__ __forceinline__ int calcDist(const uchar& a, const uchar& b) { return (a-b)*(a-b); } 264 __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); } 265 __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); } 266 267 template <class T> struct FastNonLocalMeans 268 { 269 enum 270 { 271 CTA_SIZE = 128, 272 273 TILE_COLS = 128, 274 TILE_ROWS = 32, 275 276 STRIDE = CTA_SIZE 277 }; 278 279 struct plus 280 { 281 __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; } 282 }; 283 284 int search_radius; 285 int block_radius; 286 287 int search_window; 288 int block_window; 289 float minus_h2_inv; 290 291 FastNonLocalMeans(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2), 292 search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {} 293 294 PtrStep<T> src; 295 mutable PtrStepi buffer; 296 297 __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 298 { 299 for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 300 { 301 dist_sums[index] = 0; 302 303 for(int tx = 0; tx < block_window; ++tx) 304 col_sums(tx, index) = 0; 305 306 int y = index / search_window; 307 int x = index - y * search_window; 308 309 int ay = i; 310 int ax = j; 311 312 int by = i + y - search_radius; 313 int bx = j + x - search_radius; 314 315#if 1 316 for (int tx = -block_radius; tx <= block_radius; ++tx) 317 { 318 int col_sum = 0; 319 for (int ty = -block_radius; ty <= block_radius; ++ty) 320 { 321 int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx)); 322 323 dist_sums[index] += dist; 324 col_sum += dist; 325 } 326 col_sums(tx + block_radius, index) = col_sum; 327 } 328#else 329 for (int ty = -block_radius; ty <= block_radius; ++ty) 330 for (int tx = -block_radius; tx <= block_radius; ++tx) 331 { 332 int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx)); 333 334 dist_sums[index] += dist; 335 col_sums(tx + block_radius, index) += dist; 336 } 337#endif 338 339 up_col_sums(j, index) = col_sums(block_window - 1, index); 340 } 341 } 342 343 __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 344 { 345 for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 346 { 347 int y = index / search_window; 348 int x = index - y * search_window; 349 350 int ay = i; 351 int ax = j + block_radius; 352 353 int by = i + y - search_radius; 354 int bx = j + x - search_radius + block_radius; 355 356 int col_sum = 0; 357 358 for (int ty = -block_radius; ty <= block_radius; ++ty) 359 col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx)); 360 361 dist_sums[index] += col_sum - col_sums(first, index); 362 363 col_sums(first, index) = col_sum; 364 up_col_sums(j, index) = col_sum; 365 } 366 } 367 368 __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const 369 { 370 int ay = i; 371 int ax = j + block_radius; 372 373 T a_up = src(ay - block_radius - 1, ax); 374 T a_down = src(ay + block_radius, ax); 375 376 for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 377 { 378 int y = index / search_window; 379 int x = index - y * search_window; 380 381 int by = i + y - search_radius; 382 int bx = j + x - search_radius + block_radius; 383 384 T b_up = src(by - block_radius - 1, bx); 385 T b_down = src(by + block_radius, bx); 386 387 int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up); 388 389 dist_sums[index] += col_sum - col_sums(first, index); 390 col_sums(first, index) = col_sum; 391 up_col_sums(j, index) = col_sum; 392 } 393 } 394 395 __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, T& dst) const 396 { 397 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type; 398 399 float weights_sum = 0; 400 sum_type sum = VecTraits<sum_type>::all(0); 401 402 float bw2_inv = 1.f/(block_window * block_window); 403 404 int sx = j - search_radius; 405 int sy = i - search_radius; 406 407 for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE) 408 { 409 int y = index / search_window; 410 int x = index - y * search_window; 411 412 float avg_dist = dist_sums[index] * bw2_inv; 413 float weight = __expf(avg_dist * minus_h2_inv); 414 weights_sum += weight; 415 416 sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x)); 417 } 418 419 __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)]; 420 421 reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer), 422 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum), 423 threadIdx.x, 424 Unroll<VecTraits<T>::cn>::op()); 425 426 if (threadIdx.x == 0) 427 dst = saturate_cast<T>(sum / weights_sum); 428 } 429 430 __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const 431 { 432 int tbx = blockIdx.x * TILE_COLS; 433 int tby = blockIdx.y * TILE_ROWS; 434 435 int tex = ::min(tbx + TILE_COLS, dst.cols); 436 int tey = ::min(tby + TILE_ROWS, dst.rows); 437 438 PtrStepi col_sums; 439 col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window; 440 col_sums.step = buffer.step; 441 442 PtrStepi up_col_sums; 443 up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window; 444 up_col_sums.step = buffer.step; 445 446 extern __shared__ int dist_sums[]; //search_window * search_window 447 448 int first = 0; 449 450 for (int i = tby; i < tey; ++i) 451 for (int j = tbx; j < tex; ++j) 452 { 453 __syncthreads(); 454 455 if (j == tbx) 456 { 457 initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums); 458 first = 0; 459 } 460 else 461 { 462 if (i == tby) 463 shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums); 464 else 465 shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums); 466 467 first = (first + 1) % block_window; 468 } 469 470 __syncthreads(); 471 472 convolve_window(i, j, dist_sums, dst(i, j)); 473 } 474 } 475 476 }; 477 478 template<typename T> 479 __global__ void fast_nlm_kernel(const FastNonLocalMeans<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); } 480 481 void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows) 482 { 483 typedef FastNonLocalMeans<uchar> FNLM; 484 dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS)); 485 486 buffer_cols = search_window * search_window * grid.y; 487 buffer_rows = src.cols + block_window * grid.x; 488 } 489 490 template<typename T> 491 void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer, 492 int search_window, int block_window, float h, cudaStream_t stream) 493 { 494 typedef FastNonLocalMeans<T> FNLM; 495 FNLM fnlm(search_window, block_window, h); 496 497 fnlm.src = (PtrStepSz<T>)src; 498 fnlm.buffer = buffer; 499 500 dim3 block(FNLM::CTA_SIZE, 1); 501 dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS)); 502 int smem = search_window * search_window * sizeof(int); 503 504 505 fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst); 506 cudaSafeCall ( cudaGetLastError () ); 507 if (stream == 0) 508 cudaSafeCall( cudaDeviceSynchronize() ); 509 } 510 511 template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); 512 template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); 513 template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t); 514 515 516 517 __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab) 518 { 519 int x = threadIdx.x + blockIdx.x * blockDim.x; 520 int y = threadIdx.y + blockIdx.y * blockDim.y; 521 522 if (x < lab.cols && y < lab.rows) 523 { 524 uchar3 p = lab(y, x); 525 ab(y,x) = make_uchar2(p.y, p.z); 526 l(y,x) = p.x; 527 } 528 } 529 530 void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream) 531 { 532 dim3 b(32, 8); 533 dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y)); 534 535 fnlm_split_kernel<<<g, b>>>(lab, l, ab); 536 cudaSafeCall ( cudaGetLastError () ); 537 if (stream == 0) 538 cudaSafeCall( cudaDeviceSynchronize() ); 539 } 540 541 __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab) 542 { 543 int x = threadIdx.x + blockIdx.x * blockDim.x; 544 int y = threadIdx.y + blockIdx.y * blockDim.y; 545 546 if (x < lab.cols && y < lab.rows) 547 { 548 uchar2 p = ab(y, x); 549 lab(y, x) = make_uchar3(l(y, x), p.x, p.y); 550 } 551 } 552 553 void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream) 554 { 555 dim3 b(32, 8); 556 dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y)); 557 558 fnlm_merge_kernel<<<g, b>>>(l, ab, lab); 559 cudaSafeCall ( cudaGetLastError () ); 560 if (stream == 0) 561 cudaSafeCall( cudaDeviceSynchronize() ); 562 } 563 } 564}}} 565