1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Wenju He, wenju@multicorewareinc.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46#define CELL_WIDTH 8 47#define CELL_HEIGHT 8 48#define CELLS_PER_BLOCK_X 2 49#define CELLS_PER_BLOCK_Y 2 50#define NTHREADS 256 51#define CV_PI_F M_PI_F 52 53#ifdef INTEL_DEVICE 54#define QANGLE_TYPE int 55#define QANGLE_TYPE2 int2 56#else 57#define QANGLE_TYPE uchar 58#define QANGLE_TYPE2 uchar2 59#endif 60 61//---------------------------------------------------------------------------- 62// Histogram computation 63// 12 threads for a cell, 12x4 threads per block 64// Use pre-computed gaussian and interp_weight lookup tables 65__kernel void compute_hists_lut_kernel( 66 const int cblock_stride_x, const int cblock_stride_y, 67 const int cnbins, const int cblock_hist_size, const int img_block_width, 68 const int blocks_in_group, const int blocks_total, 69 const int grad_quadstep, const int qangle_step, 70 __global const float* grad, __global const QANGLE_TYPE* qangle, 71 __global const float* gauss_w_lut, 72 __global float* block_hists, __local float* smem) 73{ 74 const int lx = get_local_id(0); 75 const int lp = lx / 24; /* local group id */ 76 const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */ 77 const int gidY = gid / img_block_width; 78 const int gidX = gid - gidY * img_block_width; 79 80 const int lidX = lx - lp * 24; 81 const int lidY = get_local_id(1); 82 83 const int cell_x = lidX / 12; 84 const int cell_y = lidY; 85 const int cell_thread_x = lidX - cell_x * 12; 86 87 __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X * 88 CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y); 89 __local float* final_hist = hists + cnbins * 90 (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12); 91 92 const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x; 93 const int offset_y = gidY * cblock_stride_y + (cell_y << 2); 94 95 __global const float* grad_ptr = (gid < blocks_total) ? 96 grad + offset_y * grad_quadstep + (offset_x << 1) : grad; 97 __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ? 98 qangle + offset_y * qangle_step + (offset_x << 1) : qangle; 99 100 __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + 101 cell_thread_x; 102 for (int bin_id = 0; bin_id < cnbins; ++bin_id) 103 hist[bin_id * 48] = 0.f; 104 105 const int dist_x = -4 + cell_thread_x - 4 * cell_x; 106 const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); 107 108 const int dist_y_begin = -4 - 4 * lidY; 109 for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) 110 { 111 float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); 112 QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]); 113 114 grad_ptr += grad_quadstep; 115 qangle_ptr += qangle_step; 116 117 int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); 118 119 int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8); 120 float gaussian = gauss_w_lut[idx]; 121 idx = (dist_y + 8) * 16 + (dist_x + 8); 122 float interp_weight = gauss_w_lut[256+idx]; 123 124 hist[bin.x * 48] += gaussian * interp_weight * vote.x; 125 hist[bin.y * 48] += gaussian * interp_weight * vote.y; 126 } 127 barrier(CLK_LOCAL_MEM_FENCE); 128 129 volatile __local float* hist_ = hist; 130 for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48) 131 { 132 if (cell_thread_x < 6) 133 hist_[0] += hist_[6]; 134 barrier(CLK_LOCAL_MEM_FENCE); 135 if (cell_thread_x < 3) 136 hist_[0] += hist_[3]; 137#ifdef CPU 138 barrier(CLK_LOCAL_MEM_FENCE); 139#endif 140 if (cell_thread_x == 0) 141 final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = 142 hist_[0] + hist_[1] + hist_[2]; 143 } 144 145 barrier(CLK_LOCAL_MEM_FENCE); 146 147 int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x; 148 if ((tid < cblock_hist_size) && (gid < blocks_total)) 149 { 150 __global float* block_hist = block_hists + 151 (gidY * img_block_width + gidX) * cblock_hist_size; 152 block_hist[tid] = final_hist[tid]; 153 } 154} 155 156//------------------------------------------------------------- 157// Normalization of histograms via L2Hys_norm 158// optimized for the case of 9 bins 159__kernel void normalize_hists_36_kernel(__global float* block_hists, 160 const float threshold, __local float *squares) 161{ 162 const int tid = get_local_id(0); 163 const int gid = get_global_id(0); 164 const int bid = tid / 36; /* block-hist id, (0 - 6) */ 165 const int boffset = bid * 36; /* block-hist offset in the work-group */ 166 const int hid = tid - boffset; /* histogram bin id, (0 - 35) */ 167 168 float elem = block_hists[gid]; 169 squares[tid] = elem * elem; 170 barrier(CLK_LOCAL_MEM_FENCE); 171 172 __local float* smem = squares + boffset; 173 float sum = smem[hid]; 174 if (hid < 18) 175 smem[hid] = sum = sum + smem[hid + 18]; 176 barrier(CLK_LOCAL_MEM_FENCE); 177 if (hid < 9) 178 smem[hid] = sum = sum + smem[hid + 9]; 179 barrier(CLK_LOCAL_MEM_FENCE); 180 if (hid < 4) 181 smem[hid] = sum + smem[hid + 4]; 182 barrier(CLK_LOCAL_MEM_FENCE); 183 sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; 184 185 elem = elem / (sqrt(sum) + 3.6f); 186 elem = min(elem, threshold); 187 188 barrier(CLK_LOCAL_MEM_FENCE); 189 squares[tid] = elem * elem; 190 barrier(CLK_LOCAL_MEM_FENCE); 191 192 sum = smem[hid]; 193 if (hid < 18) 194 smem[hid] = sum = sum + smem[hid + 18]; 195 barrier(CLK_LOCAL_MEM_FENCE); 196 if (hid < 9) 197 smem[hid] = sum = sum + smem[hid + 9]; 198 barrier(CLK_LOCAL_MEM_FENCE); 199 if (hid < 4) 200 smem[hid] = sum + smem[hid + 4]; 201 barrier(CLK_LOCAL_MEM_FENCE); 202 sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8]; 203 204 block_hists[gid] = elem / (sqrt(sum) + 1e-3f); 205} 206 207//------------------------------------------------------------- 208// Normalization of histograms via L2Hys_norm 209// 210inline float reduce_smem(volatile __local float* smem, int size) 211{ 212 unsigned int tid = get_local_id(0); 213 float sum = smem[tid]; 214 215 if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; 216 barrier(CLK_LOCAL_MEM_FENCE); } 217 if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; 218 barrier(CLK_LOCAL_MEM_FENCE); } 219 if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; 220 barrier(CLK_LOCAL_MEM_FENCE); } 221#ifdef CPU 222 if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; 223 barrier(CLK_LOCAL_MEM_FENCE); } 224 if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; 225 barrier(CLK_LOCAL_MEM_FENCE); } 226 if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8]; 227 barrier(CLK_LOCAL_MEM_FENCE); } 228 if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4]; 229 barrier(CLK_LOCAL_MEM_FENCE); } 230 if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2]; 231 barrier(CLK_LOCAL_MEM_FENCE); } 232 if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; 233 barrier(CLK_LOCAL_MEM_FENCE); } 234#else 235 if (tid < 32) 236 { 237 if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; 238#if WAVE_SIZE < 32 239 } barrier(CLK_LOCAL_MEM_FENCE); 240 if (tid < 16) { 241#endif 242 if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; 243 if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; 244 if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; 245 if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; 246 if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; 247 } 248#endif 249 250 return sum; 251} 252 253__kernel void normalize_hists_kernel( 254 const int nthreads, const int block_hist_size, const int img_block_width, 255 __global float* block_hists, const float threshold, __local float *squares) 256{ 257 const int tid = get_local_id(0); 258 const int gidX = get_group_id(0); 259 const int gidY = get_group_id(1); 260 261 __global float* hist = block_hists + (gidY * img_block_width + gidX) * 262 block_hist_size + tid; 263 264 float elem = 0.f; 265 if (tid < block_hist_size) 266 elem = hist[0]; 267 268 squares[tid] = elem * elem; 269 270 barrier(CLK_LOCAL_MEM_FENCE); 271 float sum = reduce_smem(squares, nthreads); 272 273 float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size); 274 elem = min(elem * scale, threshold); 275 276 barrier(CLK_LOCAL_MEM_FENCE); 277 squares[tid] = elem * elem; 278 279 barrier(CLK_LOCAL_MEM_FENCE); 280 sum = reduce_smem(squares, nthreads); 281 scale = 1.0f / (sqrt(sum) + 1e-3f); 282 283 if (tid < block_hist_size) 284 hist[0] = elem * scale; 285} 286 287//--------------------------------------------------------------------- 288// Linear SVM based classification 289// 48x96 window, 9 bins and default parameters 290// 180 threads, each thread corresponds to a bin in a row 291__kernel void classify_hists_180_kernel( 292 const int cdescr_width, const int cdescr_height, const int cblock_hist_size, 293 const int img_win_width, const int img_block_width, 294 const int win_block_stride_x, const int win_block_stride_y, 295 __global const float * block_hists, __global const float* coefs, 296 float free_coef, float threshold, __global uchar* labels) 297{ 298 const int tid = get_local_id(0); 299 const int gidX = get_group_id(0); 300 const int gidY = get_group_id(1); 301 302 __global const float* hist = block_hists + (gidY * win_block_stride_y * 303 img_block_width + gidX * win_block_stride_x) * cblock_hist_size; 304 305 float product = 0.f; 306 307 for (int i = 0; i < cdescr_height; i++) 308 { 309 product += coefs[i * cdescr_width + tid] * 310 hist[i * img_block_width * cblock_hist_size + tid]; 311 } 312 313 __local float products[180]; 314 315 products[tid] = product; 316 317 barrier(CLK_LOCAL_MEM_FENCE); 318 319 if (tid < 90) products[tid] = product = product + products[tid + 90]; 320 barrier(CLK_LOCAL_MEM_FENCE); 321 322 if (tid < 45) products[tid] = product = product + products[tid + 45]; 323 barrier(CLK_LOCAL_MEM_FENCE); 324 325 volatile __local float* smem = products; 326#ifdef CPU 327 if (tid < 13) smem[tid] = product = product + smem[tid + 32]; 328 barrier(CLK_LOCAL_MEM_FENCE); 329 if (tid < 16) smem[tid] = product = product + smem[tid + 16]; 330 barrier(CLK_LOCAL_MEM_FENCE); 331 if(tid<8) smem[tid] = product = product + smem[tid + 8]; 332 barrier(CLK_LOCAL_MEM_FENCE); 333 if(tid<4) smem[tid] = product = product + smem[tid + 4]; 334 barrier(CLK_LOCAL_MEM_FENCE); 335 if(tid<2) smem[tid] = product = product + smem[tid + 2]; 336 barrier(CLK_LOCAL_MEM_FENCE); 337#else 338 if (tid < 13) 339 { 340 smem[tid] = product = product + smem[tid + 32]; 341 } 342#if WAVE_SIZE < 32 343 barrier(CLK_LOCAL_MEM_FENCE); 344#endif 345 if (tid < 16) 346 { 347 smem[tid] = product = product + smem[tid + 16]; 348 smem[tid] = product = product + smem[tid + 8]; 349 smem[tid] = product = product + smem[tid + 4]; 350 smem[tid] = product = product + smem[tid + 2]; 351 } 352#endif 353 354 if (tid == 0){ 355 product = product + smem[tid + 1]; 356 labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); 357 } 358} 359 360//--------------------------------------------------------------------- 361// Linear SVM based classification 362// 64x128 window, 9 bins and default parameters 363// 256 threads, 252 of them are used 364__kernel void classify_hists_252_kernel( 365 const int cdescr_width, const int cdescr_height, const int cblock_hist_size, 366 const int img_win_width, const int img_block_width, 367 const int win_block_stride_x, const int win_block_stride_y, 368 __global const float * block_hists, __global const float* coefs, 369 float free_coef, float threshold, __global uchar* labels) 370{ 371 const int tid = get_local_id(0); 372 const int gidX = get_group_id(0); 373 const int gidY = get_group_id(1); 374 375 __global const float* hist = block_hists + (gidY * win_block_stride_y * 376 img_block_width + gidX * win_block_stride_x) * cblock_hist_size; 377 378 float product = 0.f; 379 if (tid < cdescr_width) 380 { 381 for (int i = 0; i < cdescr_height; i++) 382 product += coefs[i * cdescr_width + tid] * 383 hist[i * img_block_width * cblock_hist_size + tid]; 384 } 385 386 __local float products[NTHREADS]; 387 388 products[tid] = product; 389 390 barrier(CLK_LOCAL_MEM_FENCE); 391 392 if (tid < 128) products[tid] = product = product + products[tid + 128]; 393 barrier(CLK_LOCAL_MEM_FENCE); 394 395 if (tid < 64) products[tid] = product = product + products[tid + 64]; 396 barrier(CLK_LOCAL_MEM_FENCE); 397 398 volatile __local float* smem = products; 399#ifdef CPU 400 if(tid<32) smem[tid] = product = product + smem[tid + 32]; 401 barrier(CLK_LOCAL_MEM_FENCE); 402 if(tid<16) smem[tid] = product = product + smem[tid + 16]; 403 barrier(CLK_LOCAL_MEM_FENCE); 404 if(tid<8) smem[tid] = product = product + smem[tid + 8]; 405 barrier(CLK_LOCAL_MEM_FENCE); 406 if(tid<4) smem[tid] = product = product + smem[tid + 4]; 407 barrier(CLK_LOCAL_MEM_FENCE); 408 if(tid<2) smem[tid] = product = product + smem[tid + 2]; 409 barrier(CLK_LOCAL_MEM_FENCE); 410#else 411 if (tid < 32) 412 { 413 smem[tid] = product = product + smem[tid + 32]; 414#if WAVE_SIZE < 32 415 } barrier(CLK_LOCAL_MEM_FENCE); 416 if (tid < 16) { 417#endif 418 smem[tid] = product = product + smem[tid + 16]; 419 smem[tid] = product = product + smem[tid + 8]; 420 smem[tid] = product = product + smem[tid + 4]; 421 smem[tid] = product = product + smem[tid + 2]; 422 } 423#endif 424 if (tid == 0){ 425 product = product + smem[tid + 1]; 426 labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); 427 } 428} 429 430//--------------------------------------------------------------------- 431// Linear SVM based classification 432// 256 threads 433__kernel void classify_hists_kernel( 434 const int cdescr_size, const int cdescr_width, const int cblock_hist_size, 435 const int img_win_width, const int img_block_width, 436 const int win_block_stride_x, const int win_block_stride_y, 437 __global const float * block_hists, __global const float* coefs, 438 float free_coef, float threshold, __global uchar* labels) 439{ 440 const int tid = get_local_id(0); 441 const int gidX = get_group_id(0); 442 const int gidY = get_group_id(1); 443 444 __global const float* hist = block_hists + (gidY * win_block_stride_y * 445 img_block_width + gidX * win_block_stride_x) * cblock_hist_size; 446 447 float product = 0.f; 448 for (int i = tid; i < cdescr_size; i += NTHREADS) 449 { 450 int offset_y = i / cdescr_width; 451 int offset_x = i - offset_y * cdescr_width; 452 product += coefs[i] * 453 hist[offset_y * img_block_width * cblock_hist_size + offset_x]; 454 } 455 456 __local float products[NTHREADS]; 457 458 products[tid] = product; 459 460 barrier(CLK_LOCAL_MEM_FENCE); 461 462 if (tid < 128) products[tid] = product = product + products[tid + 128]; 463 barrier(CLK_LOCAL_MEM_FENCE); 464 465 if (tid < 64) products[tid] = product = product + products[tid + 64]; 466 barrier(CLK_LOCAL_MEM_FENCE); 467 468 volatile __local float* smem = products; 469#ifdef CPU 470 if(tid<32) smem[tid] = product = product + smem[tid + 32]; 471 barrier(CLK_LOCAL_MEM_FENCE); 472 if(tid<16) smem[tid] = product = product + smem[tid + 16]; 473 barrier(CLK_LOCAL_MEM_FENCE); 474 if(tid<8) smem[tid] = product = product + smem[tid + 8]; 475 barrier(CLK_LOCAL_MEM_FENCE); 476 if(tid<4) smem[tid] = product = product + smem[tid + 4]; 477 barrier(CLK_LOCAL_MEM_FENCE); 478 if(tid<2) smem[tid] = product = product + smem[tid + 2]; 479 barrier(CLK_LOCAL_MEM_FENCE); 480#else 481 if (tid < 32) 482 { 483 smem[tid] = product = product + smem[tid + 32]; 484#if WAVE_SIZE < 32 485 } barrier(CLK_LOCAL_MEM_FENCE); 486 if (tid < 16) { 487#endif 488 smem[tid] = product = product + smem[tid + 16]; 489 smem[tid] = product = product + smem[tid + 8]; 490 smem[tid] = product = product + smem[tid + 4]; 491 smem[tid] = product = product + smem[tid + 2]; 492 } 493#endif 494 if (tid == 0){ 495 smem[tid] = product = product + smem[tid + 1]; 496 labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); 497 } 498} 499 500//---------------------------------------------------------------------------- 501// Extract descriptors 502 503__kernel void extract_descrs_by_rows_kernel( 504 const int cblock_hist_size, const int descriptors_quadstep, 505 const int cdescr_size, const int cdescr_width, const int img_block_width, 506 const int win_block_stride_x, const int win_block_stride_y, 507 __global const float* block_hists, __global float* descriptors) 508{ 509 int tid = get_local_id(0); 510 int gidX = get_group_id(0); 511 int gidY = get_group_id(1); 512 513 // Get left top corner of the window in src 514 __global const float* hist = block_hists + (gidY * win_block_stride_y * 515 img_block_width + gidX * win_block_stride_x) * cblock_hist_size; 516 517 // Get left top corner of the window in dst 518 __global float* descriptor = descriptors + 519 (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; 520 521 // Copy elements from src to dst 522 for (int i = tid; i < cdescr_size; i += NTHREADS) 523 { 524 int offset_y = i / cdescr_width; 525 int offset_x = i - offset_y * cdescr_width; 526 descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x]; 527 } 528} 529 530__kernel void extract_descrs_by_cols_kernel( 531 const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, 532 const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, 533 const int win_block_stride_x, const int win_block_stride_y, 534 __global const float* block_hists, __global float* descriptors) 535{ 536 int tid = get_local_id(0); 537 int gidX = get_group_id(0); 538 int gidY = get_group_id(1); 539 540 // Get left top corner of the window in src 541 __global const float* hist = block_hists + (gidY * win_block_stride_y * 542 img_block_width + gidX * win_block_stride_x) * cblock_hist_size; 543 544 // Get left top corner of the window in dst 545 __global float* descriptor = descriptors + 546 (gidY * get_num_groups(0) + gidX) * descriptors_quadstep; 547 548 // Copy elements from src to dst 549 for (int i = tid; i < cdescr_size; i += NTHREADS) 550 { 551 int block_idx = i / cblock_hist_size; 552 int idx_in_block = i - block_idx * cblock_hist_size; 553 554 int y = block_idx / cnblocks_win_x; 555 int x = block_idx - y * cnblocks_win_x; 556 557 descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = 558 hist[(y * img_block_width + x) * cblock_hist_size + idx_in_block]; 559 } 560} 561 562//---------------------------------------------------------------------------- 563// Gradients computation 564 565__kernel void compute_gradients_8UC4_kernel( 566 const int height, const int width, 567 const int img_step, const int grad_quadstep, const int qangle_step, 568 const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle, 569 const float angle_scale, const char correct_gamma, const int cnbins) 570{ 571 const int x = get_global_id(0); 572 const int tid = get_local_id(0); 573 const int gSizeX = get_local_size(0); 574 const int gidY = get_group_id(1); 575 576 __global const uchar4* row = img + gidY * img_step; 577 578 __local float sh_row[(NTHREADS + 2) * 3]; 579 580 uchar4 val; 581 if (x < width) 582 val = row[x]; 583 else 584 val = row[width - 2]; 585 586 sh_row[tid + 1] = val.x; 587 sh_row[tid + 1 + (NTHREADS + 2)] = val.y; 588 sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z; 589 590 if (tid == 0) 591 { 592 val = row[max(x - 1, 1)]; 593 sh_row[0] = val.x; 594 sh_row[(NTHREADS + 2)] = val.y; 595 sh_row[2 * (NTHREADS + 2)] = val.z; 596 } 597 598 if (tid == gSizeX - 1) 599 { 600 val = row[min(x + 1, width - 2)]; 601 sh_row[gSizeX + 1] = val.x; 602 sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y; 603 sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z; 604 } 605 606 barrier(CLK_LOCAL_MEM_FENCE); 607 if (x < width) 608 { 609 float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], 610 sh_row[tid + 2 * (NTHREADS + 2)], 0); 611 float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], 612 sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0); 613 614 float4 dx; 615 if (correct_gamma == 1) 616 dx = sqrt(b) - sqrt(a); 617 else 618 dx = b - a; 619 620 float4 dy = (float4) 0.f; 621 622 if (gidY > 0 && gidY < height - 1) 623 { 624 a = convert_float4(img[(gidY - 1) * img_step + x].xyzw); 625 b = convert_float4(img[(gidY + 1) * img_step + x].xyzw); 626 627 if (correct_gamma == 1) 628 dy = sqrt(b) - sqrt(a); 629 else 630 dy = b - a; 631 } 632 633 float4 mag = hypot(dx, dy); 634 float best_dx = dx.x; 635 float best_dy = dy.x; 636 637 float mag0 = mag.x; 638 if (mag0 < mag.y) 639 { 640 best_dx = dx.y; 641 best_dy = dy.y; 642 mag0 = mag.y; 643 } 644 645 if (mag0 < mag.z) 646 { 647 best_dx = dx.z; 648 best_dy = dy.z; 649 mag0 = mag.z; 650 } 651 652 float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; 653 int hidx = (int)floor(ang); 654 ang -= hidx; 655 hidx = (hidx + cnbins) % cnbins; 656 657 qangle[(gidY * qangle_step + x) << 1] = hidx; 658 qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins; 659 grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang); 660 grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang; 661 } 662} 663 664__kernel void compute_gradients_8UC1_kernel( 665 const int height, const int width, 666 const int img_step, const int grad_quadstep, const int qangle_step, 667 __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle, 668 const float angle_scale, const char correct_gamma, const int cnbins) 669{ 670 const int x = get_global_id(0); 671 const int tid = get_local_id(0); 672 const int gSizeX = get_local_size(0); 673 const int gidY = get_group_id(1); 674 675 __global const uchar* row = img + gidY * img_step; 676 677 __local float sh_row[NTHREADS + 2]; 678 679 if (x < width) 680 sh_row[tid + 1] = row[x]; 681 else 682 sh_row[tid + 1] = row[width - 2]; 683 684 if (tid == 0) 685 sh_row[0] = row[max(x - 1, 1)]; 686 687 if (tid == gSizeX - 1) 688 sh_row[gSizeX + 1] = row[min(x + 1, width - 2)]; 689 690 barrier(CLK_LOCAL_MEM_FENCE); 691 if (x < width) 692 { 693 float dx; 694 695 if (correct_gamma == 1) 696 dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]); 697 else 698 dx = sh_row[tid + 2] - sh_row[tid]; 699 700 float dy = 0.f; 701 if (gidY > 0 && gidY < height - 1) 702 { 703 float a = (float) img[ (gidY + 1) * img_step + x ]; 704 float b = (float) img[ (gidY - 1) * img_step + x ]; 705 if (correct_gamma == 1) 706 dy = sqrt(a) - sqrt(b); 707 else 708 dy = a - b; 709 } 710 float mag = hypot(dx, dy); 711 712 float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f; 713 int hidx = (int)floor(ang); 714 ang -= hidx; 715 hidx = (hidx + cnbins) % cnbins; 716 717 qangle[ (gidY * qangle_step + x) << 1 ] = hidx; 718 qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins; 719 grad[ (gidY * grad_quadstep + x) << 1 ] = mag * (1.f - ang); 720 grad[ ((gidY * grad_quadstep + x) << 1) + 1 ] = mag * ang; 721 } 722} 723