1/* 2 * kernel_gauss_lap_pyramid.cl 3 * input0 4 * input1 5 * output 6 * window, pos_x, pos_y, width, height 7 */ 8 9#ifndef PYRAMID_UV 10#define PYRAMID_UV 0 11#endif 12 13#ifndef CL_PYRAMID_ENABLE_DUMP 14#define CL_PYRAMID_ENABLE_DUMP 0 15#endif 16 17#ifndef ENABLE_MASK_GAUSS_SCALE 18#define ENABLE_MASK_GAUSS_SCALE 0 19#endif 20 21#define fixed_pixels 8 22#define GAUSS_V_R 2 23#define GAUSS_H_R 1 24#define COEFF_MID 4 25 26#define zero8 (float8)(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) 27 28__constant const float coeffs[9] = {0.0f, 0.0f, 0.152f, 0.222f, 0.252f, 0.222f, 0.152f, 0.0f, 0.0f}; 29 30#define ARG_FORMAT4 "(%.1f,%.1f,%.1f,%.1f)" 31#define ARGS4(a) a.s0, a.s1, a.s2, a.s3 32 33#define ARG_FORMAT8 "(%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f)" 34#define ARGS8(a) a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7 35 36/* 37 * input: RGBA-CL_UNSIGNED_INT16 38 * output_gauss: RGBA-CL_UNSIGNED_INT8 39 * output_lap:RGBA-CL_UNSIGNED_INT16 40 * each work-item calc 2 lines 41 */ 42__kernel void 43kernel_gauss_scale_transform ( 44 __read_only image2d_t input, int in_offset_x, 45 __write_only image2d_t output_gauss 46#if CL_PYRAMID_ENABLE_DUMP 47 , __write_only image2d_t dump_orig 48#endif 49) 50{ 51 int g_x = get_global_id (0); 52 int in_x = g_x + in_offset_x; 53 int g_y = get_global_id (1) * 4; 54 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 55 56 int g_out_x = get_global_id (0); 57 int g_out_y = get_global_id (1) * 2; 58 59#if CL_PYRAMID_ENABLE_DUMP 60 write_imageui (dump_orig, (int2)(g_x, g_y + 0), read_imageui(input, sampler, (int2)(in_x, g_y))); 61 write_imageui (dump_orig, (int2)(g_x, g_y + 1), read_imageui(input, sampler, (int2)(in_x, g_y + 1))); 62 write_imageui (dump_orig, (int2)(g_x, g_y + 2), read_imageui(input, sampler, (int2)(in_x, g_y + 2))); 63 write_imageui (dump_orig, (int2)(g_x, g_y + 3), read_imageui(input, sampler, (int2)(in_x, g_y + 3))); 64#endif 65 66 float8 result_pre[2] = {zero8, zero8}; 67 float8 result_next[2] = {zero8, zero8}; 68 float8 result_cur[2] = {zero8, zero8}; 69 float4 final_g[2]; 70 71 float8 tmp_data; 72 int i_ver; 73 74#pragma unroll 75 for (i_ver = -GAUSS_V_R; i_ver <= GAUSS_V_R + 2; i_ver++) { 76 int cur_g_y = g_y + i_ver; 77 float coeff0 = coeffs[i_ver + COEFF_MID]; 78 float coeff1 = coeffs[i_ver + COEFF_MID - 2]; 79 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y))))); 80 result_pre[0] += tmp_data * coeff0; 81 result_pre[1] += tmp_data * coeff1; 82 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y))))); 83 result_cur[0] += tmp_data * coeff0; 84 result_cur[1] += tmp_data * coeff1; 85 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y))))); 86 result_next[1] += tmp_data * coeff1; 87 result_next[0] += tmp_data * coeff0; 88 } 89 90 int i_line; 91#pragma unroll 92 for (i_line = 0; i_line < 2; ++i_line) { 93#if !PYRAMID_UV 94 final_g[i_line] = result_cur[i_line].even * coeffs[COEFF_MID] + 95 (float4)(result_pre[i_line].s7, result_cur[i_line].s135) * coeffs[COEFF_MID + 1] + 96 (float4)(result_pre[i_line].s6, result_cur[i_line].s024) * coeffs[COEFF_MID + 2] + 97 (float4)(result_cur[i_line].s1357) * coeffs[COEFF_MID + 1] + 98 (float4)(result_cur[i_line].s246, result_next[i_line].s0) * coeffs[COEFF_MID + 2]; 99#else 100 final_g[i_line] = result_cur[i_line].s0145 * coeffs[COEFF_MID] + 101 (float4)(result_pre[i_line].s67, result_cur[i_line].s23) * coeffs[COEFF_MID + 1] + 102 (float4)(result_pre[i_line].s45, result_cur[i_line].s01) * coeffs[COEFF_MID + 2] + 103 (float4)(result_cur[i_line].s2367) * coeffs[COEFF_MID + 1] + 104 (float4)(result_cur[i_line].s45, result_next[i_line].s01) * coeffs[COEFF_MID + 2]; 105#endif 106 final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f); 107 write_imageui (output_gauss, (int2)(g_out_x, g_out_y + i_line), convert_uint4(final_g[i_line])); 108 } 109 110} 111 112inline float8 113read_scale_y (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x) 114{ 115 float8 data; 116 data.s0 = read_imagef (input, sampler, pos_start).x; 117 pos_start.x += step_x; 118 data.s1 = read_imagef (input, sampler, pos_start).x; 119 pos_start.x += step_x; 120 data.s2 = read_imagef (input, sampler, pos_start).x; 121 pos_start.x += step_x; 122 data.s3 = read_imagef (input, sampler, pos_start).x; 123 pos_start.x += step_x; 124 data.s4 = read_imagef (input, sampler, pos_start).x; 125 pos_start.x += step_x; 126 data.s5 = read_imagef (input, sampler, pos_start).x; 127 pos_start.x += step_x; 128 data.s6 = read_imagef (input, sampler, pos_start).x; 129 pos_start.x += step_x; 130 data.s7 = read_imagef (input, sampler, pos_start).x; 131 return data; 132} 133 134inline float8 135read_scale_uv (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x) 136{ 137 float8 data; 138 data.s01 = read_imagef (input, sampler, pos_start).xy; 139 pos_start.x += step_x; 140 data.s23 = read_imagef (input, sampler, pos_start).xy; 141 pos_start.x += step_x; 142 data.s45 = read_imagef (input, sampler, pos_start).xy; 143 pos_start.x += step_x; 144 data.s67 = read_imagef (input, sampler, pos_start).xy; 145 return data; 146} 147 148/* 149 * input_gauss: RGBA-CL_UNSIGNED_INT18 150 * input_lap: RGBA-CL_UNSIGNED_INT16 151 * output: RGBA-CL_UNSIGNED_INT16 152 * each work-item calc 2 lines 153 */ 154__kernel void 155kernel_gauss_lap_reconstruct ( 156 __read_only image2d_t input_gauss, 157 float in_sampler_offset_x, float in_sampler_offset_y, 158 __read_only image2d_t input_lap, 159 __write_only image2d_t output, int out_offset_x, float out_width, float out_height 160#if CL_PYRAMID_ENABLE_DUMP 161 , __write_only image2d_t dump_resize, __write_only image2d_t dump_final 162#endif 163) 164{ 165 int g_x = get_global_id (0); 166 int g_y = get_global_id (1); 167 const sampler_t lap_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 168 const sampler_t gauss_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; 169 170 //if (g_x > out_width + 0.9f || g_y > out_height + 0.5f) 171 // return; 172 173 float8 lap = convert_float8(as_uchar8(convert_ushort4(read_imageui(input_lap, lap_sampler, (int2)(g_x, g_y))))); 174 lap = (lap - 128.0f) * 2.0f; 175 176 float8 data_g; 177 float2 input_gauss_pos; 178 float step_x; 179 input_gauss_pos.x = g_x / out_width + in_sampler_offset_x; 180 input_gauss_pos.y = g_y / out_height + in_sampler_offset_y; 181#if !PYRAMID_UV 182 step_x = 0.125f / out_width; 183 data_g = read_scale_y (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f; 184#else 185 step_x = 0.25f / out_width; 186 data_g = read_scale_uv (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f; 187#endif 188 189#if CL_PYRAMID_ENABLE_DUMP 190 write_imageui (dump_resize, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g)))); 191#endif 192 193 data_g += lap + 0.5f; 194 data_g = clamp (data_g, 0.0f, 255.0f); 195 write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g)))); 196#if CL_PYRAMID_ENABLE_DUMP 197 write_imageui (dump_final, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g)))); 198#endif 199} 200 201__kernel void 202kernel_pyramid_blend ( 203 __read_only image2d_t input0, __read_only image2d_t input1, 204#if !PYRAMID_UV 205 __global const float8 *input0_mask, 206#else 207 __global const float4 *input0_mask, 208#endif 209 __write_only image2d_t output) 210{ 211 sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 212 const int g_x = get_global_id (0); 213 const int g_y = get_global_id (1); 214 int2 pos = (int2) (g_x, g_y); 215 216 float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos)))); 217 float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos)))); 218 float8 out_data; 219 220#if !PYRAMID_UV 221 out_data = (data0 - data1) * input0_mask[g_x] + data1; 222#else 223 float8 coeff; 224 coeff.even = input0_mask[g_x]; 225 coeff.odd = coeff.even; 226 out_data = (data0 - data1) * coeff + data1; 227#endif 228 229 out_data = clamp (out_data + 0.5f, 0.0f, 255.0f); 230 231 write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data)))); 232} 233 234__kernel void 235kernel_pyramid_scale ( 236 __read_only image2d_t input, __write_only image2d_t output, 237 int out_offset_x, int output_width, int output_height) 238{ 239 const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; 240 int g_x = get_global_id (0); 241 int g_y = get_global_id (1); 242 243 float2 normCoor = (float2)(g_x, g_y) / (float2)(output_width, output_height); 244 float8 out_data; 245 float step_x; 246 247#if !PYRAMID_UV 248 step_x = 0.125f / output_width; 249 out_data = read_scale_y (input, sampler, normCoor, step_x) * 255.0f; 250#else 251 step_x = 0.25f / output_width; 252 out_data = read_scale_uv (input, sampler, normCoor, step_x) * 255.0f; 253#endif 254 255 out_data = clamp (out_data + 0.5f, 0.0f, 255.0f); 256 write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(out_data)))); 257} 258 259__kernel void 260kernel_pyramid_copy ( 261 __read_only image2d_t input, int in_offset_x, 262 __write_only image2d_t output, int out_offset_x, 263 int max_g_x, int max_g_y) 264{ 265 sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 266 const int g_x = get_global_id (0); 267 const int g_y = get_global_id (1); 268 269 if (g_x >= max_g_x || g_y >= max_g_y) 270 return; 271 272 uint4 data = read_imageui (input, sampler, (int2)(g_x + in_offset_x, g_y)); 273 write_imageui (output, (int2)(g_x + out_offset_x, g_y), data); 274} 275 276/* 277 * input_gauss: RGBA-CL_UNSIGNED_INT18 278 * input_lap: RGBA-CL_UNSIGNED_INT16 279 * output: RGBA-CL_UNSIGNED_INT16 280 * each work-item calc 2 lines 281 */ 282__kernel void 283kernel_lap_transform ( 284 __read_only image2d_t input_gauss0, int gauss0_offset_x, 285 __read_only image2d_t input_gauss1, 286 float gauss1_sampler_offset_x, float gauss1_sampler_offset_y, 287 __write_only image2d_t output, int lap_offset_x, float out_width, float out_height) 288{ 289 int g_x = get_global_id (0); 290 int g_y = get_global_id (1); 291 const sampler_t gauss0_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 292 const sampler_t gauss1_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; 293 294 float8 orig = convert_float8(as_uchar8(convert_ushort4( 295 read_imageui(input_gauss0, gauss0_sampler, (int2)(g_x + gauss0_offset_x, g_y))))); 296 float8 zoom_in; 297 float2 gauss1_pos; 298 float sampler_step; 299 gauss1_pos.y = (g_y / out_height) + gauss1_sampler_offset_y; 300 gauss1_pos.x = (g_x / out_width) + gauss1_sampler_offset_x; 301 302#if !PYRAMID_UV 303 sampler_step = 0.125f / out_width; 304 zoom_in.s0 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 305 gauss1_pos.x += sampler_step; 306 zoom_in.s1 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 307 gauss1_pos.x += sampler_step; 308 zoom_in.s2 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 309 gauss1_pos.x += sampler_step; 310 zoom_in.s3 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 311 gauss1_pos.x += sampler_step; 312 zoom_in.s4 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 313 gauss1_pos.x += sampler_step; 314 zoom_in.s5 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 315 gauss1_pos.x += sampler_step; 316 zoom_in.s6 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 317 gauss1_pos.x += sampler_step; 318 zoom_in.s7 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x; 319#else 320 sampler_step = 0.25f / out_width; 321 zoom_in.s01 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy; 322 gauss1_pos.x += sampler_step; 323 zoom_in.s23 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy; 324 gauss1_pos.x += sampler_step; 325 zoom_in.s45 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy; 326 gauss1_pos.x += sampler_step; 327 zoom_in.s67 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy; 328#endif 329 float8 lap = (orig - zoom_in * 256.0f) * 0.5f + 128.0f + 0.5f; 330 lap = clamp (lap, 0.0f, 255.0f); 331 write_imageui (output, (int2)(g_x + lap_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(lap)))); 332} 333 334 335/* 336 * input0: RGBA-CL_UNSIGNED_INT16 337 * input1: RGBA-CL_UNSIGNED_INT16 338 * out_diff: RGBA-CL_UNSIGNED_INT16 339 */ 340__kernel void 341kernel_image_diff ( 342 __read_only image2d_t input0, int offset0, 343 __read_only image2d_t input1, int offset1, 344 __write_only image2d_t out_diff) 345{ 346 int g_x = get_global_id (0); 347 int g_y = get_global_id (1); 348 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 349 350 int8 data0 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, (int2)(g_x + offset0, g_y))))); 351 int8 data1 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, (int2)(g_x + offset1, g_y))))); 352 uint8 diff = abs_diff (data0, data1); 353 write_imageui (out_diff, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(diff)))); 354} 355 356 357/* 358 * input0: RGBA-CL_UNSIGNED_INT16 359 */ 360#define LEFT_POS (int)(-1) 361#define MID_POS (int)(0) 362#define RIGHT_POS (int)(1) 363 364__inline int pos_buf_index (int x, int y, int stride) 365{ 366 return mad24 (stride, y, x); 367} 368 369__kernel void 370kernel_seam_dp ( 371 __read_only image2d_t image, 372 __global short *pos_buf, __global float *sum_buf, int offset_x, int valid_width, 373 int max_pos, int seam_height, int seam_stride) 374{ 375 int l_x = get_local_id (0); 376 int group_id = get_group_id (0); 377 if (l_x >= valid_width) 378 return; 379 380 // group0 fill first half slice image curve y = [0, seam_height/2 - 1] 381 // group1 fill send half slice image curve = [seam_height - 1, seam_height/2] 382 int first_slice_h = seam_height / 2; 383 int group_h = (group_id == 0 ? first_slice_h : seam_height - first_slice_h); 384 385 __local float slm_sum[4096]; 386 float mid, left, right, cur; 387 int slm_idx; 388 int default_pos; 389 390 int x = l_x + offset_x; 391 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 392 int y = (group_id == 0 ? 0 : seam_height - 1); 393 float sum = convert_float(read_imageui(image, sampler, (int2)(x, y)).x); 394 395 default_pos = x; 396 slm_sum[l_x] = sum; 397 barrier (CLK_LOCAL_MEM_FENCE); 398 pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(default_pos); 399 400 for (int i = 0; i < group_h; ++i) { 401 y = (group_id == 0 ? i : seam_height - i - 1); 402 slm_idx = l_x - 1; 403 slm_idx = (slm_idx > 0 ? slm_idx : 0); 404 left = slm_sum[slm_idx]; 405 slm_idx = l_x + 1; 406 slm_idx = (slm_idx < valid_width ? slm_idx : valid_width - 1); 407 right = slm_sum[slm_idx]; 408 409 cur = convert_float(read_imageui(image, sampler, (int2)(x, y)).x); 410 411 left = left + cur; 412 right = right + cur; 413 mid = sum + cur; 414 415 int pos; 416 pos = (left < mid) ? LEFT_POS : MID_POS; 417 sum = min (left, mid); 418 pos = (sum < right) ? pos : RIGHT_POS; 419 sum = min (sum, right); 420 slm_sum[l_x] = sum; 421 barrier (CLK_LOCAL_MEM_FENCE); 422 423 pos += default_pos; 424 pos = clamp (pos, offset_x, max_pos); 425 //if (l_x == 3) 426 // printf ("s:%f, pos:%d, mid:%f, offset_x:%d\n", sum.s0, pos.s0, mid.s0, offset_x); 427 pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(pos); 428 } 429 sum_buf[group_id * seam_stride + x] = sum; 430 //printf ("sum(x):%f(x:%d)\n", sum_buf[x].s0, x); 431} 432 433__kernel void 434kernel_seam_mask_blend ( 435 __read_only image2d_t input0, __read_only image2d_t input1, 436 __read_only image2d_t seam_mask, 437 __write_only image2d_t output) 438{ 439 sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 440 const int g_x = get_global_id (0); 441 const int g_y = get_global_id (1); 442 int2 pos = (int2) (g_x, g_y); 443 444 float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos)))); 445 float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos)))); 446 float8 coeff0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(seam_mask, sampler, pos)))) / 255.0f; 447 float8 out_data; 448 449#if !PYRAMID_UV 450 out_data = (data0 - data1) * coeff0 + data1; 451#else 452 coeff0.even = (coeff0.even + coeff0.odd) * 0.5f; 453 coeff0.odd = coeff0.even; 454 out_data = (data0 - data1) * coeff0 + data1; 455#endif 456 457 out_data = clamp (out_data + 0.5f, 0.0f, 255.0f); 458 459 write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data)))); 460} 461 462 463 464#define MASK_GAUSS_R 4 465#define MASK_COEFF_MID 7 466 467__constant const float mask_coeffs[] = {0.0f, 0.0f, 0.0f, 0.082f, 0.102f, 0.119f, 0.130f, 0.134f, 0.130f, 0.119f, 0.102f, 0.082f, 0.0f, 0.0f, 0.0f}; 468 469/* 470 * input: RGBA-CL_UNSIGNED_INT16 471 * output_gauss: RGBA-CL_UNSIGNED_INT8 ? 472 * output_lap:RGBA-CL_UNSIGNED_INT16 473 * each work-item calc 2 lines 474 */ 475__kernel void 476kernel_mask_gauss_scale_slm ( 477 __read_only image2d_t input, 478 __write_only image2d_t output_gauss, 479 int image_width 480#if ENABLE_MASK_GAUSS_SCALE 481 , __write_only image2d_t output_scale 482#endif 483) 484{ 485#define WI_LINES 2 486// input image width MUST < MASK_GAUSS_SLM_WIDTH*4 487#define MASK_GAUSS_SLM_WIDTH 256 488#define CONV_COEFF 128.0f 489 490 int g_x = get_global_id (0); 491 int g_y = get_global_id (1) * WI_LINES; 492 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 493 __local ushort4 slm_gauss_y[WI_LINES][MASK_GAUSS_SLM_WIDTH]; 494 495 float8 result_cur[WI_LINES] = {zero8, zero8}; 496 float8 tmp_data; 497 int i_line; 498 int cur_g_y; 499 500#pragma unroll 501 for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) { 502 cur_g_y = g_y + i_line; 503 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(g_x, cur_g_y))))); 504 result_cur[0] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID]; 505 result_cur[1] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID - 1]; 506 } 507 ((__local ushort8*)(slm_gauss_y[0]))[g_x] = convert_ushort8(result_cur[0] * CONV_COEFF); 508 ((__local ushort8*)(slm_gauss_y[1]))[g_x] = convert_ushort8(result_cur[1] * CONV_COEFF); 509 barrier (CLK_LOCAL_MEM_FENCE); 510 511 float8 final_g[WI_LINES]; 512 float4 result_pre; 513 float4 result_next; 514 515#pragma unroll 516 for (i_line = 0; i_line < WI_LINES; ++i_line) { 517 result_pre = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 - 1, 0, image_width * 2)]) / CONV_COEFF; 518 result_next = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 + 2, 0, image_width * 2)]) / CONV_COEFF; 519 final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] + 520 (float8)(result_pre.s3, result_cur[i_line].s0123, result_cur[i_line].s456) * 521 mask_coeffs[MASK_COEFF_MID + 1] + 522 (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next.s0) * 523 mask_coeffs[MASK_COEFF_MID + 1] + 524 (float8)(result_pre.s23, result_cur[i_line].s0123, result_cur[i_line].s45) * 525 mask_coeffs[MASK_COEFF_MID + 2] + 526 (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next.s01) * 527 mask_coeffs[MASK_COEFF_MID + 2] + 528 (float8)(result_pre.s123, result_cur[i_line].s0123, result_cur[i_line].s4) * 529 mask_coeffs[MASK_COEFF_MID + 3] + 530 (float8)(result_cur[i_line].s3456, result_cur[i_line].s7, result_next.s012) * 531 mask_coeffs[MASK_COEFF_MID + 3] + 532 (float8)(result_pre.s0123, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] + 533 (float8)(result_cur[i_line].s4567, result_next.s0123) * mask_coeffs[MASK_COEFF_MID + 4]; 534 final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f); 535 //if ((g_x == 9 || g_x == 8) && g_y == 0) { 536 // printf ("(x:%d, y:0), pre:" ARG_FORMAT4 "cur" ARG_FORMAT8 "next" ARG_FORMAT4 "final:" ARG_FORMAT8 "\n", 537 // g_x, ARGS4(result_pre), ARGS8(result_cur[i_line]), ARGS4(result_next), ARGS8(final_g[i_line])); 538 //} 539 write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line])))); 540 } 541 542#if ENABLE_MASK_GAUSS_SCALE 543 write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even)); 544#endif 545} 546 547__kernel void 548kernel_mask_gauss_scale ( 549 __read_only image2d_t input, 550 __write_only image2d_t output_gauss 551#if ENABLE_MASK_GAUSS_SCALE 552 , __write_only image2d_t output_scale 553#endif 554) 555{ 556 int g_x = get_global_id (0); 557 int in_x = g_x; 558 int g_y = get_global_id (1) * 2; 559 const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 560 561 float8 result_pre[2] = {zero8, zero8}; 562 float8 result_next[2] = {zero8, zero8}; 563 float8 result_cur[2] = {zero8, zero8}; 564 float8 final_g[2]; 565 566 float8 tmp_data; 567 int i_line; 568 int cur_g_y; 569 float coeff0, coeff1; 570 571#pragma unroll 572 for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) { 573 cur_g_y = g_y + i_line; 574 coeff0 = mask_coeffs[i_line + MASK_COEFF_MID]; 575 coeff1 = mask_coeffs[i_line + MASK_COEFF_MID - 1]; 576 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y))))); 577 result_pre[0] += tmp_data * coeff0; 578 result_pre[1] += tmp_data * coeff1; 579 580 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y))))); 581 result_cur[0] += tmp_data * coeff0; 582 result_cur[1] += tmp_data * coeff1; 583 tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y))))); 584 result_next[1] += tmp_data * coeff1; 585 result_next[0] += tmp_data * coeff0; 586 } 587 588#pragma unroll 589 for (i_line = 0; i_line < 2; ++i_line) { 590 final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] + 591 (float8)(result_pre[i_line].s7, result_cur[i_line].s0123, result_cur[i_line].s456) * 592 mask_coeffs[MASK_COEFF_MID + 1] + 593 (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next[i_line].s0) * 594 mask_coeffs[MASK_COEFF_MID + 1] + 595 (float8)(result_pre[i_line].s67, result_cur[i_line].s0123, result_cur[i_line].s45) * 596 mask_coeffs[MASK_COEFF_MID + 2] + 597 (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next[i_line].s01) * 598 mask_coeffs[MASK_COEFF_MID + 2] + 599 (float8)(result_pre[i_line].s567, result_cur[i_line].s0123, result_cur[i_line].s4) * 600 mask_coeffs[MASK_COEFF_MID + 3] + 601 (float8)(result_cur[i_line].s3456,result_cur[i_line].s7, result_next[i_line].s012) * 602 mask_coeffs[MASK_COEFF_MID + 3] + 603 (float8)(result_pre[i_line].s4567, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] + 604 (float8)(result_cur[i_line].s4567, result_next[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4]; 605 final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f); 606 write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line])))); 607 } 608 609#if ENABLE_MASK_GAUSS_SCALE 610 write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even)); 611#endif 612 613} 614 615