1/*
2 * kernel_gauss_lap_pyramid.cl
3 * input0
4 * input1
5 * output
6 * window, pos_x, pos_y, width, height
7 */
8
9#ifndef PYRAMID_UV
10#define PYRAMID_UV 0
11#endif
12
13#ifndef CL_PYRAMID_ENABLE_DUMP
14#define CL_PYRAMID_ENABLE_DUMP 0
15#endif
16
17#ifndef ENABLE_MASK_GAUSS_SCALE
18#define ENABLE_MASK_GAUSS_SCALE 0
19#endif
20
21#define fixed_pixels 8
22#define GAUSS_V_R 2
23#define GAUSS_H_R 1
24#define COEFF_MID 4
25
26#define zero8 (float8)(0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
27
28__constant const float coeffs[9] = {0.0f, 0.0f, 0.152f, 0.222f, 0.252f, 0.222f, 0.152f, 0.0f, 0.0f};
29
30#define ARG_FORMAT4 "(%.1f,%.1f,%.1f,%.1f)"
31#define ARGS4(a) a.s0, a.s1, a.s2, a.s3
32
33#define ARG_FORMAT8 "(%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f)"
34#define ARGS8(a) a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7
35
36/*
37 * input: RGBA-CL_UNSIGNED_INT16
38 * output_gauss: RGBA-CL_UNSIGNED_INT8
39 * output_lap:RGBA-CL_UNSIGNED_INT16
40 * each work-item calc 2 lines
41 */
42__kernel void
43kernel_gauss_scale_transform (
44    __read_only image2d_t input, int in_offset_x,
45    __write_only image2d_t output_gauss
46#if CL_PYRAMID_ENABLE_DUMP
47    , __write_only image2d_t dump_orig
48#endif
49)
50{
51    int g_x = get_global_id (0);
52    int in_x = g_x + in_offset_x;
53    int g_y = get_global_id (1) * 4;
54    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
55
56    int g_out_x = get_global_id (0);
57    int g_out_y = get_global_id (1) * 2;
58
59#if CL_PYRAMID_ENABLE_DUMP
60    write_imageui (dump_orig, (int2)(g_x, g_y + 0), read_imageui(input, sampler, (int2)(in_x, g_y)));
61    write_imageui (dump_orig, (int2)(g_x, g_y + 1), read_imageui(input, sampler, (int2)(in_x, g_y + 1)));
62    write_imageui (dump_orig, (int2)(g_x, g_y + 2), read_imageui(input, sampler, (int2)(in_x, g_y + 2)));
63    write_imageui (dump_orig, (int2)(g_x, g_y + 3), read_imageui(input, sampler, (int2)(in_x, g_y + 3)));
64#endif
65
66    float8 result_pre[2] = {zero8, zero8};
67    float8 result_next[2] = {zero8, zero8};
68    float8 result_cur[2] = {zero8, zero8};
69    float4 final_g[2];
70
71    float8 tmp_data;
72    int i_ver;
73
74#pragma unroll
75    for (i_ver = -GAUSS_V_R; i_ver <= GAUSS_V_R + 2; i_ver++) {
76        int cur_g_y = g_y + i_ver;
77        float coeff0 = coeffs[i_ver + COEFF_MID];
78        float coeff1 = coeffs[i_ver + COEFF_MID - 2];
79        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y)))));
80        result_pre[0] += tmp_data * coeff0;
81        result_pre[1] += tmp_data * coeff1;
82        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y)))));
83        result_cur[0] += tmp_data * coeff0;
84        result_cur[1] += tmp_data * coeff1;
85        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y)))));
86        result_next[1] += tmp_data * coeff1;
87        result_next[0] += tmp_data * coeff0;
88    }
89
90    int i_line;
91#pragma unroll
92    for (i_line = 0; i_line < 2; ++i_line) {
93#if !PYRAMID_UV
94        final_g[i_line] = result_cur[i_line].even * coeffs[COEFF_MID] +
95                          (float4)(result_pre[i_line].s7, result_cur[i_line].s135) * coeffs[COEFF_MID + 1] +
96                          (float4)(result_pre[i_line].s6, result_cur[i_line].s024) * coeffs[COEFF_MID + 2] +
97                          (float4)(result_cur[i_line].s1357) * coeffs[COEFF_MID + 1] +
98                          (float4)(result_cur[i_line].s246, result_next[i_line].s0) * coeffs[COEFF_MID + 2];
99#else
100        final_g[i_line] = result_cur[i_line].s0145 * coeffs[COEFF_MID] +
101                          (float4)(result_pre[i_line].s67, result_cur[i_line].s23) * coeffs[COEFF_MID + 1] +
102                          (float4)(result_pre[i_line].s45, result_cur[i_line].s01) * coeffs[COEFF_MID + 2] +
103                          (float4)(result_cur[i_line].s2367) * coeffs[COEFF_MID + 1] +
104                          (float4)(result_cur[i_line].s45, result_next[i_line].s01) * coeffs[COEFF_MID + 2];
105#endif
106        final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
107        write_imageui (output_gauss, (int2)(g_out_x, g_out_y + i_line), convert_uint4(final_g[i_line]));
108    }
109
110}
111
112inline float8
113read_scale_y (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x)
114{
115    float8 data;
116    data.s0 = read_imagef (input, sampler, pos_start).x;
117    pos_start.x += step_x;
118    data.s1 = read_imagef (input, sampler, pos_start).x;
119    pos_start.x += step_x;
120    data.s2 = read_imagef (input, sampler, pos_start).x;
121    pos_start.x += step_x;
122    data.s3 = read_imagef (input, sampler, pos_start).x;
123    pos_start.x += step_x;
124    data.s4 = read_imagef (input, sampler, pos_start).x;
125    pos_start.x += step_x;
126    data.s5 = read_imagef (input, sampler, pos_start).x;
127    pos_start.x += step_x;
128    data.s6 = read_imagef (input, sampler, pos_start).x;
129    pos_start.x += step_x;
130    data.s7 = read_imagef (input, sampler, pos_start).x;
131    return data;
132}
133
134inline float8
135read_scale_uv (__read_only image2d_t input, const sampler_t sampler, float2 pos_start, float step_x)
136{
137    float8 data;
138    data.s01 = read_imagef (input, sampler, pos_start).xy;
139    pos_start.x += step_x;
140    data.s23 = read_imagef (input, sampler, pos_start).xy;
141    pos_start.x += step_x;
142    data.s45 = read_imagef (input, sampler, pos_start).xy;
143    pos_start.x += step_x;
144    data.s67 = read_imagef (input, sampler, pos_start).xy;
145    return data;
146}
147
148/*
149 * input_gauss: RGBA-CL_UNSIGNED_INT18
150 * input_lap: RGBA-CL_UNSIGNED_INT16
151 * output:     RGBA-CL_UNSIGNED_INT16
152 * each work-item calc 2 lines
153 */
154__kernel void
155kernel_gauss_lap_reconstruct (
156    __read_only image2d_t input_gauss,
157    float in_sampler_offset_x, float in_sampler_offset_y,
158    __read_only image2d_t input_lap,
159    __write_only image2d_t output, int out_offset_x, float out_width, float out_height
160#if CL_PYRAMID_ENABLE_DUMP
161    , __write_only image2d_t dump_resize, __write_only image2d_t dump_final
162#endif
163)
164{
165    int g_x = get_global_id (0);
166    int g_y = get_global_id (1);
167    const sampler_t lap_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
168    const sampler_t gauss_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
169
170    //if (g_x > out_width + 0.9f || g_y > out_height + 0.5f)
171    //    return;
172
173    float8 lap = convert_float8(as_uchar8(convert_ushort4(read_imageui(input_lap, lap_sampler, (int2)(g_x, g_y)))));
174    lap = (lap - 128.0f) * 2.0f;
175
176    float8 data_g;
177    float2 input_gauss_pos;
178    float step_x;
179    input_gauss_pos.x = g_x / out_width + in_sampler_offset_x;
180    input_gauss_pos.y = g_y / out_height + in_sampler_offset_y;
181#if !PYRAMID_UV
182    step_x = 0.125f / out_width;
183    data_g = read_scale_y (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f;
184#else
185    step_x = 0.25f / out_width;
186    data_g = read_scale_uv (input_gauss, gauss_sampler, input_gauss_pos, step_x) * 256.0f;
187#endif
188
189#if CL_PYRAMID_ENABLE_DUMP
190    write_imageui (dump_resize, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
191#endif
192
193    data_g += lap + 0.5f;
194    data_g = clamp (data_g, 0.0f, 255.0f);
195    write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
196#if CL_PYRAMID_ENABLE_DUMP
197    write_imageui (dump_final, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(data_g))));
198#endif
199}
200
201__kernel void
202kernel_pyramid_blend (
203    __read_only image2d_t input0, __read_only image2d_t input1,
204#if !PYRAMID_UV
205    __global const float8 *input0_mask,
206#else
207    __global const float4 *input0_mask,
208#endif
209    __write_only image2d_t output)
210{
211    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
212    const int g_x = get_global_id (0);
213    const int g_y = get_global_id (1);
214    int2 pos = (int2) (g_x, g_y);
215
216    float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos))));
217    float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos))));
218    float8 out_data;
219
220#if !PYRAMID_UV
221    out_data = (data0 - data1) * input0_mask[g_x] + data1;
222#else
223    float8 coeff;
224    coeff.even = input0_mask[g_x];
225    coeff.odd = coeff.even;
226    out_data = (data0 - data1) * coeff + data1;
227#endif
228
229    out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
230
231    write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data))));
232}
233
234__kernel void
235kernel_pyramid_scale (
236    __read_only image2d_t input, __write_only image2d_t output,
237    int out_offset_x, int output_width, int output_height)
238{
239    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
240    int g_x = get_global_id (0);
241    int g_y = get_global_id (1);
242
243    float2 normCoor = (float2)(g_x, g_y) / (float2)(output_width, output_height);
244    float8 out_data;
245    float step_x;
246
247#if !PYRAMID_UV
248    step_x = 0.125f / output_width;
249    out_data = read_scale_y (input, sampler, normCoor, step_x) * 255.0f;
250#else
251    step_x = 0.25f / output_width;
252    out_data = read_scale_uv (input, sampler, normCoor, step_x) * 255.0f;
253#endif
254
255    out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
256    write_imageui (output, (int2)(g_x + out_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(out_data))));
257}
258
259__kernel void
260kernel_pyramid_copy (
261    __read_only image2d_t input, int in_offset_x,
262    __write_only image2d_t output, int out_offset_x,
263    int max_g_x, int max_g_y)
264{
265    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
266    const int g_x = get_global_id (0);
267    const int g_y = get_global_id (1);
268
269    if (g_x >= max_g_x || g_y >= max_g_y)
270        return;
271
272    uint4 data = read_imageui (input, sampler, (int2)(g_x + in_offset_x, g_y));
273    write_imageui (output, (int2)(g_x + out_offset_x, g_y), data);
274}
275
276/*
277 * input_gauss: RGBA-CL_UNSIGNED_INT18
278 * input_lap: RGBA-CL_UNSIGNED_INT16
279 * output:     RGBA-CL_UNSIGNED_INT16
280 * each work-item calc 2 lines
281 */
282__kernel void
283kernel_lap_transform (
284    __read_only image2d_t input_gauss0, int gauss0_offset_x,
285    __read_only image2d_t input_gauss1,
286    float gauss1_sampler_offset_x, float gauss1_sampler_offset_y,
287    __write_only image2d_t output, int lap_offset_x, float out_width, float out_height)
288{
289    int g_x = get_global_id (0);
290    int g_y = get_global_id (1);
291    const sampler_t gauss0_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
292    const sampler_t gauss1_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
293
294    float8 orig = convert_float8(as_uchar8(convert_ushort4(
295                      read_imageui(input_gauss0, gauss0_sampler, (int2)(g_x + gauss0_offset_x, g_y)))));
296    float8 zoom_in;
297    float2 gauss1_pos;
298    float sampler_step;
299    gauss1_pos.y = (g_y / out_height) + gauss1_sampler_offset_y;
300    gauss1_pos.x = (g_x / out_width) + gauss1_sampler_offset_x;
301
302#if !PYRAMID_UV
303    sampler_step = 0.125f / out_width;
304    zoom_in.s0 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
305    gauss1_pos.x += sampler_step;
306    zoom_in.s1 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
307    gauss1_pos.x += sampler_step;
308    zoom_in.s2 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
309    gauss1_pos.x += sampler_step;
310    zoom_in.s3 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
311    gauss1_pos.x += sampler_step;
312    zoom_in.s4 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
313    gauss1_pos.x += sampler_step;
314    zoom_in.s5 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
315    gauss1_pos.x += sampler_step;
316    zoom_in.s6 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
317    gauss1_pos.x += sampler_step;
318    zoom_in.s7 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).x;
319#else
320    sampler_step = 0.25f / out_width;
321    zoom_in.s01 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
322    gauss1_pos.x += sampler_step;
323    zoom_in.s23 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
324    gauss1_pos.x += sampler_step;
325    zoom_in.s45 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
326    gauss1_pos.x += sampler_step;
327    zoom_in.s67 = read_imagef (input_gauss1, gauss1_sampler, gauss1_pos).xy;
328#endif
329    float8 lap = (orig - zoom_in * 256.0f) * 0.5f + 128.0f + 0.5f;
330    lap = clamp (lap, 0.0f, 255.0f);
331    write_imageui (output, (int2)(g_x + lap_offset_x, g_y), convert_uint4(as_ushort4(convert_uchar8(lap))));
332}
333
334
335/*
336 * input0: RGBA-CL_UNSIGNED_INT16
337 * input1: RGBA-CL_UNSIGNED_INT16
338 * out_diff:  RGBA-CL_UNSIGNED_INT16
339 */
340__kernel void
341kernel_image_diff (
342    __read_only image2d_t input0, int offset0,
343    __read_only image2d_t input1, int offset1,
344    __write_only image2d_t out_diff)
345{
346    int g_x = get_global_id (0);
347    int g_y = get_global_id (1);
348    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
349
350    int8 data0 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, (int2)(g_x + offset0, g_y)))));
351    int8 data1 = convert_int8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, (int2)(g_x + offset1, g_y)))));
352    uint8 diff = abs_diff (data0, data1);
353    write_imageui (out_diff, (int2)(g_x, g_y), convert_uint4(as_ushort4(convert_uchar8(diff))));
354}
355
356
357/*
358 * input0: RGBA-CL_UNSIGNED_INT16
359 */
360#define LEFT_POS (int)(-1)
361#define MID_POS (int)(0)
362#define RIGHT_POS (int)(1)
363
364__inline int pos_buf_index (int x, int y, int stride)
365{
366    return mad24 (stride, y, x);
367}
368
369__kernel void
370kernel_seam_dp (
371    __read_only image2d_t image,
372    __global short *pos_buf, __global float *sum_buf, int offset_x, int valid_width,
373    int max_pos, int seam_height, int seam_stride)
374{
375    int l_x = get_local_id (0);
376    int group_id = get_group_id (0);
377    if (l_x >= valid_width)
378        return;
379
380    // group0 fill first half slice image curve y = [0, seam_height/2 - 1]
381    // group1 fill send half slice image curve = [seam_height - 1, seam_height/2]
382    int first_slice_h = seam_height / 2;
383    int group_h = (group_id == 0 ? first_slice_h : seam_height - first_slice_h);
384
385    __local float slm_sum[4096];
386    float mid, left, right, cur;
387    int slm_idx;
388    int default_pos;
389
390    int x = l_x + offset_x;
391    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
392    int y = (group_id == 0 ? 0 : seam_height - 1);
393    float sum = convert_float(read_imageui(image, sampler, (int2)(x, y)).x);
394
395    default_pos = x;
396    slm_sum[l_x] = sum;
397    barrier (CLK_LOCAL_MEM_FENCE);
398    pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(default_pos);
399
400    for (int i = 0; i < group_h; ++i) {
401        y = (group_id == 0 ? i : seam_height - i - 1);
402        slm_idx = l_x - 1;
403        slm_idx = (slm_idx > 0 ? slm_idx : 0);
404        left = slm_sum[slm_idx];
405        slm_idx = l_x + 1;
406        slm_idx = (slm_idx < valid_width ? slm_idx : valid_width - 1);
407        right = slm_sum[slm_idx];
408
409        cur = convert_float(read_imageui(image, sampler, (int2)(x, y)).x);
410
411        left = left + cur;
412        right = right + cur;
413        mid = sum + cur;
414
415        int pos;
416        pos = (left < mid) ? LEFT_POS : MID_POS;
417        sum = min (left, mid);
418        pos = (sum < right) ? pos : RIGHT_POS;
419        sum = min (sum, right);
420        slm_sum[l_x] = sum;
421        barrier (CLK_LOCAL_MEM_FENCE);
422
423        pos += default_pos;
424        pos = clamp (pos, offset_x, max_pos);
425        //if (l_x == 3)
426        //    printf ("s:%f, pos:%d, mid:%f, offset_x:%d\n", sum.s0, pos.s0, mid.s0, offset_x);
427        pos_buf[pos_buf_index(x, y, seam_stride)] = convert_short(pos);
428    }
429    sum_buf[group_id * seam_stride + x] = sum;
430    //printf ("sum(x):%f(x:%d)\n", sum_buf[x].s0, x);
431}
432
433__kernel void
434kernel_seam_mask_blend (
435    __read_only image2d_t input0, __read_only image2d_t input1,
436    __read_only image2d_t seam_mask,
437    __write_only image2d_t output)
438{
439    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
440    const int g_x = get_global_id (0);
441    const int g_y = get_global_id (1);
442    int2 pos = (int2) (g_x, g_y);
443
444    float8 data0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input0, sampler, pos))));
445    float8 data1 = convert_float8(as_uchar8(convert_ushort4(read_imageui(input1, sampler, pos))));
446    float8 coeff0 = convert_float8(as_uchar8(convert_ushort4(read_imageui(seam_mask, sampler, pos)))) / 255.0f;
447    float8 out_data;
448
449#if !PYRAMID_UV
450    out_data = (data0 - data1) * coeff0 + data1;
451#else
452    coeff0.even = (coeff0.even + coeff0.odd) * 0.5f;
453    coeff0.odd = coeff0.even;
454    out_data = (data0 - data1) * coeff0 + data1;
455#endif
456
457    out_data = clamp (out_data + 0.5f, 0.0f, 255.0f);
458
459    write_imageui(output, pos, convert_uint4(as_ushort4(convert_uchar8(out_data))));
460}
461
462
463
464#define MASK_GAUSS_R 4
465#define MASK_COEFF_MID 7
466
467__constant const float mask_coeffs[] = {0.0f, 0.0f, 0.0f, 0.082f, 0.102f, 0.119f, 0.130f, 0.134f, 0.130f, 0.119f, 0.102f, 0.082f, 0.0f, 0.0f, 0.0f};
468
469/*
470 * input: RGBA-CL_UNSIGNED_INT16
471 * output_gauss: RGBA-CL_UNSIGNED_INT8 ?
472 * output_lap:RGBA-CL_UNSIGNED_INT16
473 * each work-item calc 2 lines
474 */
475__kernel void
476kernel_mask_gauss_scale_slm (
477    __read_only image2d_t input,
478    __write_only image2d_t output_gauss,
479    int image_width
480#if ENABLE_MASK_GAUSS_SCALE
481    , __write_only image2d_t output_scale
482#endif
483)
484{
485#define WI_LINES 2
486// input image width MUST < MASK_GAUSS_SLM_WIDTH*4
487#define MASK_GAUSS_SLM_WIDTH  256
488#define CONV_COEFF 128.0f
489
490    int g_x = get_global_id (0);
491    int g_y = get_global_id (1) * WI_LINES;
492    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
493    __local ushort4 slm_gauss_y[WI_LINES][MASK_GAUSS_SLM_WIDTH];
494
495    float8 result_cur[WI_LINES] = {zero8, zero8};
496    float8 tmp_data;
497    int i_line;
498    int cur_g_y;
499
500#pragma unroll
501    for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) {
502        cur_g_y = g_y + i_line;
503        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(g_x, cur_g_y)))));
504        result_cur[0] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID];
505        result_cur[1] += tmp_data * mask_coeffs[i_line + MASK_COEFF_MID - 1];
506    }
507    ((__local ushort8*)(slm_gauss_y[0]))[g_x] = convert_ushort8(result_cur[0] * CONV_COEFF);
508    ((__local ushort8*)(slm_gauss_y[1]))[g_x] = convert_ushort8(result_cur[1] * CONV_COEFF);
509    barrier (CLK_LOCAL_MEM_FENCE);
510
511    float8 final_g[WI_LINES];
512    float4 result_pre;
513    float4 result_next;
514
515#pragma unroll
516    for (i_line = 0; i_line < WI_LINES; ++i_line) {
517        result_pre = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 - 1, 0, image_width * 2)]) / CONV_COEFF;
518        result_next = convert_float4(slm_gauss_y[i_line][clamp (g_x * 2 + 2, 0, image_width * 2)]) / CONV_COEFF;
519        final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] +
520                          (float8)(result_pre.s3, result_cur[i_line].s0123, result_cur[i_line].s456) *
521                                  mask_coeffs[MASK_COEFF_MID + 1] +
522                          (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next.s0) *
523                                  mask_coeffs[MASK_COEFF_MID + 1] +
524                          (float8)(result_pre.s23, result_cur[i_line].s0123, result_cur[i_line].s45) *
525                                  mask_coeffs[MASK_COEFF_MID + 2] +
526                          (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next.s01) *
527                                  mask_coeffs[MASK_COEFF_MID + 2] +
528                          (float8)(result_pre.s123, result_cur[i_line].s0123, result_cur[i_line].s4) *
529                                  mask_coeffs[MASK_COEFF_MID + 3] +
530                          (float8)(result_cur[i_line].s3456, result_cur[i_line].s7, result_next.s012) *
531                                  mask_coeffs[MASK_COEFF_MID + 3] +
532                          (float8)(result_pre.s0123, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] +
533                          (float8)(result_cur[i_line].s4567, result_next.s0123) * mask_coeffs[MASK_COEFF_MID + 4];
534        final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
535        //if ((g_x == 9 || g_x == 8) && g_y == 0) {
536        //    printf ("(x:%d, y:0), pre:" ARG_FORMAT4 "cur" ARG_FORMAT8 "next" ARG_FORMAT4 "final:" ARG_FORMAT8 "\n",
537        //        g_x, ARGS4(result_pre), ARGS8(result_cur[i_line]), ARGS4(result_next), ARGS8(final_g[i_line]));
538        //}
539        write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line]))));
540    }
541
542#if ENABLE_MASK_GAUSS_SCALE
543    write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even));
544#endif
545}
546
547__kernel void
548kernel_mask_gauss_scale (
549    __read_only image2d_t input,
550    __write_only image2d_t output_gauss
551#if ENABLE_MASK_GAUSS_SCALE
552    , __write_only image2d_t output_scale
553#endif
554)
555{
556    int g_x = get_global_id (0);
557    int in_x = g_x;
558    int g_y = get_global_id (1) * 2;
559    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
560
561    float8 result_pre[2] = {zero8, zero8};
562    float8 result_next[2] = {zero8, zero8};
563    float8 result_cur[2] = {zero8, zero8};
564    float8 final_g[2];
565
566    float8 tmp_data;
567    int i_line;
568    int cur_g_y;
569    float coeff0, coeff1;
570
571#pragma unroll
572    for (i_line = -MASK_GAUSS_R; i_line <= MASK_GAUSS_R + 1; i_line++) {
573        cur_g_y = g_y + i_line;
574        coeff0 = mask_coeffs[i_line + MASK_COEFF_MID];
575        coeff1 = mask_coeffs[i_line + MASK_COEFF_MID - 1];
576        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x - 1, cur_g_y)))));
577        result_pre[0] += tmp_data * coeff0;
578        result_pre[1] += tmp_data * coeff1;
579
580        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x, cur_g_y)))));
581        result_cur[0] += tmp_data * coeff0;
582        result_cur[1] += tmp_data * coeff1;
583        tmp_data = convert_float8(as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(in_x + 1, cur_g_y)))));
584        result_next[1] += tmp_data * coeff1;
585        result_next[0] += tmp_data * coeff0;
586    }
587
588#pragma unroll
589    for (i_line = 0; i_line < 2; ++i_line) {
590        final_g[i_line] = result_cur[i_line] * mask_coeffs[MASK_COEFF_MID] +
591                          (float8)(result_pre[i_line].s7, result_cur[i_line].s0123, result_cur[i_line].s456) *
592                                  mask_coeffs[MASK_COEFF_MID + 1] +
593                          (float8)(result_cur[i_line].s1234, result_cur[i_line].s567, result_next[i_line].s0) *
594                                  mask_coeffs[MASK_COEFF_MID + 1] +
595                          (float8)(result_pre[i_line].s67, result_cur[i_line].s0123, result_cur[i_line].s45) *
596                                  mask_coeffs[MASK_COEFF_MID + 2] +
597                          (float8)(result_cur[i_line].s2345, result_cur[i_line].s67, result_next[i_line].s01) *
598                                  mask_coeffs[MASK_COEFF_MID + 2] +
599                          (float8)(result_pre[i_line].s567, result_cur[i_line].s0123, result_cur[i_line].s4) *
600                                  mask_coeffs[MASK_COEFF_MID + 3] +
601                          (float8)(result_cur[i_line].s3456,result_cur[i_line].s7, result_next[i_line].s012) *
602                                  mask_coeffs[MASK_COEFF_MID + 3] +
603                          (float8)(result_pre[i_line].s4567, result_cur[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4] +
604                          (float8)(result_cur[i_line].s4567, result_next[i_line].s0123) * mask_coeffs[MASK_COEFF_MID + 4];
605        final_g[i_line] = clamp (final_g[i_line] + 0.5f, 0.0f, 255.0f);
606        write_imageui (output_gauss, (int2)(g_x, g_y + i_line), convert_uint4(as_ushort4(convert_uchar8(final_g[i_line]))));
607    }
608
609#if ENABLE_MASK_GAUSS_SCALE
610    write_imageui (output_scale, (int2)(g_x, get_global_id (1)), convert_uint4(final_g[0].even));
611#endif
612
613}
614
615