kernel_bayer_pipe.cl revision 66a677ec4817859937a8886ff3f48fc79daa8430
1/*
2 * function: kernel_bayer_pipe
3 * params:
4 *   input:    image2d_t as read only
5 *   output:   image2d_t as write only
6 *   blc_config: black level correction configuration
7 *   wb_config: whitebalance configuration
8 *   gamma_table: RGGB table
9 *   stats_output: 3a stats output
10 */
11
12
13#define WORKGROUP_CELL_WIDTH 64
14#define WORKGROUP_CELL_HEIGHT 4
15
16#define DEMOSAIC_X_CELL_PER_WORKITEM 2
17
18#define PIXEL_PER_CELL 2
19
20#define SLM_CELL_X_OFFSET 4
21#define SLM_CELL_Y_OFFSET 1
22
23// 8x8
24#define SLM_CELL_X_VALID_SIZE WORKGROUP_CELL_WIDTH
25#define SLM_CELL_Y_VALID_SIZE WORKGROUP_CELL_HEIGHT
26
27// 10x10
28#define SLM_CELL_X_SIZE (SLM_CELL_X_VALID_SIZE + SLM_CELL_X_OFFSET * 2)
29#define SLM_CELL_Y_SIZE (SLM_CELL_Y_VALID_SIZE + SLM_CELL_Y_OFFSET * 2)
30
31#define GUASS_DELTA_S_1      1.031739f
32#define GUASS_DELTA_S_1_5    1.072799f
33#define GUASS_DELTA_S_2      1.133173f
34#define GUASS_DELTA_S_2_5    1.215717f
35
36typedef struct
37{
38    float           ee_gain;
39    float           ee_threshold;
40    float           nr_gain;
41} CLEeConfig;
42
43inline int get_shared_pos_x (int i)
44{
45    return i % SLM_CELL_X_SIZE;
46}
47
48inline int get_shared_pos_y (int i)
49{
50    return i / SLM_CELL_X_SIZE;
51}
52
53inline int shared_pos (int x, int y)
54{
55    return mad24(y, SLM_CELL_X_SIZE, x);
56}
57
58/* BA10=> GRBG  */
59inline void grbg_slm_load (
60    __local float *px, __local float *py, __local float *pz, __local float *pw,
61    int index, __read_only image2d_t input, int x_start, int y_start
62)
63{
64    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
65    float4 data1, data2, line1, line2;
66    int x0 = (get_shared_pos_x (index) + x_start) / 4;
67    int y0 = get_shared_pos_y (index) + y_start;
68    int2 pos = (int2)(x0, y0);
69    float4 gr, r, b, gb;
70
71    gr = read_imagef (input, sampler, (int2)(x0, y0));
72    r = read_imagef (input, sampler, (int2)(x0, y0 + 544));
73    b = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 2));
74    gb = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 3));
75
76    (*(__local float4 *)(px + index)) = gr;
77    (*(__local float4 *)(py + index)) = r;
78    (*(__local float4 *)(pz + index)) = b;
79    (*(__local float4 *)(pw + index)) = gb;
80}
81
82#define MAX_DELTA_COFF 5.0f
83#define MIN_DELTA_COFF 1.0f
84#define DEFAULT_DELTA_COFF 4.0f
85
86inline float2 delta_coff (float2 in, __local float *table)
87{
88    float2 out;
89    out.x = table[(int)(fabs(in.x * 64.0f))];
90    out.y = table[(int)(fabs(in.y * 64.0f))];
91
92    return out;
93}
94
95inline float2 dot_denoise (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float *table, float gain)
96{
97    float2 coff0, coff1, coff2, coff3, coff4, coff5;
98    coff0 = delta_coff (0.0, table) * gain;
99    coff1 = delta_coff (in1 - value, table);
100    coff2 = delta_coff (in2 - value, table);
101    coff3 = delta_coff (in3 - value, table);
102    coff4 = delta_coff (in4 - value, table);
103    //(in1 * coff1 + in2 * coff2 + in3 * coff3 + in4 * coff4 + value * coff0)
104    float2 sum1 = (mad (in1, coff1,
105                        mad (in2, coff2,
106                             mad (in3, coff3,
107                                  mad (in4, coff4, value * coff0)))));
108    return  sum1 / (coff0 + coff1 + coff2 + coff3 + coff4);
109}
110
111inline float2 dot_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, float2 out, CLEeConfig ee_config, float2 *egain)
112{
113    float2 eH = mad(in1, -0.5f, value);
114    eH = mad(in3, -0.5f, eH);
115    float2 eV = mad(in2, -0.5f, value);
116    eV = mad(in4, -0.5f, eV);
117
118    eH = fmax(eH, eV);
119
120    eH = eH > ee_config.ee_threshold ? eH : 0.0f;
121
122    egain[0] = mad(eH, ee_config.ee_gain, out) / out;
123
124    return out * egain[0];
125}
126
127inline float2 dot_denoise_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float *table, float gain, float2 *egain, CLEeConfig ee_config)
128{
129    float2 out = dot_denoise(value, in1, in2, in3, in4, table, gain);
130    return dot_ee(value, in1, in2, in3, in4, out, ee_config, egain);
131}
132
133void demosaic_2_cell (
134    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
135    int in_x, int in_y,
136    __write_only image2d_t out, uint out_height, int out_x, int out_y)
137{
138    float4 out_data;
139    float2 value;
140    int index;
141    {
142        float3 R_y[2];
143        index = shared_pos (in_x - 1, in_y);
144        R_y[0] = *(__local float3*)(y_data_in + index);
145        index = shared_pos (in_x - 1, in_y + 1);
146        R_y[1] = *(__local float3*)(y_data_in + index);
147
148        out_data.s02 = (R_y[0].s01 + R_y[0].s12) * 0.5f;
149        out_data.s13 = R_y[0].s12;
150        write_imagef (out, (int2)(out_x, out_y), out_data);
151
152        out_data.s02 = (R_y[0].s01 + R_y[0].s12 + R_y[1].s01 + R_y[1].s12) * 0.25f;
153        out_data.s13 = (R_y[0].s12 + R_y[1].s12) * 0.5f;
154        write_imagef (out, (int2)(out_x, out_y + 1), out_data);
155    }
156
157    {
158        float3 B_z[2];
159        index = shared_pos (in_x, in_y - 1);
160        B_z[0] = *(__local float3*)(z_data_in + index);
161        index = shared_pos (in_x, in_y);
162        B_z[1] = *(__local float3*)(z_data_in + index);
163
164        out_data.s02 = (B_z[0].s01 + B_z[1].s01) * 0.5f;
165        out_data.s13 = (B_z[0].s01 + B_z[0].s12 + B_z[1].s01 + B_z[1].s12) * 0.25f;
166        write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data);
167
168        out_data.s02 = B_z[1].s01;
169        out_data.s13 = (B_z[1].s01 + B_z[1].s12) * 0.5f;
170        write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data);
171    }
172
173    {
174        float3 Gr_x[2], Gb_w[2];
175        index = shared_pos (in_x, in_y);
176        Gr_x[0] = *(__local float3*)(x_data_in + index);
177        index = shared_pos (in_x, in_y + 1);
178        Gr_x[1] = *(__local float3*)(x_data_in + index);
179
180        index = shared_pos (in_x - 1, in_y - 1);
181        Gb_w[0] = *(__local float3*)(w_data_in + index);
182        index = shared_pos (in_x - 1, in_y);
183        Gb_w[1] = *(__local float3*)(w_data_in + index);
184
185        out_data.s02 = (Gr_x[0].s01 * 4.0f + Gb_w[0].s01 +
186                        Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12) * 0.125f;
187        out_data.s13 = (Gr_x[0].s01 + Gr_x[0].s12 + Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
188        write_imagef (out, (int2)(out_x, out_y + out_height), out_data);
189
190        out_data.s02 = (Gr_x[0].s01 + Gr_x[1].s01 + Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;
191
192        out_data.s13 = (Gb_w[1].s12 * 4.0f + Gr_x[0].s01 +
193                        Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12) * 0.125f;
194        write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data);
195    }
196}
197
198void demosaic_denoise_2_cell (
199    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
200    int in_x, int in_y,
201    __write_only image2d_t out, uint out_height, int out_x, int out_y, __local float *table, CLEeConfig ee_config)
202{
203    float4 out_data[2];
204    float2 value;
205    int index;
206    float2 egain[4];
207
208    ///////////////////////////////////////G///////////////////////////////////
209    {
210        float3 Gr_x[2], Gb_w[2];
211        index = shared_pos (in_x - 1, in_y - 1);
212        Gb_w[0] = *(__local float3*)(w_data_in + index);
213        index = shared_pos (in_x - 1, in_y);
214        Gb_w[1] = *(__local float3*)(w_data_in + index);
215
216        index = shared_pos (in_x, in_y);
217        Gr_x[0] = *(__local float3*)(x_data_in + index);
218        index = shared_pos (in_x, in_y + 1);
219        Gr_x[1] = *(__local float3*)(x_data_in + index);
220
221        value = mad (Gr_x[0].s01, 4.0f,  (Gb_w[0].s01 +
222                                          Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12)) * 0.125f;
223        out_data[0].s02 = dot_denoise_ee (value, Gb_w[0].s01, Gb_w[0].s12, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1_5, &egain[0], ee_config);
224        value = (Gr_x[0].s01 + Gr_x[0].s12 +
225                 Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
226        out_data[0].s13 = dot_denoise_ee(value, Gr_x[0].s01, Gr_x[0].s12, Gb_w[0].s12, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[1], ee_config);
227
228        value = (Gr_x[0].s01 + Gr_x[1].s01 +
229                 Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;
230        out_data[1].s02 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[1].s01, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[2], ee_config);
231
232        value = mad (Gb_w[1].s12, 4.0f, (Gr_x[0].s01 +
233                                         Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12)) * 0.125f;
234        out_data[1].s13 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[0].s12, Gr_x[1].s01, Gr_x[1].s12, table, GUASS_DELTA_S_1_5, &egain[3], ee_config);
235
236        write_imagef (out, (int2)(out_x, out_y + out_height), out_data[0]);
237        write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data[1]);
238    }
239
240    ////////////////////////////////R//////////////////////////////////////////
241    {
242        float4 R_y[3];
243        index = shared_pos (in_x - 1, in_y - 1);
244        R_y[0] = *(__local float4*)(y_data_in + index);
245        index = shared_pos (in_x - 1, in_y);
246        R_y[1] = *(__local float4*)(y_data_in + index);
247        index = shared_pos (in_x - 1, in_y + 1);
248        R_y[2] = *(__local float4*)(y_data_in + index);
249
250        value = (R_y[1].s01 + R_y[1].s12) * 0.5f;
251        out_data[0].s02 = dot_denoise (value, R_y[0].s01, R_y[0].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_2_5) * egain[0];
252
253        value = R_y[1].s12;
254        out_data[0].s13 = dot_denoise (value, R_y[0].s12, R_y[1].s01, R_y[1].s23, R_y[2].s12, table, GUASS_DELTA_S_2) * egain[1];
255
256        value = (R_y[1].s01 + R_y[1].s12 +
257                 R_y[2].s01 + R_y[2].s12) * 0.25f;
258        out_data[1].s02 = dot_denoise (value, R_y[1].s01, R_y[1].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_1_5) * egain[2];
259
260        value = (R_y[1].s12 + R_y[2].s12) * 0.5f;
261        out_data[1].s13 = dot_denoise (value, R_y[1].s01, R_y[1].s23, R_y[2].s01, R_y[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];
262
263        write_imagef (out, (int2)(out_x, out_y), out_data[0]);
264        write_imagef (out, (int2)(out_x, out_y + 1), out_data[1]);
265
266    }
267    ////////////////////////////////B//////////////////////////////////////////
268    {
269        float4 B_z[3];
270        index = shared_pos (in_x - 1, in_y - 1);
271        B_z[0] = *(__local float4*)(z_data_in + index);
272        index = shared_pos (in_x - 1, in_y);
273        B_z[1] = *(__local float4*)(z_data_in + index);
274        index = shared_pos (in_x - 1, in_y + 1);
275        B_z[2] = *(__local float4*)(z_data_in + index);
276
277        value = (B_z[0].s12 + B_z[1].s12) * 0.5f;
278        out_data[0].s02 = dot_denoise (value, B_z[0].s01, B_z[0].s23, B_z[1].s01, B_z[1].s23, table, GUASS_DELTA_S_2_5) * egain[0];
279
280        value = (B_z[0].s12 + B_z[0].s23 +
281                 B_z[1].s12 + B_z[1].s23) * 0.25f;
282        out_data[0].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[1].s12, B_z[1].s23, table, GUASS_DELTA_S_1_5) * egain[1];
283
284        value = B_z[1].s12;
285        out_data[1].s02 = dot_denoise (value, B_z[0].s12, B_z[1].s01, B_z[1].s23, B_z[2].s12, table, GUASS_DELTA_S_2) * egain[2];
286
287        value = (B_z[1].s12 + B_z[1].s23) * 0.5f;
288        out_data[1].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[2].s12, B_z[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];
289
290        write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data[0]);
291        write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data[1]);
292    }
293}
294
295void shared_demosaic (
296    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
297    int in_x, int in_y,
298    __write_only image2d_t out, uint output_height, int out_x, int out_y,
299    uint has_denoise, __local float *table, CLEeConfig ee_config)
300{
301    if (has_denoise) {
302        demosaic_denoise_2_cell (
303            x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
304            out, output_height, out_x, out_y, table, ee_config);
305    } else {
306        demosaic_2_cell (
307            x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
308            out, output_height, out_x, out_y);
309    }
310}
311
312__kernel void kernel_bayer_pipe (__read_only image2d_t input,
313                                 uint input_height,
314                                 __write_only image2d_t output,
315                                 uint output_height,
316                                 __global float * bnr_table,
317                                 uint has_denoise,
318                                 CLEeConfig ee_config
319                                )
320{
321    int g_id_x = get_global_id (0);
322    int g_id_y = get_global_id (1);
323    int g_size_x = get_global_size (0);
324    int g_size_y = get_global_size (1);
325
326    int l_id_x = get_local_id(0);
327    int l_id_y = get_local_id(1);
328    int l_size_x = get_local_size (0);
329    int l_size_y = get_local_size (1);
330
331    __local float p1_x[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_y[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_z[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_w[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
332    __local float4 p2[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
333    __local float4 *stats_cache = p2;
334    __local float SLM_delta_coef_table[64];
335
336    int out_x_start, out_y_start;
337    int x_start = get_group_id (0) * WORKGROUP_CELL_WIDTH;
338    int y_start = get_group_id (1) * WORKGROUP_CELL_HEIGHT;
339    int i = mad24 (l_id_y, l_size_x, l_id_x);
340    int j = i;
341
342    i *= 4;
343    for (; i < SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE; i += (l_size_x * l_size_y) * 4) {
344        grbg_slm_load (p1_x, p1_y, p1_z, p1_w, i,
345                       input,
346                       x_start - SLM_CELL_X_OFFSET, y_start - SLM_CELL_Y_OFFSET);
347    }
348    for(; j < 64; j += l_size_x * l_size_y)
349        SLM_delta_coef_table[j] = bnr_table[j];
350
351    barrier(CLK_LOCAL_MEM_FENCE);
352
353    i = mad24 (l_id_y, l_size_x, l_id_x);
354    int workitem_x_size = (SLM_CELL_X_VALID_SIZE / DEMOSAIC_X_CELL_PER_WORKITEM);
355    int input_x = (i % workitem_x_size) * DEMOSAIC_X_CELL_PER_WORKITEM;
356    int input_y = i / workitem_x_size;
357
358    shared_demosaic (
359        p1_x, p1_y, p1_z, p1_w,
360        input_x + SLM_CELL_X_OFFSET, input_y + SLM_CELL_Y_OFFSET,
361        output, output_height,
362        (input_x + x_start) * PIXEL_PER_CELL / 4, (input_y + y_start) * PIXEL_PER_CELL, has_denoise, SLM_delta_coef_table, ee_config);
363}
364
365