16e05115ccae00564c2fad6e02d16230ca38459d5yaowang/*
26e05115ccae00564c2fad6e02d16230ca38459d5yaowang * function: kernel_tonemapping
36e05115ccae00564c2fad6e02d16230ca38459d5yaowang *     implementation of tone mapping
46e05115ccae00564c2fad6e02d16230ca38459d5yaowang * input:    image2d_t as read only
56e05115ccae00564c2fad6e02d16230ca38459d5yaowang * output:   image2d_t as write only
66e05115ccae00564c2fad6e02d16230ca38459d5yaowang */
76e05115ccae00564c2fad6e02d16230ca38459d5yaowang
81f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai#define WORK_ITEM_X_SIZE 8
91f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai#define WORK_ITEM_Y_SIZE 8
101f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
111f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai#define SHARED_PIXEL_X_SIZE 10
121f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai#define SHARED_PIXEL_Y_SIZE 10
131f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
14053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai__kernel void kernel_tonemapping (__read_only image2d_t input, __write_only image2d_t output, float y_max, float y_target, int image_height)
156e05115ccae00564c2fad6e02d16230ca38459d5yaowang{
168500c1ff487f2700c6d795ab88e7e29722b53328wujunkai    int g_id_x = get_global_id (0);
178500c1ff487f2700c6d795ab88e7e29722b53328wujunkai    int g_id_y = get_global_id (1);
188500c1ff487f2700c6d795ab88e7e29722b53328wujunkai
198500c1ff487f2700c6d795ab88e7e29722b53328wujunkai    int group_id_x = get_group_id(0);
208500c1ff487f2700c6d795ab88e7e29722b53328wujunkai    int group_id_y = get_group_id(1);
218500c1ff487f2700c6d795ab88e7e29722b53328wujunkai
2261eca6a683f43581f5c6698976ec88a642f280efwujunkai    int local_id_x = get_local_id(0);
2361eca6a683f43581f5c6698976ec88a642f280efwujunkai    int local_id_y = get_local_id(1);
2461eca6a683f43581f5c6698976ec88a642f280efwujunkai
2561eca6a683f43581f5c6698976ec88a642f280efwujunkai    int g_size_x = get_global_size (0);
2661eca6a683f43581f5c6698976ec88a642f280efwujunkai    int g_size_y = get_global_size (1);
278500c1ff487f2700c6d795ab88e7e29722b53328wujunkai
287082441027a3a00d745f5dad58f36290f7d7b614wujunkai    int local_index = local_id_y * WORK_ITEM_X_SIZE + local_id_x;
297082441027a3a00d745f5dad58f36290f7d7b614wujunkai
306e05115ccae00564c2fad6e02d16230ca38459d5yaowang    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
316e05115ccae00564c2fad6e02d16230ca38459d5yaowang
327082441027a3a00d745f5dad58f36290f7d7b614wujunkai    __local float4 local_src_data[SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE];
337082441027a3a00d745f5dad58f36290f7d7b614wujunkai
347082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 src_data_Gr = read_imagef (input, sampler, (int2)(g_id_x, g_id_y));
357082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 src_data_R = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height));
36053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai    float4 src_data_B = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 2));
377082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 src_data_Gb = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 3));
387082441027a3a00d745f5dad58f36290f7d7b614wujunkai
397082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 src_data_G = (src_data_Gr + src_data_Gb) / 2;
407082441027a3a00d745f5dad58f36290f7d7b614wujunkai
417082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 src_y_data = 0.0f;
427082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_y_data = mad(src_data_R, 255.f * 0.299f, src_y_data);
437082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_y_data = mad(src_data_G, 255.f * 0.587f, src_y_data);
447082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_y_data = mad(src_data_B, 255.f * 0.114f, src_y_data);
457082441027a3a00d745f5dad58f36290f7d7b614wujunkai
467082441027a3a00d745f5dad58f36290f7d7b614wujunkai    local_src_data[(local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x + 1] = src_y_data;
477082441027a3a00d745f5dad58f36290f7d7b614wujunkai
487082441027a3a00d745f5dad58f36290f7d7b614wujunkai    if(local_index < SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE)
497082441027a3a00d745f5dad58f36290f7d7b614wujunkai    {
507082441027a3a00d745f5dad58f36290f7d7b614wujunkai        int target_index = local_index <= SHARED_PIXEL_X_SIZE ? local_index : (local_index <= (SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE - SHARED_PIXEL_X_SIZE) ? (local_index + WORK_ITEM_X_SIZE + (local_index - (SHARED_PIXEL_X_SIZE + 1)) / 2 * WORK_ITEM_X_SIZE) : (local_index + WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE));
517082441027a3a00d745f5dad58f36290f7d7b614wujunkai        int start_x = mad24(group_id_x, WORK_ITEM_X_SIZE, -1);
527082441027a3a00d745f5dad58f36290f7d7b614wujunkai        int start_y = mad24(group_id_y, WORK_ITEM_Y_SIZE, -1);
537082441027a3a00d745f5dad58f36290f7d7b614wujunkai        int offset_x = target_index % SHARED_PIXEL_X_SIZE;
547082441027a3a00d745f5dad58f36290f7d7b614wujunkai        int offset_y = target_index / SHARED_PIXEL_X_SIZE;
557082441027a3a00d745f5dad58f36290f7d7b614wujunkai
567082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 data_Gr = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y));
577082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 data_R = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height));
587082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 data_B = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 2));
597082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 data_Gb = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 3));
607082441027a3a00d745f5dad58f36290f7d7b614wujunkai
617082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 data_G = (data_Gr + data_Gb) / 2;
627082441027a3a00d745f5dad58f36290f7d7b614wujunkai
637082441027a3a00d745f5dad58f36290f7d7b614wujunkai        float4 y_data = 0.0f;
647082441027a3a00d745f5dad58f36290f7d7b614wujunkai        y_data = mad(data_R, 255.f * 0.299f, y_data);
657082441027a3a00d745f5dad58f36290f7d7b614wujunkai        y_data = mad(data_G, 255.f * 0.587f, y_data);
667082441027a3a00d745f5dad58f36290f7d7b614wujunkai        y_data = mad(data_B, 255.f * 0.114f, y_data);
677082441027a3a00d745f5dad58f36290f7d7b614wujunkai        local_src_data[target_index] = y_data;
687082441027a3a00d745f5dad58f36290f7d7b614wujunkai    }
698500c1ff487f2700c6d795ab88e7e29722b53328wujunkai
707082441027a3a00d745f5dad58f36290f7d7b614wujunkai    barrier(CLK_LOCAL_MEM_FENCE);
7161eca6a683f43581f5c6698976ec88a642f280efwujunkai
7261eca6a683f43581f5c6698976ec88a642f280efwujunkai    float gaussian_table[9] = {0.075f, 0.124f, 0.075f,
7361eca6a683f43581f5c6698976ec88a642f280efwujunkai                               0.124f, 0.204f, 0.124f,
7461eca6a683f43581f5c6698976ec88a642f280efwujunkai                               0.075f, 0.124f, 0.075f
7561eca6a683f43581f5c6698976ec88a642f280efwujunkai                              };
76053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai    float4 src_ym_data = 0.0f;
77053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai
787082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float16 integrate_data = *((__local float16 *)(local_src_data + local_id_y * SHARED_PIXEL_X_SIZE + local_id_x));
791f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
807082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[0], src_ym_data);
817082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[1], src_ym_data);
827082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[2], src_ym_data);
831f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
847082441027a3a00d745f5dad58f36290f7d7b614wujunkai    integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x));
851f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
867082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[3], src_ym_data);
877082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(src_y_data, (float4)gaussian_table[4], src_ym_data);
887082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[5], src_ym_data);
891f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
907082441027a3a00d745f5dad58f36290f7d7b614wujunkai    integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 2) * SHARED_PIXEL_X_SIZE + local_id_x));
911f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
927082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[6], src_ym_data);
937082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[7], src_ym_data);
947082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[8], src_ym_data);
951f1b2ff37eae59f74eebffbe25a9ef98a06f7055wujunkai
967082441027a3a00d745f5dad58f36290f7d7b614wujunkai    float4 gain = ((float4)(y_max + y_target) + src_ym_data) / (src_y_data + src_ym_data + (float4)y_target);
977082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_data_Gr = src_data_Gr * gain;
98053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai    src_data_R = src_data_R * gain;
99053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai    src_data_B = src_data_B * gain;
1007082441027a3a00d745f5dad58f36290f7d7b614wujunkai    src_data_Gb = src_data_Gb * gain;
101053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai
1027082441027a3a00d745f5dad58f36290f7d7b614wujunkai    write_imagef(output, (int2)(g_id_x, g_id_y), src_data_Gr);
1037082441027a3a00d745f5dad58f36290f7d7b614wujunkai    write_imagef(output, (int2)(g_id_x, g_id_y + image_height), src_data_R);
104053eeebbc0a6b37db36a035ba5a9d62335196aa7wujunkai    write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 2), src_data_B);
1057082441027a3a00d745f5dad58f36290f7d7b614wujunkai    write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 3), src_data_Gb);
1066e05115ccae00564c2fad6e02d16230ca38459d5yaowang}
107