1/* 2 * function: kernel_tonemapping 3 * implementation of tone mapping 4 * input: image2d_t as read only 5 * output: image2d_t as write only 6 */ 7 8#define WORK_ITEM_X_SIZE 8 9#define WORK_ITEM_Y_SIZE 8 10 11#define SHARED_PIXEL_X_SIZE 10 12#define SHARED_PIXEL_Y_SIZE 10 13 14__kernel void kernel_tonemapping (__read_only image2d_t input, __write_only image2d_t output, float y_max, float y_target, int image_height) 15{ 16 int g_id_x = get_global_id (0); 17 int g_id_y = get_global_id (1); 18 19 int group_id_x = get_group_id(0); 20 int group_id_y = get_group_id(1); 21 22 int local_id_x = get_local_id(0); 23 int local_id_y = get_local_id(1); 24 25 int g_size_x = get_global_size (0); 26 int g_size_y = get_global_size (1); 27 28 int local_index = local_id_y * WORK_ITEM_X_SIZE + local_id_x; 29 30 sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 31 32 __local float4 local_src_data[SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE]; 33 34 float4 src_data_Gr = read_imagef (input, sampler, (int2)(g_id_x, g_id_y)); 35 float4 src_data_R = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height)); 36 float4 src_data_B = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 2)); 37 float4 src_data_Gb = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 3)); 38 39 float4 src_data_G = (src_data_Gr + src_data_Gb) / 2; 40 41 float4 src_y_data = 0.0f; 42 src_y_data = mad(src_data_R, 255.f * 0.299f, src_y_data); 43 src_y_data = mad(src_data_G, 255.f * 0.587f, src_y_data); 44 src_y_data = mad(src_data_B, 255.f * 0.114f, src_y_data); 45 46 local_src_data[(local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x + 1] = src_y_data; 47 48 if(local_index < SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE) 49 { 50 int target_index = local_index <= SHARED_PIXEL_X_SIZE ? local_index : (local_index <= (SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE - SHARED_PIXEL_X_SIZE) ? (local_index + WORK_ITEM_X_SIZE + (local_index - (SHARED_PIXEL_X_SIZE + 1)) / 2 * WORK_ITEM_X_SIZE) : (local_index + WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE)); 51 int start_x = mad24(group_id_x, WORK_ITEM_X_SIZE, -1); 52 int start_y = mad24(group_id_y, WORK_ITEM_Y_SIZE, -1); 53 int offset_x = target_index % SHARED_PIXEL_X_SIZE; 54 int offset_y = target_index / SHARED_PIXEL_X_SIZE; 55 56 float4 data_Gr = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y)); 57 float4 data_R = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height)); 58 float4 data_B = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 2)); 59 float4 data_Gb = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 3)); 60 61 float4 data_G = (data_Gr + data_Gb) / 2; 62 63 float4 y_data = 0.0f; 64 y_data = mad(data_R, 255.f * 0.299f, y_data); 65 y_data = mad(data_G, 255.f * 0.587f, y_data); 66 y_data = mad(data_B, 255.f * 0.114f, y_data); 67 local_src_data[target_index] = y_data; 68 } 69 70 barrier(CLK_LOCAL_MEM_FENCE); 71 72 float gaussian_table[9] = {0.075f, 0.124f, 0.075f, 73 0.124f, 0.204f, 0.124f, 74 0.075f, 0.124f, 0.075f 75 }; 76 float4 src_ym_data = 0.0f; 77 78 float16 integrate_data = *((__local float16 *)(local_src_data + local_id_y * SHARED_PIXEL_X_SIZE + local_id_x)); 79 80 src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[0], src_ym_data); 81 src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[1], src_ym_data); 82 src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[2], src_ym_data); 83 84 integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x)); 85 86 src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[3], src_ym_data); 87 src_ym_data = mad(src_y_data, (float4)gaussian_table[4], src_ym_data); 88 src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[5], src_ym_data); 89 90 integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 2) * SHARED_PIXEL_X_SIZE + local_id_x)); 91 92 src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[6], src_ym_data); 93 src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[7], src_ym_data); 94 src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[8], src_ym_data); 95 96 float4 gain = ((float4)(y_max + y_target) + src_ym_data) / (src_y_data + src_ym_data + (float4)y_target); 97 src_data_Gr = src_data_Gr * gain; 98 src_data_R = src_data_R * gain; 99 src_data_B = src_data_B * gain; 100 src_data_Gb = src_data_Gb * gain; 101 102 write_imagef(output, (int2)(g_id_x, g_id_y), src_data_Gr); 103 write_imagef(output, (int2)(g_id_x, g_id_y + image_height), src_data_R); 104 write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 2), src_data_B); 105 write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 3), src_data_Gb); 106} 107