1/*
2 * function: kernel_bayer_copy
3 *     sample code of default kernel arguments
4 * input:    image2d_t as read only
5 * output:   image2d_t as write only
6 */
7
8//#define ENABLE_IMAGE_2D_INPUT 0
9
10#ifndef STATS_BITS
11#define STATS_BITS 8
12#endif
13
14/*
15 * GROUP_PIXEL_X_SIZE = 2 * GROUP_CELL_X_SIZE
16 * GROUP_PIXEL_Y_SIZE = 2 * GROUP_CELL_Y_SIZE
17*/
18
19#define GROUP_CELL_X_SIZE 64
20#define GROUP_CELL_Y_SIZE 4
21
22//float4; 16
23#define SLM_X_SIZE (GROUP_CELL_X_SIZE / 4)
24#define SLM_Y_SIZE GROUP_CELL_Y_SIZE
25
26#define STATS_3A_CELL_X_SIZE 8
27#define STATS_3A_CELL_Y_SIZE GROUP_CELL_Y_SIZE
28
29typedef struct  {
30    float  level_gr;  /* Black level for GR pixels */
31    float  level_r;   /* Black level for R pixels */
32    float  level_b;   /* Black level for B pixels */
33    float  level_gb;  /* Black level for GB pixels */
34    uint   color_bits;
35} CLBLCConfig;
36
37
38typedef struct
39{
40    float r_gain;
41    float gr_gain;
42    float gb_gain;
43    float b_gain;
44} CLWBConfig;
45
46inline int slm_pos (const int x, const int y)
47{
48    return mad24 (y, SLM_X_SIZE, x);
49}
50
51inline void gamma_correct(float8 *in_out, __global float *table)
52{
53    in_out->s0 = table[clamp(convert_int(in_out->s0 * 255.0f), 0, 255)];
54    in_out->s1 = table[clamp(convert_int(in_out->s1 * 255.0f), 0, 255)];
55    in_out->s2 = table[clamp(convert_int(in_out->s2 * 255.0f), 0, 255)];
56    in_out->s3 = table[clamp(convert_int(in_out->s3 * 255.0f), 0, 255)];
57    in_out->s4 = table[clamp(convert_int(in_out->s4 * 255.0f), 0, 255)];
58    in_out->s5 = table[clamp(convert_int(in_out->s5 * 255.0f), 0, 255)];
59    in_out->s6 = table[clamp(convert_int(in_out->s6 * 255.0f), 0, 255)];
60    in_out->s7 = table[clamp(convert_int(in_out->s7 * 255.0f), 0, 255)];
61}
62
63inline float avg_float8 (float8 data)
64{
65    return (data.s0 + data.s1 + data.s2 + data.s3 + data.s4 + data.s5 + data.s6 + data.s7) * 0.125f;
66}
67
68inline void stats_3a_calculate (
69    __local float4 * slm_gr,
70    __local float4 * slm_r,
71    __local float4 * slm_b,
72    __local float4 * slm_gb,
73    __global ushort8 * stats_output,
74    CLWBConfig *wb_config)
75{
76    const int group_x_size = get_num_groups (0);
77    const int group_id_x = get_group_id (0);
78    const int group_id_y = get_group_id (1);
79
80    const int l_id_x = get_local_id (0);
81    const int l_id_y = get_local_id (1);
82    const int l_size_x = get_local_size (0);
83    const int stats_float4_x_count = STATS_3A_CELL_X_SIZE / 4;
84    int count =  stats_float4_x_count * STATS_3A_CELL_Y_SIZE / 4;
85
86    int index = mad24 (l_id_y, l_size_x, l_id_x);
87    int index_x = index % SLM_X_SIZE;
88    int index_y = index / SLM_X_SIZE;
89
90    if (mad24 (index_y,  stats_float4_x_count, index_x % stats_float4_x_count) < count) {
91        int pitch_count = count / stats_float4_x_count * SLM_X_SIZE;
92        int index1 = index + pitch_count;
93        int index2 = index1 + pitch_count;
94        int index3 = index2 + pitch_count;
95        slm_gr[index] = (slm_gr[index] + slm_gr[index1] + slm_gr[index2] + slm_gr[index3]) * 0.25f;
96        slm_r[index] = (slm_r[index] + slm_r[index1] + slm_r[index2] + slm_r[index3]) * 0.25f;
97        slm_b[index] = (slm_b[index] + slm_b[index1] + slm_b[index2] + slm_b[index3]) * 0.25f;
98        slm_gb[index] = (slm_gb[index] + slm_gb[index1] + slm_gb[index2] + slm_gb[index3]) * 0.25f;
99    }
100    barrier (CLK_LOCAL_MEM_FENCE);
101
102    if (index < SLM_X_SIZE / 2) {
103        float result_gr, result_r, result_b, result_gb, avg_y;
104        float8 tmp;
105        tmp = ((__local float8*)slm_gr)[index];
106        result_gr = avg_float8 (tmp);
107
108        tmp = ((__local float8*)slm_r)[index];
109        result_r = avg_float8 (tmp);
110
111        tmp = ((__local float8*)slm_b)[index];
112        result_b = avg_float8 (tmp);
113
114        tmp = ((__local float8*)slm_gb)[index];
115        result_gb = avg_float8 (tmp);
116
117        int out_index = mad24 (mad24 (group_id_y, group_x_size, group_id_x),
118                               (GROUP_CELL_X_SIZE / STATS_3A_CELL_X_SIZE) * (GROUP_CELL_Y_SIZE / STATS_3A_CELL_Y_SIZE),
119                               index);
120
121#if STATS_BITS==8
122        avg_y = mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 74.843f,
123                     mad (result_r * wb_config->r_gain, 76.245f, result_b * 29.070f));
124
125        //ushort avg_y; avg_r; avg_gr; avg_gb; avg_b; valid_wb_count; f_value1; f_value2;
126        stats_output[out_index] = (ushort8) (
127                                      convert_ushort (convert_uchar_sat (avg_y)),
128                                      convert_ushort (convert_uchar_sat (result_r * 255.0f)),
129                                      convert_ushort (convert_uchar_sat (result_gr * 255.0f)),
130                                      convert_ushort (convert_uchar_sat (result_gb * 255.0f)),
131                                      convert_ushort (convert_uchar_sat (result_b * 255.0f)),
132                                      STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
133                                      0,
134                                      0);
135#elif STATS_BITS==12
136        avg_y = mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 1201.883f,
137                     mad (result_r * wb_config->r_gain, 1224.405f, result_b * 466.830f));
138
139        stats_output[out_index] = (ushort8) (
140                                      convert_ushort (clamp (avg_y, 0.0f, 4095.0f)),
141                                      convert_ushort (clamp (result_r * 4096.0f, 0.0f, 4095.0f)),
142                                      convert_ushort (clamp (result_gr * 4096.0f, 0.0f, 4095.0f)),
143                                      convert_ushort (clamp (result_gb * 4096.0f, 0.0f, 4095.0f)),
144                                      convert_ushort (clamp (result_b * 4096.0f, 0.0f, 4095.0f)),
145                                      STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
146                                      0,
147                                      0);
148#else
149        printf ("kernel 3a-stats error, wrong bit depth:%d\n", STATS_BITS);
150#endif
151    }
152}
153
154
155__kernel void kernel_bayer_basic (
156#if ENABLE_IMAGE_2D_INPUT
157    __read_only image2d_t input,
158#else
159    __global const ushort8 *input,
160#endif
161    uint input_aligned_width,
162    __write_only image2d_t output,
163    uint out_height,
164    CLBLCConfig blc_config,
165    CLWBConfig wb_config,
166    __global float *gamma_table,
167    __global ushort8 *stats_output
168)
169{
170    int g_x = get_global_id (0);
171    int g_y = get_global_id (1);
172
173    const int l_x = get_local_id (0);
174    const int l_y = get_local_id (1);
175    const int l_x_size = get_local_size (0);
176    const int l_y_size = get_local_size (1);
177    const int group_id_x = get_group_id (0);
178    const int group_id_y = get_group_id (1);
179
180    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
181
182    int index = mad24 (l_y, l_x_size, l_x);
183    int x_cell_start = (GROUP_CELL_X_SIZE / 4) * group_id_x;
184    int y_cell_start = GROUP_CELL_Y_SIZE * group_id_y;
185    int x, y;
186
187    float blc_multiplier = (float)(1 << (16 - blc_config.color_bits));
188
189    __local float4 slm_gr[SLM_X_SIZE * SLM_Y_SIZE], slm_r[SLM_X_SIZE * SLM_Y_SIZE], slm_b[SLM_X_SIZE * SLM_Y_SIZE], slm_gb[SLM_X_SIZE * SLM_Y_SIZE];
190
191    for (; index < SLM_X_SIZE * SLM_Y_SIZE; index += l_x_size * l_y_size) {
192        float8 line1;
193        float8 line2;
194
195        x = index % SLM_X_SIZE + x_cell_start;
196        y = index / SLM_X_SIZE + y_cell_start;
197
198#if ENABLE_IMAGE_2D_INPUT
199        line1 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2)))) / 65536.0f;
200        line2 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2 + 1)))) / 65536.0f;
201#else
202        line1 = convert_float8 (input [y * 2 * input_aligned_width + x]) / 65536.0f;
203        line2 = convert_float8 (input [(y * 2 + 1) * input_aligned_width + x]) / 65536.0f;
204#endif
205
206        float4 gr = mad (line1.even, blc_multiplier, - blc_config.level_gr);
207        float4 r = mad (line1.odd, blc_multiplier, - blc_config.level_r);
208        float4 b = mad (line2.even, blc_multiplier, - blc_config.level_b);
209        float4 gb = mad (line2.odd, blc_multiplier, - blc_config.level_gb);
210
211        slm_gr[index] = gr;
212        slm_r[index] =  r;
213        slm_b[index] =  b;
214        slm_gb[index] = gb;
215    }
216    barrier(CLK_LOCAL_MEM_FENCE);
217
218    float8 data_gr, data_r, data_b, data_gb;
219    index = mad24 (l_y, l_x_size, l_x);
220    x = mad24 (GROUP_CELL_X_SIZE / 8, group_id_x,  index % (SLM_X_SIZE / 2));
221    y = mad24 (GROUP_CELL_Y_SIZE, group_id_y,  index / (SLM_X_SIZE / 2));
222
223    data_gr = ((__local float8*)slm_gr)[index];
224    data_gr = data_gr * wb_config.gr_gain;
225
226    data_r = ((__local float8*)slm_r)[index];
227    data_r  = data_r * wb_config.r_gain;
228
229    data_b = ((__local float8*)slm_b)[index];
230    data_b = data_b * wb_config.b_gain;
231
232    data_gb = ((__local float8*)slm_gb)[index];
233    data_gb = data_gb * wb_config.gb_gain;
234
235#if ENABLE_GAMMA
236    gamma_correct (&data_gr, gamma_table);
237    gamma_correct (&data_r, gamma_table);
238    gamma_correct (&data_b, gamma_table);
239    gamma_correct (&data_gb, gamma_table);
240#endif
241
242#if 0
243    if (x % 16 == 0 && y % 16 == 0) {
244        uint8 value = convert_uint8(convert_uchar8_sat(data_gr * 255.0f));
245        printf ("(x:%d, y:%d) (blc.bit:%d, level:%d) (wb.gr:%f)=> (%d, %d, %d, %d, %d, %d, %d, %d)\n",
246                x * 8, y,
247                blc_config.color_bits, convert_uint(blc_config.level_gr * 255.0f),
248                wb_config.gr_gain,
249                value.s0, value.s1, value.s2, value.s3, value.s4, value.s5, value.s6, value.s7);
250    }
251#endif
252
253    write_imageui (output, (int2)(x, y), as_uint4 (convert_ushort8 (data_gr * 65536.0f)));
254    write_imageui (output, (int2)(x, y + out_height), as_uint4 (convert_ushort8 (data_r * 65536.0f)));
255    write_imageui (output, (int2)(x, y + out_height * 2), as_uint4 (convert_ushort8 (data_b * 65536.0f)));
256    write_imageui (output, (int2)(x, y + out_height * 3), as_uint4 (convert_ushort8 (data_gb * 65536.0f)));
257
258    stats_3a_calculate (slm_gr, slm_r, slm_b, slm_gb, stats_output, &wb_config);
259}
260
261