1 // This file is auto-generated. Do not edit!
2 
3 #include "precomp.hpp"
4 #include "opencl_kernels_photo.hpp"
5 
6 namespace cv
7 {
8 namespace ocl
9 {
10 namespace photo
11 {
12 
13 const struct ProgramEntry nlmeans={"nlmeans",
14 "#ifdef cl_amd_printf\n"
15 "#pragma OPENCL_EXTENSION cl_amd_printf:enable\n"
16 "#endif\n"
17 "#ifdef DOUBLE_SUPPORT\n"
18 "#ifdef cl_amd_fp64\n"
19 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
20 "#elif defined cl_khr_fp64\n"
21 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
22 "#endif\n"
23 "#endif\n"
24 "#ifdef OP_CALC_WEIGHTS\n"
25 "__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,\n"
26 "FT almostDist2ActualDistMultiplier, int fixedPointMult,\n"
27 "w_t den, FT WEIGHT_THRESHOLD)\n"
28 "{\n"
29 "int almostDist = get_global_id(0);\n"
30 "if (almostDist < almostMaxDist)\n"
31 "{\n"
32 "FT dist = almostDist * almostDist2ActualDistMultiplier;\n"
33 "#ifdef ABS\n"
34 "w_t w = exp((w_t)(-dist*dist) * den);\n"
35 "#else\n"
36 "w_t w = exp((w_t)(-dist) * den);\n"
37 "#endif\n"
38 "wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));\n"
39 "almostDist2Weight[almostDist] =\n"
40 "weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;\n"
41 "}\n"
42 "}\n"
43 "#elif defined OP_CALC_FASTNLMEANS\n"
44 "#define noconvert\n"
45 "#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)\n"
46 "inline int calcDist(pixel_t a, pixel_t b)\n"
47 "{\n"
48 "#ifdef ABS\n"
49 "int_t retval = convert_int_t(abs_diff(a, b));\n"
50 "#else\n"
51 "int_t diff = convert_int_t(a) - convert_int_t(b);\n"
52 "int_t retval = diff * diff;\n"
53 "#endif\n"
54 "#if cn == 1\n"
55 "return retval;\n"
56 "#elif cn == 2\n"
57 "return retval.x + retval.y;\n"
58 "#elif cn == 3\n"
59 "return retval.x + retval.y + retval.z;\n"
60 "#elif cn == 4\n"
61 "return retval.x + retval.y + retval.z + retval.w;\n"
62 "#else\n"
63 "#error \"cn should be either 1, 2, 3 or 4\"\n"
64 "#endif\n"
65 "}\n"
66 "#ifdef ABS\n"
67 "inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)\n"
68 "{\n"
69 "return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);\n"
70 "}\n"
71 "#else\n"
72 "inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)\n"
73 "{\n"
74 "int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);\n"
75 "int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);\n"
76 "int_t retval = (A - B) * (A + B);\n"
77 "#if cn == 1\n"
78 "return retval;\n"
79 "#elif cn == 2\n"
80 "return retval.x + retval.y;\n"
81 "#elif cn == 3\n"
82 "return retval.x + retval.y + retval.z;\n"
83 "#elif cn == 4\n"
84 "return retval.x + retval.y + retval.z + retval.w;\n"
85 "#else\n"
86 "#error \"cn should be either 1, 2, 3 or 4\"\n"
87 "#endif\n"
88 "}\n"
89 "#endif\n"
90 "#define COND if (x == 0 && y == 0)\n"
91 "inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,\n"
92 "__local int * dists, int y, int x, int id,\n"
93 "__global int * col_dists, __global int * up_col_dists)\n"
94 "{\n"
95 "y -= TEMPLATE_SIZE2;\n"
96 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n"
97 "int col_dists_current_private[TEMPLATE_SIZE];\n"
98 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n"
99 "{\n"
100 "int dist = 0, value;\n"
101 "__global const pixel_t * src_template = (__global const pixel_t *)(src +\n"
102 "mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));\n"
103 "__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));\n"
104 "__global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;\n"
105 "#pragma unroll\n"
106 "for (int j = 0; j < TEMPLATE_SIZE; ++j)\n"
107 "col_dists_current_private[j] = 0;\n"
108 "for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)\n"
109 "{\n"
110 "#pragma unroll\n"
111 "for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx)\n"
112 "{\n"
113 "value = calcDist(src_template[tx], src_current[tx]);\n"
114 "col_dists_current_private[tx + TEMPLATE_SIZE2] += value;\n"
115 "dist += value;\n"
116 "}\n"
117 "src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);\n"
118 "src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);\n"
119 "}\n"
120 "#pragma unroll\n"
121 "for (int j = 0; j < TEMPLATE_SIZE; ++j)\n"
122 "col_dists_current[j] = col_dists_current_private[j];\n"
123 "dists[i] = dist;\n"
124 "up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1];\n"
125 "}\n"
126 "}\n"
127 "inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,\n"
128 "__local int * dists, int y, int x0, int x, int id, int first,\n"
129 "__global int * col_dists, __global int * up_col_dists)\n"
130 "{\n"
131 "x += TEMPLATE_SIZE2;\n"
132 "y -= TEMPLATE_SIZE2;\n"
133 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n"
134 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n"
135 "{\n"
136 "__global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));\n"
137 "__global const pixel_t * src_template = (__global const pixel_t *)(src +\n"
138 "mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));\n"
139 "__global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;\n"
140 "int col_dist = 0;\n"
141 "#pragma unroll\n"
142 "for (int ty = 0; ty < TEMPLATE_SIZE; ++ty)\n"
143 "{\n"
144 "col_dist += calcDist(src_current[0], src_template[0]);\n"
145 "src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);\n"
146 "src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);\n"
147 "}\n"
148 "dists[i] += col_dist - col_dists_current[first];\n"
149 "col_dists_current[first] = col_dist;\n"
150 "up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist;\n"
151 "}\n"
152 "}\n"
153 "inline void calcElement(__global const uchar * src, int src_step, int src_offset,\n"
154 "__local int * dists, int y, int x0, int x, int id, int first,\n"
155 "__global int * col_dists, __global int * up_col_dists)\n"
156 "{\n"
157 "int sx = x + TEMPLATE_SIZE2;\n"
158 "int sy_up = y - TEMPLATE_SIZE2 - 1;\n"
159 "int sy_down = y + TEMPLATE_SIZE2;\n"
160 "pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));\n"
161 "pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));\n"
162 "sx -= SEARCH_SIZE2;\n"
163 "sy_up -= SEARCH_SIZE2;\n"
164 "sy_down -= SEARCH_SIZE2;\n"
165 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n"
166 "{\n"
167 "int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;\n"
168 "pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));\n"
169 "pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));\n"
170 "__global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);\n"
171 "__global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);\n"
172 "int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t);\n"
173 "dists[i] += col_dist - col_dists_current[0];\n"
174 "col_dists_current[0] = col_dist;\n"
175 "up_col_dists_current[0] = col_dist;\n"
176 "}\n"
177 "}\n"
178 "inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,\n"
179 "__local int * dists, __global const wlut_t * almostDist2Weight,\n"
180 "__global uchar * dst, int dst_step, int dst_offset,\n"
181 "int y, int x, int id, __local weight_t * weights_local,\n"
182 "__local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)\n"
183 "{\n"
184 "int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;\n"
185 "weight_t weights = (weight_t)0;\n"
186 "sum_t weighted_sum = (sum_t)0;\n"
187 "for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)\n"
188 "{\n"
189 "int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));\n"
190 "sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));\n"
191 "int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;\n"
192 "weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);\n"
193 "weights += weight;\n"
194 "weighted_sum += (sum_t)weight * src_value;\n"
195 "}\n"
196 "weights_local[id] = weights;\n"
197 "weighted_sum_local[id] = weighted_sum;\n"
198 "barrier(CLK_LOCAL_MEM_FENCE);\n"
199 "for (int lsize = CTA_SIZE >> 1; lsize > 2; lsize >>= 1)\n"
200 "{\n"
201 "if (id < lsize)\n"
202 "{\n"
203 "int id2 = lsize + id;\n"
204 "weights_local[id] += weights_local[id2];\n"
205 "weighted_sum_local[id] += weighted_sum_local[id2];\n"
206 "}\n"
207 "barrier(CLK_LOCAL_MEM_FENCE);\n"
208 "}\n"
209 "if (id == 0)\n"
210 "{\n"
211 "int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));\n"
212 "sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +\n"
213 "weighted_sum_local[2] + weighted_sum_local[3];\n"
214 "weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];\n"
215 "*(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);\n"
216 "}\n"
217 "}\n"
218 "__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,\n"
219 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
220 "__global const wlut_t * almostDist2Weight, __global uchar * buffer,\n"
221 "int almostTemplateWindowSizeSqBinShift)\n"
222 "{\n"
223 "int block_x = get_group_id(0), nblocks_x = get_num_groups(0);\n"
224 "int block_y = get_group_id(1);\n"
225 "int id = get_local_id(0), first;\n"
226 "__local int dists[SEARCH_SIZE_SQ];\n"
227 "__local weight_t weights[CTA_SIZE];\n"
228 "__local sum_t weighted_sum[CTA_SIZE];\n"
229 "int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);\n"
230 "int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);\n"
231 "int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE);\n"
232 "__global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));\n"
233 "__global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;\n"
234 "for (int y = y0; y < y1; ++y)\n"
235 "for (int x = x0; x < x1; ++x)\n"
236 "{\n"
237 "if (x == x0)\n"
238 "{\n"
239 "calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists);\n"
240 "first = 0;\n"
241 "}\n"
242 "else\n"
243 "{\n"
244 "if (y == y0)\n"
245 "calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);\n"
246 "else\n"
247 "calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists);\n"
248 "first = (first + 1) % TEMPLATE_SIZE;\n"
249 "}\n"
250 "convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset,\n"
251 "y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift);\n"
252 "}\n"
253 "}\n"
254 "#endif\n"
255 , "094aea838a917cea483f77e19dd39de3"};
256 ProgramSource nlmeans_oclsrc(nlmeans.programStr);
257 }
258 }}
259