1 // This file is auto-generated. Do not edit!
2 
3 #include "precomp.hpp"
4 #include "opencl_kernels_imgproc.hpp"
5 
6 namespace cv
7 {
8 namespace ocl
9 {
10 namespace imgproc
11 {
12 
13 const struct ProgramEntry accumulate={"accumulate",
14 "#ifdef DOUBLE_SUPPORT\n"
15 "#ifdef cl_amd_fp64\n"
16 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
17 "#elif defined (cl_khr_fp64)\n"
18 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
19 "#endif\n"
20 "#endif\n"
21 "#define SRC_TSIZE cn * (int)sizeof(srcT1)\n"
22 "#define DST_TSIZE cn * (int)sizeof(dstT1)\n"
23 "#define noconvert\n"
24 "__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,\n"
25 "#ifdef ACCUMULATE_PRODUCT\n"
26 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n"
27 "#endif\n"
28 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols\n"
29 "#ifdef ACCUMULATE_WEIGHTED\n"
30 ", dstT1 alpha\n"
31 "#endif\n"
32 "#ifdef HAVE_MASK\n"
33 ", __global const uchar * mask, int mask_step, int mask_offset\n"
34 "#endif\n"
35 ")\n"
36 "{\n"
37 "int x = get_global_id(0);\n"
38 "int y = get_global_id(1) * rowsPerWI;\n"
39 "if (x < dst_cols)\n"
40 "{\n"
41 "int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));\n"
42 "#ifdef HAVE_MASK\n"
43 "int mask_index = mad24(y, mask_step, mask_offset + x);\n"
44 "mask += mask_index;\n"
45 "#endif\n"
46 "#ifdef ACCUMULATE_PRODUCT\n"
47 "int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));\n"
48 "#endif\n"
49 "int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));\n"
50 "#pragma unroll\n"
51 "for (int i = 0; i < rowsPerWI; ++i)\n"
52 "if (y < dst_rows)\n"
53 "{\n"
54 "__global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);\n"
55 "#ifdef ACCUMULATE_PRODUCT\n"
56 "__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);\n"
57 "#endif\n"
58 "__global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);\n"
59 "#ifdef HAVE_MASK\n"
60 "if (mask[0])\n"
61 "#endif\n"
62 "#pragma unroll\n"
63 "for (int c = 0; c < cn; ++c)\n"
64 "{\n"
65 "#ifdef ACCUMULATE\n"
66 "dst[c] += convertToDT(src[c]);\n"
67 "#elif defined ACCUMULATE_SQUARE\n"
68 "dstT1 val = convertToDT(src[c]);\n"
69 "dst[c] = fma(val, val, dst[c]);\n"
70 "#elif defined ACCUMULATE_PRODUCT\n"
71 "dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);\n"
72 "#elif defined ACCUMULATE_WEIGHTED\n"
73 "dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);\n"
74 "#else\n"
75 "#error \"Unknown accumulation type\"\n"
76 "#endif\n"
77 "}\n"
78 "src_index += src_step;\n"
79 "#ifdef ACCUMULATE_PRODUCT\n"
80 "src2_index += src2_step;\n"
81 "#endif\n"
82 "#ifdef HAVE_MASK\n"
83 "mask += mask_step;\n"
84 "#endif\n"
85 "dst_index += dst_step;\n"
86 "++y;\n"
87 "}\n"
88 "}\n"
89 "}\n"
90 , "5f2c2d40f721d738ad2b8ef755376c6f"};
91 ProgramSource accumulate_oclsrc(accumulate.programStr);
92 const struct ProgramEntry bilateral={"bilateral",
93 "#if cn != 3\n"
94 "#define loadpix(addr) *(__global const uchar_t *)(addr)\n"
95 "#define storepix(val, addr)  *(__global uchar_t *)(addr) = val\n"
96 "#define TSIZE cn\n"
97 "#else\n"
98 "#define loadpix(addr) vload3(0, (__global const uchar *)(addr))\n"
99 "#define storepix(val, addr) vstore3(val, 0, (__global uchar *)(addr))\n"
100 "#define TSIZE 3\n"
101 "#endif\n"
102 "#if cn == 1\n"
103 "#define SUM(a) a\n"
104 "#elif cn == 2\n"
105 "#define SUM(a) a.x + a.y\n"
106 "#elif cn == 3\n"
107 "#define SUM(a) a.x + a.y + a.z\n"
108 "#elif cn == 4\n"
109 "#define SUM(a) a.x + a.y + a.z + a.w\n"
110 "#else\n"
111 "#error \"cn should be <= 4\"\n"
112 "#endif\n"
113 "__kernel void bilateral(__global const uchar * src, int src_step, int src_offset,\n"
114 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
115 "__constant float * space_weight, __constant int * space_ofs)\n"
116 "{\n"
117 "int x = get_global_id(0);\n"
118 "int y = get_global_id(1);\n"
119 "if (y < dst_rows && x < dst_cols)\n"
120 "{\n"
121 "int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));\n"
122 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
123 "float_t sum = (float_t)(0.0f);\n"
124 "float wsum = 0.0f;\n"
125 "#ifdef INTEL_DEVICE\n"
126 "float_t val0 = convert_float_t(loadpix(src + src_index));\n"
127 "#else\n"
128 "int_t val0 = convert_int_t(loadpix(src + src_index));\n"
129 "#endif\n"
130 "#pragma unroll\n"
131 "for (int k = 0; k < maxk; k++ )\n"
132 "{\n"
133 "#ifdef INTEL_DEVICE\n"
134 "float_t val = convert_float_t(loadpix(src + src_index + space_ofs[k]));\n"
135 "float diff = SUM(fabs(val - val0));\n"
136 "#else\n"
137 "int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));\n"
138 "int diff = SUM(abs(val - val0));\n"
139 "#endif\n"
140 "float w = space_weight[k] * native_exp((float)(diff * diff * gauss_color_coeff));\n"
141 "sum += convert_float_t(val) * (float_t)(w);\n"
142 "wsum += w;\n"
143 "}\n"
144 "storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);\n"
145 "}\n"
146 "}\n"
147 "#ifdef INTEL_DEVICE\n"
148 "#if cn == 1\n"
149 "__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset,\n"
150 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
151 "__constant float * space_weight, __constant int * space_ofs)\n"
152 "{\n"
153 "int x = get_global_id(0);\n"
154 "int y = get_global_id(1);\n"
155 "if (y < dst_rows && x < dst_cols / 4 )\n"
156 "{\n"
157 "int src_index = ((y + radius) * src_step) + x * 4  + (radius + src_offset);\n"
158 "int dst_index = (y  * dst_step) +  x * 4 + dst_offset ;\n"
159 "float4 sum = 0.f, wsum = 0.f;\n"
160 "float4 val0 = convert_float4(vload4(0, src + src_index));\n"
161 "#pragma unroll\n"
162 "for (int k = 0; k < maxk; k++ )\n"
163 "{\n"
164 "float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k]));\n"
165 "float4 w = space_weight[k] * native_exp((val - val0) * (val - val0) * gauss_color_coeff);\n"
166 "sum += val * w;\n"
167 "wsum += w;\n"
168 "}\n"
169 "sum = sum / wsum + .5f;\n"
170 "vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index);\n"
171 "}\n"
172 "}\n"
173 "#endif\n"
174 "#endif\n"
175 , "1cc12569fdb93cbfa05bb215d3d42e64"};
176 ProgramSource bilateral_oclsrc(bilateral.programStr);
177 const struct ProgramEntry blend_linear={"blend_linear",
178 "#ifdef DOUBLE_SUPPORT\n"
179 "#ifdef cl_amd_fp64\n"
180 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
181 "#elif defined (cl_khr_fp64)\n"
182 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
183 "#endif\n"
184 "#endif\n"
185 "#define noconvert\n"
186 "__kernel void blendLinear(__global const uchar * src1ptr, int src1_step, int src1_offset,\n"
187 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n"
188 "__global const uchar * weight1, int weight1_step, int weight1_offset,\n"
189 "__global const uchar * weight2, int weight2_step, int weight2_offset,\n"
190 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
191 "{\n"
192 "int x = get_global_id(0);\n"
193 "int y = get_global_id(1);\n"
194 "if (x < dst_cols && y < dst_rows)\n"
195 "{\n"
196 "int src1_index = mad24(y, src1_step, src1_offset + x * cn * (int)sizeof(T));\n"
197 "int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(T));\n"
198 "int weight1_index = mad24(y, weight1_step, weight1_offset + x * (int)sizeof(float));\n"
199 "int weight2_index = mad24(y, weight2_step, weight2_offset + x * (int)sizeof(float));\n"
200 "int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(T));\n"
201 "float w1 = *(__global const float *)(weight1 + weight1_index),\n"
202 "w2 = *(__global const float *)(weight2 + weight2_index);\n"
203 "float den = w1 + w2 + 1e-5f;\n"
204 "__global const T * src1 = (__global const T *)(src1ptr + src1_index);\n"
205 "__global const T * src2 = (__global const T *)(src2ptr + src2_index);\n"
206 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
207 "#pragma unroll\n"
208 "for (int i = 0; i < cn; ++i)\n"
209 "{\n"
210 "float num = w1 * convert_float(src1[i]) + w2 * convert_float(src2[i]);\n"
211 "dst[i] = convertToT(num / den);\n"
212 "}\n"
213 "}\n"
214 "}\n"
215 , "76072b51c3ede4951ee0200aa33297dc"};
216 ProgramSource blend_linear_oclsrc(blend_linear.programStr);
217 const struct ProgramEntry boxFilter={"boxFilter",
218 "#ifdef DOUBLE_SUPPORT\n"
219 "#ifdef cl_amd_fp64\n"
220 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
221 "#elif defined (cl_khr_fp64)\n"
222 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
223 "#endif\n"
224 "#endif\n"
225 "#if cn != 3\n"
226 "#define loadpix(addr) *(__global const ST *)(addr)\n"
227 "#define storepix(val, addr)  *(__global DT *)(addr) = val\n"
228 "#define SRCSIZE (int)sizeof(ST)\n"
229 "#define DSTSIZE (int)sizeof(DT)\n"
230 "#else\n"
231 "#define loadpix(addr) vload3(0, (__global const ST1 *)(addr))\n"
232 "#define storepix(val, addr) vstore3(val, 0, (__global DT1 *)(addr))\n"
233 "#define SRCSIZE (int)sizeof(ST1)*cn\n"
234 "#define DSTSIZE (int)sizeof(DT1)*cn\n"
235 "#endif\n"
236 "#ifdef BORDER_CONSTANT\n"
237 "#elif defined BORDER_REPLICATE\n"
238 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
239 "{ \\\n"
240 "x = max(min(x, maxX - 1), minX); \\\n"
241 "y = max(min(y, maxY - 1), minY); \\\n"
242 "}\n"
243 "#elif defined BORDER_WRAP\n"
244 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
245 "{ \\\n"
246 "if (x < minX) \\\n"
247 "x -= ((x - maxX + 1) / maxX) * maxX; \\\n"
248 "if (x >= maxX) \\\n"
249 "x %= maxX; \\\n"
250 "if (y < minY) \\\n"
251 "y -= ((y - maxY + 1) / maxY) * maxY; \\\n"
252 "if (y >= maxY) \\\n"
253 "y %= maxY; \\\n"
254 "}\n"
255 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
256 "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n"
257 "{ \\\n"
258 "if (maxX - minX == 1) \\\n"
259 "x = minX; \\\n"
260 "else \\\n"
261 "do \\\n"
262 "{ \\\n"
263 "if (x < minX) \\\n"
264 "x = minX - (x - minX) - 1 + delta; \\\n"
265 "else \\\n"
266 "x = maxX - 1 - (x - maxX) - delta; \\\n"
267 "} \\\n"
268 "while (x >= maxX || x < minX); \\\n"
269 "\\\n"
270 "if (maxY - minY == 1) \\\n"
271 "y = minY; \\\n"
272 "else \\\n"
273 "do \\\n"
274 "{ \\\n"
275 "if (y < minY) \\\n"
276 "y = minY - (y - minY) - 1 + delta; \\\n"
277 "else \\\n"
278 "y = maxY - 1 - (y - maxY) - delta; \\\n"
279 "} \\\n"
280 "while (y >= maxY || y < minY); \\\n"
281 "}\n"
282 "#ifdef BORDER_REFLECT\n"
283 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n"
284 "#elif defined(BORDER_REFLECT_101)\n"
285 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n"
286 "#endif\n"
287 "#else\n"
288 "#error No extrapolation method\n"
289 "#endif\n"
290 "#define noconvert\n"
291 "#ifdef SQR\n"
292 "#define PROCESS_ELEM(value) (value * value)\n"
293 "#else\n"
294 "#define PROCESS_ELEM(value) value\n"
295 "#endif\n"
296 "struct RectCoords\n"
297 "{\n"
298 "int x1, y1, x2, y2;\n"
299 "};\n"
300 "inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)\n"
301 "{\n"
302 "#ifdef BORDER_ISOLATED\n"
303 "if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)\n"
304 "#else\n"
305 "if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)\n"
306 "#endif\n"
307 "{\n"
308 "int src_index = mad24(pos.y, src_step, pos.x * SRCSIZE);\n"
309 "WT value = convertToWT(loadpix(srcptr + src_index));\n"
310 "return PROCESS_ELEM(value);\n"
311 "}\n"
312 "else\n"
313 "{\n"
314 "#ifdef BORDER_CONSTANT\n"
315 "return (WT)(0);\n"
316 "#else\n"
317 "int selected_col = pos.x, selected_row = pos.y;\n"
318 "EXTRAPOLATE(selected_col, selected_row,\n"
319 "#ifdef BORDER_ISOLATED\n"
320 "srcCoords.x1, srcCoords.y1,\n"
321 "#else\n"
322 "0, 0,\n"
323 "#endif\n"
324 "srcCoords.x2, srcCoords.y2);\n"
325 "int src_index = mad24(selected_row, src_step, selected_col * SRCSIZE);\n"
326 "WT value = convertToWT(loadpix(srcptr + src_index));\n"
327 "return PROCESS_ELEM(value);\n"
328 "#endif\n"
329 "}\n"
330 "}\n"
331 "__kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n"
332 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols\n"
333 "#ifdef NORMALIZE\n"
334 ", float alpha\n"
335 "#endif\n"
336 ")\n"
337 "{\n"
338 "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n"
339 "int x = get_local_id(0) + (LOCAL_SIZE_X - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;\n"
340 "int y = get_global_id(1) * BLOCK_SIZE_Y;\n"
341 "int local_id = get_local_id(0);\n"
342 "WT data[KERNEL_SIZE_Y];\n"
343 "__local WT sumOfCols[LOCAL_SIZE_X];\n"
344 "int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);\n"
345 "#pragma unroll\n"
346 "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)\n"
347 "data[sy] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);\n"
348 "WT tmp_sum = (WT)(0);\n"
349 "#pragma unroll\n"
350 "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)\n"
351 "tmp_sum += data[sy];\n"
352 "sumOfCols[local_id] = tmp_sum;\n"
353 "barrier(CLK_LOCAL_MEM_FENCE);\n"
354 "int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));\n"
355 "__global DT * dst = (__global DT *)(dstptr + dst_index);\n"
356 "int sy_index = 0;\n"
357 "for (int i = 0, stepY = min(rows - y, BLOCK_SIZE_Y); i < stepY; ++i)\n"
358 "{\n"
359 "if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE_X - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&\n"
360 "x >= 0 && x < cols)\n"
361 "{\n"
362 "WT total_sum = (WT)(0);\n"
363 "#pragma unroll\n"
364 "for (int sx = 0; sx < KERNEL_SIZE_X; sx++)\n"
365 "total_sum += sumOfCols[local_id + sx - ANCHOR_X];\n"
366 "#ifdef NORMALIZE\n"
367 "DT dstval = convertToDT((WT)(alpha) * total_sum);\n"
368 "#else\n"
369 "DT dstval = convertToDT(total_sum);\n"
370 "#endif\n"
371 "storepix(dstval, dst);\n"
372 "}\n"
373 "barrier(CLK_LOCAL_MEM_FENCE);\n"
374 "tmp_sum = sumOfCols[local_id];\n"
375 "tmp_sum -= data[sy_index];\n"
376 "data[sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);\n"
377 "srcPos.y++;\n"
378 "tmp_sum += data[sy_index];\n"
379 "sumOfCols[local_id] = tmp_sum;\n"
380 "sy_index = sy_index + 1 < KERNEL_SIZE_Y ? sy_index + 1 : 0;\n"
381 "barrier(CLK_LOCAL_MEM_FENCE);\n"
382 "dst = (__global DT *)((__global uchar *)dst + dst_step);\n"
383 "}\n"
384 "}\n"
385 , "d3e542270fa2ea1fc3744043dad50cb4"};
386 ProgramSource boxFilter_oclsrc(boxFilter.programStr);
387 const struct ProgramEntry calc_back_project={"calc_back_project",
388 "#define OUT_OF_RANGE -1\n"
389 "#define ROUNDING_EPS 0.000001f\n"
390 "#if histdims == 1\n"
391 "__kernel void calcLUT(__global const uchar * histptr, int hist_step, int hist_offset, int hist_bins,\n"
392 "__global int * lut, float scale, __constant float * ranges)\n"
393 "{\n"
394 "int x = get_global_id(0);\n"
395 "float value = convert_float(x);\n"
396 "if (value > ranges[1] || value < ranges[0])\n"
397 "lut[x] = OUT_OF_RANGE;\n"
398 "else\n"
399 "{\n"
400 "float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;\n"
401 "value -= lb;\n"
402 "int bin = convert_int_sat_rtn(value / gap + ROUNDING_EPS);\n"
403 "if (bin >= hist_bins)\n"
404 "lut[x] = OUT_OF_RANGE;\n"
405 "else\n"
406 "{\n"
407 "int hist_index = mad24(hist_step, bin, hist_offset);\n"
408 "__global const float * hist = (__global const float *)(histptr + hist_index);\n"
409 "lut[x] = (int)convert_uchar_sat_rte(hist[0] * scale);\n"
410 "}\n"
411 "}\n"
412 "}\n"
413 "__kernel void LUT(__global const uchar * src, int src_step, int src_offset,\n"
414 "__constant int * lut,\n"
415 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
416 "{\n"
417 "int x = get_global_id(0);\n"
418 "int y = get_global_id(1);\n"
419 "if (x < dst_cols && y < dst_rows)\n"
420 "{\n"
421 "int src_index = mad24(y, src_step, src_offset + x * scn);\n"
422 "int dst_index = mad24(y, dst_step, dst_offset + x);\n"
423 "int value = lut[src[src_index]];\n"
424 "dst[dst_index] = value == OUT_OF_RANGE ? 0 : convert_uchar(value);\n"
425 "}\n"
426 "}\n"
427 "#elif histdims == 2\n"
428 "__kernel void calcLUT(int hist_bins, __global int * lut, int lut_offset,\n"
429 "__constant float * ranges, int roffset)\n"
430 "{\n"
431 "int x = get_global_id(0);\n"
432 "float value = convert_float(x);\n"
433 "ranges += roffset;\n"
434 "lut += lut_offset;\n"
435 "if (value > ranges[1] || value < ranges[0])\n"
436 "lut[x] = OUT_OF_RANGE;\n"
437 "else\n"
438 "{\n"
439 "float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;\n"
440 "value -= lb;\n"
441 "int bin = convert_int_sat_rtn(value / gap + ROUNDING_EPS);\n"
442 "lut[x] = bin >= hist_bins ? OUT_OF_RANGE : bin;\n"
443 "}\n"
444 "}\n"
445 "__kernel void LUT(__global const uchar * src1, int src1_step, int src1_offset,\n"
446 "__global const uchar * src2, int src2_step, int src2_offset,\n"
447 "__global const uchar * histptr, int hist_step, int hist_offset,\n"
448 "__constant int * lut, float scale,\n"
449 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
450 "{\n"
451 "int x = get_global_id(0);\n"
452 "int y = get_global_id(1);\n"
453 "if (x < dst_cols && y < dst_rows)\n"
454 "{\n"
455 "int src1_index = mad24(y, src1_step, src1_offset + x * scn1);\n"
456 "int src2_index = mad24(y, src2_step, src2_offset + x * scn2);\n"
457 "int dst_index = mad24(y, dst_step, dst_offset + x);\n"
458 "int bin1 = lut[src1[src1_index]];\n"
459 "int bin2 = lut[src2[src2_index] + 256];\n"
460 "dst[dst_index] = bin1 == OUT_OF_RANGE || bin2 == OUT_OF_RANGE ? 0 :\n"
461 "convert_uchar_sat_rte(*(__global const float *)(histptr +\n"
462 "mad24(hist_step, bin1, hist_offset + bin2 * (int)sizeof(float))) * scale);\n"
463 "}\n"
464 "}\n"
465 "#else\n"
466 "#error \"(nimages <= 2) should be true\"\n"
467 "#endif\n"
468 , "6bab391f796ff5b2ba3d38f23929307e"};
469 ProgramSource calc_back_project_oclsrc(calc_back_project.programStr);
470 const struct ProgramEntry canny={"canny",
471 "#define TG22 0.4142135623730950488016887242097f\n"
472 "#define TG67 2.4142135623730950488016887242097f\n"
473 "#ifdef WITH_SOBEL\n"
474 "#if cn == 1\n"
475 "#define loadpix(addr) convert_floatN(*(__global const TYPE *)(addr))\n"
476 "#else\n"
477 "#define loadpix(addr) convert_floatN(vload3(0, (__global const TYPE *)(addr)))\n"
478 "#endif\n"
479 "#define storepix(value, addr) *(__global int *)(addr) = (int)(value)\n"
480 "__constant int prev[4][2] = {\n"
481 "{ 0, -1 },\n"
482 "{ -1, 1 },\n"
483 "{ -1, 0 },\n"
484 "{ -1, -1 }\n"
485 "};\n"
486 "__constant int next[4][2] = {\n"
487 "{ 0, 1 },\n"
488 "{ 1, -1 },\n"
489 "{ 1, 0 },\n"
490 "{ 1, 1 }\n"
491 "};\n"
492 "inline float3 sobel(int idx, __local const floatN *smem)\n"
493 "{\n"
494 "float3 res;\n"
495 "floatN dx = fma(2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4],\n"
496 "smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]);\n"
497 "floatN dy = fma(2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9],\n"
498 "smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]);\n"
499 "#ifdef L2GRAD\n"
500 "floatN magN = fma(dx, dx, dy * dy);\n"
501 "#else\n"
502 "floatN magN = fabs(dx) + fabs(dy);\n"
503 "#endif\n"
504 "#if cn == 1\n"
505 "res.z = magN;\n"
506 "res.x = dx;\n"
507 "res.y = dy;\n"
508 "#else\n"
509 "res.z = max(magN.x, max(magN.y, magN.z));\n"
510 "if (res.z == magN.y)\n"
511 "{\n"
512 "dx.x = dx.y;\n"
513 "dy.x = dy.y;\n"
514 "}\n"
515 "else if (res.z == magN.z)\n"
516 "{\n"
517 "dx.x = dx.z;\n"
518 "dy.x = dy.z;\n"
519 "}\n"
520 "res.x = dx.x;\n"
521 "res.y = dy.x;\n"
522 "#endif\n"
523 "return res;\n"
524 "}\n"
525 "__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols,\n"
526 "__global uchar *map, int map_step, int map_offset,\n"
527 "float low_thr, float high_thr)\n"
528 "{\n"
529 "__local floatN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)];\n"
530 "int lidx = get_local_id(0);\n"
531 "int lidy = get_local_id(1);\n"
532 "int start_x = GRP_SIZEX * get_group_id(0);\n"
533 "int start_y = GRP_SIZEY * get_group_id(1);\n"
534 "int i = lidx + lidy * GRP_SIZEX;\n"
535 "for (int j = i;  j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY)\n"
536 "{\n"
537 "int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1);\n"
538 "int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1);\n"
539 "smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset)));\n"
540 "}\n"
541 "barrier(CLK_LOCAL_MEM_FENCE);\n"
542 "__local float mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n"
543 "lidx++;\n"
544 "lidy++;\n"
545 "if (i < GRP_SIZEX + 2)\n"
546 "{\n"
547 "int grp_sizey = min(GRP_SIZEY + 1, rows - start_y);\n"
548 "mag[i] = (sobel(i, smem)).z;\n"
549 "mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z;\n"
550 "}\n"
551 "if (i < GRP_SIZEY + 2)\n"
552 "{\n"
553 "int grp_sizex = min(GRP_SIZEX + 1, cols - start_x);\n"
554 "mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z;\n"
555 "mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z;\n"
556 "}\n"
557 "int idx = lidx + lidy * (GRP_SIZEX + 4);\n"
558 "i = lidx + lidy * (GRP_SIZEX + 2);\n"
559 "float3 res = sobel(idx, smem);\n"
560 "mag[i] = res.z;\n"
561 "barrier(CLK_LOCAL_MEM_FENCE);\n"
562 "int x = (int) res.x;\n"
563 "int y = (int) res.y;\n"
564 "int gidx = get_global_id(0);\n"
565 "int gidy = get_global_id(1);\n"
566 "if (gidx >= cols || gidy >= rows)\n"
567 "return;\n"
568 "float mag0 = mag[i];\n"
569 "int value = 1;\n"
570 "if (mag0 > low_thr)\n"
571 "{\n"
572 "int a = (y / (float)x) * TG22;\n"
573 "int b = (y / (float)x) * TG67;\n"
574 "a = min((int)abs(a), 1) + 1;\n"
575 "b = min((int)abs(b), 1);\n"
576 "int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);\n"
577 "int dir = a * b + 2 * dir3;\n"
578 "float prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];\n"
579 "float next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);\n"
580 "if (mag0 > prev_mag && mag0 >= next_mag)\n"
581 "{\n"
582 "value = (mag0 > high_thr) ? 2 : 0;\n"
583 "}\n"
584 "}\n"
585 "storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));\n"
586 "}\n"
587 "#elif defined WITHOUT_SOBEL\n"
588 "#define loadpix(addr) (__global short *)(addr)\n"
589 "#define storepix(val, addr) *(__global int *)(addr) = (int)(val)\n"
590 "#ifdef L2GRAD\n"
591 "#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y))\n"
592 "#else\n"
593 "#define dist(x, y) (abs(x) + abs(y))\n"
594 "#endif\n"
595 "__constant int prev[4][2] = {\n"
596 "{ 0, -1 },\n"
597 "{ -1, -1 },\n"
598 "{ -1, 0 },\n"
599 "{ -1, 1 }\n"
600 "};\n"
601 "__constant int next[4][2] = {\n"
602 "{ 0, 1 },\n"
603 "{ 1, 1 },\n"
604 "{ 1, 0 },\n"
605 "{ 1, -1 }\n"
606 "};\n"
607 "__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset,\n"
608 "__global const uchar *dyptr, int dy_step, int dy_offset,\n"
609 "__global uchar *map, int map_step, int map_offset, int rows, int cols,\n"
610 "int low_thr, int high_thr)\n"
611 "{\n"
612 "int start_x = get_group_id(0) * GRP_SIZEX;\n"
613 "int start_y = get_group_id(1) * GRP_SIZEY;\n"
614 "int lidx = get_local_id(0);\n"
615 "int lidy = get_local_id(1);\n"
616 "__local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n"
617 "__local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n"
618 "#pragma unroll\n"
619 "for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY)\n"
620 "{\n"
621 "int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1);\n"
622 "int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1);\n"
623 "int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset));\n"
624 "int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset));\n"
625 "__global short *dx = loadpix(dxptr + dx_index);\n"
626 "__global short *dy = loadpix(dyptr + dy_index);\n"
627 "int mag0 = dist(dx[0], dy[0]);\n"
628 "#if cn > 1\n"
629 "short cdx = dx[0], cdy = dy[0];\n"
630 "#pragma unroll\n"
631 "for (int j = 1; j < cn; ++j)\n"
632 "{\n"
633 "int mag1 = dist(dx[j], dy[j]);\n"
634 "if (mag1 > mag0)\n"
635 "{\n"
636 "mag0 = mag1;\n"
637 "cdx = dx[j];\n"
638 "cdy = dy[j];\n"
639 "}\n"
640 "}\n"
641 "dx[0] = cdx;\n"
642 "dy[0] = cdy;\n"
643 "#endif\n"
644 "mag[i] = mag0;\n"
645 "sigma[i] = (short2)(dx[0], dy[0]);\n"
646 "}\n"
647 "barrier(CLK_LOCAL_MEM_FENCE);\n"
648 "int gidx = get_global_id(0);\n"
649 "int gidy = get_global_id(1);\n"
650 "if (gidx >= cols || gidy >= rows)\n"
651 "return;\n"
652 "lidx++;\n"
653 "lidy++;\n"
654 "int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)];\n"
655 "short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x;\n"
656 "short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y;\n"
657 "int value = 1;\n"
658 "if (mag0 > low_thr)\n"
659 "{\n"
660 "int a = (y / (float)x) * TG22;\n"
661 "int b = (y / (float)x) * TG67;\n"
662 "a = min((int)abs(a), 1) + 1;\n"
663 "b = min((int)abs(b), 1);\n"
664 "int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);\n"
665 "int dir = a * b + 2 * dir3;\n"
666 "int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];\n"
667 "int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);\n"
668 "if (mag0 > prev_mag && mag0 >= next_mag)\n"
669 "{\n"
670 "value = (mag0 > high_thr) ? 2 : 0;\n"
671 "}\n"
672 "}\n"
673 "storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));\n"
674 "}\n"
675 "#undef TG22\n"
676 "#undef CANNY_SHIFT\n"
677 "#elif defined STAGE2\n"
678 "#define loadpix(addr) *(__global int *)(addr)\n"
679 "#define storepix(val, addr) *(__global int *)(addr) = (int)(val)\n"
680 "#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y)\n"
681 "#define l_stack_size (4*LOCAL_TOTAL)\n"
682 "#define p_stack_size 8\n"
683 "__constant short move_dir[2][8] = {\n"
684 "{ -1, -1, -1, 0, 0, 1, 1, 1 },\n"
685 "{ -1, 0, 1, -1, 1, -1, 0, 1 }\n"
686 "};\n"
687 "__kernel void stage2_hysteresis(__global uchar *map_ptr, int map_step, int map_offset, int rows, int cols)\n"
688 "{\n"
689 "map_ptr += map_offset;\n"
690 "int x = get_global_id(0);\n"
691 "int y = get_global_id(1) * PIX_PER_WI;\n"
692 "int lid = get_local_id(0) + get_local_id(1) * LOCAL_X;\n"
693 "__local ushort2 l_stack[l_stack_size];\n"
694 "__local int l_counter;\n"
695 "if (lid == 0)\n"
696 "l_counter = 0;\n"
697 "barrier(CLK_LOCAL_MEM_FENCE);\n"
698 "if (x < cols)\n"
699 "{\n"
700 "__global uchar* map = map_ptr + mad24(y, map_step, x * (int)sizeof(int));\n"
701 "#pragma unroll\n"
702 "for (int cy = 0; cy < PIX_PER_WI; ++cy)\n"
703 "{\n"
704 "if (y < rows)\n"
705 "{\n"
706 "int type = loadpix(map);\n"
707 "if (type == 2)\n"
708 "{\n"
709 "l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);\n"
710 "}\n"
711 "y++;\n"
712 "map += map_step;\n"
713 "}\n"
714 "}\n"
715 "}\n"
716 "barrier(CLK_LOCAL_MEM_FENCE);\n"
717 "ushort2 p_stack[p_stack_size];\n"
718 "int p_counter = 0;\n"
719 "while(l_counter != 0)\n"
720 "{\n"
721 "int mod = l_counter % LOCAL_TOTAL;\n"
722 "int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0);\n"
723 "for (int i = 0; i < pix_per_thr; ++i)\n"
724 "{\n"
725 "int index = atomic_dec(&l_counter) - 1;\n"
726 "if (index < 0)\n"
727 "continue;\n"
728 "ushort2 pos = l_stack[ index ];\n"
729 "#pragma unroll\n"
730 "for (int j = 0; j < 8; ++j)\n"
731 "{\n"
732 "ushort posx = pos.x + move_dir[0][j];\n"
733 "ushort posy = pos.y + move_dir[1][j];\n"
734 "if (posx < 0 || posy < 0 || posx >= cols || posy >= rows)\n"
735 "continue;\n"
736 "__global uchar *addr = map_ptr + mad24(posy, map_step, posx * (int)sizeof(int));\n"
737 "int type = loadpix(addr);\n"
738 "if (type == 0)\n"
739 "{\n"
740 "p_stack[p_counter++] = (ushort2)(posx, posy);\n"
741 "storepix(2, addr);\n"
742 "}\n"
743 "}\n"
744 "}\n"
745 "barrier(CLK_LOCAL_MEM_FENCE);\n"
746 "if (l_counter < 0)\n"
747 "l_counter = 0;\n"
748 "barrier(CLK_LOCAL_MEM_FENCE);\n"
749 "while (p_counter > 0)\n"
750 "{\n"
751 "l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter];\n"
752 "}\n"
753 "barrier(CLK_LOCAL_MEM_FENCE);\n"
754 "}\n"
755 "}\n"
756 "#elif defined GET_EDGES\n"
757 "__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols,\n"
758 "__global uchar *dst, int dst_step, int dst_offset)\n"
759 "{\n"
760 "int x = get_global_id(0);\n"
761 "int y = get_global_id(1) * PIX_PER_WI;\n"
762 "if (x < cols)\n"
763 "{\n"
764 "int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset));\n"
765 "int dst_index = mad24(dst_step, y, x + dst_offset);\n"
766 "#pragma unroll\n"
767 "for (int cy = 0; cy < PIX_PER_WI; ++cy)\n"
768 "{\n"
769 "if (y < rows)\n"
770 "{\n"
771 "__global const int * map = (__global const int *)(mapptr + map_index);\n"
772 "dst[dst_index] = (uchar)(-(map[0] >> 1));\n"
773 "y++;\n"
774 "map_index += map_step;\n"
775 "dst_index += dst_step;\n"
776 "}\n"
777 "}\n"
778 "}\n"
779 "}\n"
780 "#endif\n"
781 , "00cd5c7db7816a3deac5680f13536a02"};
782 ProgramSource canny_oclsrc(canny.programStr);
783 const struct ProgramEntry clahe={"clahe",
784 "#ifndef WAVE_SIZE\n"
785 "#define WAVE_SIZE 1\n"
786 "#endif\n"
787 "inline int calc_lut(__local int* smem, int val, int tid)\n"
788 "{\n"
789 "smem[tid] = val;\n"
790 "barrier(CLK_LOCAL_MEM_FENCE);\n"
791 "if (tid == 0)\n"
792 "for (int i = 1; i < 256; ++i)\n"
793 "smem[i] += smem[i - 1];\n"
794 "barrier(CLK_LOCAL_MEM_FENCE);\n"
795 "return smem[tid];\n"
796 "}\n"
797 "#ifdef CPU\n"
798 "inline void reduce(volatile __local int* smem, int val, int tid)\n"
799 "{\n"
800 "smem[tid] = val;\n"
801 "barrier(CLK_LOCAL_MEM_FENCE);\n"
802 "if (tid < 128)\n"
803 "smem[tid] = val += smem[tid + 128];\n"
804 "barrier(CLK_LOCAL_MEM_FENCE);\n"
805 "if (tid < 64)\n"
806 "smem[tid] = val += smem[tid + 64];\n"
807 "barrier(CLK_LOCAL_MEM_FENCE);\n"
808 "if (tid < 32)\n"
809 "smem[tid] += smem[tid + 32];\n"
810 "barrier(CLK_LOCAL_MEM_FENCE);\n"
811 "if (tid < 16)\n"
812 "smem[tid] += smem[tid + 16];\n"
813 "barrier(CLK_LOCAL_MEM_FENCE);\n"
814 "if (tid < 8)\n"
815 "smem[tid] += smem[tid + 8];\n"
816 "barrier(CLK_LOCAL_MEM_FENCE);\n"
817 "if (tid < 4)\n"
818 "smem[tid] += smem[tid + 4];\n"
819 "barrier(CLK_LOCAL_MEM_FENCE);\n"
820 "if (tid < 2)\n"
821 "smem[tid] += smem[tid + 2];\n"
822 "barrier(CLK_LOCAL_MEM_FENCE);\n"
823 "if (tid < 1)\n"
824 "smem[256] = smem[tid] + smem[tid + 1];\n"
825 "barrier(CLK_LOCAL_MEM_FENCE);\n"
826 "}\n"
827 "#else\n"
828 "inline void reduce(__local volatile int* smem, int val, int tid)\n"
829 "{\n"
830 "smem[tid] = val;\n"
831 "barrier(CLK_LOCAL_MEM_FENCE);\n"
832 "if (tid < 128)\n"
833 "smem[tid] = val += smem[tid + 128];\n"
834 "barrier(CLK_LOCAL_MEM_FENCE);\n"
835 "if (tid < 64)\n"
836 "smem[tid] = val += smem[tid + 64];\n"
837 "barrier(CLK_LOCAL_MEM_FENCE);\n"
838 "if (tid < 32)\n"
839 "{\n"
840 "smem[tid] += smem[tid + 32];\n"
841 "#if WAVE_SIZE < 32\n"
842 "} barrier(CLK_LOCAL_MEM_FENCE);\n"
843 "if (tid < 16)\n"
844 "{\n"
845 "#endif\n"
846 "smem[tid] += smem[tid + 16];\n"
847 "#if WAVE_SIZE < 16\n"
848 "}\n"
849 "barrier(CLK_LOCAL_MEM_FENCE);\n"
850 "if (tid < 8)\n"
851 "{\n"
852 "#endif\n"
853 "smem[tid] += smem[tid + 8];\n"
854 "smem[tid] += smem[tid + 4];\n"
855 "smem[tid] += smem[tid + 2];\n"
856 "smem[tid] += smem[tid + 1];\n"
857 "}\n"
858 "}\n"
859 "#endif\n"
860 "__kernel void calcLut(__global __const uchar * src, const int srcStep,\n"
861 "const int src_offset, __global uchar * lut,\n"
862 "const int dstStep, const int dst_offset,\n"
863 "const int2 tileSize, const int tilesX,\n"
864 "const int clipLimit, const float lutScale)\n"
865 "{\n"
866 "__local int smem[512];\n"
867 "int tx = get_group_id(0);\n"
868 "int ty = get_group_id(1);\n"
869 "int tid = get_local_id(1) * get_local_size(0)\n"
870 "+ get_local_id(0);\n"
871 "smem[tid] = 0;\n"
872 "barrier(CLK_LOCAL_MEM_FENCE);\n"
873 "for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))\n"
874 "{\n"
875 "__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);\n"
876 "for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))\n"
877 "{\n"
878 "const int data = srcPtr[j];\n"
879 "atomic_inc(&smem[data]);\n"
880 "}\n"
881 "}\n"
882 "barrier(CLK_LOCAL_MEM_FENCE);\n"
883 "int tHistVal = smem[tid];\n"
884 "barrier(CLK_LOCAL_MEM_FENCE);\n"
885 "if (clipLimit > 0)\n"
886 "{\n"
887 "int clipped = 0;\n"
888 "if (tHistVal > clipLimit)\n"
889 "{\n"
890 "clipped = tHistVal - clipLimit;\n"
891 "tHistVal = clipLimit;\n"
892 "}\n"
893 "reduce(smem, clipped, tid);\n"
894 "barrier(CLK_LOCAL_MEM_FENCE);\n"
895 "#ifdef CPU\n"
896 "clipped = smem[256];\n"
897 "#else\n"
898 "clipped = smem[0];\n"
899 "#endif\n"
900 "__local int totalClipped;\n"
901 "if (tid == 0)\n"
902 "totalClipped = clipped;\n"
903 "barrier(CLK_LOCAL_MEM_FENCE);\n"
904 "int redistBatch = totalClipped / 256;\n"
905 "tHistVal += redistBatch;\n"
906 "int residual = totalClipped - redistBatch * 256;\n"
907 "if (tid < residual)\n"
908 "++tHistVal;\n"
909 "}\n"
910 "const int lutVal = calc_lut(smem, tHistVal, tid);\n"
911 "uint ires = (uint)convert_int_rte(lutScale * lutVal);\n"
912 "lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =\n"
913 "convert_uchar(clamp(ires, (uint)0, (uint)255));\n"
914 "}\n"
915 "__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,\n"
916 "__global uchar * dst, const int dstStep, const int dst_offset,\n"
917 "__global uchar * lut, const int lutStep, int lut_offset,\n"
918 "const int cols, const int rows,\n"
919 "const int2 tileSize,\n"
920 "const int tilesX, const int tilesY)\n"
921 "{\n"
922 "const int x = get_global_id(0);\n"
923 "const int y = get_global_id(1);\n"
924 "if (x >= cols || y >= rows)\n"
925 "return;\n"
926 "const float tyf = (convert_float(y) / tileSize.y) - 0.5f;\n"
927 "int ty1 = convert_int_rtn(tyf);\n"
928 "int ty2 = ty1 + 1;\n"
929 "const float ya = tyf - ty1;\n"
930 "ty1 = max(ty1, 0);\n"
931 "ty2 = min(ty2, tilesY - 1);\n"
932 "const float txf = (convert_float(x) / tileSize.x) - 0.5f;\n"
933 "int tx1 = convert_int_rtn(txf);\n"
934 "int tx2 = tx1 + 1;\n"
935 "const float xa = txf - tx1;\n"
936 "tx1 = max(tx1, 0);\n"
937 "tx2 = min(tx2, tilesX - 1);\n"
938 "const int srcVal = src[mad24(y, srcStep, x + src_offset)];\n"
939 "float res = 0;\n"
940 "res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));\n"
941 "res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));\n"
942 "res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));\n"
943 "res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));\n"
944 "uint ires = (uint)convert_int_rte(res);\n"
945 "dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));\n"
946 "}\n"
947 , "1240500336efb8988a25b1da384c217d"};
948 ProgramSource clahe_oclsrc(clahe.programStr);
949 const struct ProgramEntry corner={"corner",
950 "#ifdef BORDER_CONSTANT\n"
951 "#elif defined BORDER_REPLICATE\n"
952 "#define EXTRAPOLATE(x, maxV) \\\n"
953 "{ \\\n"
954 "x = max(min(x, maxV - 1), 0); \\\n"
955 "}\n"
956 "#elif defined BORDER_WRAP\n"
957 "#define EXTRAPOLATE(x, maxV) \\\n"
958 "{ \\\n"
959 "if (x < 0) \\\n"
960 "x -= ((x - maxV + 1) / maxV) * maxV; \\\n"
961 "if (x >= maxV) \\\n"
962 "x %= maxV; \\\n"
963 "}\n"
964 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)\n"
965 "#define EXTRAPOLATE_(x, maxV, delta) \\\n"
966 "{ \\\n"
967 "if (maxV == 1) \\\n"
968 "x = 0; \\\n"
969 "else \\\n"
970 "do \\\n"
971 "{ \\\n"
972 "if ( x < 0 ) \\\n"
973 "x = -x - 1 + delta; \\\n"
974 "else \\\n"
975 "x = maxV - 1 - (x - maxV) - delta; \\\n"
976 "} \\\n"
977 "while (x >= maxV || x < 0); \\\n"
978 "}\n"
979 "#ifdef BORDER_REFLECT\n"
980 "#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)\n"
981 "#else\n"
982 "#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)\n"
983 "#endif\n"
984 "#else\n"
985 "#error No extrapolation method\n"
986 "#endif\n"
987 "#define THREADS 256\n"
988 "__kernel void corner(__global const float * Dx, int dx_step, int dx_offset, int dx_whole_rows, int dx_whole_cols,\n"
989 "__global const float * Dy, int dy_step, int dy_offset, int dy_whole_rows, int dy_whole_cols,\n"
990 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float k)\n"
991 "{\n"
992 "int col = get_local_id(0);\n"
993 "int gX = get_group_id(0);\n"
994 "int gY = get_group_id(1);\n"
995 "int gly = get_global_id(1);\n"
996 "int dx_x_off = (dx_offset % dx_step) >> 2;\n"
997 "int dx_y_off = dx_offset / dx_step;\n"
998 "int dy_x_off = (dy_offset % dy_step) >> 2;\n"
999 "int dy_y_off = dy_offset / dy_step;\n"
1000 "int dst_x_off = (dst_offset % dst_step) >> 2;\n"
1001 "int dst_y_off = dst_offset / dst_step;\n"
1002 "int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;\n"
1003 "int dx_startY = (gY << 1) - anY + dx_y_off;\n"
1004 "int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;\n"
1005 "int dy_startY = (gY << 1) - anY + dy_y_off;\n"
1006 "int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;\n"
1007 "int dst_startY = (gY << 1) + dst_y_off;\n"
1008 "float data[3][ksY+1];\n"
1009 "__local float temp[6][THREADS];\n"
1010 "#ifdef BORDER_CONSTANT\n"
1011 "for (int i=0; i < ksY+1; i++)\n"
1012 "{\n"
1013 "bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;\n"
1014 "int indexDx = mad24(dx_startY+i, dx_step>>2, dx_startX+col);\n"
1015 "float dx_s = dx_con ? Dx[indexDx] : 0.0f;\n"
1016 "bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;\n"
1017 "int indexDy = mad24(dy_startY+i, dy_step>>2, dy_startX+col);\n"
1018 "float dy_s = dy_con ? Dy[indexDy] : 0.0f;\n"
1019 "data[0][i] = dx_s * dx_s;\n"
1020 "data[1][i] = dx_s * dy_s;\n"
1021 "data[2][i] = dy_s * dy_s;\n"
1022 "}\n"
1023 "#else\n"
1024 "int clamped_col = min(2*dst_cols, col);\n"
1025 "for (int i=0; i < ksY+1; i++)\n"
1026 "{\n"
1027 "int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;\n"
1028 "EXTRAPOLATE(dx_selected_row, dx_whole_rows)\n"
1029 "EXTRAPOLATE(dx_selected_col, dx_whole_cols)\n"
1030 "float dx_s = Dx[mad24(dx_selected_row, dx_step>>2, dx_selected_col)];\n"
1031 "int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;\n"
1032 "EXTRAPOLATE(dy_selected_row, dy_whole_rows)\n"
1033 "EXTRAPOLATE(dy_selected_col, dy_whole_cols)\n"
1034 "float dy_s = Dy[mad24(dy_selected_row, dy_step>>2, dy_selected_col)];\n"
1035 "data[0][i] = dx_s * dx_s;\n"
1036 "data[1][i] = dx_s * dy_s;\n"
1037 "data[2][i] = dy_s * dy_s;\n"
1038 "}\n"
1039 "#endif\n"
1040 "float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;\n"
1041 "for (int i=1; i < ksY; i++)\n"
1042 "{\n"
1043 "sum0 += data[0][i];\n"
1044 "sum1 += data[1][i];\n"
1045 "sum2 += data[2][i];\n"
1046 "}\n"
1047 "float sum01 = sum0 + data[0][0];\n"
1048 "float sum02 = sum0 + data[0][ksY];\n"
1049 "temp[0][col] = sum01;\n"
1050 "temp[1][col] = sum02;\n"
1051 "float sum11 = sum1 + data[1][0];\n"
1052 "float sum12 = sum1 + data[1][ksY];\n"
1053 "temp[2][col] = sum11;\n"
1054 "temp[3][col] = sum12;\n"
1055 "float sum21 = sum2 + data[2][0];\n"
1056 "float sum22 = sum2 + data[2][ksY];\n"
1057 "temp[4][col] = sum21;\n"
1058 "temp[5][col] = sum22;\n"
1059 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1060 "if (col < (THREADS - (ksX - 1)))\n"
1061 "{\n"
1062 "col += anX;\n"
1063 "int posX = dst_startX - dst_x_off + col - anX;\n"
1064 "int posY = (gly << 1);\n"
1065 "int till = (ksX + 1) & 1;\n"
1066 "float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };\n"
1067 "for (int k=0; k<6; k++)\n"
1068 "{\n"
1069 "float temp_sum = 0;\n"
1070 "for (int i=-anX; i<=anX - till; i++)\n"
1071 "temp_sum += temp[k][col+i];\n"
1072 "tmp_sum[k] = temp_sum;\n"
1073 "}\n"
1074 "#ifdef CORNER_HARRIS\n"
1075 "if (posX < dst_cols && (posY) < dst_rows)\n"
1076 "{\n"
1077 "int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));\n"
1078 "*(__global float *)(dst + dst_index) =\n"
1079 "tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);\n"
1080 "}\n"
1081 "if (posX < dst_cols && (posY + 1) < dst_rows)\n"
1082 "{\n"
1083 "int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));\n"
1084 "*(__global float *)(dst + dst_index) =\n"
1085 "tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);\n"
1086 "}\n"
1087 "#elif defined CORNER_MINEIGENVAL\n"
1088 "if (posX < dst_cols && (posY) < dst_rows)\n"
1089 "{\n"
1090 "int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));\n"
1091 "float a = tmp_sum[0] * 0.5f;\n"
1092 "float b = tmp_sum[2];\n"
1093 "float c = tmp_sum[4] * 0.5f;\n"
1094 "*(__global float *)(dst + dst_index) = (float)((a+c) - native_sqrt((a-c)*(a-c) + b*b));\n"
1095 "}\n"
1096 "if (posX < dst_cols && (posY + 1) < dst_rows)\n"
1097 "{\n"
1098 "int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));\n"
1099 "float a = tmp_sum[1] * 0.5f;\n"
1100 "float b = tmp_sum[3];\n"
1101 "float c = tmp_sum[5] * 0.5f;\n"
1102 "*(__global float *)(dst + dst_index) = (float)((a+c) - native_sqrt((a-c)*(a-c) + b*b));\n"
1103 "}\n"
1104 "#else\n"
1105 "#error \"No such corners type\"\n"
1106 "#endif\n"
1107 "}\n"
1108 "}\n"
1109 , "0b0ba9ee4305009cb2433737f7ed5bcd"};
1110 ProgramSource corner_oclsrc(corner.programStr);
1111 const struct ProgramEntry covardata={"covardata",
1112 "#ifdef BORDER_CONSTANT\n"
1113 "#define EXTRAPOLATE(x, maxV)\n"
1114 "#elif defined BORDER_REPLICATE\n"
1115 "#define EXTRAPOLATE(x, maxV) \\\n"
1116 "{ \\\n"
1117 "(x) = clamp((x), 0, (maxV)-1); \\\n"
1118 "}\n"
1119 "#elif defined BORDER_WRAP\n"
1120 "#define EXTRAPOLATE(x, maxV) \\\n"
1121 "{ \\\n"
1122 "(x) = ( (x) + (maxV) ) % (maxV); \\\n"
1123 "}\n"
1124 "#elif defined BORDER_REFLECT\n"
1125 "#define EXTRAPOLATE(x, maxV) \\\n"
1126 "{ \\\n"
1127 "(x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \\\n"
1128 "}\n"
1129 "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n"
1130 "#define EXTRAPOLATE(x, maxV) \\\n"
1131 "{ \\\n"
1132 "(x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \\\n"
1133 "}\n"
1134 "#else\n"
1135 "#error No extrapolation method\n"
1136 "#endif\n"
1137 "#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*src_step))[_x])\n"
1138 "#ifdef BORDER_CONSTANT\n"
1139 "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n"
1140 "#else\n"
1141 "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n"
1142 "#endif\n"
1143 "#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])\n"
1144 "#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])\n"
1145 "#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \\\n"
1146 "int srcX = x + srcOffsetX - (kernel_border); \\\n"
1147 "int srcY = y + srcOffsetY - (kernel_border); \\\n"
1148 "int xb = srcX; \\\n"
1149 "int yb = srcY; \\\n"
1150 "\\\n"
1151 "EXTRAPOLATE(xb, (width)); \\\n"
1152 "EXTRAPOLATE(yb, (height)); \\\n"
1153 "lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n"
1154 "\\\n"
1155 "if(lix < ((kernel_border)*2)) \\\n"
1156 "{ \\\n"
1157 "int xb = srcX+BLK_X; \\\n"
1158 "EXTRAPOLATE(xb,(width)); \\\n"
1159 "lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n"
1160 "} \\\n"
1161 "if(liy< ((kernel_border)*2)) \\\n"
1162 "{ \\\n"
1163 "int yb = srcY+BLK_Y; \\\n"
1164 "EXTRAPOLATE(yb, (height)); \\\n"
1165 "lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n"
1166 "} \\\n"
1167 "if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \\\n"
1168 "{ \\\n"
1169 "int xb = srcX+BLK_X; \\\n"
1170 "int yb = srcY+BLK_Y; \\\n"
1171 "EXTRAPOLATE(xb,(width)); \\\n"
1172 "EXTRAPOLATE(yb,(height)); \\\n"
1173 "lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n"
1174 "}\n"
1175 "__kernel void sobel3(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n"
1176 "__global uchar * DstX, int DstXPitch, int DstXOffset,\n"
1177 "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n"
1178 "int height, int width, float scale)\n"
1179 "{\n"
1180 "__local float lsmem[BLK_Y+2][BLK_X+2];\n"
1181 "int lix = get_local_id(0);\n"
1182 "int liy = get_local_id(1);\n"
1183 "int x = (int)get_global_id(0);\n"
1184 "int y = (int)get_global_id(1);\n"
1185 "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)\n"
1186 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1187 "if( x >= dstWidth || y >=dstHeight )  return;\n"
1188 "float u1 = lsmem[liy][lix];\n"
1189 "float u2 = lsmem[liy][lix+1];\n"
1190 "float u3 = lsmem[liy][lix+2];\n"
1191 "float m1 = lsmem[liy+1][lix];\n"
1192 "float m3 = lsmem[liy+1][lix+2];\n"
1193 "float b1 = lsmem[liy+2][lix];\n"
1194 "float b2 = lsmem[liy+2][lix+1];\n"
1195 "float b3 = lsmem[liy+2][lix+2];\n"
1196 "#ifdef SCHARR\n"
1197 "DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;\n"
1198 "DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;\n"
1199 "#else\n"
1200 "DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;\n"
1201 "DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;\n"
1202 "#endif\n"
1203 "}\n"
1204 "__kernel void sobel5(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n"
1205 "__global uchar * DstX, int DstXPitch, int DstXOffset,\n"
1206 "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n"
1207 "int height, int width, float scale)\n"
1208 "{\n"
1209 "__local float lsmem[BLK_Y+4][BLK_X+4];\n"
1210 "int lix = get_local_id(0);\n"
1211 "int liy = get_local_id(1);\n"
1212 "int x = (int)get_global_id(0);\n"
1213 "int y = (int)get_global_id(1);\n"
1214 "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)\n"
1215 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1216 "if( x >= dstWidth || y >=dstHeight )  return;\n"
1217 "float t1 = lsmem[liy][lix];\n"
1218 "float t2 = lsmem[liy][lix+1];\n"
1219 "float t3 = lsmem[liy][lix+2];\n"
1220 "float t4 = lsmem[liy][lix+3];\n"
1221 "float t5 = lsmem[liy][lix+4];\n"
1222 "float u1 = lsmem[liy+1][lix];\n"
1223 "float u2 = lsmem[liy+1][lix+1];\n"
1224 "float u3 = lsmem[liy+1][lix+2];\n"
1225 "float u4 = lsmem[liy+1][lix+3];\n"
1226 "float u5 = lsmem[liy+1][lix+4];\n"
1227 "float m1 = lsmem[liy+2][lix];\n"
1228 "float m2 = lsmem[liy+2][lix+1];\n"
1229 "float m4 = lsmem[liy+2][lix+3];\n"
1230 "float m5 = lsmem[liy+2][lix+4];\n"
1231 "float l1 = lsmem[liy+3][lix];\n"
1232 "float l2 = lsmem[liy+3][lix+1];\n"
1233 "float l3 = lsmem[liy+3][lix+2];\n"
1234 "float l4 = lsmem[liy+3][lix+3];\n"
1235 "float l5 = lsmem[liy+3][lix+4];\n"
1236 "float b1 = lsmem[liy+4][lix];\n"
1237 "float b2 = lsmem[liy+4][lix+1];\n"
1238 "float b3 = lsmem[liy+4][lix+2];\n"
1239 "float b4 = lsmem[liy+4][lix+3];\n"
1240 "float b5 = lsmem[liy+4][lix+4];\n"
1241 "DSTX(x,y) = scale *\n"
1242 "mad(12.0f, m4 - m2,\n"
1243 "mad(6.0f, m5 - m1,\n"
1244 "mad(8.0f, u4 - u2 + l4 - l2,\n"
1245 "mad(4.0f, u5 - u1 + l5 - l1,\n"
1246 "mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )\n"
1247 ")\n"
1248 ")\n"
1249 ")\n"
1250 ");\n"
1251 "DSTY(x,y) = scale *\n"
1252 "mad(12.0f, l3 - u3,\n"
1253 "mad(6.0f, b3 - t3,\n"
1254 "mad(8.0f, l2 - u2 + l4 - u4,\n"
1255 "mad(4.0f, b2 - t2 + b4 - t4,\n"
1256 "mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )\n"
1257 ")\n"
1258 ")\n"
1259 ")\n"
1260 ");\n"
1261 "}\n"
1262 "__kernel void sobel7(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n"
1263 "__global uchar * DstX, int DstXPitch, int DstXOffset,\n"
1264 "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n"
1265 "int height, int width, float scale)\n"
1266 "{\n"
1267 "__local float lsmem[BLK_Y+6][BLK_X+6];\n"
1268 "int lix = get_local_id(0);\n"
1269 "int liy = get_local_id(1);\n"
1270 "int x = (int)get_global_id(0);\n"
1271 "int y = (int)get_global_id(1);\n"
1272 "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)\n"
1273 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1274 "if( x >= dstWidth || y >=dstHeight )  return;\n"
1275 "float tt1 = lsmem[liy][lix];\n"
1276 "float tt2 = lsmem[liy][lix+1];\n"
1277 "float tt3 = lsmem[liy][lix+2];\n"
1278 "float tt4 = lsmem[liy][lix+3];\n"
1279 "float tt5 = lsmem[liy][lix+4];\n"
1280 "float tt6 = lsmem[liy][lix+5];\n"
1281 "float tt7 = lsmem[liy][lix+6];\n"
1282 "float t1 = lsmem[liy+1][lix];\n"
1283 "float t2 = lsmem[liy+1][lix+1];\n"
1284 "float t3 = lsmem[liy+1][lix+2];\n"
1285 "float t4 = lsmem[liy+1][lix+3];\n"
1286 "float t5 = lsmem[liy+1][lix+4];\n"
1287 "float t6 = lsmem[liy+1][lix+5];\n"
1288 "float t7 = lsmem[liy+1][lix+6];\n"
1289 "float u1 = lsmem[liy+2][lix];\n"
1290 "float u2 = lsmem[liy+2][lix+1];\n"
1291 "float u3 = lsmem[liy+2][lix+2];\n"
1292 "float u4 = lsmem[liy+2][lix+3];\n"
1293 "float u5 = lsmem[liy+2][lix+4];\n"
1294 "float u6 = lsmem[liy+2][lix+5];\n"
1295 "float u7 = lsmem[liy+2][lix+6];\n"
1296 "float m1 = lsmem[liy+3][lix];\n"
1297 "float m2 = lsmem[liy+3][lix+1];\n"
1298 "float m3 = lsmem[liy+3][lix+2];\n"
1299 "float m5 = lsmem[liy+3][lix+4];\n"
1300 "float m6 = lsmem[liy+3][lix+5];\n"
1301 "float m7 = lsmem[liy+3][lix+6];\n"
1302 "float l1 = lsmem[liy+4][lix];\n"
1303 "float l2 = lsmem[liy+4][lix+1];\n"
1304 "float l3 = lsmem[liy+4][lix+2];\n"
1305 "float l4 = lsmem[liy+4][lix+3];\n"
1306 "float l5 = lsmem[liy+4][lix+4];\n"
1307 "float l6 = lsmem[liy+4][lix+5];\n"
1308 "float l7 = lsmem[liy+4][lix+6];\n"
1309 "float b1 = lsmem[liy+5][lix];\n"
1310 "float b2 = lsmem[liy+5][lix+1];\n"
1311 "float b3 = lsmem[liy+5][lix+2];\n"
1312 "float b4 = lsmem[liy+5][lix+3];\n"
1313 "float b5 = lsmem[liy+5][lix+4];\n"
1314 "float b6 = lsmem[liy+5][lix+5];\n"
1315 "float b7 = lsmem[liy+5][lix+6];\n"
1316 "float bb1 = lsmem[liy+6][lix];\n"
1317 "float bb2 = lsmem[liy+6][lix+1];\n"
1318 "float bb3 = lsmem[liy+6][lix+2];\n"
1319 "float bb4 = lsmem[liy+6][lix+3];\n"
1320 "float bb5 = lsmem[liy+6][lix+4];\n"
1321 "float bb6 = lsmem[liy+6][lix+5];\n"
1322 "float bb7 = lsmem[liy+6][lix+6];\n"
1323 "DSTX(x,y) = scale *\n"
1324 "mad(100.0f, m5 - m3,\n"
1325 "mad(80.0f, m6 - m2,\n"
1326 "mad(20.0f, m7 - m1,\n"
1327 "mad(75.0f, u5 - u3 + l5 - l3,\n"
1328 "mad(60.0f, u6 - u2 + l6 - l2,\n"
1329 "mad(15.0f, u7 - u1 + l7 - l1,\n"
1330 "mad(30.0f, t5 - t3 + b5 - b3,\n"
1331 "mad(24.0f, t6 - t2 + b6 - b2,\n"
1332 "mad(6.0f, t7 - t1 + b7 - b1,\n"
1333 "mad(5.0f, tt5 - tt3 + bb5 - bb3,\n"
1334 "mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )\n"
1335 ")\n"
1336 ")\n"
1337 ")\n"
1338 ")\n"
1339 ")\n"
1340 ")\n"
1341 ")\n"
1342 ")\n"
1343 ")\n"
1344 ");\n"
1345 "DSTY(x,y) = scale *\n"
1346 "mad(100.0f, l4 - u4,\n"
1347 "mad(80.0f, b4 - t4,\n"
1348 "mad(20.0f, bb4 - tt4,\n"
1349 "mad(75.0f, l5 - u5 + l3 - u3,\n"
1350 "mad(60.0f, b5 - t5 + b3 - t3,\n"
1351 "mad(15.0f, bb5 - tt5 + bb3 - tt3,\n"
1352 "mad(30.0f, l6 - u6 + l2 - u2,\n"
1353 "mad(24.0f, b6 - t6 + b2 - t2,\n"
1354 "mad(6.0f, bb6 - tt6 + bb2 - tt2,\n"
1355 "mad(5.0f, l7 - u7 + l1 - u1,\n"
1356 "mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )\n"
1357 ")\n"
1358 ")\n"
1359 ")\n"
1360 ")\n"
1361 ")\n"
1362 ")\n"
1363 ")\n"
1364 ")\n"
1365 ")\n"
1366 ");\n"
1367 "}\n"
1368 , "97cb1ffd4e7c1bc93caba596bf9c6e55"};
1369 ProgramSource covardata_oclsrc(covardata.programStr);
1370 const struct ProgramEntry cvtcolor={"cvtcolor",
1371 "#if depth == 0\n"
1372 "#define DATA_TYPE uchar\n"
1373 "#define MAX_NUM  255\n"
1374 "#define HALF_MAX 128\n"
1375 "#define COEFF_TYPE int\n"
1376 "#define SAT_CAST(num) convert_uchar_sat(num)\n"
1377 "#define DEPTH_0\n"
1378 "#elif depth == 2\n"
1379 "#define DATA_TYPE ushort\n"
1380 "#define MAX_NUM  65535\n"
1381 "#define HALF_MAX 32768\n"
1382 "#define COEFF_TYPE int\n"
1383 "#define SAT_CAST(num) convert_ushort_sat(num)\n"
1384 "#define DEPTH_2\n"
1385 "#elif depth == 5\n"
1386 "#define DATA_TYPE float\n"
1387 "#define MAX_NUM  1.0f\n"
1388 "#define HALF_MAX 0.5f\n"
1389 "#define COEFF_TYPE float\n"
1390 "#define SAT_CAST(num) (num)\n"
1391 "#define DEPTH_5\n"
1392 "#else\n"
1393 "#error \"invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)\"\n"
1394 "#endif\n"
1395 "#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))\n"
1396 "enum\n"
1397 "{\n"
1398 "yuv_shift  = 14,\n"
1399 "xyz_shift  = 12,\n"
1400 "hsv_shift  = 12,\n"
1401 "R2Y        = 4899,\n"
1402 "G2Y        = 9617,\n"
1403 "B2Y        = 1868,\n"
1404 "BLOCK_SIZE = 256\n"
1405 "};\n"
1406 "#define scnbytes ((int)sizeof(DATA_TYPE)*scn)\n"
1407 "#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)\n"
1408 "#ifndef hscale\n"
1409 "#define hscale 0\n"
1410 "#endif\n"
1411 "#ifndef hrange\n"
1412 "#define hrange 0\n"
1413 "#endif\n"
1414 "#if bidx == 0\n"
1415 "#define R_COMP z\n"
1416 "#define G_COMP y\n"
1417 "#define B_COMP x\n"
1418 "#elif bidx == 2\n"
1419 "#define R_COMP x\n"
1420 "#define G_COMP y\n"
1421 "#define B_COMP z\n"
1422 "#elif bidx == 3\n"
1423 "#define R_COMP w\n"
1424 "#define G_COMP w\n"
1425 "#define B_COMP w\n"
1426 "#endif\n"
1427 "#ifndef uidx\n"
1428 "#define uidx 0\n"
1429 "#endif\n"
1430 "#ifndef yidx\n"
1431 "#define yidx 0\n"
1432 "#endif\n"
1433 "#ifndef PIX_PER_WI_X\n"
1434 "#define PIX_PER_WI_X 1\n"
1435 "#endif\n"
1436 "#define __CAT(x, y) x##y\n"
1437 "#define CAT(x, y) __CAT(x, y)\n"
1438 "#define DATA_TYPE_4 CAT(DATA_TYPE, 4)\n"
1439 "__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,\n"
1440 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1441 "int rows, int cols)\n"
1442 "{\n"
1443 "int x = get_global_id(0);\n"
1444 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1445 "if (x < cols)\n"
1446 "{\n"
1447 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1448 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
1449 "#pragma unroll\n"
1450 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1451 "{\n"
1452 "if (y < rows)\n"
1453 "{\n"
1454 "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n"
1455 "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n"
1456 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
1457 "#ifdef DEPTH_5\n"
1458 "dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));\n"
1459 "#else\n"
1460 "dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);\n"
1461 "#endif\n"
1462 "++y;\n"
1463 "src_index += src_step;\n"
1464 "dst_index += dst_step;\n"
1465 "}\n"
1466 "}\n"
1467 "}\n"
1468 "}\n"
1469 "__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,\n"
1470 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1471 "int rows, int cols)\n"
1472 "{\n"
1473 "int x = get_global_id(0);\n"
1474 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1475 "if (x < cols)\n"
1476 "{\n"
1477 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1478 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
1479 "#pragma unroll\n"
1480 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1481 "{\n"
1482 "if (y < rows)\n"
1483 "{\n"
1484 "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n"
1485 "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n"
1486 "DATA_TYPE val = src[0];\n"
1487 "#if dcn == 3 || defined DEPTH_5\n"
1488 "dst[0] = dst[1] = dst[2] = val;\n"
1489 "#if dcn == 4\n"
1490 "dst[3] = MAX_NUM;\n"
1491 "#endif\n"
1492 "#else\n"
1493 "*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);\n"
1494 "#endif\n"
1495 "++y;\n"
1496 "dst_index += dst_step;\n"
1497 "src_index += src_step;\n"
1498 "}\n"
1499 "}\n"
1500 "}\n"
1501 "}\n"
1502 "__constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\n"
1503 "__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };\n"
1504 "__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,\n"
1505 "__global uchar* dstptr, int dst_step, int dt_offset,\n"
1506 "int rows, int cols)\n"
1507 "{\n"
1508 "int x = get_global_id(0);\n"
1509 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1510 "if (x < cols)\n"
1511 "{\n"
1512 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1513 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n"
1514 "#pragma unroll\n"
1515 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1516 "{\n"
1517 "if (y < rows)\n"
1518 "{\n"
1519 "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n"
1520 "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n"
1521 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
1522 "DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n"
1523 "#ifdef DEPTH_5\n"
1524 "__constant float * coeffs = c_RGB2YUVCoeffs_f;\n"
1525 "const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));\n"
1526 "const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);\n"
1527 "const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);\n"
1528 "#else\n"
1529 "__constant int * coeffs = c_RGB2YUVCoeffs_i;\n"
1530 "const int delta = HALF_MAX * (1 << yuv_shift);\n"
1531 "const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);\n"
1532 "const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);\n"
1533 "const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);\n"
1534 "#endif\n"
1535 "dst[0] = SAT_CAST( Y );\n"
1536 "dst[1] = SAT_CAST( U );\n"
1537 "dst[2] = SAT_CAST( V );\n"
1538 "++y;\n"
1539 "dst_index += dst_step;\n"
1540 "src_index += src_step;\n"
1541 "}\n"
1542 "}\n"
1543 "}\n"
1544 "}\n"
1545 "__constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };\n"
1546 "__constant int   c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };\n"
1547 "__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n"
1548 "__global uchar* dstptr, int dst_step, int dt_offset,\n"
1549 "int rows, int cols)\n"
1550 "{\n"
1551 "int x = get_global_id(0);\n"
1552 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1553 "if (x < cols)\n"
1554 "{\n"
1555 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1556 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n"
1557 "#pragma unroll\n"
1558 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1559 "{\n"
1560 "if (y < rows)\n"
1561 "{\n"
1562 "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n"
1563 "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n"
1564 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
1565 "DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;\n"
1566 "#ifdef DEPTH_5\n"
1567 "__constant float * coeffs = c_YUV2RGBCoeffs_f;\n"
1568 "float r = fma(V - HALF_MAX, coeffs[3], Y);\n"
1569 "float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));\n"
1570 "float b = fma(U - HALF_MAX, coeffs[0], Y);\n"
1571 "#else\n"
1572 "__constant int * coeffs = c_YUV2RGBCoeffs_i;\n"
1573 "const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);\n"
1574 "const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);\n"
1575 "const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);\n"
1576 "#endif\n"
1577 "dst[bidx] = SAT_CAST( b );\n"
1578 "dst[1] = SAT_CAST( g );\n"
1579 "dst[bidx^2] = SAT_CAST( r );\n"
1580 "#if dcn == 4\n"
1581 "dst[3] = MAX_NUM;\n"
1582 "#endif\n"
1583 "++y;\n"
1584 "dst_index += dst_step;\n"
1585 "src_index += src_step;\n"
1586 "}\n"
1587 "}\n"
1588 "}\n"
1589 "}\n"
1590 "__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,\n"
1591 "-0.812999725f, 1.5959997177f };\n"
1592 "__kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,\n"
1593 "__global uchar* dstptr, int dst_step, int dt_offset,\n"
1594 "int rows, int cols)\n"
1595 "{\n"
1596 "int x = get_global_id(0);\n"
1597 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1598 "if (x < cols / 2)\n"
1599 "{\n"
1600 "#pragma unroll\n"
1601 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1602 "{\n"
1603 "if (y < rows / 2 )\n"
1604 "{\n"
1605 "__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);\n"
1606 "__global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);\n"
1607 "__global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));\n"
1608 "__global uchar*       dst2 = dst1 + dst_step;\n"
1609 "float Y1 = ysrc[0];\n"
1610 "float Y2 = ysrc[1];\n"
1611 "float Y3 = ysrc[src_step];\n"
1612 "float Y4 = ysrc[src_step + 1];\n"
1613 "float U  = ((float)usrc[uidx]) - HALF_MAX;\n"
1614 "float V  = ((float)usrc[1-uidx]) - HALF_MAX;\n"
1615 "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n"
1616 "float ruv = fma(coeffs[4], V, 0.5f);\n"
1617 "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n"
1618 "float buv = fma(coeffs[1], U, 0.5f);\n"
1619 "Y1 = max(0.f, Y1 - 16.f) * coeffs[0];\n"
1620 "dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);\n"
1621 "dst1[1]        = convert_uchar_sat(Y1 + guv);\n"
1622 "dst1[bidx]     = convert_uchar_sat(Y1 + buv);\n"
1623 "#if dcn == 4\n"
1624 "dst1[3]        = 255;\n"
1625 "#endif\n"
1626 "Y2 = max(0.f, Y2 - 16.f) * coeffs[0];\n"
1627 "dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);\n"
1628 "dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);\n"
1629 "dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);\n"
1630 "#if dcn == 4\n"
1631 "dst1[7]        = 255;\n"
1632 "#endif\n"
1633 "Y3 = max(0.f, Y3 - 16.f) * coeffs[0];\n"
1634 "dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);\n"
1635 "dst2[1]        = convert_uchar_sat(Y3 + guv);\n"
1636 "dst2[bidx]     = convert_uchar_sat(Y3 + buv);\n"
1637 "#if dcn == 4\n"
1638 "dst2[3]        = 255;\n"
1639 "#endif\n"
1640 "Y4 = max(0.f, Y4 - 16.f) * coeffs[0];\n"
1641 "dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);\n"
1642 "dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);\n"
1643 "dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);\n"
1644 "#if dcn == 4\n"
1645 "dst2[7]        = 255;\n"
1646 "#endif\n"
1647 "}\n"
1648 "++y;\n"
1649 "}\n"
1650 "}\n"
1651 "}\n"
1652 "__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,\n"
1653 "__global uchar* dstptr, int dst_step, int dt_offset,\n"
1654 "int rows, int cols)\n"
1655 "{\n"
1656 "int x = get_global_id(0);\n"
1657 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1658 "if (x < cols / 2)\n"
1659 "{\n"
1660 "#pragma unroll\n"
1661 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1662 "{\n"
1663 "if (y < rows / 2 )\n"
1664 "{\n"
1665 "__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);\n"
1666 "__global uchar*       dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);\n"
1667 "__global uchar*       dst2 = dst1 + dst_step;\n"
1668 "float Y1 = ysrc[0];\n"
1669 "float Y2 = ysrc[1];\n"
1670 "float Y3 = ysrc[src_step];\n"
1671 "float Y4 = ysrc[src_step + 1];\n"
1672 "#ifdef SRC_CONT\n"
1673 "__global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);\n"
1674 "int u_ind = mad24(y, cols >> 1, x);\n"
1675 "float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };\n"
1676 "#else\n"
1677 "int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};\n"
1678 "__global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);\n"
1679 "__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);\n"
1680 "float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };\n"
1681 "#endif\n"
1682 "float U = uv[uidx];\n"
1683 "float V = uv[1-uidx];\n"
1684 "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n"
1685 "float ruv = fma(coeffs[4], V, 0.5f);\n"
1686 "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n"
1687 "float buv = fma(coeffs[1], U, 0.5f);\n"
1688 "Y1 = max(0.f, Y1 - 16.f) * coeffs[0];\n"
1689 "dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);\n"
1690 "dst1[1]        = convert_uchar_sat(Y1 + guv);\n"
1691 "dst1[bidx]     = convert_uchar_sat(Y1 + buv);\n"
1692 "#if dcn == 4\n"
1693 "dst1[3]        = 255;\n"
1694 "#endif\n"
1695 "Y2 = max(0.f, Y2 - 16.f) * coeffs[0];\n"
1696 "dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);\n"
1697 "dst1[dcn + 1]        = convert_uchar_sat(Y2 + guv);\n"
1698 "dst1[dcn + bidx]     = convert_uchar_sat(Y2 + buv);\n"
1699 "#if dcn == 4\n"
1700 "dst1[7]        = 255;\n"
1701 "#endif\n"
1702 "Y3 = max(0.f, Y3 - 16.f) * coeffs[0];\n"
1703 "dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);\n"
1704 "dst2[1]        = convert_uchar_sat(Y3 + guv);\n"
1705 "dst2[bidx]     = convert_uchar_sat(Y3 + buv);\n"
1706 "#if dcn == 4\n"
1707 "dst2[3]        = 255;\n"
1708 "#endif\n"
1709 "Y4 = max(0.f, Y4 - 16.f) * coeffs[0];\n"
1710 "dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);\n"
1711 "dst2[dcn + 1]        = convert_uchar_sat(Y4 + guv);\n"
1712 "dst2[dcn + bidx]     = convert_uchar_sat(Y4 + buv);\n"
1713 "#if dcn == 4\n"
1714 "dst2[7]        = 255;\n"
1715 "#endif\n"
1716 "}\n"
1717 "++y;\n"
1718 "}\n"
1719 "}\n"
1720 "}\n"
1721 "__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,\n"
1722 "0.438999176f, -0.3679990768f, -0.0709991455f };\n"
1723 "__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,\n"
1724 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
1725 "int rows, int cols)\n"
1726 "{\n"
1727 "int x = get_global_id(0) * PIX_PER_WI_X;\n"
1728 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1729 "if (x < cols/2)\n"
1730 "{\n"
1731 "int src_index  = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));\n"
1732 "int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);\n"
1733 "int y_rows = rows / 3 * 2;\n"
1734 "int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};\n"
1735 "__constant float* coeffs = c_RGB2YUVCoeffs_420;\n"
1736 "#pragma unroll\n"
1737 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1738 "{\n"
1739 "if (y < rows / 3)\n"
1740 "{\n"
1741 "__global const uchar* src1 = srcptr + src_index;\n"
1742 "__global const uchar* src2 = src1 + src_step;\n"
1743 "__global uchar* ydst1 = dstptr + ydst_index;\n"
1744 "__global uchar* ydst2 = ydst1 + dst_step;\n"
1745 "__global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);\n"
1746 "__global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);\n"
1747 "#if PIX_PER_WI_X == 2\n"
1748 "int s11 = *((__global const int*) src1);\n"
1749 "int s12 = *((__global const int*) src1 + 1);\n"
1750 "int s13 = *((__global const int*) src1 + 2);\n"
1751 "#if scn == 4\n"
1752 "int s14 = *((__global const int*) src1 + 3);\n"
1753 "#endif\n"
1754 "int s21 = *((__global const int*) src2);\n"
1755 "int s22 = *((__global const int*) src2 + 1);\n"
1756 "int s23 = *((__global const int*) src2 + 2);\n"
1757 "#if scn == 4\n"
1758 "int s24 = *((__global const int*) src2 + 3);\n"
1759 "#endif\n"
1760 "float src_pix1[scn * 4], src_pix2[scn * 4];\n"
1761 "*((float4*) src_pix1)     = convert_float4(as_uchar4(s11));\n"
1762 "*((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));\n"
1763 "*((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));\n"
1764 "#if scn == 4\n"
1765 "*((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));\n"
1766 "#endif\n"
1767 "*((float4*) src_pix2)     = convert_float4(as_uchar4(s21));\n"
1768 "*((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));\n"
1769 "*((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));\n"
1770 "#if scn == 4\n"
1771 "*((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));\n"
1772 "#endif\n"
1773 "uchar4 y1, y2;\n"
1774 "y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[      2-bidx], fma(coeffs[1], src_pix1[      1], fma(coeffs[2], src_pix1[      bidx], 16.5f))));\n"
1775 "y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[  scn+2-bidx], fma(coeffs[1], src_pix1[  scn+1], fma(coeffs[2], src_pix1[  scn+bidx], 16.5f))));\n"
1776 "y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));\n"
1777 "y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));\n"
1778 "y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[      2-bidx], fma(coeffs[1], src_pix2[      1], fma(coeffs[2], src_pix2[      bidx], 16.5f))));\n"
1779 "y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[  scn+2-bidx], fma(coeffs[1], src_pix2[  scn+1], fma(coeffs[2], src_pix2[  scn+bidx], 16.5f))));\n"
1780 "y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));\n"
1781 "y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));\n"
1782 "*((__global int*) ydst1) = as_int(y1);\n"
1783 "*((__global int*) ydst2) = as_int(y2);\n"
1784 "float uv[4] = { fma(coeffs[3], src_pix1[      2-bidx], fma(coeffs[4], src_pix1[      1], fma(coeffs[5], src_pix1[      bidx], 128.5f))),\n"
1785 "fma(coeffs[5], src_pix1[      2-bidx], fma(coeffs[6], src_pix1[      1], fma(coeffs[7], src_pix1[      bidx], 128.5f))),\n"
1786 "fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),\n"
1787 "fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };\n"
1788 "udst[0] = convert_uchar_sat(uv[uidx]    );\n"
1789 "vdst[0] = convert_uchar_sat(uv[1 - uidx]);\n"
1790 "udst[1] = convert_uchar_sat(uv[2 + uidx]);\n"
1791 "vdst[1] = convert_uchar_sat(uv[3 - uidx]);\n"
1792 "#else\n"
1793 "float4 src_pix1 = convert_float4(vload4(0, src1));\n"
1794 "float4 src_pix2 = convert_float4(vload4(0, src1+scn));\n"
1795 "float4 src_pix3 = convert_float4(vload4(0, src2));\n"
1796 "float4 src_pix4 = convert_float4(vload4(0, src2+scn));\n"
1797 "ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));\n"
1798 "ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));\n"
1799 "ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));\n"
1800 "ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));\n"
1801 "float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),\n"
1802 "fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };\n"
1803 "udst[0] = convert_uchar_sat(uv[uidx]  );\n"
1804 "vdst[0] = convert_uchar_sat(uv[1-uidx]);\n"
1805 "#endif\n"
1806 "++y;\n"
1807 "src_index += 2*src_step;\n"
1808 "ydst_index += 2*dst_step;\n"
1809 "}\n"
1810 "}\n"
1811 "}\n"
1812 "}\n"
1813 "__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,\n"
1814 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
1815 "int rows, int cols)\n"
1816 "{\n"
1817 "int x = get_global_id(0);\n"
1818 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1819 "if (x < cols / 2)\n"
1820 "{\n"
1821 "__global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);\n"
1822 "__global uchar*       dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));\n"
1823 "#pragma unroll\n"
1824 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1825 "{\n"
1826 "if (y < rows )\n"
1827 "{\n"
1828 "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n"
1829 "#ifndef USE_OPTIMIZED_LOAD\n"
1830 "float U = ((float) src[uidx]) - HALF_MAX;\n"
1831 "float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;\n"
1832 "float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];\n"
1833 "float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];\n"
1834 "#else\n"
1835 "int load_src = *((__global int*) src);\n"
1836 "float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};\n"
1837 "float U = vec_src[uidx] - HALF_MAX;\n"
1838 "float V = vec_src[(2 + uidx) % 4] - HALF_MAX;\n"
1839 "float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];\n"
1840 "float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];\n"
1841 "#endif\n"
1842 "float ruv = fma(coeffs[4], V, 0.5f);\n"
1843 "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n"
1844 "float buv = fma(coeffs[1], U, 0.5f);\n"
1845 "dst[2 - bidx] = convert_uchar_sat(y00 + ruv);\n"
1846 "dst[1]        = convert_uchar_sat(y00 + guv);\n"
1847 "dst[bidx]     = convert_uchar_sat(y00 + buv);\n"
1848 "#if dcn == 4\n"
1849 "dst[3]        = 255;\n"
1850 "#endif\n"
1851 "dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);\n"
1852 "dst[dcn + 1]        = convert_uchar_sat(y01 + guv);\n"
1853 "dst[dcn + bidx]     = convert_uchar_sat(y01 + buv);\n"
1854 "#if dcn == 4\n"
1855 "dst[7]        = 255;\n"
1856 "#endif\n"
1857 "}\n"
1858 "++y;\n"
1859 "src += src_step;\n"
1860 "dst += dst_step;\n"
1861 "}\n"
1862 "}\n"
1863 "}\n"
1864 "__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\n"
1865 "__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\n"
1866 "__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,\n"
1867 "__global uchar* dstptr, int dst_step, int dt_offset,\n"
1868 "int rows, int cols)\n"
1869 "{\n"
1870 "int x = get_global_id(0);\n"
1871 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1872 "if (x < cols)\n"
1873 "{\n"
1874 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1875 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n"
1876 "#pragma unroll\n"
1877 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1878 "{\n"
1879 "if (y < rows)\n"
1880 "{\n"
1881 "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n"
1882 "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n"
1883 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
1884 "DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n"
1885 "#ifdef DEPTH_5\n"
1886 "__constant float * coeffs = c_RGB2YCrCbCoeffs_f;\n"
1887 "DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));\n"
1888 "DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);\n"
1889 "DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);\n"
1890 "#else\n"
1891 "__constant int * coeffs = c_RGB2YCrCbCoeffs_i;\n"
1892 "int delta = HALF_MAX * (1 << yuv_shift);\n"
1893 "int Y =  CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);\n"
1894 "int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);\n"
1895 "int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);\n"
1896 "#endif\n"
1897 "dst[0] = SAT_CAST( Y );\n"
1898 "dst[1] = SAT_CAST( Cr );\n"
1899 "dst[2] = SAT_CAST( Cb );\n"
1900 "++y;\n"
1901 "dst_index += dst_step;\n"
1902 "src_index += src_step;\n"
1903 "}\n"
1904 "}\n"
1905 "}\n"
1906 "}\n"
1907 "__constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };\n"
1908 "__constant int   c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };\n"
1909 "__kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,\n"
1910 "__global uchar* dst, int dst_step, int dst_offset,\n"
1911 "int rows, int cols)\n"
1912 "{\n"
1913 "int x = get_global_id(0);\n"
1914 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
1915 "if (x < cols)\n"
1916 "{\n"
1917 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
1918 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
1919 "#pragma unroll\n"
1920 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1921 "{\n"
1922 "if (y < rows)\n"
1923 "{\n"
1924 "__global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);\n"
1925 "__global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);\n"
1926 "DATA_TYPE_4 src_pix = vload4(0, srcptr);\n"
1927 "DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;\n"
1928 "#ifdef DEPTH_5\n"
1929 "__constant float * coeff = c_YCrCb2RGBCoeffs_f;\n"
1930 "float r = fma(coeff[0], cr - HALF_MAX, yp);\n"
1931 "float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp));\n"
1932 "float b = fma(coeff[3], cb - HALF_MAX, yp);\n"
1933 "#else\n"
1934 "__constant int * coeff = c_YCrCb2RGBCoeffs_i;\n"
1935 "int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);\n"
1936 "int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);\n"
1937 "int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);\n"
1938 "#endif\n"
1939 "dstptr[(bidx^2)] = SAT_CAST(r);\n"
1940 "dstptr[1] = SAT_CAST(g);\n"
1941 "dstptr[bidx] = SAT_CAST(b);\n"
1942 "#if dcn == 4\n"
1943 "dstptr[3] = MAX_NUM;\n"
1944 "#endif\n"
1945 "++y;\n"
1946 "dst_index += dst_step;\n"
1947 "src_index += src_step;\n"
1948 "}\n"
1949 "}\n"
1950 "}\n"
1951 "}\n"
1952 "__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,\n"
1953 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1954 "int rows, int cols, __constant COEFF_TYPE * coeffs)\n"
1955 "{\n"
1956 "int dx = get_global_id(0);\n"
1957 "int dy = get_global_id(1) * PIX_PER_WI_Y;\n"
1958 "if (dx < cols)\n"
1959 "{\n"
1960 "int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));\n"
1961 "int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));\n"
1962 "#pragma unroll\n"
1963 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
1964 "{\n"
1965 "if (dy < rows)\n"
1966 "{\n"
1967 "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n"
1968 "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n"
1969 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
1970 "DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;\n"
1971 "#ifdef DEPTH_5\n"
1972 "float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));\n"
1973 "float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));\n"
1974 "float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));\n"
1975 "#else\n"
1976 "int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);\n"
1977 "int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);\n"
1978 "int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);\n"
1979 "#endif\n"
1980 "dst[0] = SAT_CAST(x);\n"
1981 "dst[1] = SAT_CAST(y);\n"
1982 "dst[2] = SAT_CAST(z);\n"
1983 "++dy;\n"
1984 "dst_index += dst_step;\n"
1985 "src_index += src_step;\n"
1986 "}\n"
1987 "}\n"
1988 "}\n"
1989 "}\n"
1990 "__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,\n"
1991 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1992 "int rows, int cols, __constant COEFF_TYPE * coeffs)\n"
1993 "{\n"
1994 "int dx = get_global_id(0);\n"
1995 "int dy = get_global_id(1) * PIX_PER_WI_Y;\n"
1996 "if (dx < cols)\n"
1997 "{\n"
1998 "int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));\n"
1999 "int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));\n"
2000 "#pragma unroll\n"
2001 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2002 "{\n"
2003 "if (dy < rows)\n"
2004 "{\n"
2005 "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n"
2006 "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n"
2007 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
2008 "DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;\n"
2009 "#ifdef DEPTH_5\n"
2010 "float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));\n"
2011 "float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));\n"
2012 "float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));\n"
2013 "#else\n"
2014 "int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);\n"
2015 "int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);\n"
2016 "int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);\n"
2017 "#endif\n"
2018 "DATA_TYPE dst0 = SAT_CAST(b);\n"
2019 "DATA_TYPE dst1 = SAT_CAST(g);\n"
2020 "DATA_TYPE dst2 = SAT_CAST(r);\n"
2021 "#if dcn == 3 || defined DEPTH_5\n"
2022 "dst[0] = dst0;\n"
2023 "dst[1] = dst1;\n"
2024 "dst[2] = dst2;\n"
2025 "#if dcn == 4\n"
2026 "dst[3] = MAX_NUM;\n"
2027 "#endif\n"
2028 "#else\n"
2029 "*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);\n"
2030 "#endif\n"
2031 "++dy;\n"
2032 "dst_index += dst_step;\n"
2033 "src_index += src_step;\n"
2034 "}\n"
2035 "}\n"
2036 "}\n"
2037 "}\n"
2038 "__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,\n"
2039 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
2040 "int rows, int cols)\n"
2041 "{\n"
2042 "int x = get_global_id(0);\n"
2043 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2044 "if (x < cols)\n"
2045 "{\n"
2046 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2047 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2048 "#pragma unroll\n"
2049 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2050 "{\n"
2051 "if (y < rows)\n"
2052 "{\n"
2053 "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n"
2054 "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n"
2055 "DATA_TYPE_4 src_pix = vload4(0, src);\n"
2056 "#ifdef REVERSE\n"
2057 "dst[0] = src_pix.z;\n"
2058 "dst[1] = src_pix.y;\n"
2059 "dst[2] = src_pix.x;\n"
2060 "#else\n"
2061 "dst[0] = src_pix.x;\n"
2062 "dst[1] = src_pix.y;\n"
2063 "dst[2] = src_pix.z;\n"
2064 "#endif\n"
2065 "#if dcn == 4\n"
2066 "#if scn == 3\n"
2067 "dst[3] = MAX_NUM;\n"
2068 "#else\n"
2069 "dst[3] = src[3];\n"
2070 "#endif\n"
2071 "#endif\n"
2072 "++y;\n"
2073 "dst_index += dst_step;\n"
2074 "src_index += src_step;\n"
2075 "}\n"
2076 "}\n"
2077 "}\n"
2078 "}\n"
2079 "__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,\n"
2080 "__global uchar* dst, int dst_step, int dst_offset,\n"
2081 "int rows, int cols)\n"
2082 "{\n"
2083 "int x = get_global_id(0);\n"
2084 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2085 "if (x < cols)\n"
2086 "{\n"
2087 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2088 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2089 "#pragma unroll\n"
2090 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2091 "{\n"
2092 "if (y < rows)\n"
2093 "{\n"
2094 "ushort t = *((__global const ushort*)(src + src_index));\n"
2095 "#if greenbits == 6\n"
2096 "dst[dst_index + bidx] = (uchar)(t << 3);\n"
2097 "dst[dst_index + 1] = (uchar)((t >> 3) & ~3);\n"
2098 "dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);\n"
2099 "#else\n"
2100 "dst[dst_index + bidx] = (uchar)(t << 3);\n"
2101 "dst[dst_index + 1] = (uchar)((t >> 2) & ~7);\n"
2102 "dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);\n"
2103 "#endif\n"
2104 "#if dcn == 4\n"
2105 "#if greenbits == 6\n"
2106 "dst[dst_index + 3] = 255;\n"
2107 "#else\n"
2108 "dst[dst_index + 3] = t & 0x8000 ? 255 : 0;\n"
2109 "#endif\n"
2110 "#endif\n"
2111 "++y;\n"
2112 "dst_index += dst_step;\n"
2113 "src_index += src_step;\n"
2114 "}\n"
2115 "}\n"
2116 "}\n"
2117 "}\n"
2118 "__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,\n"
2119 "__global uchar* dst, int dst_step, int dst_offset,\n"
2120 "int rows, int cols)\n"
2121 "{\n"
2122 "int x = get_global_id(0);\n"
2123 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2124 "if (x < cols)\n"
2125 "{\n"
2126 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2127 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2128 "#pragma unroll\n"
2129 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2130 "{\n"
2131 "if (y < rows)\n"
2132 "{\n"
2133 "uchar4 src_pix = vload4(0, src + src_index);\n"
2134 "#if greenbits == 6\n"
2135 "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));\n"
2136 "#elif scn == 3\n"
2137 "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));\n"
2138 "#else\n"
2139 "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|\n"
2140 "((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));\n"
2141 "#endif\n"
2142 "++y;\n"
2143 "dst_index += dst_step;\n"
2144 "src_index += src_step;\n"
2145 "}\n"
2146 "}\n"
2147 "}\n"
2148 "}\n"
2149 "__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,\n"
2150 "__global uchar* dst, int dst_step, int dst_offset,\n"
2151 "int rows, int cols)\n"
2152 "{\n"
2153 "int x = get_global_id(0);\n"
2154 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2155 "if (x < cols)\n"
2156 "{\n"
2157 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2158 "int dst_index = mad24(y, dst_step, dst_offset + x);\n"
2159 "#pragma unroll\n"
2160 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2161 "{\n"
2162 "if (y < rows)\n"
2163 "{\n"
2164 "int t = *((__global const ushort*)(src + src_index));\n"
2165 "#if greenbits == 6\n"
2166 "dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);\n"
2167 "#else\n"
2168 "dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);\n"
2169 "#endif\n"
2170 "++y;\n"
2171 "dst_index += dst_step;\n"
2172 "src_index += src_step;\n"
2173 "}\n"
2174 "}\n"
2175 "}\n"
2176 "}\n"
2177 "__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,\n"
2178 "__global uchar* dst, int dst_step, int dst_offset,\n"
2179 "int rows, int cols)\n"
2180 "{\n"
2181 "int x = get_global_id(0);\n"
2182 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2183 "if (x < cols)\n"
2184 "{\n"
2185 "int src_index = mad24(y, src_step, src_offset + x);\n"
2186 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2187 "#pragma unroll\n"
2188 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2189 "{\n"
2190 "if (y < rows)\n"
2191 "{\n"
2192 "int t = src[src_index];\n"
2193 "#if greenbits == 6\n"
2194 "*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\n"
2195 "#else\n"
2196 "t >>= 3;\n"
2197 "*((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));\n"
2198 "#endif\n"
2199 "++y;\n"
2200 "dst_index += dst_step;\n"
2201 "src_index += src_step;\n"
2202 "}\n"
2203 "}\n"
2204 "}\n"
2205 "}\n"
2206 "__constant int sector_data[][3] = { { 1, 3, 0 },\n"
2207 "{ 1, 0, 2 },\n"
2208 "{ 3, 0, 1 },\n"
2209 "{ 0, 2, 1 },\n"
2210 "{ 0, 1, 3 },\n"
2211 "{ 2, 1, 0 } };\n"
2212 "#ifdef DEPTH_0\n"
2213 "__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,\n"
2214 "__global uchar* dst, int dst_step, int dst_offset,\n"
2215 "int rows, int cols,\n"
2216 "__constant int * sdiv_table, __constant int * hdiv_table)\n"
2217 "{\n"
2218 "int x = get_global_id(0);\n"
2219 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2220 "if (x < cols)\n"
2221 "{\n"
2222 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2223 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2224 "#pragma unroll\n"
2225 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2226 "{\n"
2227 "if (y < rows)\n"
2228 "{\n"
2229 "uchar4 src_pix = vload4(0, src + src_index);\n"
2230 "int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n"
2231 "int h, s, v = b;\n"
2232 "int vmin = b, diff;\n"
2233 "int vr, vg;\n"
2234 "v = max(v, g);\n"
2235 "v = max(v, r);\n"
2236 "vmin = min(vmin, g);\n"
2237 "vmin = min(vmin, r);\n"
2238 "diff = v - vmin;\n"
2239 "vr = v == r ? -1 : 0;\n"
2240 "vg = v == g ? -1 : 0;\n"
2241 "s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;\n"
2242 "h = (vr & (g - b)) +\n"
2243 "(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));\n"
2244 "h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;\n"
2245 "h += h < 0 ? hrange : 0;\n"
2246 "dst[dst_index] = convert_uchar_sat_rte(h);\n"
2247 "dst[dst_index + 1] = (uchar)s;\n"
2248 "dst[dst_index + 2] = (uchar)v;\n"
2249 "++y;\n"
2250 "dst_index += dst_step;\n"
2251 "src_index += src_step;\n"
2252 "}\n"
2253 "}\n"
2254 "}\n"
2255 "}\n"
2256 "__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,\n"
2257 "__global uchar* dst, int dst_step, int dst_offset,\n"
2258 "int rows, int cols)\n"
2259 "{\n"
2260 "int x = get_global_id(0);\n"
2261 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2262 "if (x < cols)\n"
2263 "{\n"
2264 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2265 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2266 "#pragma unroll\n"
2267 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2268 "{\n"
2269 "if (y < rows)\n"
2270 "{\n"
2271 "uchar4 src_pix = vload4(0, src + src_index);\n"
2272 "float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);\n"
2273 "float b, g, r;\n"
2274 "if (s != 0)\n"
2275 "{\n"
2276 "float tab[4];\n"
2277 "int sector;\n"
2278 "h *= hscale;\n"
2279 "if( h < 0 )\n"
2280 "do h += 6; while( h < 0 );\n"
2281 "else if( h >= 6 )\n"
2282 "do h -= 6; while( h >= 6 );\n"
2283 "sector = convert_int_sat_rtn(h);\n"
2284 "h -= sector;\n"
2285 "if( (unsigned)sector >= 6u )\n"
2286 "{\n"
2287 "sector = 0;\n"
2288 "h = 0.f;\n"
2289 "}\n"
2290 "tab[0] = v;\n"
2291 "tab[1] = v*(1.f - s);\n"
2292 "tab[2] = v*(1.f - s*h);\n"
2293 "tab[3] = v*(1.f - s*(1.f - h));\n"
2294 "b = tab[sector_data[sector][0]];\n"
2295 "g = tab[sector_data[sector][1]];\n"
2296 "r = tab[sector_data[sector][2]];\n"
2297 "}\n"
2298 "else\n"
2299 "b = g = r = v;\n"
2300 "dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);\n"
2301 "dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);\n"
2302 "dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);\n"
2303 "#if dcn == 4\n"
2304 "dst[dst_index + 3] = MAX_NUM;\n"
2305 "#endif\n"
2306 "++y;\n"
2307 "dst_index += dst_step;\n"
2308 "src_index += src_step;\n"
2309 "}\n"
2310 "}\n"
2311 "}\n"
2312 "}\n"
2313 "#elif defined DEPTH_5\n"
2314 "__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,\n"
2315 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
2316 "int rows, int cols)\n"
2317 "{\n"
2318 "int x = get_global_id(0);\n"
2319 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2320 "if (x < cols)\n"
2321 "{\n"
2322 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2323 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2324 "#pragma unroll\n"
2325 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2326 "{\n"
2327 "if (y < rows)\n"
2328 "{\n"
2329 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2330 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2331 "float4 src_pix = vload4(0, src);\n"
2332 "float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n"
2333 "float h, s, v;\n"
2334 "float vmin, diff;\n"
2335 "v = vmin = r;\n"
2336 "if( v < g ) v = g;\n"
2337 "if( v < b ) v = b;\n"
2338 "if( vmin > g ) vmin = g;\n"
2339 "if( vmin > b ) vmin = b;\n"
2340 "diff = v - vmin;\n"
2341 "s = diff/(float)(fabs(v) + FLT_EPSILON);\n"
2342 "diff = (float)(60.f/(diff + FLT_EPSILON));\n"
2343 "if( v == r )\n"
2344 "h = (g - b)*diff;\n"
2345 "else if( v == g )\n"
2346 "h = fma(b - r, diff, 120.f);\n"
2347 "else\n"
2348 "h = fma(r - g, diff, 240.f);\n"
2349 "if( h < 0 )\n"
2350 "h += 360.f;\n"
2351 "dst[0] = h*hscale;\n"
2352 "dst[1] = s;\n"
2353 "dst[2] = v;\n"
2354 "++y;\n"
2355 "dst_index += dst_step;\n"
2356 "src_index += src_step;\n"
2357 "}\n"
2358 "}\n"
2359 "}\n"
2360 "}\n"
2361 "__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n"
2362 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
2363 "int rows, int cols)\n"
2364 "{\n"
2365 "int x = get_global_id(0);\n"
2366 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2367 "if (x < cols)\n"
2368 "{\n"
2369 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2370 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2371 "#pragma unroll\n"
2372 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2373 "{\n"
2374 "if (y < rows)\n"
2375 "{\n"
2376 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2377 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2378 "float4 src_pix = vload4(0, src);\n"
2379 "float h = src_pix.x, s = src_pix.y, v = src_pix.z;\n"
2380 "float b, g, r;\n"
2381 "if (s != 0)\n"
2382 "{\n"
2383 "float tab[4];\n"
2384 "int sector;\n"
2385 "h *= hscale;\n"
2386 "if(h < 0)\n"
2387 "do h += 6; while (h < 0);\n"
2388 "else if (h >= 6)\n"
2389 "do h -= 6; while (h >= 6);\n"
2390 "sector = convert_int_sat_rtn(h);\n"
2391 "h -= sector;\n"
2392 "if ((unsigned)sector >= 6u)\n"
2393 "{\n"
2394 "sector = 0;\n"
2395 "h = 0.f;\n"
2396 "}\n"
2397 "tab[0] = v;\n"
2398 "tab[1] = v*(1.f - s);\n"
2399 "tab[2] = v*(1.f - s*h);\n"
2400 "tab[3] = v*(1.f - s*(1.f - h));\n"
2401 "b = tab[sector_data[sector][0]];\n"
2402 "g = tab[sector_data[sector][1]];\n"
2403 "r = tab[sector_data[sector][2]];\n"
2404 "}\n"
2405 "else\n"
2406 "b = g = r = v;\n"
2407 "dst[bidx] = b;\n"
2408 "dst[1] = g;\n"
2409 "dst[bidx^2] = r;\n"
2410 "#if dcn == 4\n"
2411 "dst[3] = MAX_NUM;\n"
2412 "#endif\n"
2413 "++y;\n"
2414 "dst_index += dst_step;\n"
2415 "src_index += src_step;\n"
2416 "}\n"
2417 "}\n"
2418 "}\n"
2419 "}\n"
2420 "#endif\n"
2421 "#ifdef DEPTH_0\n"
2422 "__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,\n"
2423 "__global uchar* dst, int dst_step, int dst_offset,\n"
2424 "int rows, int cols)\n"
2425 "{\n"
2426 "int x = get_global_id(0);\n"
2427 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2428 "if (x < cols)\n"
2429 "{\n"
2430 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2431 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2432 "#pragma unroll\n"
2433 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2434 "{\n"
2435 "if (y < rows)\n"
2436 "{\n"
2437 "uchar4 src_pix = vload4(0, src + src_index);\n"
2438 "float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);\n"
2439 "float h = 0.f, s = 0.f, l;\n"
2440 "float vmin, vmax, diff;\n"
2441 "vmax = vmin = r;\n"
2442 "if (vmax < g) vmax = g;\n"
2443 "if (vmax < b) vmax = b;\n"
2444 "if (vmin > g) vmin = g;\n"
2445 "if (vmin > b) vmin = b;\n"
2446 "diff = vmax - vmin;\n"
2447 "l = (vmax + vmin)*0.5f;\n"
2448 "if (diff > FLT_EPSILON)\n"
2449 "{\n"
2450 "s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);\n"
2451 "diff = 60.f/diff;\n"
2452 "if( vmax == r )\n"
2453 "h = (g - b)*diff;\n"
2454 "else if( vmax == g )\n"
2455 "h = fma(b - r, diff, 120.f);\n"
2456 "else\n"
2457 "h = fma(r - g, diff, 240.f);\n"
2458 "if( h < 0.f )\n"
2459 "h += 360.f;\n"
2460 "}\n"
2461 "dst[dst_index] = convert_uchar_sat_rte(h*hscale);\n"
2462 "dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);\n"
2463 "dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);\n"
2464 "++y;\n"
2465 "dst_index += dst_step;\n"
2466 "src_index += src_step;\n"
2467 "}\n"
2468 "}\n"
2469 "}\n"
2470 "}\n"
2471 "__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,\n"
2472 "__global uchar* dst, int dst_step, int dst_offset,\n"
2473 "int rows, int cols)\n"
2474 "{\n"
2475 "int x = get_global_id(0);\n"
2476 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2477 "if (x < cols)\n"
2478 "{\n"
2479 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2480 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2481 "#pragma unroll\n"
2482 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2483 "{\n"
2484 "if (y < rows)\n"
2485 "{\n"
2486 "uchar4 src_pix = vload4(0, src + src_index);\n"
2487 "float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);\n"
2488 "float b, g, r;\n"
2489 "if (s != 0)\n"
2490 "{\n"
2491 "float tab[4];\n"
2492 "float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;\n"
2493 "float p1 = 2*l - p2;\n"
2494 "h *= hscale;\n"
2495 "if( h < 0 )\n"
2496 "do h += 6; while( h < 0 );\n"
2497 "else if( h >= 6 )\n"
2498 "do h -= 6; while( h >= 6 );\n"
2499 "int sector = convert_int_sat_rtn(h);\n"
2500 "h -= sector;\n"
2501 "tab[0] = p2;\n"
2502 "tab[1] = p1;\n"
2503 "tab[2] = fma(p2 - p1, 1-h, p1);\n"
2504 "tab[3] = fma(p2 - p1, h, p1);\n"
2505 "b = tab[sector_data[sector][0]];\n"
2506 "g = tab[sector_data[sector][1]];\n"
2507 "r = tab[sector_data[sector][2]];\n"
2508 "}\n"
2509 "else\n"
2510 "b = g = r = l;\n"
2511 "dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);\n"
2512 "dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);\n"
2513 "dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);\n"
2514 "#if dcn == 4\n"
2515 "dst[dst_index + 3] = MAX_NUM;\n"
2516 "#endif\n"
2517 "++y;\n"
2518 "dst_index += dst_step;\n"
2519 "src_index += src_step;\n"
2520 "}\n"
2521 "}\n"
2522 "}\n"
2523 "}\n"
2524 "#elif defined DEPTH_5\n"
2525 "__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,\n"
2526 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
2527 "int rows, int cols)\n"
2528 "{\n"
2529 "int x = get_global_id(0);\n"
2530 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2531 "if (x < cols)\n"
2532 "{\n"
2533 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2534 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2535 "#pragma unroll\n"
2536 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2537 "{\n"
2538 "if (y < rows)\n"
2539 "{\n"
2540 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2541 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2542 "float4 src_pix = vload4(0, src);\n"
2543 "float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n"
2544 "float h = 0.f, s = 0.f, l;\n"
2545 "float vmin, vmax, diff;\n"
2546 "vmax = vmin = r;\n"
2547 "if (vmax < g) vmax = g;\n"
2548 "if (vmax < b) vmax = b;\n"
2549 "if (vmin > g) vmin = g;\n"
2550 "if (vmin > b) vmin = b;\n"
2551 "diff = vmax - vmin;\n"
2552 "l = (vmax + vmin)*0.5f;\n"
2553 "if (diff > FLT_EPSILON)\n"
2554 "{\n"
2555 "s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);\n"
2556 "diff = 60.f/diff;\n"
2557 "if( vmax == r )\n"
2558 "h = (g - b)*diff;\n"
2559 "else if( vmax == g )\n"
2560 "h = fma(b - r, diff, 120.f);\n"
2561 "else\n"
2562 "h = fma(r - g, diff, 240.f);\n"
2563 "if( h < 0.f ) h += 360.f;\n"
2564 "}\n"
2565 "dst[0] = h*hscale;\n"
2566 "dst[1] = l;\n"
2567 "dst[2] = s;\n"
2568 "++y;\n"
2569 "dst_index += dst_step;\n"
2570 "src_index += src_step;\n"
2571 "}\n"
2572 "}\n"
2573 "}\n"
2574 "}\n"
2575 "__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n"
2576 "__global uchar* dstptr, int dst_step, int dst_offset,\n"
2577 "int rows, int cols)\n"
2578 "{\n"
2579 "int x = get_global_id(0);\n"
2580 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2581 "if (x < cols)\n"
2582 "{\n"
2583 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2584 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2585 "#pragma unroll\n"
2586 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2587 "{\n"
2588 "if (y < rows)\n"
2589 "{\n"
2590 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2591 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2592 "float4 src_pix = vload4(0, src);\n"
2593 "float h = src_pix.x, l = src_pix.y, s = src_pix.z;\n"
2594 "float b, g, r;\n"
2595 "if (s != 0)\n"
2596 "{\n"
2597 "float tab[4];\n"
2598 "int sector;\n"
2599 "float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;\n"
2600 "float p1 = 2*l - p2;\n"
2601 "h *= hscale;\n"
2602 "if( h < 0 )\n"
2603 "do h += 6; while( h < 0 );\n"
2604 "else if( h >= 6 )\n"
2605 "do h -= 6; while( h >= 6 );\n"
2606 "sector = convert_int_sat_rtn(h);\n"
2607 "h -= sector;\n"
2608 "tab[0] = p2;\n"
2609 "tab[1] = p1;\n"
2610 "tab[2] = fma(p2 - p1, 1-h, p1);\n"
2611 "tab[3] = fma(p2 - p1, h, p1);\n"
2612 "b = tab[sector_data[sector][0]];\n"
2613 "g = tab[sector_data[sector][1]];\n"
2614 "r = tab[sector_data[sector][2]];\n"
2615 "}\n"
2616 "else\n"
2617 "b = g = r = l;\n"
2618 "dst[bidx] = b;\n"
2619 "dst[1] = g;\n"
2620 "dst[bidx^2] = r;\n"
2621 "#if dcn == 4\n"
2622 "dst[3] = MAX_NUM;\n"
2623 "#endif\n"
2624 "++y;\n"
2625 "dst_index += dst_step;\n"
2626 "src_index += src_step;\n"
2627 "}\n"
2628 "}\n"
2629 "}\n"
2630 "}\n"
2631 "#endif\n"
2632 "#ifdef DEPTH_0\n"
2633 "__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,\n"
2634 "__global uchar* dst, int dst_step, int dst_offset,\n"
2635 "int rows, int cols)\n"
2636 "{\n"
2637 "int x = get_global_id(0);\n"
2638 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2639 "if (x < cols)\n"
2640 "{\n"
2641 "int src_index = mad24(y, src_step, src_offset + (x << 2));\n"
2642 "int dst_index = mad24(y, dst_step, dst_offset + (x << 2));\n"
2643 "#pragma unroll\n"
2644 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2645 "{\n"
2646 "if (y < rows)\n"
2647 "{\n"
2648 "uchar4 src_pix = *(__global const uchar4 *)(src + src_index);\n"
2649 "*(__global uchar4 *)(dst + dst_index) =\n"
2650 "(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,\n"
2651 "mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,\n"
2652 "mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);\n"
2653 "++y;\n"
2654 "dst_index += dst_step;\n"
2655 "src_index += src_step;\n"
2656 "}\n"
2657 "}\n"
2658 "}\n"
2659 "}\n"
2660 "__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,\n"
2661 "__global uchar* dst, int dst_step, int dst_offset,\n"
2662 "int rows, int cols)\n"
2663 "{\n"
2664 "int x = get_global_id(0);\n"
2665 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2666 "if (x < cols)\n"
2667 "{\n"
2668 "int src_index = mad24(y, src_step, mad24(x, 4, src_offset));\n"
2669 "int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));\n"
2670 "#pragma unroll\n"
2671 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2672 "{\n"
2673 "if (y < rows)\n"
2674 "{\n"
2675 "uchar4 src_pix = *(__global const uchar4 *)(src + src_index);\n"
2676 "uchar v3 = src_pix.w, v3_half = v3 / 2;\n"
2677 "if (v3 == 0)\n"
2678 "*(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);\n"
2679 "else\n"
2680 "*(__global uchar4 *)(dst + dst_index) =\n"
2681 "(uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,\n"
2682 "mad24(src_pix.y, MAX_NUM, v3_half) / v3,\n"
2683 "mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);\n"
2684 "++y;\n"
2685 "dst_index += dst_step;\n"
2686 "src_index += src_step;\n"
2687 "}\n"
2688 "}\n"
2689 "}\n"
2690 "}\n"
2691 "#endif\n"
2692 "#define lab_shift xyz_shift\n"
2693 "#define gamma_shift 3\n"
2694 "#define lab_shift2 (lab_shift + gamma_shift)\n"
2695 "#define GAMMA_TAB_SIZE 1024\n"
2696 "#define GammaTabScale (float)GAMMA_TAB_SIZE\n"
2697 "inline float splineInterpolate(float x, __global const float * tab, int n)\n"
2698 "{\n"
2699 "int ix = clamp(convert_int_sat_rtn(x), 0, n-1);\n"
2700 "x -= ix;\n"
2701 "tab += ix << 2;\n"
2702 "return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);\n"
2703 "}\n"
2704 "#ifdef DEPTH_0\n"
2705 "__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,\n"
2706 "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n"
2707 "__global const ushort * gammaTab, __global ushort * LabCbrtTab_b,\n"
2708 "__constant int * coeffs, int Lscale, int Lshift)\n"
2709 "{\n"
2710 "int x = get_global_id(0);\n"
2711 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2712 "if (x < cols)\n"
2713 "{\n"
2714 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2715 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2716 "#pragma unroll\n"
2717 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2718 "{\n"
2719 "if (y < rows)\n"
2720 "{\n"
2721 "__global const uchar* src_ptr = src + src_index;\n"
2722 "__global uchar* dst_ptr = dst + dst_index;\n"
2723 "uchar4 src_pix = vload4(0, src_ptr);\n"
2724 "int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n"
2725 "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n"
2726 "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n"
2727 "int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];\n"
2728 "int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];\n"
2729 "int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];\n"
2730 "int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];\n"
2731 "int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );\n"
2732 "int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );\n"
2733 "int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );\n"
2734 "dst_ptr[0] = SAT_CAST(L);\n"
2735 "dst_ptr[1] = SAT_CAST(a);\n"
2736 "dst_ptr[2] = SAT_CAST(b);\n"
2737 "++y;\n"
2738 "dst_index += dst_step;\n"
2739 "src_index += src_step;\n"
2740 "}\n"
2741 "}\n"
2742 "}\n"
2743 "}\n"
2744 "#elif defined DEPTH_5\n"
2745 "__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,\n"
2746 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n"
2747 "#ifdef SRGB\n"
2748 "__global const float * gammaTab,\n"
2749 "#endif\n"
2750 "__constant float * coeffs, float _1_3, float _a)\n"
2751 "{\n"
2752 "int x = get_global_id(0);\n"
2753 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2754 "if (x < cols)\n"
2755 "{\n"
2756 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2757 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2758 "#pragma unroll\n"
2759 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2760 "{\n"
2761 "if (y < rows)\n"
2762 "{\n"
2763 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2764 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2765 "float4 src_pix = vload4(0, src);\n"
2766 "float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n"
2767 "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n"
2768 "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n"
2769 "float R = clamp(src_pix.x, 0.0f, 1.0f);\n"
2770 "float G = clamp(src_pix.y, 0.0f, 1.0f);\n"
2771 "float B = clamp(src_pix.z, 0.0f, 1.0f);\n"
2772 "#ifdef SRGB\n"
2773 "R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2774 "G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2775 "B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2776 "#endif\n"
2777 "float X = fma(R, C0, fma(G, C1, B*C2));\n"
2778 "float Y = fma(R, C3, fma(G, C4, B*C5));\n"
2779 "float Z = fma(R, C6, fma(G, C7, B*C8));\n"
2780 "float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);\n"
2781 "float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);\n"
2782 "float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);\n"
2783 "float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);\n"
2784 "float a = 500.f * (FX - FY);\n"
2785 "float b = 200.f * (FY - FZ);\n"
2786 "dst[0] = L;\n"
2787 "dst[1] = a;\n"
2788 "dst[2] = b;\n"
2789 "++y;\n"
2790 "dst_index += dst_step;\n"
2791 "src_index += src_step;\n"
2792 "}\n"
2793 "}\n"
2794 "}\n"
2795 "}\n"
2796 "#endif\n"
2797 "inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,\n"
2798 "#ifdef SRGB\n"
2799 "__global const float * gammaTab,\n"
2800 "#endif\n"
2801 "__constant float * coeffs, float lThresh, float fThresh)\n"
2802 "{\n"
2803 "float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];\n"
2804 "float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n"
2805 "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n"
2806 "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n"
2807 "float y, fy;\n"
2808 "if (li <= lThresh)\n"
2809 "{\n"
2810 "y = li / 903.3f;\n"
2811 "fy = fma(7.787f, y, 16.0f / 116.0f);\n"
2812 "}\n"
2813 "else\n"
2814 "{\n"
2815 "fy = (li + 16.0f) / 116.0f;\n"
2816 "y = fy * fy * fy;\n"
2817 "}\n"
2818 "float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };\n"
2819 "#pragma unroll\n"
2820 "for (int j = 0; j < 2; j++)\n"
2821 "if (fxz[j] <= fThresh)\n"
2822 "fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;\n"
2823 "else\n"
2824 "fxz[j] = fxz[j] * fxz[j] * fxz[j];\n"
2825 "float x = fxz[0], z = fxz[1];\n"
2826 "float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);\n"
2827 "float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);\n"
2828 "float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);\n"
2829 "#ifdef SRGB\n"
2830 "ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2831 "go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2832 "bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2833 "#endif\n"
2834 "dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;\n"
2835 "}\n"
2836 "#ifdef DEPTH_0\n"
2837 "__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,\n"
2838 "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n"
2839 "#ifdef SRGB\n"
2840 "__global const float * gammaTab,\n"
2841 "#endif\n"
2842 "__constant float * coeffs, float lThresh, float fThresh)\n"
2843 "{\n"
2844 "int x = get_global_id(0);\n"
2845 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2846 "if (x < cols)\n"
2847 "{\n"
2848 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2849 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2850 "#pragma unroll\n"
2851 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2852 "{\n"
2853 "if (y < rows)\n"
2854 "{\n"
2855 "__global const uchar* src_ptr = src + src_index;\n"
2856 "__global uchar * dst_ptr = dst + dst_index;\n"
2857 "uchar4 src_pix = vload4(0, src_ptr);\n"
2858 "float srcbuf[3], dstbuf[3];\n"
2859 "srcbuf[0] = src_pix.x*(100.f/255.f);\n"
2860 "srcbuf[1] = convert_float(src_pix.y - 128);\n"
2861 "srcbuf[2] = convert_float(src_pix.z - 128);\n"
2862 "Lab2BGR_f(&srcbuf[0], &dstbuf[0],\n"
2863 "#ifdef SRGB\n"
2864 "gammaTab,\n"
2865 "#endif\n"
2866 "coeffs, lThresh, fThresh);\n"
2867 "#if dcn == 3\n"
2868 "dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);\n"
2869 "dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);\n"
2870 "dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);\n"
2871 "#else\n"
2872 "*(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),\n"
2873 "SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);\n"
2874 "#endif\n"
2875 "++y;\n"
2876 "dst_index += dst_step;\n"
2877 "src_index += src_step;\n"
2878 "}\n"
2879 "}\n"
2880 "}\n"
2881 "}\n"
2882 "#elif defined DEPTH_5\n"
2883 "__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,\n"
2884 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n"
2885 "#ifdef SRGB\n"
2886 "__global const float * gammaTab,\n"
2887 "#endif\n"
2888 "__constant float * coeffs, float lThresh, float fThresh)\n"
2889 "{\n"
2890 "int x = get_global_id(0);\n"
2891 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2892 "if (x < cols)\n"
2893 "{\n"
2894 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2895 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2896 "#pragma unroll\n"
2897 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2898 "{\n"
2899 "if (y < rows)\n"
2900 "{\n"
2901 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2902 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2903 "float4 src_pix = vload4(0, src);\n"
2904 "float srcbuf[3], dstbuf[3];\n"
2905 "srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;\n"
2906 "Lab2BGR_f(&srcbuf[0], &dstbuf[0],\n"
2907 "#ifdef SRGB\n"
2908 "gammaTab,\n"
2909 "#endif\n"
2910 "coeffs, lThresh, fThresh);\n"
2911 "dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];\n"
2912 "#if dcn == 4\n"
2913 "dst[3] = MAX_NUM;\n"
2914 "#endif\n"
2915 "++y;\n"
2916 "dst_index += dst_step;\n"
2917 "src_index += src_step;\n"
2918 "}\n"
2919 "}\n"
2920 "}\n"
2921 "}\n"
2922 "#endif\n"
2923 "#define LAB_CBRT_TAB_SIZE 1024\n"
2924 "#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))\n"
2925 "__constant float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;\n"
2926 "#ifdef DEPTH_5\n"
2927 "__kernel void BGR2Luv(__global const uchar * srcptr, int src_step, int src_offset,\n"
2928 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n"
2929 "#ifdef SRGB\n"
2930 "__global const float * gammaTab,\n"
2931 "#endif\n"
2932 "__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)\n"
2933 "{\n"
2934 "int x = get_global_id(0);\n"
2935 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2936 "if (x < cols)\n"
2937 "{\n"
2938 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2939 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2940 "#pragma unroll\n"
2941 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2942 "if (y < rows)\n"
2943 "{\n"
2944 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
2945 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
2946 "float R = src[0], G = src[1], B = src[2];\n"
2947 "#ifdef SRGB\n"
2948 "R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2949 "G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2950 "B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2951 "#endif\n"
2952 "float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));\n"
2953 "float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));\n"
2954 "float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));\n"
2955 "float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);\n"
2956 "L = fma(116.f, L, -16.f);\n"
2957 "float d = 52.0f / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);\n"
2958 "float u = L*fma(X, d, -_un);\n"
2959 "float v = L*fma(2.25f, Y*d, -_vn);\n"
2960 "dst[0] = L;\n"
2961 "dst[1] = u;\n"
2962 "dst[2] = v;\n"
2963 "++y;\n"
2964 "dst_index += dst_step;\n"
2965 "src_index += src_step;\n"
2966 "}\n"
2967 "}\n"
2968 "}\n"
2969 "#elif defined DEPTH_0\n"
2970 "__kernel void BGR2Luv(__global const uchar * src, int src_step, int src_offset,\n"
2971 "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n"
2972 "#ifdef SRGB\n"
2973 "__global const float * gammaTab,\n"
2974 "#endif\n"
2975 "__global const float * LabCbrtTab, __constant float * coeffs, float _un, float _vn)\n"
2976 "{\n"
2977 "int x = get_global_id(0);\n"
2978 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
2979 "if (x < cols)\n"
2980 "{\n"
2981 "src += mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
2982 "dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
2983 "#pragma unroll\n"
2984 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
2985 "if (y < rows)\n"
2986 "{\n"
2987 "float scale = 1.0f / 255.0f;\n"
2988 "float R = src[0]*scale, G = src[1]*scale, B = src[2]*scale;\n"
2989 "#ifdef SRGB\n"
2990 "R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2991 "G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2992 "B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
2993 "#endif\n"
2994 "float X = fma(R, coeffs[0], fma(G, coeffs[1], B*coeffs[2]));\n"
2995 "float Y = fma(R, coeffs[3], fma(G, coeffs[4], B*coeffs[5]));\n"
2996 "float Z = fma(R, coeffs[6], fma(G, coeffs[7], B*coeffs[8]));\n"
2997 "float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);\n"
2998 "L = 116.f*L - 16.f;\n"
2999 "float d = (4*13) / fmax(fma(15.0f, Y, fma(3.0f, Z, X)), FLT_EPSILON);\n"
3000 "float u = L*(X*d - _un);\n"
3001 "float v = L*fma(2.25f, Y*d, -_vn);\n"
3002 "dst[0] = SAT_CAST(L * 2.55f);\n"
3003 "dst[1] = SAT_CAST(fma(u, 0.72033898305084743f, 96.525423728813564f));\n"
3004 "dst[2] = SAT_CAST(fma(v, 0.9732824427480916f, 136.259541984732824f));\n"
3005 "++y;\n"
3006 "dst += dst_step;\n"
3007 "src += src_step;\n"
3008 "}\n"
3009 "}\n"
3010 "}\n"
3011 "#endif\n"
3012 "#ifdef DEPTH_5\n"
3013 "__kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offset,\n"
3014 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n"
3015 "#ifdef SRGB\n"
3016 "__global const float * gammaTab,\n"
3017 "#endif\n"
3018 "__constant float * coeffs, float _un, float _vn)\n"
3019 "{\n"
3020 "int x = get_global_id(0);\n"
3021 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
3022 "if (x < cols)\n"
3023 "{\n"
3024 "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
3025 "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
3026 "#pragma unroll\n"
3027 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
3028 "if (y < rows)\n"
3029 "{\n"
3030 "__global const float * src = (__global const float *)(srcptr + src_index);\n"
3031 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
3032 "float L = src[0], u = src[1], v = src[2], d, X, Y, Z;\n"
3033 "Y = (L + 16.f) * (1.f/116.f);\n"
3034 "Y = Y*Y*Y;\n"
3035 "d = (1.f/13.f)/L;\n"
3036 "u = fma(u, d, _un);\n"
3037 "v = fma(v, d, _vn);\n"
3038 "float iv = 1.f/v;\n"
3039 "X = 2.25f * u * Y * iv;\n"
3040 "Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;\n"
3041 "float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));\n"
3042 "float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));\n"
3043 "float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));\n"
3044 "R = clamp(R, 0.f, 1.f);\n"
3045 "G = clamp(G, 0.f, 1.f);\n"
3046 "B = clamp(B, 0.f, 1.f);\n"
3047 "#ifdef SRGB\n"
3048 "R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3049 "G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3050 "B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3051 "#endif\n"
3052 "dst[0] = R;\n"
3053 "dst[1] = G;\n"
3054 "dst[2] = B;\n"
3055 "#if dcn == 4\n"
3056 "dst[3] = MAX_NUM;\n"
3057 "#endif\n"
3058 "++y;\n"
3059 "dst_index += dst_step;\n"
3060 "src_index += src_step;\n"
3061 "}\n"
3062 "}\n"
3063 "}\n"
3064 "#elif defined DEPTH_0\n"
3065 "__kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,\n"
3066 "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n"
3067 "#ifdef SRGB\n"
3068 "__global const float * gammaTab,\n"
3069 "#endif\n"
3070 "__constant float * coeffs, float _un, float _vn)\n"
3071 "{\n"
3072 "int x = get_global_id(0);\n"
3073 "int y = get_global_id(1) * PIX_PER_WI_Y;\n"
3074 "if (x < cols)\n"
3075 "{\n"
3076 "src += mad24(y, src_step, mad24(x, scnbytes, src_offset));\n"
3077 "dst += mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n"
3078 "#pragma unroll\n"
3079 "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n"
3080 "if (y < rows)\n"
3081 "{\n"
3082 "float d, X, Y, Z;\n"
3083 "float L = src[0]*(100.f/255.f);\n"
3084 "float u = fma(convert_float(src[1]), 1.388235294117647f, -134.f);\n"
3085 "float v = fma(convert_float(src[2]), 1.027450980392157f, - 140.f);\n"
3086 "Y = (L + 16.f) * (1.f/116.f);\n"
3087 "Y = Y*Y*Y;\n"
3088 "d = (1.f/13.f)/L;\n"
3089 "u = fma(u, d, _un);\n"
3090 "v = fma(v, d, _vn);\n"
3091 "float iv = 1.f/v;\n"
3092 "X = 2.25f * u * Y * iv ;\n"
3093 "Z = (12 - fma(3.0f, u, 20.0f * v)) * Y * 0.25f * iv;\n"
3094 "float R = fma(X, coeffs[0], fma(Y, coeffs[1], Z * coeffs[2]));\n"
3095 "float G = fma(X, coeffs[3], fma(Y, coeffs[4], Z * coeffs[5]));\n"
3096 "float B = fma(X, coeffs[6], fma(Y, coeffs[7], Z * coeffs[8]));\n"
3097 "R = clamp(R, 0.f, 1.f);\n"
3098 "G = clamp(G, 0.f, 1.f);\n"
3099 "B = clamp(B, 0.f, 1.f);\n"
3100 "#ifdef SRGB\n"
3101 "R = splineInterpolate(R*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3102 "G = splineInterpolate(G*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3103 "B = splineInterpolate(B*GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n"
3104 "#endif\n"
3105 "uchar dst0 = SAT_CAST(R * 255.0f);\n"
3106 "uchar dst1 = SAT_CAST(G * 255.0f);\n"
3107 "uchar dst2 = SAT_CAST(B * 255.0f);\n"
3108 "#if dcn == 4\n"
3109 "*(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);\n"
3110 "#else\n"
3111 "dst[0] = dst0;\n"
3112 "dst[1] = dst1;\n"
3113 "dst[2] = dst2;\n"
3114 "#endif\n"
3115 "++y;\n"
3116 "dst += dst_step;\n"
3117 "src += src_step;\n"
3118 "}\n"
3119 "}\n"
3120 "}\n"
3121 "#endif\n"
3122 , "4cef6d86e62644944b49b945ea0fc356"};
3123 ProgramSource cvtcolor_oclsrc(cvtcolor.programStr);
3124 const struct ProgramEntry filter2D={"filter2D",
3125 "#ifdef EXTRA_EXTRAPOLATION\n"
3126 "#ifdef BORDER_CONSTANT\n"
3127 "#define EXTRAPOLATE(x, minV, maxV)\n"
3128 "#elif defined BORDER_REPLICATE\n"
3129 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3130 "{ \\\n"
3131 "(x) = clamp((x), (minV), (maxV)-1); \\\n"
3132 "}\n"
3133 "#elif defined BORDER_WRAP\n"
3134 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3135 "{ \\\n"
3136 "if ((x) < (minV)) \\\n"
3137 "(x) += ((maxV) - (minV)); \\\n"
3138 "if ((x) >= (maxV)) \\\n"
3139 "(x) -= ((maxV) - (minV)); \\\n"
3140 "}\n"
3141 "#elif defined BORDER_REFLECT\n"
3142 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3143 "{ \\\n"
3144 "if ((maxV) - (minV) == 1) \\\n"
3145 "(x) = (minV); \\\n"
3146 "else \\\n"
3147 "while ((x) >= (maxV) || (x) < (minV)) \\\n"
3148 "{ \\\n"
3149 "if ((x) < (minV)) \\\n"
3150 "(x) = (minV) - ((x) - (minV)) - 1; \\\n"
3151 "else \\\n"
3152 "(x) = (maxV) - 1 - ((x) - (maxV)); \\\n"
3153 "} \\\n"
3154 "}\n"
3155 "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n"
3156 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3157 "{ \\\n"
3158 "if ((maxV) - (minV) == 1) \\\n"
3159 "(x) = (minV); \\\n"
3160 "else \\\n"
3161 "while ((x) >= (maxV) || (x) < (minV)) \\\n"
3162 "{ \\\n"
3163 "if ((x) < (minV)) \\\n"
3164 "(x) = (minV) - ((x) - (minV)); \\\n"
3165 "else \\\n"
3166 "(x) = (maxV) - 1 - ((x) - (maxV)) - 1; \\\n"
3167 "} \\\n"
3168 "}\n"
3169 "#else\n"
3170 "#error No extrapolation method\n"
3171 "#endif\n"
3172 "#else\n"
3173 "#ifdef BORDER_CONSTANT\n"
3174 "#define EXTRAPOLATE(x, minV, maxV)\n"
3175 "#elif defined BORDER_REPLICATE\n"
3176 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3177 "{ \\\n"
3178 "(x) = clamp((x), (minV), (maxV)-1); \\\n"
3179 "}\n"
3180 "#elif defined BORDER_WRAP\n"
3181 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3182 "{ \\\n"
3183 "if ((x) < (minV)) \\\n"
3184 "(x) += (((minV) - (x)) / ((maxV) - (minV)) + 1) * ((maxV) - (minV)); \\\n"
3185 "if ((x) >= (maxV)) \\\n"
3186 "(x) = ((x) - (minV)) % ((maxV) - (minV)) + (minV); \\\n"
3187 "}\n"
3188 "#elif defined BORDER_REFLECT\n"
3189 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3190 "{ \\\n"
3191 "(x) = clamp((x), 2 * (minV) - (x) - 1, 2 * (maxV) - (x) - 1); \\\n"
3192 "}\n"
3193 "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n"
3194 "#define EXTRAPOLATE(x, minV, maxV) \\\n"
3195 "{ \\\n"
3196 "(x) = clamp((x), 2 * (minV) - (x), 2 * (maxV) - (x) - 2); \\\n"
3197 "}\n"
3198 "#else\n"
3199 "#error No extrapolation method\n"
3200 "#endif\n"
3201 "#endif\n"
3202 "#ifdef DOUBLE_SUPPORT\n"
3203 "#ifdef cl_amd_fp64\n"
3204 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
3205 "#elif defined (cl_khr_fp64)\n"
3206 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
3207 "#endif\n"
3208 "#endif\n"
3209 "#if cn != 3\n"
3210 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
3211 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
3212 "#define SRCSIZE (int)sizeof(srcT)\n"
3213 "#define DSTSIZE (int)sizeof(dstT)\n"
3214 "#else\n"
3215 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
3216 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
3217 "#define SRCSIZE (int)sizeof(srcT1) * cn\n"
3218 "#define DSTSIZE (int)sizeof(dstT1) * cn\n"
3219 "#endif\n"
3220 "#define UPDATE_COLUMN_SUM(col) \\\n"
3221 "__constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * col]; \\\n"
3222 "WT tmp_sum = 0;                                                 \\\n"
3223 "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)                      \\\n"
3224 "tmp_sum += data[sy] * k[sy];                                \\\n"
3225 "sumOfCols[local_id] = tmp_sum;                                  \\\n"
3226 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3227 "#define UPDATE_TOTAL_SUM(col) \\\n"
3228 "int id = local_id + col - ANCHOR_X; \\\n"
3229 "if (id >= 0 && id < LOCAL_SIZE)     \\\n"
3230 "total_sum += sumOfCols[id];     \\\n"
3231 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3232 "#define noconvert\n"
3233 "#define DIG(a) a,\n"
3234 "__constant WT1 kernelData[] = { COEFF };\n"
3235 "__kernel void filter2D(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n"
3236 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)\n"
3237 "{\n"
3238 "int local_id = get_local_id(0);\n"
3239 "int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;\n"
3240 "int y = get_global_id(1);\n"
3241 "WT data[KERNEL_SIZE_Y];\n"
3242 "__local WT sumOfCols[LOCAL_SIZE];\n"
3243 "#ifdef BORDER_ISOLATED\n"
3244 "int srcBeginX = srcOffsetX;\n"
3245 "int srcBeginY = srcOffsetY;\n"
3246 "#else\n"
3247 "int srcBeginX = 0;\n"
3248 "int srcBeginY = 0;\n"
3249 "#endif\n"
3250 "int srcX = srcOffsetX + x;\n"
3251 "int srcY = srcOffsetY + y - ANCHOR_Y;\n"
3252 "__global dstT *dst = (__global dstT *)(dstptr + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));\n"
3253 "#ifdef BORDER_CONSTANT\n"
3254 "if (srcX >= srcBeginX && srcX < srcEndX)\n"
3255 "{\n"
3256 "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n"
3257 "{\n"
3258 "if (srcY >= srcBeginY && srcY < srcEndY)\n"
3259 "data[sy + sy_index] = convertToWT(loadpix(srcptr + mad24(srcY, src_step, srcX * SRCSIZE)));\n"
3260 "else\n"
3261 "data[sy + sy_index] = (WT)(0);\n"
3262 "}\n"
3263 "}\n"
3264 "else\n"
3265 "{\n"
3266 "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n"
3267 "{\n"
3268 "data[sy + sy_index] = (WT)(0);\n"
3269 "}\n"
3270 "}\n"
3271 "#else\n"
3272 "EXTRAPOLATE(srcX, srcBeginX, srcEndX);\n"
3273 "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n"
3274 "{\n"
3275 "int tempY = srcY;\n"
3276 "EXTRAPOLATE(tempY, srcBeginY, srcEndY);\n"
3277 "data[sy + sy_index] = convertToWT(loadpix(srcptr + mad24(tempY, src_step, srcX * SRCSIZE)));\n"
3278 "}\n"
3279 "#endif\n"
3280 "WT total_sum = 0;\n"
3281 "for (int sx = 0; sx < ANCHOR_X; sx++)\n"
3282 "{\n"
3283 "UPDATE_COLUMN_SUM(sx);\n"
3284 "UPDATE_TOTAL_SUM(sx);\n"
3285 "}\n"
3286 "__constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * ANCHOR_X];\n"
3287 "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)\n"
3288 "total_sum += data[sy] * k[sy];\n"
3289 "for (int sx = ANCHOR_X + 1; sx < KERNEL_SIZE_X; sx++)\n"
3290 "{\n"
3291 "UPDATE_COLUMN_SUM(sx);\n"
3292 "UPDATE_TOTAL_SUM(sx);\n"
3293 "}\n"
3294 "if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) && x >= 0 && x < cols)\n"
3295 "storepix(convertToDstT(total_sum + (WT)(delta)), dst);\n"
3296 "}\n"
3297 , "77e935928055f243ff9082b1879a0b2c"};
3298 ProgramSource filter2D_oclsrc(filter2D.programStr);
3299 const struct ProgramEntry filter2DSmall={"filter2DSmall",
3300 "#ifdef BORDER_REPLICATE\n"
3301 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))\n"
3302 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n"
3303 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))\n"
3304 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))\n"
3305 "#endif\n"
3306 "#ifdef BORDER_REFLECT\n"
3307 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))\n"
3308 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n"
3309 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))\n"
3310 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))\n"
3311 "#endif\n"
3312 "#ifdef BORDER_REFLECT_101\n"
3313 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))\n"
3314 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n"
3315 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))\n"
3316 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))\n"
3317 "#endif\n"
3318 "#ifdef BORDER_WRAP\n"
3319 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))\n"
3320 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n"
3321 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))\n"
3322 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))\n"
3323 "#endif\n"
3324 "#ifdef BORDER_ISOLATED\n"
3325 "#define ISOLATED_MIN(VAL) (VAL)\n"
3326 "#else\n"
3327 "#define ISOLATED_MIN(VAL) 0\n"
3328 "#endif\n"
3329 "#ifdef EXTRA_EXTRAPOLATION\n"
3330 "#ifdef BORDER_CONSTANT\n"
3331 "#elif defined BORDER_REPLICATE\n"
3332 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
3333 "{ \\\n"
3334 "x = max(min(x, maxX - 1), minX); \\\n"
3335 "y = max(min(y, maxY - 1), minY); \\\n"
3336 "}\n"
3337 "#elif defined BORDER_WRAP\n"
3338 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
3339 "{ \\\n"
3340 "if (x < minX) \\\n"
3341 "x -= ((x - maxX + 1) / maxX) * maxX; \\\n"
3342 "if (x >= maxX) \\\n"
3343 "x %= maxX; \\\n"
3344 "if (y < minY) \\\n"
3345 "y -= ((y - maxY + 1) / maxY) * maxY; \\\n"
3346 "if (y >= maxY) \\\n"
3347 "y %= maxY; \\\n"
3348 "}\n"
3349 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
3350 "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n"
3351 "{ \\\n"
3352 "if (maxX - minX == 1) \\\n"
3353 "x = minX; \\\n"
3354 "else \\\n"
3355 "do \\\n"
3356 "{ \\\n"
3357 "if (x < minX) \\\n"
3358 "x = minX - (x - minX) - 1 + delta; \\\n"
3359 "else \\\n"
3360 "x = maxX - 1 - (x - maxX) - delta; \\\n"
3361 "} \\\n"
3362 "while (x >= maxX || x < minX); \\\n"
3363 "\\\n"
3364 "if (maxY - minY == 1) \\\n"
3365 "y = minY; \\\n"
3366 "else \\\n"
3367 "do \\\n"
3368 "{ \\\n"
3369 "if (y < minY) \\\n"
3370 "y = minY - (y - minY) - 1 + delta; \\\n"
3371 "else \\\n"
3372 "y = maxY - 1 - (y - maxY) - delta; \\\n"
3373 "} \\\n"
3374 "while (y >= maxY || y < minY); \\\n"
3375 "}\n"
3376 "#ifdef BORDER_REFLECT\n"
3377 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n"
3378 "#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)\n"
3379 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n"
3380 "#endif\n"
3381 "#else\n"
3382 "#error No extrapolation method\n"
3383 "#endif\n"
3384 "#else\n"
3385 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
3386 "{ \\\n"
3387 "int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \\\n"
3388 "_row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \\\n"
3389 "_row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \\\n"
3390 "y = _row + ISOLATED_MIN(minY); \\\n"
3391 "\\\n"
3392 "_col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \\\n"
3393 "_col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \\\n"
3394 "x = _col + ISOLATED_MIN(minX); \\\n"
3395 "}\n"
3396 "#endif\n"
3397 "#ifdef DOUBLE_SUPPORT\n"
3398 "#ifdef cl_amd_fp64\n"
3399 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
3400 "#elif defined (cl_khr_fp64)\n"
3401 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
3402 "#endif\n"
3403 "#endif\n"
3404 "#if cn != 3\n"
3405 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
3406 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
3407 "#define SRCSIZE (int)sizeof(srcT)\n"
3408 "#define DSTSIZE (int)sizeof(dstT)\n"
3409 "#else\n"
3410 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
3411 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
3412 "#define SRCSIZE (int)sizeof(srcT1) * cn\n"
3413 "#define DSTSIZE (int)sizeof(dstT1) * cn\n"
3414 "#endif\n"
3415 "#define noconvert\n"
3416 "struct RectCoords\n"
3417 "{\n"
3418 "int x1, y1, x2, y2;\n"
3419 "};\n"
3420 "#ifdef BORDER_ISOLATED\n"
3421 "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n"
3422 "{\n"
3423 "return (coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);\n"
3424 "}\n"
3425 "#else\n"
3426 "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n"
3427 "{\n"
3428 "return (coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);\n"
3429 "}\n"
3430 "#endif\n"
3431 "inline WT getBorderPixel(const struct RectCoords bounds, int2 coord,\n"
3432 "__global const uchar* srcptr, int srcstep)\n"
3433 "{\n"
3434 "#ifdef BORDER_CONSTANT\n"
3435 "return (WT)(0);\n"
3436 "#else\n"
3437 "int selected_col = coord.x;\n"
3438 "int selected_row = coord.y;\n"
3439 "EXTRAPOLATE(selected_col, selected_row,\n"
3440 "bounds.x1, bounds.y1,\n"
3441 "bounds.x2, bounds.y2\n"
3442 ");\n"
3443 "coord = (int2)(selected_col, selected_row);\n"
3444 "__global const uchar* ptr = srcptr + mul24(coord.y, srcstep) +\n"
3445 "coord.x * SRCSIZE;\n"
3446 "return convertToWT(loadpix(ptr));\n"
3447 "#endif\n"
3448 "}\n"
3449 "inline WT readSrcPixelSingle(int2 pos, __global const uchar* srcptr,\n"
3450 "int srcstep, const struct RectCoords srcCoords)\n"
3451 "{\n"
3452 "if (!isBorder(srcCoords, pos, 1))\n"
3453 "{\n"
3454 "__global const uchar* ptr = srcptr + mul24(pos.y, srcstep) +\n"
3455 "pos.x * SRCSIZE;\n"
3456 "return convertToWT(loadpix(ptr));\n"
3457 "}\n"
3458 "else\n"
3459 "{\n"
3460 "return getBorderPixel(srcCoords, pos, srcptr, srcstep);\n"
3461 "}\n"
3462 "}\n"
3463 "#define __CAT(x, y) x##y\n"
3464 "#define CAT(x, y) __CAT(x, y)\n"
3465 "#define vload1(OFFSET, PTR) (*(PTR + OFFSET))\n"
3466 "#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)\n"
3467 "#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)\n"
3468 "#if PX_LOAD_VEC_SIZE == 1\n"
3469 "#define PX_LOAD_FLOAT_VEC_CONV (float)\n"
3470 "#elif PX_LOAD_VEC_SIZE == 2\n"
3471 "#define PX_LOAD_FLOAT_VEC_CONV convert_float2\n"
3472 "#elif PX_LOAD_VEC_SIZE == 3\n"
3473 "#define PX_LOAD_FLOAT_VEC_CONV convert_float3\n"
3474 "#elif PX_LOAD_VEC_SIZE == 4\n"
3475 "#define PX_LOAD_FLOAT_VEC_CONV convert_float4\n"
3476 "#endif\n"
3477 "#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)\n"
3478 "#define float1 float\n"
3479 "inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar* srcptr,\n"
3480 "int srcstep, const struct RectCoords srcCoords)\n"
3481 "{\n"
3482 "__global const srcT1* ptr = (__global const srcT1*)\n"
3483 "(srcptr + mul24(pos.y, srcstep) +\n"
3484 "pos.x * SRCSIZE);\n"
3485 "return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));\n"
3486 "}\n"
3487 "#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n"
3488 "#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n"
3489 "#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n"
3490 "#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n"
3491 "#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n"
3492 "#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n"
3493 "#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n"
3494 "#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n"
3495 "#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n"
3496 "#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n"
3497 "#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n"
3498 "#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n"
3499 "#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n"
3500 "#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n"
3501 "#define DIG(a) a,\n"
3502 "__constant WT1 kernelData[] = { COEFF };\n"
3503 "__kernel void filter2DSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n"
3504 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)\n"
3505 "{\n"
3506 "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n"
3507 "const int startX = get_global_id(0) * PX_PER_WI_X;\n"
3508 "const int startY = get_global_id(1) * PX_PER_WI_Y;\n"
3509 "if ((startX >= cols) || (startY >= rows))\n"
3510 "{\n"
3511 "return;\n"
3512 "}\n"
3513 "WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH];\n"
3514 "int py = 0;\n"
3515 "LOOP(PX_LOAD_Y_ITERATIONS, py,\n"
3516 "{\n"
3517 "int y = startY + py;\n"
3518 "int px = 0;\n"
3519 "LOOP(PX_LOAD_X_ITERATIONS, px,\n"
3520 "{\n"
3521 "int x = startX + (px * PX_LOAD_NUM_PX);\n"
3522 "int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y);\n"
3523 "if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX))\n"
3524 "{\n"
3525 "PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords);\n"
3526 "*((PX_LOAD_FLOAT_VEC_TYPE*)&privateData[py][px * PX_LOAD_NUM_PX]) = p;\n"
3527 "}\n"
3528 "else\n"
3529 "{\n"
3530 "int lx = 0;\n"
3531 "LOOP(PX_LOAD_NUM_PX, lx,\n"
3532 "{\n"
3533 "WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords);\n"
3534 "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p;\n"
3535 "srcPos.x++;\n"
3536 "});\n"
3537 "}\n"
3538 "});\n"
3539 "});\n"
3540 "py = 0;\n"
3541 "LOOP(PX_PER_WI_Y, py,\n"
3542 "{\n"
3543 "int y = startY + py;\n"
3544 "int px = 0;\n"
3545 "LOOP(PX_PER_WI_X, px,\n"
3546 "{\n"
3547 "int x = startX + px;\n"
3548 "WT total_sum = 0;\n"
3549 "int sy = 0;\n"
3550 "int kernelIndex = 0;\n"
3551 "LOOP(KERNEL_SIZE_Y, sy,\n"
3552 "{\n"
3553 "int sx = 0;\n"
3554 "LOOP(KERNEL_SIZE_X, sx,\n"
3555 "{\n"
3556 "total_sum = mad(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum);\n"
3557 "});\n"
3558 "});\n"
3559 "__global dstT* dstPtr = (__global dstT*)(dstptr + y * dst_step + dst_offset + x * DSTSIZE);\n"
3560 "storepix(convertToDstT(total_sum + (WT)(delta)), dstPtr);\n"
3561 "});\n"
3562 "});\n"
3563 "}\n"
3564 , "030d23b1d64d51e6485f8941af1e3fc3"};
3565 ProgramSource filter2DSmall_oclsrc(filter2DSmall.programStr);
3566 const struct ProgramEntry filterSepCol={"filterSepCol",
3567 "#ifdef DOUBLE_SUPPORT\n"
3568 "#ifdef cl_amd_fp64\n"
3569 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
3570 "#elif defined (cl_khr_fp64)\n"
3571 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
3572 "#endif\n"
3573 "#endif\n"
3574 "#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)\n"
3575 "#define RADIUS 1\n"
3576 "#define noconvert\n"
3577 "#if CN != 3\n"
3578 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
3579 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
3580 "#define SRCSIZE (int)sizeof(srcT)\n"
3581 "#define DSTSIZE (int)sizeof(dstT)\n"
3582 "#else\n"
3583 "#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))\n"
3584 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
3585 "#define SRCSIZE (int)sizeof(srcT1)*3\n"
3586 "#define DSTSIZE (int)sizeof(dstT1)*3\n"
3587 "#endif\n"
3588 "#define DIG(a) a,\n"
3589 "__constant srcT1 mat_kernel[] = { COEFF };\n"
3590 "__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,\n"
3591 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)\n"
3592 "{\n"
3593 "int x = get_global_id(0);\n"
3594 "int y = get_global_id(1);\n"
3595 "int l_x = get_local_id(0);\n"
3596 "int l_y = get_local_id(1);\n"
3597 "int start_addr = mad24(y, src_step, x * SRCSIZE);\n"
3598 "int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);\n"
3599 "srcT sum, temp[READ_TIMES_COL];\n"
3600 "__local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];\n"
3601 "for (int i = 0; i < READ_TIMES_COL; ++i)\n"
3602 "{\n"
3603 "int current_addr = mad24(i, LSIZE1 * src_step, start_addr);\n"
3604 "current_addr = current_addr < end_addr ? current_addr : 0;\n"
3605 "temp[i] = loadpix(src + current_addr);\n"
3606 "}\n"
3607 "for (int i = 0; i < READ_TIMES_COL; ++i)\n"
3608 "LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];\n"
3609 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3610 "sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];\n"
3611 "for (int i = 1; i <= RADIUSY; ++i)\n"
3612 "{\n"
3613 "temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];\n"
3614 "temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];\n"
3615 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
3616 "sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);\n"
3617 "#else\n"
3618 "sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);\n"
3619 "#endif\n"
3620 "}\n"
3621 "#ifdef INTEGER_ARITHMETIC\n"
3622 "#ifdef INTEL_DEVICE\n"
3623 "sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);\n"
3624 "#else\n"
3625 "sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;\n"
3626 "#endif\n"
3627 "#endif\n"
3628 "if (x < dst_cols && y < dst_rows)\n"
3629 "{\n"
3630 "start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));\n"
3631 "storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);\n"
3632 "}\n"
3633 "}\n"
3634 , "83a29b40287a01ffdb496951c71bc7cd"};
3635 ProgramSource filterSepCol_oclsrc(filterSepCol.programStr);
3636 const struct ProgramEntry filterSepRow={"filterSepRow",
3637 "#ifdef DOUBLE_SUPPORT\n"
3638 "#ifdef cl_amd_fp64\n"
3639 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
3640 "#elif defined (cl_khr_fp64)\n"
3641 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
3642 "#endif\n"
3643 "#endif\n"
3644 "#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0)\n"
3645 "#define RADIUS 1\n"
3646 "#ifdef BORDER_REPLICATE\n"
3647 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))\n"
3648 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n"
3649 "#endif\n"
3650 "#ifdef BORDER_REFLECT\n"
3651 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))\n"
3652 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n"
3653 "#endif\n"
3654 "#ifdef BORDER_REFLECT_101\n"
3655 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))\n"
3656 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n"
3657 "#endif\n"
3658 "#ifdef BORDER_WRAP\n"
3659 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))\n"
3660 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n"
3661 "#endif\n"
3662 "#ifdef EXTRA_EXTRAPOLATION\n"
3663 "#ifdef BORDER_CONSTANT\n"
3664 "#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n"
3665 "#elif defined BORDER_REPLICATE\n"
3666 "#define EXTRAPOLATE(t, minT, maxT) \\\n"
3667 "{ \\\n"
3668 "t = max(min(t, (maxT) - 1), (minT)); \\\n"
3669 "}\n"
3670 "#elif defined BORDER_WRAP\n"
3671 "#define EXTRAPOLATE(x, minT, maxT) \\\n"
3672 "{ \\\n"
3673 "if (t < (minT)) \\\n"
3674 "t -= ((t - (maxT) + 1) / (maxT)) * (maxT); \\\n"
3675 "if (t >= (maxT)) \\\n"
3676 "t %= (maxT); \\\n"
3677 "}\n"
3678 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
3679 "#define EXTRAPOLATE_(t, minT, maxT, delta) \\\n"
3680 "{ \\\n"
3681 "if ((maxT) - (minT) == 1) \\\n"
3682 "t = (minT); \\\n"
3683 "else \\\n"
3684 "do \\\n"
3685 "{ \\\n"
3686 "if (t < (minT)) \\\n"
3687 "t = (minT) - (t - (minT)) - 1 + delta; \\\n"
3688 "else \\\n"
3689 "t = (maxT) - 1 - (t - (maxT)) - delta; \\\n"
3690 "} \\\n"
3691 "while (t >= (maxT) || t < (minT)); \\\n"
3692 "\\\n"
3693 "}\n"
3694 "#ifdef BORDER_REFLECT\n"
3695 "#define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 0)\n"
3696 "#elif defined(BORDER_REFLECT_101)\n"
3697 "#define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 1)\n"
3698 "#endif\n"
3699 "#else\n"
3700 "#error No extrapolation method\n"
3701 "#endif\n"
3702 "#else\n"
3703 "#ifdef BORDER_CONSTANT\n"
3704 "#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n"
3705 "#else\n"
3706 "#define EXTRAPOLATE(t, minT, maxT) \\\n"
3707 "{ \\\n"
3708 "int _delta = t - (minT); \\\n"
3709 "_delta = ADDR_L(_delta, 0, (maxT) - (minT)); \\\n"
3710 "_delta = ADDR_R(_delta, (maxT) - (minT), _delta); \\\n"
3711 "t = _delta + (minT); \\\n"
3712 "}\n"
3713 "#endif\n"
3714 "#endif\n"
3715 "#define noconvert\n"
3716 "#if CN != 3\n"
3717 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
3718 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
3719 "#define SRCSIZE (int)sizeof(srcT)\n"
3720 "#define DSTSIZE (int)sizeof(dstT)\n"
3721 "#else\n"
3722 "#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))\n"
3723 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
3724 "#define SRCSIZE (int)sizeof(srcT1)*3\n"
3725 "#define DSTSIZE (int)sizeof(dstT1)*3\n"
3726 "#endif\n"
3727 "#define DIG(a) a,\n"
3728 "__constant dstT1 mat_kernel[] = { COEFF };\n"
3729 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
3730 "#define dstT4 int4\n"
3731 "#define convertDstVec convert_int4\n"
3732 "#else\n"
3733 "#define dstT4 float4\n"
3734 "#define convertDstVec convert_float4\n"
3735 "#endif\n"
3736 "__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,\n"
3737 "int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,\n"
3738 "__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,\n"
3739 "int radiusy)\n"
3740 "{\n"
3741 "int x = get_global_id(0)<<2;\n"
3742 "int y = get_global_id(1);\n"
3743 "int l_x = get_local_id(0);\n"
3744 "int l_y = get_local_id(1);\n"
3745 "int start_x = x + src_offset_x - RADIUSX & 0xfffffffc;\n"
3746 "int offset = src_offset_x - RADIUSX & 3;\n"
3747 "int start_y = y + src_offset_y - radiusy;\n"
3748 "int start_addr = mad24(start_y, src_step_in_pixel, start_x);\n"
3749 "dstT4 sum;\n"
3750 "uchar4 temp[READ_TIMES_ROW];\n"
3751 "__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];\n"
3752 "#ifdef BORDER_CONSTANT\n"
3753 "int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);\n"
3754 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3755 "{\n"
3756 "int current_addr = mad24(i, LSIZE0 << 2, start_addr);\n"
3757 "current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;\n"
3758 "temp[i] = *(__global const uchar4 *)&src[current_addr];\n"
3759 "}\n"
3760 "#ifdef BORDER_ISOLATED\n"
3761 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3762 "{\n"
3763 "temp[i].x = ELEM(start_x+i*LSIZE0*4,   src_offset_x, src_offset_x + src_cols, 0,         temp[i].x);\n"
3764 "temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0,         temp[i].y);\n"
3765 "temp[i].z = ELEM(start_x+i*LSIZE0*4+2, src_offset_x, src_offset_x + src_cols, 0,         temp[i].z);\n"
3766 "temp[i].w = ELEM(start_x+i*LSIZE0*4+3, src_offset_x, src_offset_x + src_cols, 0,         temp[i].w);\n"
3767 "temp[i]   = ELEM(start_y,              src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);\n"
3768 "}\n"
3769 "#else\n"
3770 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3771 "{\n"
3772 "temp[i].x = ELEM(start_x+i*LSIZE0*4,   0, src_whole_cols, 0,         temp[i].x);\n"
3773 "temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0,         temp[i].y);\n"
3774 "temp[i].z = ELEM(start_x+i*LSIZE0*4+2, 0, src_whole_cols, 0,         temp[i].z);\n"
3775 "temp[i].w = ELEM(start_x+i*LSIZE0*4+3, 0, src_whole_cols, 0,         temp[i].w);\n"
3776 "temp[i]   = ELEM(start_y,              0, src_whole_rows, (uchar4)0, temp[i]);\n"
3777 "}\n"
3778 "#endif\n"
3779 "#else\n"
3780 "#ifdef BORDER_ISOLATED\n"
3781 "int not_all_in_range = (start_x<src_offset_x) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_offset_x + src_cols)| (start_y<src_offset_y) | (start_y >= src_offset_y + src_rows);\n"
3782 "#else\n"
3783 "int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);\n"
3784 "#endif\n"
3785 "int4 index[READ_TIMES_ROW], addr;\n"
3786 "int s_y;\n"
3787 "if (not_all_in_range)\n"
3788 "{\n"
3789 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3790 "{\n"
3791 "index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3);\n"
3792 "#ifdef BORDER_ISOLATED\n"
3793 "EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);\n"
3794 "EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);\n"
3795 "EXTRAPOLATE(index[i].z, src_offset_x, src_offset_x + src_cols);\n"
3796 "EXTRAPOLATE(index[i].w, src_offset_x, src_offset_x + src_cols);\n"
3797 "#else\n"
3798 "EXTRAPOLATE(index[i].x, 0, src_whole_cols);\n"
3799 "EXTRAPOLATE(index[i].y, 0, src_whole_cols);\n"
3800 "EXTRAPOLATE(index[i].z, 0, src_whole_cols);\n"
3801 "EXTRAPOLATE(index[i].w, 0, src_whole_cols);\n"
3802 "#endif\n"
3803 "}\n"
3804 "s_y = start_y;\n"
3805 "#ifdef BORDER_ISOLATED\n"
3806 "EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);\n"
3807 "#else\n"
3808 "EXTRAPOLATE(s_y, 0, src_whole_rows);\n"
3809 "#endif\n"
3810 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3811 "{\n"
3812 "addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]);\n"
3813 "temp[i].x = src[addr.x];\n"
3814 "temp[i].y = src[addr.y];\n"
3815 "temp[i].z = src[addr.z];\n"
3816 "temp[i].w = src[addr.w];\n"
3817 "}\n"
3818 "}\n"
3819 "else\n"
3820 "{\n"
3821 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3822 "temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)];\n"
3823 "}\n"
3824 "#endif\n"
3825 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3826 "LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];\n"
3827 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3828 "sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];\n"
3829 "for (int i = 1; i <= RADIUSX; ++i)\n"
3830 "{\n"
3831 "temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);\n"
3832 "temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);\n"
3833 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
3834 "sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);\n"
3835 "#else\n"
3836 "sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);\n"
3837 "#endif\n"
3838 "}\n"
3839 "start_addr = mad24(y, dst_step_in_pixel, x);\n"
3840 "if ((x+3<dst_cols) & (y<dst_rows))\n"
3841 "*(__global dstT4*)&dst[start_addr] = sum;\n"
3842 "else if ((x+2<dst_cols) && (y<dst_rows))\n"
3843 "{\n"
3844 "dst[start_addr] = sum.x;\n"
3845 "dst[start_addr+1] = sum.y;\n"
3846 "dst[start_addr+2] = sum.z;\n"
3847 "}\n"
3848 "else if ((x+1<dst_cols) && (y<dst_rows))\n"
3849 "{\n"
3850 "dst[start_addr] = sum.x;\n"
3851 "dst[start_addr+1] = sum.y;\n"
3852 "}\n"
3853 "else if (x<dst_cols && y<dst_rows)\n"
3854 "dst[start_addr] = sum.x;\n"
3855 "}\n"
3856 "__kernel void row_filter(__global const uchar * src, int src_step, int src_offset_x, int src_offset_y,\n"
3857 "int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,\n"
3858 "__global uchar * dst, int dst_step, int dst_cols, int dst_rows,\n"
3859 "int radiusy)\n"
3860 "{\n"
3861 "int x = get_global_id(0);\n"
3862 "int y = get_global_id(1);\n"
3863 "int l_x = get_local_id(0);\n"
3864 "int l_y = get_local_id(1);\n"
3865 "int start_x = x + src_offset_x - RADIUSX;\n"
3866 "int start_y = y + src_offset_y - radiusy;\n"
3867 "int start_addr = mad24(start_y, src_step, start_x * SRCSIZE);\n"
3868 "dstT sum;\n"
3869 "srcT temp[READ_TIMES_ROW];\n"
3870 "__local srcT LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];\n"
3871 "#ifdef BORDER_CONSTANT\n"
3872 "int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);\n"
3873 "for (int i = 0; i < READ_TIMES_ROW; i++)\n"
3874 "{\n"
3875 "int current_addr = mad24(i, LSIZE0 * SRCSIZE, start_addr);\n"
3876 "current_addr = current_addr < end_addr && current_addr >= 0 ? current_addr : 0;\n"
3877 "temp[i] = loadpix(src + current_addr);\n"
3878 "}\n"
3879 "#ifdef BORDER_ISOLATED\n"
3880 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3881 "{\n"
3882 "temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]);\n"
3883 "temp[i] = ELEM(start_y,                   src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]);\n"
3884 "}\n"
3885 "#else\n"
3886 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3887 "{\n"
3888 "temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]);\n"
3889 "temp[i] = ELEM(start_y,                   0, src_whole_rows, (srcT)(0), temp[i]);\n"
3890 "}\n"
3891 "#endif\n"
3892 "#else\n"
3893 "int index[READ_TIMES_ROW], s_x, s_y;\n"
3894 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3895 "{\n"
3896 "s_x = mad24(i, LSIZE0, start_x);\n"
3897 "s_y = start_y;\n"
3898 "#ifdef BORDER_ISOLATED\n"
3899 "EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);\n"
3900 "EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);\n"
3901 "#else\n"
3902 "EXTRAPOLATE(s_x, 0, src_whole_cols);\n"
3903 "EXTRAPOLATE(s_y, 0, src_whole_rows);\n"
3904 "#endif\n"
3905 "index[i] = mad24(s_y, src_step, s_x * SRCSIZE);\n"
3906 "}\n"
3907 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3908 "temp[i] = loadpix(src + index[i]);\n"
3909 "#endif\n"
3910 "for (int i = 0; i < READ_TIMES_ROW; ++i)\n"
3911 "LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];\n"
3912 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3913 "sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX];\n"
3914 "for (int i = 1; i <= RADIUSX; ++i)\n"
3915 "{\n"
3916 "temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];\n"
3917 "temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];\n"
3918 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
3919 "sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);\n"
3920 "#else\n"
3921 "sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);\n"
3922 "#endif\n"
3923 "}\n"
3924 "if (x < dst_cols && y < dst_rows)\n"
3925 "{\n"
3926 "start_addr = mad24(y, dst_step, x * DSTSIZE);\n"
3927 "storepix(sum, dst + start_addr);\n"
3928 "}\n"
3929 "}\n"
3930 , "e99b92fca8604fe253f3c641802ce117"};
3931 ProgramSource filterSepRow_oclsrc(filterSepRow.programStr);
3932 const struct ProgramEntry filterSep_singlePass={"filterSep_singlePass",
3933 "#ifdef BORDER_CONSTANT\n"
3934 "#define EXTRAPOLATE(x, maxV)\n"
3935 "#elif defined BORDER_REPLICATE\n"
3936 "#define EXTRAPOLATE(x, maxV) \\\n"
3937 "{ \\\n"
3938 "(x) = clamp((x), 0, (maxV)-1); \\\n"
3939 "}\n"
3940 "#elif defined BORDER_WRAP\n"
3941 "#define EXTRAPOLATE(x, maxV) \\\n"
3942 "{ \\\n"
3943 "(x) = ( (x) + (maxV) ) % (maxV); \\\n"
3944 "}\n"
3945 "#elif defined BORDER_REFLECT\n"
3946 "#define EXTRAPOLATE(x, maxV) \\\n"
3947 "{ \\\n"
3948 "(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \\\n"
3949 "}\n"
3950 "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n"
3951 "#define EXTRAPOLATE(x, maxV) \\\n"
3952 "{ \\\n"
3953 "(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \\\n"
3954 "}\n"
3955 "#else\n"
3956 "#error No extrapolation method\n"
3957 "#endif\n"
3958 "#if CN != 3\n"
3959 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
3960 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
3961 "#define SRCSIZE (int)sizeof(srcT)\n"
3962 "#define DSTSIZE (int)sizeof(dstT)\n"
3963 "#else\n"
3964 "#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))\n"
3965 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
3966 "#define SRCSIZE (int)sizeof(srcT1)*3\n"
3967 "#define DSTSIZE (int)sizeof(dstT1)*3\n"
3968 "#endif\n"
3969 "#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))\n"
3970 "#ifdef BORDER_CONSTANT\n"
3971 "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n"
3972 "#else\n"
3973 "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n"
3974 "#endif\n"
3975 "#define noconvert\n"
3976 "#define DIG(a) a,\n"
3977 "__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };\n"
3978 "__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };\n"
3979 "__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,\n"
3980 "__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)\n"
3981 "{\n"
3982 "__local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];\n"
3983 "__local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];\n"
3984 "int lix = get_local_id(0);\n"
3985 "int liy = get_local_id(1);\n"
3986 "int x = get_global_id(0);\n"
3987 "int srcX = x + srcOffsetX - RADIUSX;\n"
3988 "int clocY = liy;\n"
3989 "do\n"
3990 "{\n"
3991 "int yb = clocY + srcOffsetY - RADIUSY;\n"
3992 "EXTRAPOLATE(yb, (height));\n"
3993 "int clocX = lix;\n"
3994 "int cSrcX = srcX;\n"
3995 "do\n"
3996 "{\n"
3997 "int xb = cSrcX;\n"
3998 "EXTRAPOLATE(xb,(width));\n"
3999 "lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n"
4000 "clocX += BLK_X;\n"
4001 "cSrcX += BLK_X;\n"
4002 "}\n"
4003 "while(clocX < BLK_X+(RADIUSX*2));\n"
4004 "clocY += BLK_Y;\n"
4005 "}\n"
4006 "while (clocY < BLK_Y+(RADIUSY*2));\n"
4007 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4008 "for (int y = 0; y < dst_rows; y+=BLK_Y)\n"
4009 "{\n"
4010 "int i, clocX = lix;\n"
4011 "WT sum = (WT) 0;\n"
4012 "do\n"
4013 "{\n"
4014 "sum = (WT) 0;\n"
4015 "for (i=0; i<=2*RADIUSY; i++)\n"
4016 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
4017 "sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);\n"
4018 "#else\n"
4019 "sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);\n"
4020 "#endif\n"
4021 "lsmemDy[liy][clocX] = sum;\n"
4022 "clocX += BLK_X;\n"
4023 "}\n"
4024 "while(clocX < BLK_X+(RADIUSX*2));\n"
4025 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4026 "if ((x < dst_cols) && (y + liy < dst_rows))\n"
4027 "{\n"
4028 "sum = 0.0f;\n"
4029 "for (i=0; i<=2*RADIUSX; i++)\n"
4030 "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n"
4031 "sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);\n"
4032 "#else\n"
4033 "sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);\n"
4034 "#endif\n"
4035 "#ifdef INTEGER_ARITHMETIC\n"
4036 "#ifdef INTEL_DEVICE\n"
4037 "sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);\n"
4038 "#else\n"
4039 "sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;\n"
4040 "#endif\n"
4041 "#endif\n"
4042 "storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));\n"
4043 "}\n"
4044 "for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)\n"
4045 "{\n"
4046 "int clocX = i % (BLK_X+(RADIUSX*2));\n"
4047 "int clocY = i / (BLK_X+(RADIUSX*2));\n"
4048 "lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];\n"
4049 "}\n"
4050 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4051 "int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;\n"
4052 "EXTRAPOLATE(yb, (height));\n"
4053 "clocX = lix;\n"
4054 "int cSrcX = x + srcOffsetX - RADIUSX;\n"
4055 "do\n"
4056 "{\n"
4057 "int xb = cSrcX;\n"
4058 "EXTRAPOLATE(xb,(width));\n"
4059 "lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n"
4060 "clocX += BLK_X;\n"
4061 "cSrcX += BLK_X;\n"
4062 "}\n"
4063 "while(clocX < BLK_X+(RADIUSX*2));\n"
4064 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4065 "}\n"
4066 "}\n"
4067 , "1335aadebf2523a98cb069063bdd2ba1"};
4068 ProgramSource filterSep_singlePass_oclsrc(filterSep_singlePass.programStr);
4069 const struct ProgramEntry filterSmall={"filterSmall",
4070 "#ifdef BORDER_REPLICATE\n"
4071 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))\n"
4072 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n"
4073 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))\n"
4074 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))\n"
4075 "#endif\n"
4076 "#ifdef BORDER_REFLECT\n"
4077 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))\n"
4078 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n"
4079 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))\n"
4080 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))\n"
4081 "#endif\n"
4082 "#ifdef BORDER_REFLECT_101\n"
4083 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))\n"
4084 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n"
4085 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))\n"
4086 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))\n"
4087 "#endif\n"
4088 "#ifdef BORDER_WRAP\n"
4089 "#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))\n"
4090 "#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n"
4091 "#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))\n"
4092 "#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))\n"
4093 "#endif\n"
4094 "#ifdef BORDER_ISOLATED\n"
4095 "#define ISOLATED_MIN(VAL) (VAL)\n"
4096 "#else\n"
4097 "#define ISOLATED_MIN(VAL) 0\n"
4098 "#endif\n"
4099 "#ifdef EXTRA_EXTRAPOLATION\n"
4100 "#ifdef BORDER_CONSTANT\n"
4101 "#elif defined BORDER_REPLICATE\n"
4102 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
4103 "{ \\\n"
4104 "x = max(min(x, maxX - 1), minX); \\\n"
4105 "y = max(min(y, maxY - 1), minY); \\\n"
4106 "}\n"
4107 "#elif defined BORDER_WRAP\n"
4108 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
4109 "{ \\\n"
4110 "if (x < minX) \\\n"
4111 "x -= ((x - maxX + 1) / maxX) * maxX; \\\n"
4112 "if (x >= maxX) \\\n"
4113 "x %= maxX; \\\n"
4114 "if (y < minY) \\\n"
4115 "y -= ((y - maxY + 1) / maxY) * maxY; \\\n"
4116 "if (y >= maxY) \\\n"
4117 "y %= maxY; \\\n"
4118 "}\n"
4119 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
4120 "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n"
4121 "{ \\\n"
4122 "if (maxX - minX == 1) \\\n"
4123 "x = minX; \\\n"
4124 "else \\\n"
4125 "do \\\n"
4126 "{ \\\n"
4127 "if (x < minX) \\\n"
4128 "x = minX - (x - minX) - 1 + delta; \\\n"
4129 "else \\\n"
4130 "x = maxX - 1 - (x - maxX) - delta; \\\n"
4131 "} \\\n"
4132 "while (x >= maxX || x < minX); \\\n"
4133 "\\\n"
4134 "if (maxY - minY == 1) \\\n"
4135 "y = minY; \\\n"
4136 "else \\\n"
4137 "do \\\n"
4138 "{ \\\n"
4139 "if (y < minY) \\\n"
4140 "y = minY - (y - minY) - 1 + delta; \\\n"
4141 "else \\\n"
4142 "y = maxY - 1 - (y - maxY) - delta; \\\n"
4143 "} \\\n"
4144 "while (y >= maxY || y < minY); \\\n"
4145 "}\n"
4146 "#ifdef BORDER_REFLECT\n"
4147 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n"
4148 "#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)\n"
4149 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n"
4150 "#endif\n"
4151 "#else\n"
4152 "#error No extrapolation method\n"
4153 "#endif\n"
4154 "#else\n"
4155 "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n"
4156 "{ \\\n"
4157 "int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \\\n"
4158 "_row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \\\n"
4159 "_row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \\\n"
4160 "y = _row + ISOLATED_MIN(minY); \\\n"
4161 "\\\n"
4162 "_col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \\\n"
4163 "_col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \\\n"
4164 "x = _col + ISOLATED_MIN(minX); \\\n"
4165 "}\n"
4166 "#endif\n"
4167 "#ifdef DOUBLE_SUPPORT\n"
4168 "#ifdef cl_amd_fp64\n"
4169 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
4170 "#elif defined (cl_khr_fp64)\n"
4171 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
4172 "#endif\n"
4173 "#endif\n"
4174 "#if cn != 3\n"
4175 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
4176 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
4177 "#define SRCSIZE (int)sizeof(srcT)\n"
4178 "#define DSTSIZE (int)sizeof(dstT)\n"
4179 "#else\n"
4180 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
4181 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
4182 "#define SRCSIZE (int)sizeof(srcT1) * cn\n"
4183 "#define DSTSIZE (int)sizeof(dstT1) * cn\n"
4184 "#endif\n"
4185 "#define noconvert\n"
4186 "struct RectCoords\n"
4187 "{\n"
4188 "int x1, y1, x2, y2;\n"
4189 "};\n"
4190 "#ifdef BORDER_ISOLATED\n"
4191 "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n"
4192 "{\n"
4193 "return coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;\n"
4194 "}\n"
4195 "#else\n"
4196 "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n"
4197 "{\n"
4198 "return coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;\n"
4199 "}\n"
4200 "#endif\n"
4201 "#define float1 float\n"
4202 "#define uchar1 uchar\n"
4203 "#define int1 int\n"
4204 "#define uint1 unit\n"
4205 "#define __CAT(x, y) x##y\n"
4206 "#define CAT(x, y) __CAT(x, y)\n"
4207 "#define vload1(OFFSET, PTR) (*(PTR + OFFSET))\n"
4208 "#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)\n"
4209 "#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)\n"
4210 "#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)\n"
4211 "inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar * srcptr,\n"
4212 "int srcstep, const struct RectCoords srcCoords)\n"
4213 "{\n"
4214 "__global const srcT1 * ptr = (__global const srcT1 *)\n"
4215 "(srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE));\n"
4216 "return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));\n"
4217 "}\n"
4218 "#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n"
4219 "#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n"
4220 "#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n"
4221 "#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n"
4222 "#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n"
4223 "#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n"
4224 "#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n"
4225 "#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n"
4226 "#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n"
4227 "#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n"
4228 "#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n"
4229 "#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n"
4230 "#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n"
4231 "#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n"
4232 "#ifdef OP_BOX_FILTER\n"
4233 "#define PROCESS_ELEM \\\n"
4234 "WT total_sum = (WT)(0); \\\n"
4235 "int sy = 0; \\\n"
4236 "LOOP(KERNEL_SIZE_Y, sy, \\\n"
4237 "{ \\\n"
4238 "int sx = 0; \\\n"
4239 "LOOP(KERNEL_SIZE_X, sx, \\\n"
4240 "{ \\\n"
4241 "total_sum += privateData[py + sy][px + sx]; \\\n"
4242 "}); \\\n"
4243 "})\n"
4244 "#elif defined OP_FILTER2D\n"
4245 "#define DIG(a) a,\n"
4246 "__constant WT1 kernelData[] = { COEFF };\n"
4247 "#define PROCESS_ELEM \\\n"
4248 "WT total_sum = 0; \\\n"
4249 "int sy = 0; \\\n"
4250 "int kernelIndex = 0; \\\n"
4251 "LOOP(KERNEL_SIZE_Y, sy, \\\n"
4252 "{ \\\n"
4253 "int sx = 0; \\\n"
4254 "LOOP(KERNEL_SIZE_X, sx, \\\n"
4255 "{ \\\n"
4256 "total_sum = fma(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum); \\\n"
4257 "}); \\\n"
4258 "})\n"
4259 "#elif defined OP_ERODE || defined OP_DILATE\n"
4260 "#ifdef DEPTH_0\n"
4261 "#define MIN_VAL 0\n"
4262 "#define MAX_VAL UCHAR_MAX\n"
4263 "#elif defined DEPTH_1\n"
4264 "#define MIN_VAL SCHAR_MIN\n"
4265 "#define MAX_VAL SCHAR_MAX\n"
4266 "#elif defined DEPTH_2\n"
4267 "#define MIN_VAL 0\n"
4268 "#define MAX_VAL USHRT_MAX\n"
4269 "#elif defined DEPTH_3\n"
4270 "#define MIN_VAL SHRT_MIN\n"
4271 "#define MAX_VAL SHRT_MAX\n"
4272 "#elif defined DEPTH_4\n"
4273 "#define MIN_VAL INT_MIN\n"
4274 "#define MAX_VAL INT_MAX\n"
4275 "#elif defined DEPTH_5\n"
4276 "#define MIN_VAL (-FLT_MAX)\n"
4277 "#define MAX_VAL FLT_MAX\n"
4278 "#elif defined DEPTH_6\n"
4279 "#define MIN_VAL (-DBL_MAX)\n"
4280 "#define MAX_VAL DBL_MAX\n"
4281 "#endif\n"
4282 "#ifdef OP_ERODE\n"
4283 "#define VAL (WT)MAX_VAL\n"
4284 "#elif defined OP_DILATE\n"
4285 "#define VAL (WT)MIN_VAL\n"
4286 "#else\n"
4287 "#error \"Unknown operation\"\n"
4288 "#endif\n"
4289 "#define convert_float1 convert_float\n"
4290 "#define convert_uchar1 convert_uchar\n"
4291 "#define convert_int1 convert_int\n"
4292 "#define convert_uint1 convert_uint\n"
4293 "#ifdef OP_ERODE\n"
4294 "#if defined INTEL_DEVICE && defined DEPTH_0\n"
4295 "#define WA_CONVERT_1 CAT(convert_uint, cn)\n"
4296 "#define WA_CONVERT_2 CAT(convert_, srcT)\n"
4297 "#define MORPH_OP(A, B) ((A) < (B) ? (A) : (B))\n"
4298 "#else\n"
4299 "#define MORPH_OP(A, B) min((A), (B))\n"
4300 "#endif\n"
4301 "#endif\n"
4302 "#ifdef OP_DILATE\n"
4303 "#define MORPH_OP(A, B) max((A), (B))\n"
4304 "#endif\n"
4305 "#define PROCESS(_y, _x) \\\n"
4306 "total_sum = convertToWT(MORPH_OP(convertToWT(total_sum), convertToWT(privateData[py + _y][px + _x])));\n"
4307 "#define PROCESS_ELEM \\\n"
4308 "WT total_sum = convertToWT(VAL); \\\n"
4309 "PROCESS_ELEM_\n"
4310 "#else\n"
4311 "#error \"No processing is specified\"\n"
4312 "#endif\n"
4313 "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n"
4314 "#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset\n"
4315 "#else\n"
4316 "#define EXTRA_PARAMS\n"
4317 "#endif\n"
4318 "inline WT getBorderPixel(const struct RectCoords bounds, int2 coord,\n"
4319 "__global const uchar * srcptr, int srcstep)\n"
4320 "{\n"
4321 "#ifdef BORDER_CONSTANT\n"
4322 "#ifdef OP_ERODE\n"
4323 "return (WT)(MAX_VAL);\n"
4324 "#elif defined OP_DILATE\n"
4325 "return (WT)(MIN_VAL);\n"
4326 "#else\n"
4327 "return (WT)(0);\n"
4328 "#endif\n"
4329 "#else\n"
4330 "int selected_col = coord.x;\n"
4331 "int selected_row = coord.y;\n"
4332 "EXTRAPOLATE(selected_col, selected_row,\n"
4333 "bounds.x1, bounds.y1,\n"
4334 "bounds.x2, bounds.y2);\n"
4335 "__global const uchar* ptr = srcptr + mad24(selected_row, srcstep, selected_col * SRCSIZE);\n"
4336 "return convertToWT(loadpix(ptr));\n"
4337 "#endif\n"
4338 "}\n"
4339 "inline WT readSrcPixelSingle(int2 pos, __global const uchar * srcptr,\n"
4340 "int srcstep, const struct RectCoords srcCoords)\n"
4341 "{\n"
4342 "if (!isBorder(srcCoords, pos, 1))\n"
4343 "{\n"
4344 "__global const uchar * ptr = srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE);\n"
4345 "return convertToWT(loadpix(ptr));\n"
4346 "}\n"
4347 "else\n"
4348 "return getBorderPixel(srcCoords, pos, srcptr, srcstep);\n"
4349 "}\n"
4350 "__kernel void filterSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n"
4351 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols\n"
4352 "#ifdef NORMALIZE\n"
4353 ", float alpha\n"
4354 "#endif\n"
4355 "EXTRA_PARAMS )\n"
4356 "{\n"
4357 "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n"
4358 "const int startX = get_global_id(0) * PX_PER_WI_X;\n"
4359 "const int startY = get_global_id(1) * PX_PER_WI_Y;\n"
4360 "if (startX >= cols || startY >= rows)\n"
4361 "return;\n"
4362 "WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH];\n"
4363 "int py = 0;\n"
4364 "LOOP(PX_LOAD_Y_ITERATIONS, py,\n"
4365 "{\n"
4366 "int y = startY + py;\n"
4367 "int px = 0;\n"
4368 "LOOP(PX_LOAD_X_ITERATIONS, px,\n"
4369 "{\n"
4370 "int x = startX + (px * PX_LOAD_NUM_PX);\n"
4371 "int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y);\n"
4372 "if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX))\n"
4373 "{\n"
4374 "PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords);\n"
4375 "#ifdef SQR\n"
4376 "*((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p * p;\n"
4377 "#else\n"
4378 "*((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p;\n"
4379 "#endif\n"
4380 "}\n"
4381 "else\n"
4382 "{\n"
4383 "int lx = 0;\n"
4384 "LOOP(PX_LOAD_NUM_PX, lx,\n"
4385 "{\n"
4386 "WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords);\n"
4387 "#ifdef SQR\n"
4388 "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p * p;\n"
4389 "#else\n"
4390 "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p;\n"
4391 "#endif\n"
4392 "srcPos.x++;\n"
4393 "});\n"
4394 "}\n"
4395 "});\n"
4396 "});\n"
4397 "py = 0;\n"
4398 "LOOP(PX_PER_WI_Y, py,\n"
4399 "{\n"
4400 "int y = startY + py;\n"
4401 "int px = 0;\n"
4402 "LOOP(PX_PER_WI_X, px,\n"
4403 "{\n"
4404 "int x = startX + px;\n"
4405 "PROCESS_ELEM;\n"
4406 "int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));\n"
4407 "__global dstT * dstPtr = (__global dstT *)(dstptr + dst_index);\n"
4408 "#ifdef NORMALIZE\n"
4409 "total_sum *= (WT)(alpha);\n"
4410 "#endif\n"
4411 "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n"
4412 "int mat_index = mad24(y, mat_step, mad24(x, SRCSIZE, mat_offset));\n"
4413 "WT value = convertToWT(loadpix(matptr + mat_index));\n"
4414 "#ifdef OP_GRADIENT\n"
4415 "storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );\n"
4416 "#elif defined OP_TOPHAT\n"
4417 "storepix(convertToDstT(convertToWT(value) - convertToWT(total_sum)), dstPtr );\n"
4418 "#elif defined OP_BLACKHAT\n"
4419 "storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );\n"
4420 "#endif\n"
4421 "#else\n"
4422 "storepix(convertToDstT(total_sum), dstPtr);\n"
4423 "#endif\n"
4424 "});\n"
4425 "});\n"
4426 "}\n"
4427 , "2aafc30dda5e658542c92a9ab2a63d4a"};
4428 ProgramSource filterSmall_oclsrc(filterSmall.programStr);
4429 const struct ProgramEntry gftt={"gftt",
4430 "#ifdef OP_MAX_EIGEN_VAL\n"
4431 "__kernel void maxEigenVal(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n"
4432 "int total, __global uchar * dstptr\n"
4433 "#ifdef HAVE_MASK\n"
4434 ", __global const uchar * maskptr, int mask_step, int mask_offset\n"
4435 "#endif\n"
4436 ")\n"
4437 "{\n"
4438 "int lid = get_local_id(0);\n"
4439 "int gid = get_group_id(0);\n"
4440 "int  id = get_global_id(0);\n"
4441 "__local float localmem_max[WGS2_ALIGNED];\n"
4442 "float maxval = -FLT_MAX;\n"
4443 "for (int grain = groupnum * WGS; id < total; id += grain)\n"
4444 "{\n"
4445 "int src_index = mad24(id / cols, src_step, mad24((id % cols), (int)sizeof(float), src_offset));\n"
4446 "#ifdef HAVE_MASK\n"
4447 "int mask_index = mad24(id / cols, mask_step, id % cols + mask_offset);\n"
4448 "if (maskptr[mask_index])\n"
4449 "#endif\n"
4450 "maxval = max(maxval, *(__global const float *)(srcptr + src_index));\n"
4451 "}\n"
4452 "if (lid < WGS2_ALIGNED)\n"
4453 "localmem_max[lid] = maxval;\n"
4454 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4455 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n"
4456 "localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid - WGS2_ALIGNED]);\n"
4457 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4458 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n"
4459 "{\n"
4460 "if (lid < lsize)\n"
4461 "{\n"
4462 "int lid2 = lsize + lid;\n"
4463 "localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);\n"
4464 "}\n"
4465 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4466 "}\n"
4467 "if (lid == 0)\n"
4468 "*(__global float *)(dstptr + (int)sizeof(float) * gid) = localmem_max[0];\n"
4469 "}\n"
4470 "__kernel void maxEigenValTask(__global float * dst, float qualityLevel,\n"
4471 "__global int * cornersptr)\n"
4472 "{\n"
4473 "float maxval = -FLT_MAX;\n"
4474 "#pragma unroll\n"
4475 "for (int x = 0; x < groupnum; ++x)\n"
4476 "maxval = max(maxval, dst[x]);\n"
4477 "dst[0] = maxval * qualityLevel;\n"
4478 "cornersptr[0] = 0;\n"
4479 "}\n"
4480 "#elif OP_FIND_CORNERS\n"
4481 "#define GET_SRC_32F(_y, _x) *(__global const float *)(eigptr + (_y) * eig_step + (_x) * (int)sizeof(float) )\n"
4482 "__kernel void findCorners(__global const uchar * eigptr, int eig_step, int eig_offset,\n"
4483 "#ifdef HAVE_MASK\n"
4484 "__global const uchar * mask, int mask_step, int mask_offset,\n"
4485 "#endif\n"
4486 "__global uchar * cornersptr, int rows, int cols,\n"
4487 "__constant float * threshold, int max_corners)\n"
4488 "{\n"
4489 "int x = get_global_id(0);\n"
4490 "int y = get_global_id(1);\n"
4491 "__global int* counter = (__global int*) cornersptr;\n"
4492 "__global float2 * corners = (__global float2 *)(cornersptr + (int)sizeof(float2));\n"
4493 "if (y < rows && x < cols\n"
4494 "#ifdef HAVE_MASK\n"
4495 "&& mask[mad24(y, mask_step, x + mask_offset)]\n"
4496 "#endif\n"
4497 ")\n"
4498 "{\n"
4499 "++x, ++y;\n"
4500 "float val = GET_SRC_32F(y, x);\n"
4501 "if (val > threshold[0])\n"
4502 "{\n"
4503 "float maxVal = val;\n"
4504 "maxVal = max(GET_SRC_32F(y - 1, x - 1), maxVal);\n"
4505 "maxVal = max(GET_SRC_32F(y - 1, x    ), maxVal);\n"
4506 "maxVal = max(GET_SRC_32F(y - 1, x + 1), maxVal);\n"
4507 "maxVal = max(GET_SRC_32F(y    , x - 1), maxVal);\n"
4508 "maxVal = max(GET_SRC_32F(y    , x + 1), maxVal);\n"
4509 "maxVal = max(GET_SRC_32F(y + 1, x - 1), maxVal);\n"
4510 "maxVal = max(GET_SRC_32F(y + 1, x    ), maxVal);\n"
4511 "maxVal = max(GET_SRC_32F(y + 1, x + 1), maxVal);\n"
4512 "if (val == maxVal)\n"
4513 "{\n"
4514 "int ind = atomic_inc(counter);\n"
4515 "if (ind < max_corners)\n"
4516 "{\n"
4517 "corners[ind].x = val;\n"
4518 "corners[ind].y = as_float(y | (x << 16));\n"
4519 "}\n"
4520 "}\n"
4521 "}\n"
4522 "}\n"
4523 "}\n"
4524 "#endif\n"
4525 , "cb2cfd26f04e14ae047e2f5eb28c8e11"};
4526 ProgramSource gftt_oclsrc(gftt.programStr);
4527 const struct ProgramEntry histogram={"histogram",
4528 "#ifndef kercn\n"
4529 "#define kercn 1\n"
4530 "#endif\n"
4531 "#ifndef T\n"
4532 "#define T uchar\n"
4533 "#endif\n"
4534 "#define noconvert\n"
4535 "__kernel void calculate_histogram(__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
4536 "__global uchar * histptr, int total)\n"
4537 "{\n"
4538 "int lid = get_local_id(0);\n"
4539 "int id = get_global_id(0) * kercn;\n"
4540 "int gid = get_group_id(0);\n"
4541 "__local int localhist[BINS];\n"
4542 "#pragma unroll\n"
4543 "for (int i = lid; i < BINS; i += WGS)\n"
4544 "localhist[i] = 0;\n"
4545 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4546 "__global const uchar * src = src_ptr + src_offset;\n"
4547 "int src_index;\n"
4548 "for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain)\n"
4549 "{\n"
4550 "#ifdef HAVE_SRC_CONT\n"
4551 "src_index = id;\n"
4552 "#else\n"
4553 "src_index = mad24(id / src_cols, src_step, id % src_cols);\n"
4554 "#endif\n"
4555 "#if kercn == 1\n"
4556 "atomic_inc(localhist + convert_int(src[src_index]));\n"
4557 "#elif kercn == 4\n"
4558 "int value = *(__global const int *)(src + src_index);\n"
4559 "atomic_inc(localhist + (value & 0xff));\n"
4560 "atomic_inc(localhist + ((value >> 8) & 0xff));\n"
4561 "atomic_inc(localhist + ((value >> 16) & 0xff));\n"
4562 "atomic_inc(localhist + ((value >> 24) & 0xff));\n"
4563 "#elif kercn >= 2\n"
4564 "T value = *(__global const T *)(src + src_index);\n"
4565 "atomic_inc(localhist + value.s0);\n"
4566 "atomic_inc(localhist + value.s1);\n"
4567 "#if kercn >= 4\n"
4568 "atomic_inc(localhist + value.s2);\n"
4569 "atomic_inc(localhist + value.s3);\n"
4570 "#if kercn >= 8\n"
4571 "atomic_inc(localhist + value.s4);\n"
4572 "atomic_inc(localhist + value.s5);\n"
4573 "atomic_inc(localhist + value.s6);\n"
4574 "atomic_inc(localhist + value.s7);\n"
4575 "#if kercn == 16\n"
4576 "atomic_inc(localhist + value.s8);\n"
4577 "atomic_inc(localhist + value.s9);\n"
4578 "atomic_inc(localhist + value.sA);\n"
4579 "atomic_inc(localhist + value.sB);\n"
4580 "atomic_inc(localhist + value.sC);\n"
4581 "atomic_inc(localhist + value.sD);\n"
4582 "atomic_inc(localhist + value.sE);\n"
4583 "atomic_inc(localhist + value.sF);\n"
4584 "#endif\n"
4585 "#endif\n"
4586 "#endif\n"
4587 "#endif\n"
4588 "}\n"
4589 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4590 "__global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int));\n"
4591 "#pragma unroll\n"
4592 "for (int i = lid; i < BINS; i += WGS)\n"
4593 "hist[i] = localhist[i];\n"
4594 "}\n"
4595 "#ifndef HT\n"
4596 "#define HT int\n"
4597 "#endif\n"
4598 "#ifndef convertToHT\n"
4599 "#define convertToHT noconvert\n"
4600 "#endif\n"
4601 "__kernel void merge_histogram(__global const int * ghist, __global uchar * histptr, int hist_step, int hist_offset)\n"
4602 "{\n"
4603 "int lid = get_local_id(0);\n"
4604 "__global HT * hist = (__global HT *)(histptr + hist_offset);\n"
4605 "#if WGS >= BINS\n"
4606 "HT res = (HT)(0);\n"
4607 "#else\n"
4608 "#pragma unroll\n"
4609 "for (int i = lid; i < BINS; i += WGS)\n"
4610 "hist[i] = (HT)(0);\n"
4611 "#endif\n"
4612 "#pragma unroll\n"
4613 "for (int i = 0; i < HISTS_COUNT; ++i)\n"
4614 "{\n"
4615 "#pragma unroll\n"
4616 "for (int j = lid; j < BINS; j += WGS)\n"
4617 "#if WGS >= BINS\n"
4618 "res += convertToHT(ghist[j]);\n"
4619 "#else\n"
4620 "hist[j] += convertToHT(ghist[j]);\n"
4621 "#endif\n"
4622 "ghist += BINS;\n"
4623 "}\n"
4624 "#if WGS >= BINS\n"
4625 "if (lid < BINS)\n"
4626 "*(__global HT *)(histptr + mad24(lid, hist_step, hist_offset)) = res;\n"
4627 "#endif\n"
4628 "}\n"
4629 "__kernel void calcLUT(__global uchar * dst, __global const int * ghist, int total)\n"
4630 "{\n"
4631 "int lid = get_local_id(0);\n"
4632 "__local int sumhist[BINS];\n"
4633 "__local float scale;\n"
4634 "#if WGS >= BINS\n"
4635 "int res = 0;\n"
4636 "#else\n"
4637 "#pragma unroll\n"
4638 "for (int i = lid; i < BINS; i += WGS)\n"
4639 "sumhist[i] = 0;\n"
4640 "#endif\n"
4641 "#pragma unroll\n"
4642 "for (int i = 0; i < HISTS_COUNT; ++i)\n"
4643 "{\n"
4644 "#pragma unroll\n"
4645 "for (int j = lid; j < BINS; j += WGS)\n"
4646 "#if WGS >= BINS\n"
4647 "res += ghist[j];\n"
4648 "#else\n"
4649 "sumhist[j] += ghist[j];\n"
4650 "#endif\n"
4651 "ghist += BINS;\n"
4652 "}\n"
4653 "#if WGS >= BINS\n"
4654 "if (lid < BINS)\n"
4655 "sumhist[lid] = res;\n"
4656 "#endif\n"
4657 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4658 "if (lid == 0)\n"
4659 "{\n"
4660 "int sum = 0, i = 0;\n"
4661 "while (!sumhist[i])\n"
4662 "++i;\n"
4663 "if (total == sumhist[i])\n"
4664 "{\n"
4665 "scale = 1;\n"
4666 "for (int j = 0; j < BINS; ++j)\n"
4667 "sumhist[i] = i;\n"
4668 "}\n"
4669 "else\n"
4670 "{\n"
4671 "scale = 255.f / (total - sumhist[i]);\n"
4672 "for (sumhist[i++] = 0; i < BINS; i++)\n"
4673 "{\n"
4674 "sum += sumhist[i];\n"
4675 "sumhist[i] = sum;\n"
4676 "}\n"
4677 "}\n"
4678 "}\n"
4679 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4680 "#pragma unroll\n"
4681 "for (int i = lid; i < BINS; i += WGS)\n"
4682 "dst[i]= convert_uchar_sat_rte(convert_float(sumhist[i]) * scale);\n"
4683 "}\n"
4684 , "3bfd6703e639c8a36eb7cdd5f3eefda6"};
4685 ProgramSource histogram_oclsrc(histogram.programStr);
4686 const struct ProgramEntry hough_lines={"hough_lines",
4687 "#define ACCUM(ptr) *((__global int*)(ptr))\n"
4688 "#ifdef MAKE_POINTS_LIST\n"
4689 "__kernel void make_point_list(__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
4690 "__global uchar * list_ptr, int list_step, int list_offset, __global int* global_offset)\n"
4691 "{\n"
4692 "int x = get_local_id(0);\n"
4693 "int y = get_group_id(1);\n"
4694 "__local int l_index, l_offset;\n"
4695 "__local int l_points[LOCAL_SIZE];\n"
4696 "__global const uchar * src = src_ptr + mad24(y, src_step, src_offset);\n"
4697 "__global int * list = (__global int*)(list_ptr + list_offset);\n"
4698 "if (x == 0)\n"
4699 "l_index = 0;\n"
4700 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4701 "if (y < src_rows)\n"
4702 "{\n"
4703 "y <<= 16;\n"
4704 "for (int i=x; i < src_cols; i+=GROUP_SIZE)\n"
4705 "{\n"
4706 "if (src[i])\n"
4707 "{\n"
4708 "int val = y | i;\n"
4709 "int index = atomic_inc(&l_index);\n"
4710 "l_points[index] = val;\n"
4711 "}\n"
4712 "}\n"
4713 "}\n"
4714 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4715 "if (x == 0)\n"
4716 "l_offset = atomic_add(global_offset, l_index);\n"
4717 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4718 "list += l_offset;\n"
4719 "for (int i=x; i < l_index; i+=GROUP_SIZE)\n"
4720 "{\n"
4721 "list[i] = l_points[i];\n"
4722 "}\n"
4723 "}\n"
4724 "#elif defined FILL_ACCUM_GLOBAL\n"
4725 "__kernel void fill_accum_global(__global const uchar * list_ptr, int list_step, int list_offset,\n"
4726 "__global uchar * accum_ptr, int accum_step, int accum_offset,\n"
4727 "int total_points, float irho, float theta, int numrho, int numangle)\n"
4728 "{\n"
4729 "int theta_idx = get_global_id(1);\n"
4730 "int count_idx = get_global_id(0);\n"
4731 "int glob_size = get_global_size(0);\n"
4732 "float cosVal;\n"
4733 "float sinVal = sincos(theta * ((float)theta_idx), &cosVal);\n"
4734 "sinVal *= irho;\n"
4735 "cosVal *= irho;\n"
4736 "__global const int * list = (__global const int*)(list_ptr + list_offset);\n"
4737 "__global int* accum = (__global int*)(accum_ptr + mad24(theta_idx + 1, accum_step, accum_offset));\n"
4738 "const int shift = (numrho - 1) / 2;\n"
4739 "if (theta_idx < numangle)\n"
4740 "{\n"
4741 "for (int i = count_idx; i < total_points; i += glob_size)\n"
4742 "{\n"
4743 "const int val = list[i];\n"
4744 "const int x = (val & 0xFFFF);\n"
4745 "const int y = (val >> 16) & 0xFFFF;\n"
4746 "int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift;\n"
4747 "atomic_inc(accum + r + 1);\n"
4748 "}\n"
4749 "}\n"
4750 "}\n"
4751 "#elif defined FILL_ACCUM_LOCAL\n"
4752 "__kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, int list_offset,\n"
4753 "__global uchar * accum_ptr, int accum_step, int accum_offset,\n"
4754 "int total_points, float irho, float theta, int numrho, int numangle)\n"
4755 "{\n"
4756 "int theta_idx = get_group_id(1);\n"
4757 "int count_idx = get_local_id(0);\n"
4758 "if (theta_idx > 0 && theta_idx < numangle + 1)\n"
4759 "{\n"
4760 "float cosVal;\n"
4761 "float sinVal = sincos(theta * (float) (theta_idx-1), &cosVal);\n"
4762 "sinVal *= irho;\n"
4763 "cosVal *= irho;\n"
4764 "__local int l_accum[BUFFER_SIZE];\n"
4765 "for (int i=count_idx; i<BUFFER_SIZE; i+=LOCAL_SIZE)\n"
4766 "l_accum[i] = 0;\n"
4767 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4768 "__global const int * list = (__global const int*)(list_ptr + list_offset);\n"
4769 "const int shift = (numrho - 1) / 2;\n"
4770 "for (int i = count_idx; i < total_points; i += LOCAL_SIZE)\n"
4771 "{\n"
4772 "const int point = list[i];\n"
4773 "const int x = (point & 0xFFFF);\n"
4774 "const int y = point >> 16;\n"
4775 "int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift;\n"
4776 "atomic_inc(l_accum + r + 1);\n"
4777 "}\n"
4778 "barrier(CLK_LOCAL_MEM_FENCE);\n"
4779 "__global int* accum = (__global int*)(accum_ptr + mad24(theta_idx, accum_step, accum_offset));\n"
4780 "for (int i=count_idx; i<BUFFER_SIZE; i+=LOCAL_SIZE)\n"
4781 "accum[i] = l_accum[i];\n"
4782 "}\n"
4783 "else if (theta_idx < numangle + 2)\n"
4784 "{\n"
4785 "__global int* accum = (__global int*)(accum_ptr + mad24(theta_idx, accum_step, accum_offset));\n"
4786 "for (int i=count_idx; i<BUFFER_SIZE; i+=LOCAL_SIZE)\n"
4787 "accum[i] = 0;\n"
4788 "}\n"
4789 "}\n"
4790 "#elif defined GET_LINES\n"
4791 "__kernel void get_lines(__global uchar * accum_ptr, int accum_step, int accum_offset, int accum_rows, int accum_cols,\n"
4792 "__global uchar * lines_ptr, int lines_step, int lines_offset, __global int* lines_index_ptr,\n"
4793 "int linesMax, int threshold, float rho, float theta)\n"
4794 "{\n"
4795 "int x0 = get_global_id(0);\n"
4796 "int y = get_global_id(1);\n"
4797 "int glob_size = get_global_size(0);\n"
4798 "if (y < accum_rows-2)\n"
4799 "{\n"
4800 "__global uchar* accum = accum_ptr + mad24(y+1, accum_step, mad24(x0+1, (int) sizeof(int), accum_offset));\n"
4801 "__global float2* lines = (__global float2*)(lines_ptr + lines_offset);\n"
4802 "__global int* lines_index = lines_index_ptr + 1;\n"
4803 "for (int x=x0; x<accum_cols-2; x+=glob_size)\n"
4804 "{\n"
4805 "int curVote = ACCUM(accum);\n"
4806 "if (curVote > threshold && curVote > ACCUM(accum - sizeof(int)) && curVote >= ACCUM(accum + sizeof(int)) &&\n"
4807 "curVote > ACCUM(accum - accum_step) && curVote >= ACCUM(accum + accum_step))\n"
4808 "{\n"
4809 "int index = atomic_inc(lines_index);\n"
4810 "if (index < linesMax)\n"
4811 "{\n"
4812 "float radius = (x - (accum_cols - 3) * 0.5f) * rho;\n"
4813 "float angle = y * theta;\n"
4814 "lines[index] = (float2)(radius, angle);\n"
4815 "}\n"
4816 "}\n"
4817 "accum += glob_size * (int) sizeof(int);\n"
4818 "}\n"
4819 "}\n"
4820 "}\n"
4821 "#elif GET_LINES_PROBABOLISTIC\n"
4822 "__kernel void get_lines(__global const uchar * accum_ptr, int accum_step, int accum_offset, int accum_rows, int accum_cols,\n"
4823 "__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
4824 "__global uchar * lines_ptr, int lines_step, int lines_offset, __global int* lines_index_ptr,\n"
4825 "int linesMax, int threshold, int lineLength, int lineGap, float rho, float theta)\n"
4826 "{\n"
4827 "int x = get_global_id(0);\n"
4828 "int y = get_global_id(1);\n"
4829 "if (y < accum_rows-2)\n"
4830 "{\n"
4831 "__global uchar* accum = accum_ptr + mad24(y+1, accum_step, mad24(x+1, (int) sizeof(int), accum_offset));\n"
4832 "__global int4* lines = (__global int4*)(lines_ptr + lines_offset);\n"
4833 "__global int* lines_index = lines_index_ptr + 1;\n"
4834 "int curVote = ACCUM(accum);\n"
4835 "if (curVote >= threshold &&\n"
4836 "curVote > ACCUM(accum - accum_step - sizeof(int)) &&\n"
4837 "curVote > ACCUM(accum - accum_step) &&\n"
4838 "curVote > ACCUM(accum - accum_step + sizeof(int)) &&\n"
4839 "curVote > ACCUM(accum - sizeof(int)) &&\n"
4840 "curVote > ACCUM(accum + sizeof(int)) &&\n"
4841 "curVote > ACCUM(accum + accum_step - sizeof(int)) &&\n"
4842 "curVote > ACCUM(accum + accum_step) &&\n"
4843 "curVote > ACCUM(accum + accum_step + sizeof(int)))\n"
4844 "{\n"
4845 "const float radius = (x - (accum_cols - 2 - 1) * 0.5f) * rho;\n"
4846 "const float angle = y * theta;\n"
4847 "float cosa;\n"
4848 "float sina = sincos(angle, &cosa);\n"
4849 "float2 p0 = (float2)(cosa * radius, sina * radius);\n"
4850 "float2 dir = (float2)(-sina, cosa);\n"
4851 "float2 pb[4] = { (float2)(-1, -1), (float2)(-1, -1), (float2)(-1, -1), (float2)(-1, -1) };\n"
4852 "float a;\n"
4853 "if (dir.x != 0)\n"
4854 "{\n"
4855 "a = -p0.x / dir.x;\n"
4856 "pb[0].x = 0;\n"
4857 "pb[0].y = p0.y + a * dir.y;\n"
4858 "a = (src_cols - 1 - p0.x) / dir.x;\n"
4859 "pb[1].x = src_cols - 1;\n"
4860 "pb[1].y = p0.y + a * dir.y;\n"
4861 "}\n"
4862 "if (dir.y != 0)\n"
4863 "{\n"
4864 "a = -p0.y / dir.y;\n"
4865 "pb[2].x = p0.x + a * dir.x;\n"
4866 "pb[2].y = 0;\n"
4867 "a = (src_rows - 1 - p0.y) / dir.y;\n"
4868 "pb[3].x = p0.x + a * dir.x;\n"
4869 "pb[3].y = src_rows - 1;\n"
4870 "}\n"
4871 "if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < src_rows))\n"
4872 "{\n"
4873 "p0 = pb[0];\n"
4874 "if (dir.x < 0)\n"
4875 "dir = -dir;\n"
4876 "}\n"
4877 "else if (pb[1].x == src_cols - 1 && (pb[1].y >= 0 && pb[1].y < src_rows))\n"
4878 "{\n"
4879 "p0 = pb[1];\n"
4880 "if (dir.x > 0)\n"
4881 "dir = -dir;\n"
4882 "}\n"
4883 "else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < src_cols))\n"
4884 "{\n"
4885 "p0 = pb[2];\n"
4886 "if (dir.y < 0)\n"
4887 "dir = -dir;\n"
4888 "}\n"
4889 "else if (pb[3].y == src_rows - 1 && (pb[3].x >= 0 && pb[3].x < src_cols))\n"
4890 "{\n"
4891 "p0 = pb[3];\n"
4892 "if (dir.y > 0)\n"
4893 "dir = -dir;\n"
4894 "}\n"
4895 "dir /= max(fabs(dir.x), fabs(dir.y));\n"
4896 "float2 line_end[2];\n"
4897 "int gap;\n"
4898 "bool inLine = false;\n"
4899 "if (p0.x < 0 || p0.x >= src_cols || p0.y < 0 || p0.y >= src_rows)\n"
4900 "return;\n"
4901 "for (;;)\n"
4902 "{\n"
4903 "if (*(src_ptr + mad24(p0.y, src_step, p0.x + src_offset)))\n"
4904 "{\n"
4905 "gap = 0;\n"
4906 "if (!inLine)\n"
4907 "{\n"
4908 "line_end[0] = p0;\n"
4909 "line_end[1] = p0;\n"
4910 "inLine = true;\n"
4911 "}\n"
4912 "else\n"
4913 "{\n"
4914 "line_end[1] = p0;\n"
4915 "}\n"
4916 "}\n"
4917 "else if (inLine)\n"
4918 "{\n"
4919 "if (++gap > lineGap)\n"
4920 "{\n"
4921 "bool good_line = fabs(line_end[1].x - line_end[0].x) >= lineLength ||\n"
4922 "fabs(line_end[1].y - line_end[0].y) >= lineLength;\n"
4923 "if (good_line)\n"
4924 "{\n"
4925 "int index = atomic_inc(lines_index);\n"
4926 "if (index < linesMax)\n"
4927 "lines[index] = (int4)(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);\n"
4928 "}\n"
4929 "gap = 0;\n"
4930 "inLine = false;\n"
4931 "}\n"
4932 "}\n"
4933 "p0 = p0 + dir;\n"
4934 "if (p0.x < 0 || p0.x >= src_cols || p0.y < 0 || p0.y >= src_rows)\n"
4935 "{\n"
4936 "if (inLine)\n"
4937 "{\n"
4938 "bool good_line = fabs(line_end[1].x - line_end[0].x) >= lineLength ||\n"
4939 "fabs(line_end[1].y - line_end[0].y) >= lineLength;\n"
4940 "if (good_line)\n"
4941 "{\n"
4942 "int index = atomic_inc(lines_index);\n"
4943 "if (index < linesMax)\n"
4944 "lines[index] = (int4)(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);\n"
4945 "}\n"
4946 "}\n"
4947 "break;\n"
4948 "}\n"
4949 "}\n"
4950 "}\n"
4951 "}\n"
4952 "}\n"
4953 "#endif\n"
4954 , "1a16d01d003274c100d23519d745047f"};
4955 ProgramSource hough_lines_oclsrc(hough_lines.programStr);
4956 const struct ProgramEntry integral_sum={"integral_sum",
4957 "#ifdef DOUBLE_SUPPORT\n"
4958 "#ifdef cl_amd_fp64\n"
4959 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
4960 "#elif defined (cl_khr_fp64)\n"
4961 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
4962 "#endif\n"
4963 "#endif\n"
4964 "#ifndef LOCAL_SUM_SIZE\n"
4965 "#define LOCAL_SUM_SIZE      16\n"
4966 "#endif\n"
4967 "#define LOCAL_SUM_STRIDE    (LOCAL_SUM_SIZE + 1)\n"
4968 "kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,\n"
4969 "__global uchar *buf_ptr, int buf_step, int buf_offset\n"
4970 "#ifdef SUM_SQUARE\n"
4971 ",__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset\n"
4972 "#endif\n"
4973 ")\n"
4974 "{\n"
4975 "__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n"
4976 "#ifdef SUM_SQUARE\n"
4977 "__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n"
4978 "#endif\n"
4979 "int lid = get_local_id(0);\n"
4980 "int gid = get_group_id(0);\n"
4981 "int x = get_global_id(0);\n"
4982 "int src_index = x + src_offset;\n"
4983 "sumT accum = 0;\n"
4984 "#ifdef SUM_SQUARE\n"
4985 "sumSQT accum_sq = 0;\n"
4986 "#endif\n"
4987 "for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)\n"
4988 "{\n"
4989 "int lsum_index = lid;\n"
4990 "#pragma unroll\n"
4991 "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)\n"
4992 "{\n"
4993 "if ((x < cols) && (y + yin < rows))\n"
4994 "{\n"
4995 "__global const uchar *src = src_ptr + src_index;\n"
4996 "accum += src[0];\n"
4997 "#ifdef SUM_SQUARE\n"
4998 "sumSQT temp = src[0] * src[0];\n"
4999 "accum_sq += temp;\n"
5000 "#endif\n"
5001 "}\n"
5002 "lm_sum[lsum_index] = accum;\n"
5003 "#ifdef SUM_SQUARE\n"
5004 "lm_sum_sq[lsum_index] = accum_sq;\n"
5005 "#endif\n"
5006 "}\n"
5007 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5008 "int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));\n"
5009 "#ifdef SUM_SQUARE\n"
5010 "int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));\n"
5011 "#endif\n"
5012 "lsum_index = LOCAL_SUM_STRIDE * lid;\n"
5013 "#pragma unroll\n"
5014 "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)\n"
5015 "{\n"
5016 "__global sumT *buf = (__global sumT *)(buf_ptr + buf_index);\n"
5017 "buf[0] = lm_sum[lsum_index];\n"
5018 "buf_index += buf_step;\n"
5019 "#ifdef SUM_SQUARE\n"
5020 "__global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);\n"
5021 "bufsq[0] = lm_sum_sq[lsum_index];\n"
5022 "buf_sq_index += buf_sq_step;\n"
5023 "#endif\n"
5024 "}\n"
5025 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5026 "}\n"
5027 "}\n"
5028 "kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,\n"
5029 "#ifdef SUM_SQUARE\n"
5030 "__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,\n"
5031 "#endif\n"
5032 "__global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols\n"
5033 "#ifdef SUM_SQUARE\n"
5034 ",__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset\n"
5035 "#endif\n"
5036 ")\n"
5037 "{\n"
5038 "__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n"
5039 "#ifdef SUM_SQUARE\n"
5040 "__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n"
5041 "#endif\n"
5042 "int lid = get_local_id(0);\n"
5043 "int gid = get_group_id(0);\n"
5044 "int gs = get_global_size(0);\n"
5045 "int x = get_global_id(0);\n"
5046 "__global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);\n"
5047 "for (int xin = x; xin < cols; xin += gs)\n"
5048 "{\n"
5049 "dst[xin] = 0;\n"
5050 "}\n"
5051 "dst_offset += dst_step;\n"
5052 "if (x < rows - 1)\n"
5053 "{\n"
5054 "dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));\n"
5055 "dst[0] = 0;\n"
5056 "}\n"
5057 "int buf_index = mad24((int)sizeof(sumT), x, buf_offset);\n"
5058 "sumT accum = 0;\n"
5059 "#ifdef SUM_SQUARE\n"
5060 "__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);\n"
5061 "for (int xin = x; xin < cols; xin += gs)\n"
5062 "{\n"
5063 "dst_sq[xin] = 0;\n"
5064 "}\n"
5065 "dst_sq_offset += dst_sq_step;\n"
5066 "if (x < rows - 1)\n"
5067 "{\n"
5068 "dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));\n"
5069 "dst_sq[0] = 0;\n"
5070 "}\n"
5071 "int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);\n"
5072 "sumSQT accum_sq = 0;\n"
5073 "#endif\n"
5074 "for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)\n"
5075 "{\n"
5076 "int lsum_index = lid;\n"
5077 "#pragma unroll\n"
5078 "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)\n"
5079 "{\n"
5080 "__global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);\n"
5081 "accum += buf[0];\n"
5082 "lm_sum[lsum_index] = accum;\n"
5083 "buf_index += buf_step;\n"
5084 "#ifdef SUM_SQUARE\n"
5085 "__global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);\n"
5086 "accum_sq += buf_sq[0];\n"
5087 "lm_sum_sq[lsum_index] = accum_sq;\n"
5088 "buf_sq_index += buf_sq_step;\n"
5089 "#endif\n"
5090 "}\n"
5091 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5092 "if (y + lid < cols)\n"
5093 "{\n"
5094 "int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));\n"
5095 "#ifdef SUM_SQUARE\n"
5096 "int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));\n"
5097 "#endif\n"
5098 "lsum_index = LOCAL_SUM_STRIDE * lid;\n"
5099 "int yin_max = min(rows - 1 -  LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);\n"
5100 "#pragma unroll\n"
5101 "for (int yin = 0; yin < yin_max; yin++, lsum_index++)\n"
5102 "{\n"
5103 "dst = (__global sumT *)(dst_ptr + dst_index);\n"
5104 "dst[0] = lm_sum[lsum_index];\n"
5105 "dst_index += dst_step;\n"
5106 "#ifdef SUM_SQUARE\n"
5107 "dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);\n"
5108 "dst_sq[0] = lm_sum_sq[lsum_index];\n"
5109 "dst_sq_index += dst_sq_step;\n"
5110 "#endif\n"
5111 "}\n"
5112 "}\n"
5113 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5114 "}\n"
5115 "}\n"
5116 , "ce49fba6c7a369504177acc108203a38"};
5117 ProgramSource integral_sum_oclsrc(integral_sum.programStr);
5118 const struct ProgramEntry laplacian5={"laplacian5",
5119 "#define noconvert\n"
5120 "#ifdef ONLY_SUM_CONVERT\n"
5121 "__kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,\n"
5122 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n"
5123 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5124 "coeffT scale, coeffT delta)\n"
5125 "{\n"
5126 "int x = get_global_id(0);\n"
5127 "int y = get_global_id(1);\n"
5128 "if (y < dst_rows && x < dst_cols)\n"
5129 "{\n"
5130 "int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));\n"
5131 "int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));\n"
5132 "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));\n"
5133 "__global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);\n"
5134 "__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);\n"
5135 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n"
5136 "#if wdepth <= 4\n"
5137 "dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );\n"
5138 "#else\n"
5139 "dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );\n"
5140 "#endif\n"
5141 "}\n"
5142 "}\n"
5143 "#else\n"
5144 "#ifdef BORDER_CONSTANT\n"
5145 "#define EXTRAPOLATE(x, maxV)\n"
5146 "#elif defined BORDER_REPLICATE\n"
5147 "#define EXTRAPOLATE(x, maxV) \\\n"
5148 "{ \\\n"
5149 "(x) = clamp((x), 0, (maxV)-1); \\\n"
5150 "}\n"
5151 "#elif defined BORDER_WRAP\n"
5152 "#define EXTRAPOLATE(x, maxV) \\\n"
5153 "{ \\\n"
5154 "(x) = ( (x) + (maxV) ) % (maxV); \\\n"
5155 "}\n"
5156 "#elif defined BORDER_REFLECT\n"
5157 "#define EXTRAPOLATE(x, maxV) \\\n"
5158 "{ \\\n"
5159 "(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \\\n"
5160 "}\n"
5161 "#elif defined BORDER_REFLECT_101\n"
5162 "#define EXTRAPOLATE(x, maxV) \\\n"
5163 "{ \\\n"
5164 "(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \\\n"
5165 "}\n"
5166 "#else\n"
5167 "#error No extrapolation method\n"
5168 "#endif\n"
5169 "#if CN != 3\n"
5170 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
5171 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
5172 "#define SRCSIZE (int)sizeof(srcT)\n"
5173 "#define DSTSIZE (int)sizeof(dstT)\n"
5174 "#else\n"
5175 "#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))\n"
5176 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
5177 "#define SRCSIZE (int)sizeof(srcT1)*3\n"
5178 "#define DSTSIZE (int)sizeof(dstT1)*3\n"
5179 "#endif\n"
5180 "#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))\n"
5181 "#ifdef BORDER_CONSTANT\n"
5182 "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n"
5183 "#else\n"
5184 "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n"
5185 "#endif\n"
5186 "#define DIG(a) a,\n"
5187 "__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };\n"
5188 "__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };\n"
5189 "__kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,\n"
5190 "__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5191 "WT1 scale, WT1 delta)\n"
5192 "{\n"
5193 "__local WT lsmem[BLK_Y + 2 * RADIUS][BLK_X + 2 * RADIUS];\n"
5194 "__local WT lsmemDy1[BLK_Y][BLK_X + 2 * RADIUS];\n"
5195 "__local WT lsmemDy2[BLK_Y][BLK_X + 2 * RADIUS];\n"
5196 "int lix = get_local_id(0);\n"
5197 "int liy = get_local_id(1);\n"
5198 "int x = get_global_id(0);\n"
5199 "int srcX = x + srcOffsetX - RADIUS;\n"
5200 "int clocY = liy;\n"
5201 "do\n"
5202 "{\n"
5203 "int yb = clocY + srcOffsetY - RADIUS;\n"
5204 "EXTRAPOLATE(yb, (height));\n"
5205 "int clocX = lix;\n"
5206 "int cSrcX = srcX;\n"
5207 "do\n"
5208 "{\n"
5209 "int xb = cSrcX;\n"
5210 "EXTRAPOLATE(xb,(width));\n"
5211 "lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n"
5212 "clocX += BLK_X;\n"
5213 "cSrcX += BLK_X;\n"
5214 "}\n"
5215 "while(clocX < BLK_X+(RADIUS*2));\n"
5216 "clocY += BLK_Y;\n"
5217 "}\n"
5218 "while (clocY < BLK_Y+(RADIUS*2));\n"
5219 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5220 "WT scale_v = (WT)scale;\n"
5221 "WT delta_v = (WT)delta;\n"
5222 "for (int y = 0; y < dst_rows; y+=BLK_Y)\n"
5223 "{\n"
5224 "int i, clocX = lix;\n"
5225 "WT sum1 = (WT) 0;\n"
5226 "WT sum2 = (WT) 0;\n"
5227 "do\n"
5228 "{\n"
5229 "sum1 = (WT) 0;\n"
5230 "sum2 = (WT) 0;\n"
5231 "for (i=0; i<=2*RADIUS; i++)\n"
5232 "{\n"
5233 "sum1 = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum1);\n"
5234 "sum2 = mad(lsmem[liy + i][clocX], mat_kernelX[i], sum2);\n"
5235 "}\n"
5236 "lsmemDy1[liy][clocX] = sum1;\n"
5237 "lsmemDy2[liy][clocX] = sum2;\n"
5238 "clocX += BLK_X;\n"
5239 "}\n"
5240 "while(clocX < BLK_X+(RADIUS*2));\n"
5241 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5242 "if ((x < dst_cols) && (y + liy < dst_rows))\n"
5243 "{\n"
5244 "sum1 = (WT) 0;\n"
5245 "sum2 = (WT) 0;\n"
5246 "for (i=0; i<=2*RADIUS; i++)\n"
5247 "{\n"
5248 "sum1 = mad(lsmemDy1[liy][lix+i], mat_kernelX[i], sum1);\n"
5249 "sum2 = mad(lsmemDy2[liy][lix+i], mat_kernelY[i], sum2);\n"
5250 "}\n"
5251 "WT sum = mad(scale_v, (sum1 + sum2), delta_v);\n"
5252 "storepix(convertToDT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));\n"
5253 "}\n"
5254 "for (int i = liy * BLK_X + lix; i < (RADIUS*2) * (BLK_X+(RADIUS*2)); i += BLK_X * BLK_Y)\n"
5255 "{\n"
5256 "int clocX = i % (BLK_X+(RADIUS*2));\n"
5257 "int clocY = i / (BLK_X+(RADIUS*2));\n"
5258 "lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];\n"
5259 "}\n"
5260 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5261 "int yb = y + liy + BLK_Y + srcOffsetY + RADIUS;\n"
5262 "EXTRAPOLATE(yb, (height));\n"
5263 "clocX = lix;\n"
5264 "int cSrcX = x + srcOffsetX - RADIUS;\n"
5265 "do\n"
5266 "{\n"
5267 "int xb = cSrcX;\n"
5268 "EXTRAPOLATE(xb,(width));\n"
5269 "lsmem[liy + 2*RADIUS][clocX] = ELEM(xb, yb, (width), (height), 0 );\n"
5270 "clocX += BLK_X;\n"
5271 "cSrcX += BLK_X;\n"
5272 "}\n"
5273 "while(clocX < BLK_X+(RADIUS*2));\n"
5274 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5275 "}\n"
5276 "}\n"
5277 "#endif\n"
5278 , "3ce3fc1a1c2e6be3a8fd0d2f51afeaf1"};
5279 ProgramSource laplacian5_oclsrc(laplacian5.programStr);
5280 const struct ProgramEntry match_template={"match_template",
5281 "#if cn != 3\n"
5282 "#define loadpix(addr) *(__global const T *)(addr)\n"
5283 "#define TSIZE (int)sizeof(T)\n"
5284 "#else\n"
5285 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
5286 "#define TSIZE ((int)sizeof(T1)*3)\n"
5287 "#endif\n"
5288 "#define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset))\n"
5289 "#define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset))\n"
5290 "#define SUMS(ox, oy)    mad24(y+oy, src_sums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sums_offset))\n"
5291 "#define SQ_SUMS(ox, oy) mad24(y+oy, src_sqsums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sqsums_offset))\n"
5292 "inline float normAcc(float num, float denum)\n"
5293 "{\n"
5294 "if (fabs(num) < denum)\n"
5295 "return num / denum;\n"
5296 "if (fabs(num) < denum * 1.125f)\n"
5297 "return num > 0 ? 1 : -1;\n"
5298 "return 0;\n"
5299 "}\n"
5300 "inline float normAcc_SQDIFF(float num, float denum)\n"
5301 "{\n"
5302 "if (fabs(num) < denum)\n"
5303 "return num / denum;\n"
5304 "if (fabs(num) < denum * 1.125f)\n"
5305 "return num > 0 ? 1 : -1;\n"
5306 "return 1;\n"
5307 "}\n"
5308 "#define noconvert\n"
5309 "#if cn == 1\n"
5310 "#define convertToDT(value) (float)(value)\n"
5311 "#elif cn == 2\n"
5312 "#define convertToDT(value) (float)(value.x + value.y)\n"
5313 "#elif cn == 3\n"
5314 "#define convertToDT(value) (float)(value.x + value.y + value.z)\n"
5315 "#elif cn == 4\n"
5316 "#define convertToDT(value) (float)(value.x + value.y + value.z + value.w)\n"
5317 "#else\n"
5318 "#error \"cn should be 1-4\"\n"
5319 "#endif\n"
5320 "#ifdef CALC_SUM\n"
5321 "__kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,\n"
5322 "int cols, int total, __global float * dst)\n"
5323 "{\n"
5324 "int lid = get_local_id(0), id = get_global_id(0);\n"
5325 "__local WT localmem[WGS2_ALIGNED];\n"
5326 "WT accumulator = (WT)(0), tmp;\n"
5327 "for ( ; id < total; id += WGS)\n"
5328 "{\n"
5329 "int src_index = mad24(id / cols, src_step, mad24(id % cols, TSIZE, src_offset));\n"
5330 "T src = loadpix(srcptr + src_index);\n"
5331 "tmp = convertToWT(src);\n"
5332 "accumulator = mad(tmp, tmp, accumulator);\n"
5333 "}\n"
5334 "if (lid < WGS2_ALIGNED)\n"
5335 "localmem[lid] = accumulator;\n"
5336 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5337 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n"
5338 "localmem[lid - WGS2_ALIGNED] += accumulator;\n"
5339 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5340 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n"
5341 "{\n"
5342 "if (lid < lsize)\n"
5343 "{\n"
5344 "int lid2 = lsize + lid;\n"
5345 "localmem[lid] += localmem[lid2];\n"
5346 "}\n"
5347 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5348 "}\n"
5349 "if (lid == 0)\n"
5350 "dst[0] = convertToDT(localmem[0]);\n"
5351 "}\n"
5352 "#elif defined FIRST_CHANNEL\n"
5353 "__kernel void extractFirstChannel( const __global uchar* img, int img_step, int img_offset,\n"
5354 "__global uchar* res, int res_step, int res_offset, int rows, int cols)\n"
5355 "{\n"
5356 "int x = get_global_id(0);\n"
5357 "int y = get_global_id(1)*PIX_PER_WI_Y;\n"
5358 "if(x < cols )\n"
5359 "{\n"
5360 "#pragma unroll\n"
5361 "for (int cy=0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)\n"
5362 "{\n"
5363 "T1 image = *(__global const T1*)(img + mad24(y, img_step, mad24(x, (int)sizeof(T1)*cn, img_offset)));;\n"
5364 "int res_idx = mad24(y, res_step, mad24(x, (int)sizeof(float), res_offset));\n"
5365 "*(__global float *)(res + res_idx) = image;\n"
5366 "}\n"
5367 "}\n"
5368 "}\n"
5369 "#elif defined CCORR\n"
5370 "#if cn==1 && PIX_PER_WI_X==4\n"
5371 "__kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,\n"
5372 "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n"
5373 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
5374 "{\n"
5375 "int x0 = get_global_id(0)*PIX_PER_WI_X;\n"
5376 "int y = get_global_id(1);\n"
5377 "if (y < dst_rows)\n"
5378 "{\n"
5379 "if (x0 + PIX_PER_WI_X <= dst_cols)\n"
5380 "{\n"
5381 "WT sum = (WT)(0);\n"
5382 "int ind = mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset));\n"
5383 "__global const T1 * template = (__global const T1*)(templateptr + template_offset);\n"
5384 "for (int i = 0; i < template_rows; ++i)\n"
5385 "{\n"
5386 "for (int j = 0; j < template_cols; ++j)\n"
5387 "{\n"
5388 "T temp = (T)(template[j]);\n"
5389 "T src = vload4(0, (__global const T1*)(srcptr + ind + j*(int)sizeof(T1)));\n"
5390 "sum = mad(convertToWT(src), convertToWT(temp), sum);\n"
5391 "}\n"
5392 "ind += src_step;\n"
5393 "template = (__global const T1 *)((__global const uchar *)template + template_step);\n"
5394 "}\n"
5395 "T temp = (T)(template[0]);\n"
5396 "int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));\n"
5397 "*(__global float4 *)(dst + dst_idx) = convert_float4(sum);\n"
5398 "}\n"
5399 "else\n"
5400 "{\n"
5401 "WT1 sum [PIX_PER_WI_X];\n"
5402 "#pragma unroll\n"
5403 "for (int i=0; i < PIX_PER_WI_X; i++) sum[i] = 0;\n"
5404 "__global const T1 * src = (__global const T1 *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset)));\n"
5405 "__global const T1 * template = (__global const T1 *)(templateptr + template_offset);\n"
5406 "for (int i = 0; i < template_rows; ++i)\n"
5407 "{\n"
5408 "for (int j = 0; j < template_cols; ++j)\n"
5409 "{\n"
5410 "#pragma unroll\n"
5411 "for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x)\n"
5412 "{\n"
5413 "sum[cx] = mad(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]);\n"
5414 "}\n"
5415 "}\n"
5416 "src = (__global const T1 *)((__global const uchar *)src + src_step);\n"
5417 "template = (__global const T1 *)((__global const uchar *)template + template_step);\n"
5418 "}\n"
5419 "#pragma unroll\n"
5420 "for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0)\n"
5421 "{\n"
5422 "int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));\n"
5423 "*(__global float *)(dst + dst_idx) = convertToDT(sum[cx]);\n"
5424 "}\n"
5425 "}\n"
5426 "}\n"
5427 "}\n"
5428 "#else\n"
5429 "__kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,\n"
5430 "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n"
5431 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
5432 "{\n"
5433 "int x = get_global_id(0);\n"
5434 "int y = get_global_id(1);\n"
5435 "if (x < dst_cols && y < dst_rows)\n"
5436 "{\n"
5437 "WT sum = (WT)(0);\n"
5438 "for (int i = 0; i < template_rows; ++i)\n"
5439 "{\n"
5440 "for (int j = 0; j < template_cols; ++j)\n"
5441 "{\n"
5442 "T src      = loadpix(srcptr      + mad24(y+i, src_step,    mad24(x+j, TSIZE, src_offset)));\n"
5443 "T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));\n"
5444 "sum = mad(convertToWT(src), convertToWT(template), sum);\n"
5445 "}\n"
5446 "}\n"
5447 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5448 "*(__global float *)(dst + dst_idx) = convertToDT(sum);\n"
5449 "}\n"
5450 "}\n"
5451 "#endif\n"
5452 "#elif defined CCORR_NORMED\n"
5453 "__kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5454 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5455 "int template_rows, int template_cols, __global const float * template_sqsum)\n"
5456 "{\n"
5457 "int x = get_global_id(0);\n"
5458 "int y = get_global_id(1);\n"
5459 "if (x < dst_cols && y < dst_rows)\n"
5460 "{\n"
5461 "__global const float * sqsum = (__global const float *)(src_sqsums);\n"
5462 "src_sqsums_step /= sizeof(float);\n"
5463 "src_sqsums_offset /= sizeof(float);\n"
5464 "float image_sqsum_ = (float)(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)] -\n"
5465 "sqsum[SQSUMS_PTR(0, template_rows)] + sqsum[SQSUMS_PTR(0, 0)]);\n"
5466 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5467 "__global float * dstult = (__global float *)(dst + dst_idx);\n"
5468 "*dstult = normAcc(*dstult, sqrt(image_sqsum_ * template_sqsum[0]));\n"
5469 "}\n"
5470 "}\n"
5471 "#elif defined SQDIFF\n"
5472 "__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset,\n"
5473 "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n"
5474 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
5475 "{\n"
5476 "int x = get_global_id(0);\n"
5477 "int y = get_global_id(1);\n"
5478 "if (x < dst_cols && y < dst_rows)\n"
5479 "{\n"
5480 "WT sum = (WT)(0), value;\n"
5481 "for (int i = 0; i < template_rows; ++i)\n"
5482 "{\n"
5483 "for (int j = 0; j < template_cols; ++j)\n"
5484 "{\n"
5485 "T src      = loadpix(srcptr      + mad24(y+i, src_step,    mad24(x+j, TSIZE, src_offset)));\n"
5486 "T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));\n"
5487 "value = convertToWT(src) - convertToWT(template);\n"
5488 "sum = mad(value, value, sum);\n"
5489 "}\n"
5490 "}\n"
5491 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5492 "*(__global float *)(dst + dst_idx) = convertToDT(sum);\n"
5493 "}\n"
5494 "}\n"
5495 "#elif defined SQDIFF_PREPARED\n"
5496 "__kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5497 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5498 "int template_rows, int template_cols, __global const float * template_sqsum)\n"
5499 "{\n"
5500 "int x = get_global_id(0);\n"
5501 "int y = get_global_id(1);\n"
5502 "if (x < dst_cols && y < dst_rows)\n"
5503 "{\n"
5504 "src_sqsums_step /= sizeof(float);\n"
5505 "src_sqsums_offset /= sizeof(float);\n"
5506 "__global const float * sqsum = (__global const float *)(src_sqsums);\n"
5507 "float image_sqsum_ = (float)(\n"
5508 "(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -\n"
5509 "(sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));\n"
5510 "float template_sqsum_value = template_sqsum[0];\n"
5511 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5512 "__global float * dstult = (__global float *)(dst + dst_idx);\n"
5513 "*dstult = image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value;\n"
5514 "}\n"
5515 "}\n"
5516 "#elif defined SQDIFF_NORMED\n"
5517 "__kernel void matchTemplate_SQDIFF_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5518 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5519 "int template_rows, int template_cols, __global const float * template_sqsum)\n"
5520 "{\n"
5521 "int x = get_global_id(0);\n"
5522 "int y = get_global_id(1);\n"
5523 "if (x < dst_cols && y < dst_rows)\n"
5524 "{\n"
5525 "src_sqsums_step /= sizeof(float);\n"
5526 "src_sqsums_offset /= sizeof(float);\n"
5527 "__global const float * sqsum = (__global const float *)(src_sqsums);\n"
5528 "float image_sqsum_ = (float)(\n"
5529 "(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -\n"
5530 "(sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));\n"
5531 "float template_sqsum_value = template_sqsum[0];\n"
5532 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5533 "__global float * dstult = (__global float *)(dst + dst_idx);\n"
5534 "*dstult = normAcc_SQDIFF(image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value, sqrt(image_sqsum_ * template_sqsum_value));\n"
5535 "}\n"
5536 "}\n"
5537 "#elif defined CCOEFF\n"
5538 "#if cn == 1\n"
5539 "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5540 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5541 "int template_rows, int template_cols, float template_sum)\n"
5542 "{\n"
5543 "int x = get_global_id(0);\n"
5544 "int y = get_global_id(1);\n"
5545 "if (x < dst_cols && y < dst_rows)\n"
5546 "{\n"
5547 "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n"
5548 "int step = src_sums_step/(int)sizeof(T);\n"
5549 "T image_sum = (T)(0), value;\n"
5550 "value = (T)(sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);\n"
5551 "image_sum = mad(value, template_sum , image_sum);\n"
5552 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5553 "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n"
5554 "}\n"
5555 "}\n"
5556 "#elif cn==3\n"
5557 "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5558 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5559 "int template_rows, int template_cols, float4 template_sum)\n"
5560 "{\n"
5561 "int x = get_global_id(0);\n"
5562 "int y = get_global_id(1);\n"
5563 "if (x < dst_cols && y < dst_rows)\n"
5564 "{\n"
5565 "T image_sum = (T)(0), value, temp_sum;\n"
5566 "temp_sum.x = template_sum.x;\n"
5567 "temp_sum.y = template_sum.y;\n"
5568 "temp_sum.z = template_sum.z;\n"
5569 "value  = vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, template_rows)));\n"
5570 "value -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, template_rows)));\n"
5571 "value -= vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, 0)));\n"
5572 "value += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));\n"
5573 "image_sum = mad(value, temp_sum , 0);\n"
5574 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5575 "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n"
5576 "}\n"
5577 "}\n"
5578 "#elif (cn==2 || cn==4)\n"
5579 "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5580 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5581 "int template_rows, int template_cols, float4 template_sum)\n"
5582 "{\n"
5583 "int x = get_global_id(0);\n"
5584 "int y = get_global_id(1);\n"
5585 "if (x < dst_cols && y < dst_rows)\n"
5586 "{\n"
5587 "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n"
5588 "int step = src_sums_step/(int)sizeof(T);\n"
5589 "T image_sum = (T)(0), value, temp_sum;\n"
5590 "#if cn==2\n"
5591 "temp_sum.x = template_sum.x;\n"
5592 "temp_sum.y = template_sum.y;\n"
5593 "#else\n"
5594 "temp_sum = template_sum;\n"
5595 "#endif\n"
5596 "value = (sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);\n"
5597 "image_sum = mad(value, temp_sum , image_sum);\n"
5598 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5599 "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n"
5600 "}\n"
5601 "}\n"
5602 "#else\n"
5603 "#error \"cn should be 1-4\"\n"
5604 "#endif\n"
5605 "#elif defined CCOEFF_NORMED\n"
5606 "#if cn == 1\n"
5607 "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5608 "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5609 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5610 "int t_rows, int t_cols, float weight, float template_sum, float template_sqsum)\n"
5611 "{\n"
5612 "int x = get_global_id(0);\n"
5613 "int y = get_global_id(1);\n"
5614 "float sum_[2];\n"
5615 "float sqsum_[2];\n"
5616 "if (x < dst_cols && y < dst_rows)\n"
5617 "{\n"
5618 "int step = src_sums_step/(int)sizeof(T);\n"
5619 "__global const T* sum   = (__global const T*)(src_sums + mad24(y, src_sums_step,     mad24(x, (int)sizeof(T), src_sums_offset)));\n"
5620 "__global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));\n"
5621 "T value_sum   = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];\n"
5622 "T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];\n"
5623 "float num = convertToDT(mad(value_sum, template_sum, 0));\n"
5624 "value_sqsum -= weight * value_sum * value_sum;\n"
5625 "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n"
5626 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5627 "__global float * dstult = (__global float *)(dst+dst_idx);\n"
5628 "*dstult = normAcc((*dstult) - num, denum);\n"
5629 "}\n"
5630 "}\n"
5631 "#elif cn==3\n"
5632 "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5633 "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5634 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5635 "int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)\n"
5636 "{\n"
5637 "int x = get_global_id(0);\n"
5638 "int y = get_global_id(1);\n"
5639 "if (x < dst_cols && y < dst_rows)\n"
5640 "{\n"
5641 "int step = src_sums_step/(int)sizeof(T);\n"
5642 "T temp_sum, value_sum, value_sqsum;\n"
5643 "temp_sum.x = template_sum.x;\n"
5644 "temp_sum.y = template_sum.y;\n"
5645 "temp_sum.z = template_sum.z;\n"
5646 "value_sum  = vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, t_rows)));\n"
5647 "value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, t_rows)));\n"
5648 "value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, 0)));\n"
5649 "value_sum += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));\n"
5650 "value_sqsum  = vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, t_rows)));\n"
5651 "value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, t_rows)));\n"
5652 "value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, 0)));\n"
5653 "value_sqsum += vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, 0)));\n"
5654 "float num = convertToDT(mad(value_sum, temp_sum, 0));\n"
5655 "value_sqsum -= weight * value_sum * value_sum;\n"
5656 "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n"
5657 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5658 "__global float * dstult = (__global float *)(dst+dst_idx);\n"
5659 "*dstult = normAcc((*dstult) - num, denum);\n"
5660 "}\n"
5661 "}\n"
5662 "#elif (cn==2 || cn==4)\n"
5663 "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n"
5664 "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n"
5665 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
5666 "int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)\n"
5667 "{\n"
5668 "int x = get_global_id(0);\n"
5669 "int y = get_global_id(1);\n"
5670 "if (x < dst_cols && y < dst_rows)\n"
5671 "{\n"
5672 "int step = src_sums_step/(int)sizeof(T);\n"
5673 "T temp_sum;\n"
5674 "__global const T* sum   = (__global const T*)(src_sums + mad24(y, src_sums_step,     mad24(x, (int)sizeof(T), src_sums_offset)));\n"
5675 "__global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));\n"
5676 "T value_sum   = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];\n"
5677 "T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];\n"
5678 "#if cn==2\n"
5679 "temp_sum.x = template_sum.x;\n"
5680 "temp_sum.y = template_sum.y;\n"
5681 "#else\n"
5682 "temp_sum = template_sum;\n"
5683 "#endif\n"
5684 "float num = convertToDT(mad(value_sum, temp_sum, 0));\n"
5685 "value_sqsum -= weight * value_sum * value_sum;\n"
5686 "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n"
5687 "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n"
5688 "__global float * dstult = (__global float *)(dst+dst_idx);\n"
5689 "*dstult = normAcc((*dstult) - num, denum);\n"
5690 "}\n"
5691 "}\n"
5692 "#else\n"
5693 "#error \"cn should be 1-4\"\n"
5694 "#endif\n"
5695 "#endif\n"
5696 , "b3c29b8efeb2ed66a052794cb7d162cb"};
5697 ProgramSource match_template_oclsrc(match_template.programStr);
5698 const struct ProgramEntry medianFilter={"medianFilter",
5699 "#if cn != 3\n"
5700 "#define loadpix(addr) *(__global const T *)(addr)\n"
5701 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
5702 "#define TSIZE (int)sizeof(T)\n"
5703 "#else\n"
5704 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
5705 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
5706 "#define TSIZE (int)sizeof(T1) * cn\n"
5707 "#endif\n"
5708 "#define OP(a,b) {    mid=a; a=min(a,b); b=max(mid,b);}\n"
5709 "#ifdef USE_4OPT\n"
5710 "#if cn == 1\n"
5711 "#define LOAD4(val, offs) (val) = vload4(0, (__global T1 *)(srcptr + src_index + (offs)))\n"
5712 "#define STORE4(val, offs) vstore4((val), 0, (__global T1 *)(dstptr + (offs)))\n"
5713 "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n"
5714 "dst0 = (T4)(src0, dst1.xyz); \\\n"
5715 "dst2 = (T4)(dst1.yzw, src2); }\n"
5716 "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n"
5717 "dst0 = (T4)(src0, src1, dst2.xy); \\\n"
5718 "dst1 = (T4)(src1, dst2.xyz); \\\n"
5719 "dst3 = (T4)(dst2.yzw, src3); \\\n"
5720 "dst4 = (T4)(dst2.zw, src3, src4); }\n"
5721 "#elif cn == 2\n"
5722 "#define LOAD4(val, offs) (val) = vload8(0, (__global T1 *)(srcptr + src_index + (offs)))\n"
5723 "#define STORE4(val, offs) vstore8((val), 0, (__global T1 *)(dstptr + (offs)))\n"
5724 "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n"
5725 "dst0 = (T4)(src0, dst1.s012345); \\\n"
5726 "dst2 = (T4)(dst1.s234567, src2); }\n"
5727 "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n"
5728 "dst0 = (T4)(src0, src1, dst2.s0123); \\\n"
5729 "dst1 = (T4)(src1, dst2.s012345); \\\n"
5730 "dst3 = (T4)(dst2.s234567, src3); \\\n"
5731 "dst4 = (T4)(dst2.s4567, src3, src4); }\n"
5732 "#elif cn == 4\n"
5733 "#define LOAD4(val, offs) (val) = vload16(0, (__global T1 *)(srcptr + src_index + (offs)))\n"
5734 "#define STORE4(val, offs) vstore16((val), 0, (__global T1 *)(dstptr + (offs)))\n"
5735 "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n"
5736 "dst0 = (T4)(src0, dst1.s0123456789ab ); \\\n"
5737 "dst2 = (T4)(dst1.s456789abcdef, src2); }\n"
5738 "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n"
5739 "dst0 = (T4)(src0, src1, dst2.s01234567); \\\n"
5740 "dst1 = (T4)(src1, dst2.s0123456789ab); \\\n"
5741 "dst3 = (T4)(dst2.s456789abcdef, src3); \\\n"
5742 "dst4 = (T4)(dst2.s89abcdef, src3, src4); }\n"
5743 "#endif\n"
5744 "__kernel void medianFilter3_u(__global const uchar* srcptr, int srcStep, int srcOffset,\n"
5745 "__global uchar*       dstptr, int dstStep, int dstOffset,\n"
5746 "int rows, int cols)\n"
5747 "{\n"
5748 "int gx= get_global_id(0) << 2;\n"
5749 "int gy= get_global_id(1) << 2;\n"
5750 "if( gy >= rows || gx >= cols)\n"
5751 "return;\n"
5752 "T c0; T4 c1; T c2;\n"
5753 "T c3; T4 c4; T c5;\n"
5754 "T c6; T4 c7; T c8;\n"
5755 "int x_left     = mad24(max(gx-1, 0), TSIZE, srcOffset);\n"
5756 "int x_central  = mad24(gx, TSIZE, srcOffset);\n"
5757 "int x_right    = mad24(min(gx+4, cols-1), TSIZE, srcOffset);\n"
5758 "int xdst = mad24(gx, TSIZE, dstOffset);\n"
5759 "int src_index = max(gy-1, 0)*srcStep;\n"
5760 "c0 = *(__global T *)(srcptr + src_index + x_left);\n"
5761 "LOAD4(c1, x_central);\n"
5762 "c2 = *(__global T *)(srcptr + src_index + x_right);\n"
5763 "src_index = gy*srcStep;\n"
5764 "c3 = *(__global T *)(srcptr + src_index + x_left);\n"
5765 "LOAD4(c4, x_central);\n"
5766 "c5 = *(__global T *)(srcptr + src_index + x_right);\n"
5767 "#define ITER3(k) { \\\n"
5768 "src_index = min(gy+k+1, rows-1)*srcStep; \\\n"
5769 "c6 = *(__global T *)(srcptr + src_index + x_left); \\\n"
5770 "LOAD4(c7, x_central); \\\n"
5771 "c8 = *(__global T *)(srcptr + src_index + x_right); \\\n"
5772 "T4 p0, p1, p2, p3, p4, p5, p6, p7, p8; \\\n"
5773 "SHUFFLE4_3(c0, c1, c2, p0, p1, p2); \\\n"
5774 "SHUFFLE4_3(c3, c4, c5, p3, p4, p5); \\\n"
5775 "SHUFFLE4_3(c6, c7, c8, p6, p7, p8); \\\n"
5776 "T4 mid; \\\n"
5777 "OP(p1, p2); OP(p4, p5); OP(p7, p8); OP(p0, p1); \\\n"
5778 "OP(p3, p4); OP(p6, p7); OP(p1, p2); OP(p4, p5); \\\n"
5779 "OP(p7, p8); OP(p0, p3); OP(p5, p8); OP(p4, p7); \\\n"
5780 "OP(p3, p6); OP(p1, p4); OP(p2, p5); OP(p4, p7); \\\n"
5781 "OP(p4, p2); OP(p6, p4); OP(p4, p2); \\\n"
5782 "int dst_index = mad24( gy+k, dstStep, xdst); \\\n"
5783 "STORE4(p4, dst_index); \\\n"
5784 "c0 = c3; c1 = c4; c2 = c5; \\\n"
5785 "c3 = c6; c4 = c7; c5 = c8; \\\n"
5786 "}\n"
5787 "ITER3(0);\n"
5788 "ITER3(1);\n"
5789 "ITER3(2);\n"
5790 "ITER3(3);\n"
5791 "}\n"
5792 "__kernel void medianFilter5_u(__global const uchar* srcptr, int srcStep, int srcOffset,\n"
5793 "__global uchar*       dstptr, int dstStep, int dstOffset,\n"
5794 "int rows, int cols)\n"
5795 "{\n"
5796 "int gx= get_global_id(0) << 2;\n"
5797 "int gy= get_global_id(1) << 2;\n"
5798 "if( gy >= rows || gx >= cols)\n"
5799 "return;\n"
5800 "T  c0; T  c1; T4  c2; T  c3; T  c4;\n"
5801 "T  c5; T  c6; T4  c7; T  c8; T  c9;\n"
5802 "T c10; T c11; T4 c12; T c13; T c14;\n"
5803 "T c15; T c16; T4 c17; T c18; T c19;\n"
5804 "T c20; T c21; T4 c22; T c23; T c24;\n"
5805 "int x_leftmost = mad24(max(gx-2, 0), TSIZE, srcOffset);\n"
5806 "int x_left     = mad24(max(gx-1, 0), TSIZE, srcOffset);\n"
5807 "int x_central  = mad24(gx, TSIZE, srcOffset);\n"
5808 "int x_right    = mad24(min(gx+4, cols-1), TSIZE, srcOffset);\n"
5809 "int x_rightmost= mad24(min(gx+5, cols-1), TSIZE, srcOffset);\n"
5810 "int xdst = mad24(gx, TSIZE, dstOffset);\n"
5811 "int src_index = max(gy-2, 0)*srcStep;\n"
5812 "c0 = *(__global T *)(srcptr + src_index + x_leftmost);\n"
5813 "c1 = *(__global T *)(srcptr + src_index + x_left);\n"
5814 "LOAD4(c2, x_central);\n"
5815 "c3 = *(__global T *)(srcptr + src_index + x_right);\n"
5816 "c4 = *(__global T *)(srcptr + src_index + x_rightmost);\n"
5817 "src_index = max(gy-1, 0)*srcStep;\n"
5818 "c5 = *(__global T *)(srcptr + src_index + x_leftmost);\n"
5819 "c6 = *(__global T *)(srcptr + src_index + x_left);\n"
5820 "LOAD4(c7, x_central);\n"
5821 "c8 = *(__global T *)(srcptr + src_index + x_right);\n"
5822 "c9 = *(__global T *)(srcptr + src_index + x_rightmost);\n"
5823 "src_index = gy*srcStep;\n"
5824 "c10 = *(__global T *)(srcptr + src_index + x_leftmost);\n"
5825 "c11 = *(__global T *)(srcptr + src_index + x_left);\n"
5826 "LOAD4(c12, x_central);\n"
5827 "c13 = *(__global T *)(srcptr + src_index + x_right);\n"
5828 "c14 = *(__global T *)(srcptr + src_index + x_rightmost);\n"
5829 "src_index = (gy+1)*srcStep;\n"
5830 "c15 = *(__global T *)(srcptr + src_index + x_leftmost);\n"
5831 "c16 = *(__global T *)(srcptr + src_index + x_left);\n"
5832 "LOAD4(c17, x_central);\n"
5833 "c18 = *(__global T *)(srcptr + src_index + x_right);\n"
5834 "c19 = *(__global T *)(srcptr + src_index + x_rightmost);\n"
5835 "for(int k = 0; k < 4; k++)\n"
5836 "{\n"
5837 "src_index = min(gy+k+2, rows-1) * srcStep;\n"
5838 "c20 = *(__global T *)(srcptr + src_index + x_leftmost);\n"
5839 "c21 = *(__global T *)(srcptr + src_index + x_left);\n"
5840 "LOAD4(c22, x_central);\n"
5841 "c23 = *(__global T *)(srcptr + src_index + x_right);\n"
5842 "c24 = *(__global T *)(srcptr + src_index + x_rightmost);\n"
5843 "T4 p0,  p1,  p2,  p3,  p4,\n"
5844 "p5,  p6,  p7,  p8,  p9,\n"
5845 "p10, p11, p12, p13, p14,\n"
5846 "p15, p16, p17, p18, p19,\n"
5847 "p20, p21, p22, p23, p24;\n"
5848 "SHUFFLE4_5(c0, c1, c2, c3, c4, p0, p1, p2, p3, p4);\n"
5849 "SHUFFLE4_5(c5, c6, c7, c8, c9, p5, p6, p7, p8, p9);\n"
5850 "SHUFFLE4_5(c10, c11, c12, c13, c14, p10, p11, p12, p13, p14);\n"
5851 "SHUFFLE4_5(c15, c16, c17, c18, c19, p15, p16, p17, p18, p19);\n"
5852 "SHUFFLE4_5(c20, c21, c22, c23, c24, p20, p21, p22, p23, p24);\n"
5853 "T4 mid;\n"
5854 "OP(p1, p2); OP(p0, p1); OP(p1, p2); OP(p4, p5); OP(p3, p4);\n"
5855 "OP(p4, p5); OP(p0, p3); OP(p2, p5); OP(p2, p3); OP(p1, p4);\n"
5856 "OP(p1, p2); OP(p3, p4); OP(p7, p8); OP(p6, p7); OP(p7, p8);\n"
5857 "OP(p10, p11); OP(p9, p10); OP(p10, p11); OP(p6, p9); OP(p8, p11);\n"
5858 "OP(p8, p9); OP(p7, p10); OP(p7, p8); OP(p9, p10); OP(p0, p6);\n"
5859 "OP(p4, p10); OP(p4, p6); OP(p2, p8); OP(p2, p4); OP(p6, p8);\n"
5860 "OP(p1, p7); OP(p5, p11); OP(p5, p7); OP(p3, p9); OP(p3, p5);\n"
5861 "OP(p7, p9); OP(p1, p2); OP(p3, p4); OP(p5, p6); OP(p7, p8);\n"
5862 "OP(p9, p10); OP(p13, p14); OP(p12, p13); OP(p13, p14); OP(p16, p17);\n"
5863 "OP(p15, p16); OP(p16, p17); OP(p12, p15); OP(p14, p17); OP(p14, p15);\n"
5864 "OP(p13, p16); OP(p13, p14); OP(p15, p16); OP(p19, p20); OP(p18, p19);\n"
5865 "OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p21, p23); OP(p22, p24);\n"
5866 "OP(p22, p23); OP(p18, p21); OP(p20, p23); OP(p20, p21); OP(p19, p22);\n"
5867 "OP(p22, p24); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p12, p18);\n"
5868 "OP(p16, p22); OP(p16, p18); OP(p14, p20); OP(p20, p24); OP(p14, p16);\n"
5869 "OP(p18, p20); OP(p22, p24); OP(p13, p19); OP(p17, p23); OP(p17, p19);\n"
5870 "OP(p15, p21); OP(p15, p17); OP(p19, p21); OP(p13, p14); OP(p15, p16);\n"
5871 "OP(p17, p18); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p0, p12);\n"
5872 "OP(p8, p20);  OP(p8, p12); OP(p4, p16); OP(p16, p24); OP(p12, p16);\n"
5873 "OP(p2, p14);  OP(p10, p22); OP(p10, p14); OP(p6, p18); OP(p6, p10);\n"
5874 "OP(p10, p12); OP(p1, p13); OP(p9, p21); OP(p9, p13); OP(p5, p17);\n"
5875 "OP(p13, p17); OP(p3, p15); OP(p11, p23); OP(p11, p15); OP(p7, p19);\n"
5876 "OP(p7, p11);  OP(p11, p13); OP(p11, p12);\n"
5877 "int dst_index = mad24( gy+k, dstStep, xdst);\n"
5878 "STORE4(p12, dst_index);\n"
5879 "c0=c5;   c1=c6;   c2=c7;   c3=c8;   c4=c9;\n"
5880 "c5=c10;  c6=c11;  c7=c12;  c8=c13;  c9=c14;\n"
5881 "c10=c15; c11=c16; c12=c17; c13=c18; c14=c19;\n"
5882 "c15=c20; c16=c21; c17=c22; c18=c23; c19=c24;\n"
5883 "}\n"
5884 "}\n"
5885 "#endif\n"
5886 "__kernel void medianFilter3(__global const uchar * srcptr, int src_step, int src_offset,\n"
5887 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
5888 "{\n"
5889 "__local T data[18][18];\n"
5890 "int x = get_local_id(0);\n"
5891 "int y = get_local_id(1);\n"
5892 "int gx = get_global_id(0);\n"
5893 "int gy = get_global_id(1);\n"
5894 "int dx = gx - x - 1;\n"
5895 "int dy = gy - y - 1;\n"
5896 "int id = min(mad24(x, 16, y), 9*18-1);\n"
5897 "int dr = id / 18;\n"
5898 "int dc = id % 18;\n"
5899 "int c = clamp(dx + dc, 0, dst_cols - 1);\n"
5900 "int r = clamp(dy + dr, 0, dst_rows - 1);\n"
5901 "int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n"
5902 "r = clamp(dy + dr + 9, 0, dst_rows - 1);\n"
5903 "int index9 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n"
5904 "data[dr][dc] = loadpix(srcptr + index1);\n"
5905 "data[dr+9][dc] = loadpix(srcptr + index9);\n"
5906 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5907 "T p0 = data[y][x], p1 = data[y][(x+1)], p2 = data[y][(x+2)];\n"
5908 "T p3 = data[y+1][x], p4 = data[y+1][(x+1)], p5 = data[y+1][(x+2)];\n"
5909 "T p6 = data[y+2][x], p7 = data[y+2][(x+1)], p8 = data[y+2][(x+2)];\n"
5910 "T mid;\n"
5911 "OP(p1, p2); OP(p4, p5); OP(p7, p8); OP(p0, p1);\n"
5912 "OP(p3, p4); OP(p6, p7); OP(p1, p2); OP(p4, p5);\n"
5913 "OP(p7, p8); OP(p0, p3); OP(p5, p8); OP(p4, p7);\n"
5914 "OP(p3, p6); OP(p1, p4); OP(p2, p5); OP(p4, p7);\n"
5915 "OP(p4, p2); OP(p6, p4); OP(p4, p2);\n"
5916 "int dst_index = mad24( gy, dst_step, mad24(gx, TSIZE, dst_offset));\n"
5917 "if (gy < dst_rows && gx < dst_cols)\n"
5918 "storepix(p4, dstptr + dst_index);\n"
5919 "}\n"
5920 "__kernel void medianFilter5(__global const uchar * srcptr, int src_step, int src_offset,\n"
5921 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
5922 "{\n"
5923 "__local T data[20][20];\n"
5924 "int x = get_local_id(0);\n"
5925 "int y = get_local_id(1);\n"
5926 "int gx = get_global_id(0);\n"
5927 "int gy = get_global_id(1);\n"
5928 "int dx = gx - x - 2;\n"
5929 "int dy = gy - y - 2;\n"
5930 "int id = min(mad24(x, 16, y), 10*20-1);\n"
5931 "int dr = id / 20;\n"
5932 "int dc = id % 20;\n"
5933 "int c = clamp(dx + dc, 0, dst_cols - 1);\n"
5934 "int r = clamp(dy + dr, 0, dst_rows - 1);\n"
5935 "int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n"
5936 "r = clamp(dy + dr + 10, 0, dst_rows - 1);\n"
5937 "int index10 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n"
5938 "data[dr][dc] = loadpix(srcptr + index1);\n"
5939 "data[dr+10][dc] = loadpix(srcptr + index10);\n"
5940 "barrier(CLK_LOCAL_MEM_FENCE);\n"
5941 "T p0 = data[y][x], p1 = data[y][x+1], p2 = data[y][x+2], p3 = data[y][x+3], p4 = data[y][x+4];\n"
5942 "T p5 = data[y+1][x], p6 = data[y+1][x+1], p7 = data[y+1][x+2], p8 = data[y+1][x+3], p9 = data[y+1][x+4];\n"
5943 "T p10 = data[y+2][x], p11 = data[y+2][x+1], p12 = data[y+2][x+2], p13 = data[y+2][x+3], p14 = data[y+2][x+4];\n"
5944 "T p15 = data[y+3][x], p16 = data[y+3][x+1], p17 = data[y+3][x+2], p18 = data[y+3][x+3], p19 = data[y+3][x+4];\n"
5945 "T p20 = data[y+4][x], p21 = data[y+4][x+1], p22 = data[y+4][x+2], p23 = data[y+4][x+3], p24 = data[y+4][x+4];\n"
5946 "T mid;\n"
5947 "OP(p1, p2); OP(p0, p1); OP(p1, p2); OP(p4, p5); OP(p3, p4);\n"
5948 "OP(p4, p5); OP(p0, p3); OP(p2, p5); OP(p2, p3); OP(p1, p4);\n"
5949 "OP(p1, p2); OP(p3, p4); OP(p7, p8); OP(p6, p7); OP(p7, p8);\n"
5950 "OP(p10, p11); OP(p9, p10); OP(p10, p11); OP(p6, p9); OP(p8, p11);\n"
5951 "OP(p8, p9); OP(p7, p10); OP(p7, p8); OP(p9, p10); OP(p0, p6);\n"
5952 "OP(p4, p10); OP(p4, p6); OP(p2, p8); OP(p2, p4); OP(p6, p8);\n"
5953 "OP(p1, p7); OP(p5, p11); OP(p5, p7); OP(p3, p9); OP(p3, p5);\n"
5954 "OP(p7, p9); OP(p1, p2); OP(p3, p4); OP(p5, p6); OP(p7, p8);\n"
5955 "OP(p9, p10); OP(p13, p14); OP(p12, p13); OP(p13, p14); OP(p16, p17);\n"
5956 "OP(p15, p16); OP(p16, p17); OP(p12, p15); OP(p14, p17); OP(p14, p15);\n"
5957 "OP(p13, p16); OP(p13, p14); OP(p15, p16); OP(p19, p20); OP(p18, p19);\n"
5958 "OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p21, p23); OP(p22, p24);\n"
5959 "OP(p22, p23); OP(p18, p21); OP(p20, p23); OP(p20, p21); OP(p19, p22);\n"
5960 "OP(p22, p24); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p12, p18);\n"
5961 "OP(p16, p22); OP(p16, p18); OP(p14, p20); OP(p20, p24); OP(p14, p16);\n"
5962 "OP(p18, p20); OP(p22, p24); OP(p13, p19); OP(p17, p23); OP(p17, p19);\n"
5963 "OP(p15, p21); OP(p15, p17); OP(p19, p21); OP(p13, p14); OP(p15, p16);\n"
5964 "OP(p17, p18); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p0, p12);\n"
5965 "OP(p8, p20); OP(p8, p12); OP(p4, p16); OP(p16, p24); OP(p12, p16);\n"
5966 "OP(p2, p14); OP(p10, p22); OP(p10, p14); OP(p6, p18); OP(p6, p10);\n"
5967 "OP(p10, p12); OP(p1, p13); OP(p9, p21); OP(p9, p13); OP(p5, p17);\n"
5968 "OP(p13, p17); OP(p3, p15); OP(p11, p23); OP(p11, p15); OP(p7, p19);\n"
5969 "OP(p7, p11); OP(p11, p13); OP(p11, p12);\n"
5970 "int dst_index = mad24(gy, dst_step, mad24(gx, TSIZE, dst_offset));\n"
5971 "if (gy < dst_rows && gx < dst_cols)\n"
5972 "storepix(p12, dstptr + dst_index);\n"
5973 "}\n"
5974 , "f082457348bfbcb2e2de3014f46093a8"};
5975 ProgramSource medianFilter_oclsrc(medianFilter.programStr);
5976 const struct ProgramEntry moments={"moments",
5977 "#if TILE_SIZE != 32\n"
5978 "#error \"TILE SIZE should be 32\"\n"
5979 "#endif\n"
5980 "__kernel void moments(__global const uchar* src, int src_step, int src_offset,\n"
5981 "int src_rows, int src_cols, __global int* mom0, int xtiles)\n"
5982 "{\n"
5983 "int x0 = get_global_id(0);\n"
5984 "int y0 = get_group_id(1);\n"
5985 "int x, y = get_local_id(1);\n"
5986 "int x_min = x0*TILE_SIZE;\n"
5987 "int ypix = y0*TILE_SIZE + y;\n"
5988 "__local int mom[TILE_SIZE][10];\n"
5989 "if (x_min < src_cols && y0*TILE_SIZE < src_rows)\n"
5990 "{\n"
5991 "if (ypix < src_rows)\n"
5992 "{\n"
5993 "int x_max = min(src_cols - x_min, TILE_SIZE);\n"
5994 "__global const uchar* ptr = src + src_offset + ypix*src_step + x_min;\n"
5995 "int4 S = (int4)(0, 0, 0, 0), p;\n"
5996 "#define SUM_ELEM(elem, ofs) \\\n"
5997 "(int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem\n"
5998 "x = x_max & -4;\n"
5999 "if (x_max >= 4)\n"
6000 "{\n"
6001 "p = convert_int4(vload4(0, ptr));\n"
6002 "#ifdef OP_MOMENTS_BINARY\n"
6003 "p = min(p, 1);\n"
6004 "#endif\n"
6005 "S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) +\n"
6006 "(int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27);\n"
6007 "if (x_max >= 8)\n"
6008 "{\n"
6009 "p = convert_int4(vload4(0, ptr + 4));\n"
6010 "#ifdef OP_MOMENTS_BINARY\n"
6011 "p = min(p, 1);\n"
6012 "#endif\n"
6013 "S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) +\n"
6014 "(int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343);\n"
6015 "if (x_max >= 12)\n"
6016 "{\n"
6017 "p = convert_int4(vload4(0, ptr + 8));\n"
6018 "#ifdef OP_MOMENTS_BINARY\n"
6019 "p = min(p, 1);\n"
6020 "#endif\n"
6021 "S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) +\n"
6022 "(int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331);\n"
6023 "if (x_max >= 16)\n"
6024 "{\n"
6025 "p = convert_int4(vload4(0, ptr + 12));\n"
6026 "#ifdef OP_MOMENTS_BINARY\n"
6027 "p = min(p, 1);\n"
6028 "#endif\n"
6029 "S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) +\n"
6030 "(int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375);\n"
6031 "}\n"
6032 "}\n"
6033 "}\n"
6034 "}\n"
6035 "if (x_max >= 20)\n"
6036 "{\n"
6037 "p = convert_int4(vload4(0, ptr + 16));\n"
6038 "#ifdef OP_MOMENTS_BINARY\n"
6039 "p = min(p, 1);\n"
6040 "#endif\n"
6041 "S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) +\n"
6042 "(int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859);\n"
6043 "if (x_max >= 24)\n"
6044 "{\n"
6045 "p = convert_int4(vload4(0, ptr + 20));\n"
6046 "#ifdef OP_MOMENTS_BINARY\n"
6047 "p = min(p, 1);\n"
6048 "#endif\n"
6049 "S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) +\n"
6050 "(int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167);\n"
6051 "if (x_max >= 28)\n"
6052 "{\n"
6053 "p = convert_int4(vload4(0, ptr + 24));\n"
6054 "#ifdef OP_MOMENTS_BINARY\n"
6055 "p = min(p, 1);\n"
6056 "#endif\n"
6057 "S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) +\n"
6058 "(int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683);\n"
6059 "if (x_max >= 32)\n"
6060 "{\n"
6061 "p = convert_int4(vload4(0, ptr + 28));\n"
6062 "#ifdef OP_MOMENTS_BINARY\n"
6063 "p = min(p, 1);\n"
6064 "#endif\n"
6065 "S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) +\n"
6066 "(int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791);\n"
6067 "}\n"
6068 "}\n"
6069 "}\n"
6070 "}\n"
6071 "if (x < x_max)\n"
6072 "{\n"
6073 "int ps = ptr[x];\n"
6074 "#ifdef OP_MOMENTS_BINARY\n"
6075 "ps = min(ps, 1);\n"
6076 "#endif\n"
6077 "S += SUM_ELEM(ps, x);\n"
6078 "if (x + 1 < x_max)\n"
6079 "{\n"
6080 "ps = ptr[x + 1];\n"
6081 "#ifdef OP_MOMENTS_BINARY\n"
6082 "ps = min(ps, 1);\n"
6083 "#endif\n"
6084 "S += SUM_ELEM(ps, x + 1);\n"
6085 "if (x + 2 < x_max)\n"
6086 "{\n"
6087 "ps = ptr[x + 2];\n"
6088 "#ifdef OP_MOMENTS_BINARY\n"
6089 "ps = min(ps, 1);\n"
6090 "#endif\n"
6091 "S += SUM_ELEM(ps, x + 2);\n"
6092 "}\n"
6093 "}\n"
6094 "}\n"
6095 "int sy = y*y;\n"
6096 "mom[y][0] = S.s0;\n"
6097 "mom[y][1] = S.s1;\n"
6098 "mom[y][2] = y*S.s0;\n"
6099 "mom[y][3] = S.s2;\n"
6100 "mom[y][4] = y*S.s1;\n"
6101 "mom[y][5] = sy*S.s0;\n"
6102 "mom[y][6] = S.s3;\n"
6103 "mom[y][7] = y*S.s2;\n"
6104 "mom[y][8] = sy*S.s1;\n"
6105 "mom[y][9] = y*sy*S.s0;\n"
6106 "}\n"
6107 "else\n"
6108 "mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =\n"
6109 "mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;\n"
6110 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6111 "#define REDUCE(d) \\\n"
6112 "if (y < d) \\\n"
6113 "{ \\\n"
6114 "mom[y][0] += mom[y + d][0]; \\\n"
6115 "mom[y][1] += mom[y + d][1]; \\\n"
6116 "mom[y][2] += mom[y + d][2]; \\\n"
6117 "mom[y][3] += mom[y + d][3]; \\\n"
6118 "mom[y][4] += mom[y + d][4]; \\\n"
6119 "mom[y][5] += mom[y + d][5]; \\\n"
6120 "mom[y][6] += mom[y + d][6]; \\\n"
6121 "mom[y][7] += mom[y + d][7]; \\\n"
6122 "mom[y][8] += mom[y + d][8]; \\\n"
6123 "mom[y][9] += mom[y + d][9]; \\\n"
6124 "} \\\n"
6125 "barrier(CLK_LOCAL_MEM_FENCE)\n"
6126 "REDUCE(16);\n"
6127 "REDUCE(8);\n"
6128 "REDUCE(4);\n"
6129 "REDUCE(2);\n"
6130 "if (y < 10)\n"
6131 "{\n"
6132 "__global int* momout = mom0 + (y0*xtiles + x0) * 10;\n"
6133 "momout[y] = mom[0][y] + mom[1][y];\n"
6134 "}\n"
6135 "}\n"
6136 "}\n"
6137 , "1d0545282b5860ed7eeeb6860fa9edc3"};
6138 ProgramSource moments_oclsrc(moments.programStr);
6139 const struct ProgramEntry morph={"morph",
6140 "#ifdef DOUBLE_SUPPORT\n"
6141 "#ifdef cl_amd_fp64\n"
6142 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
6143 "#elif defined (cl_khr_fp64)\n"
6144 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
6145 "#endif\n"
6146 "#endif\n"
6147 "#define noconvert\n"
6148 "#if cn != 3\n"
6149 "#define loadpix(addr) *(__global const T *)(addr)\n"
6150 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
6151 "#define TSIZE (int)sizeof(T)\n"
6152 "#else\n"
6153 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
6154 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
6155 "#define TSIZE ((int)sizeof(T1)*3)\n"
6156 "#endif\n"
6157 "#ifdef DEPTH_0\n"
6158 "#define MIN_VAL 0\n"
6159 "#define MAX_VAL UCHAR_MAX\n"
6160 "#elif defined DEPTH_1\n"
6161 "#define MIN_VAL SCHAR_MIN\n"
6162 "#define MAX_VAL SCHAR_MAX\n"
6163 "#elif defined DEPTH_2\n"
6164 "#define MIN_VAL 0\n"
6165 "#define MAX_VAL USHRT_MAX\n"
6166 "#elif defined DEPTH_3\n"
6167 "#define MIN_VAL SHRT_MIN\n"
6168 "#define MAX_VAL SHRT_MAX\n"
6169 "#elif defined DEPTH_4\n"
6170 "#define MIN_VAL INT_MIN\n"
6171 "#define MAX_VAL INT_MAX\n"
6172 "#elif defined DEPTH_5\n"
6173 "#define MIN_VAL (-FLT_MAX)\n"
6174 "#define MAX_VAL FLT_MAX\n"
6175 "#elif defined DEPTH_6\n"
6176 "#define MIN_VAL (-DBL_MAX)\n"
6177 "#define MAX_VAL DBL_MAX\n"
6178 "#endif\n"
6179 "#ifdef OP_ERODE\n"
6180 "#define VAL MAX_VAL\n"
6181 "#elif defined OP_DILATE\n"
6182 "#define VAL MIN_VAL\n"
6183 "#else\n"
6184 "#error \"Unknown operation\"\n"
6185 "#endif\n"
6186 "#ifdef OP_ERODE\n"
6187 "#if defined INTEL_DEVICE && defined DEPTH_0\n"
6188 "#define MORPH_OP(A, B) ((A) < (B) ? (A) : (B))\n"
6189 "#else\n"
6190 "#define MORPH_OP(A, B) min((A), (B))\n"
6191 "#endif\n"
6192 "#endif\n"
6193 "#ifdef OP_DILATE\n"
6194 "#define MORPH_OP(A, B) max((A), (B))\n"
6195 "#endif\n"
6196 "#define PROCESS(y, x) \\\n"
6197 "temp = LDS_DAT[mad24(l_y + y, width, l_x + x)]; \\\n"
6198 "res = MORPH_OP(res, temp);\n"
6199 "#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) < (l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n"
6200 "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n"
6201 "#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset\n"
6202 "#else\n"
6203 "#define EXTRA_PARAMS\n"
6204 "#endif\n"
6205 "__kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,\n"
6206 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
6207 "int src_offset_x, int src_offset_y, int cols, int rows,\n"
6208 "int src_whole_cols, int src_whole_rows EXTRA_PARAMS)\n"
6209 "{\n"
6210 "int gidx = get_global_id(0), gidy = get_global_id(1);\n"
6211 "int l_x = get_local_id(0), l_y = get_local_id(1);\n"
6212 "int x = get_group_id(0) * LSIZE0, y = get_group_id(1) * LSIZE1;\n"
6213 "int start_x = x + src_offset_x - RADIUSX;\n"
6214 "int width = mad24(RADIUSX, 2, LSIZE0 + 1);\n"
6215 "int start_y = y + src_offset_y - RADIUSY;\n"
6216 "int point1 = mad24(l_y, LSIZE0, l_x);\n"
6217 "int point2 = point1 + LSIZE0 * LSIZE1;\n"
6218 "int tl_x = point1 % width, tl_y = point1 / width;\n"
6219 "int tl_x2 = point2 % width, tl_y2 = point2 / width;\n"
6220 "int cur_x = start_x + tl_x, cur_y = start_y + tl_y;\n"
6221 "int cur_x2 = start_x + tl_x2, cur_y2 = start_y + tl_y2;\n"
6222 "int start_addr = mad24(cur_y, src_step, cur_x * TSIZE);\n"
6223 "int start_addr2 = mad24(cur_y2, src_step, cur_x2 * TSIZE);\n"
6224 "__local T LDS_DAT[2 * LSIZE1 * LSIZE0];\n"
6225 "int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * TSIZE);\n"
6226 "start_addr = start_addr < end_addr && start_addr > 0 ? start_addr : 0;\n"
6227 "start_addr2 = start_addr2 < end_addr && start_addr2 > 0 ? start_addr2 : 0;\n"
6228 "T temp0 = loadpix(srcptr + start_addr);\n"
6229 "T temp1 = loadpix(srcptr + start_addr2);\n"
6230 "temp0 = ELEM(cur_x, 0, src_whole_cols, (T)(VAL), temp0);\n"
6231 "temp0 = ELEM(cur_y, 0, src_whole_rows, (T)(VAL), temp0);\n"
6232 "temp1 = ELEM(cur_x2, 0, src_whole_cols, (T)(VAL), temp1);\n"
6233 "temp1 = ELEM(cur_y2, 0, src_whole_rows, (T)(VAL), temp1);\n"
6234 "LDS_DAT[point1] = temp0;\n"
6235 "LDS_DAT[point2] = temp1;\n"
6236 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6237 "if (gidx < cols && gidy < rows)\n"
6238 "{\n"
6239 "T res = (T)(VAL), temp;\n"
6240 "PROCESS_ELEMS;\n"
6241 "int dst_index = mad24(gidy, dst_step, mad24(gidx, TSIZE, dst_offset));\n"
6242 "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n"
6243 "int mat_index =  mad24(gidy, mat_step, mad24(gidx, TSIZE, mat_offset));\n"
6244 "T value = loadpix(matptr + mat_index);\n"
6245 "#ifdef OP_GRADIENT\n"
6246 "storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);\n"
6247 "#elif defined OP_TOPHAT\n"
6248 "storepix(convertToT(convertToWT(value) - convertToWT(res)), dstptr + dst_index);\n"
6249 "#elif defined OP_BLACKHAT\n"
6250 "storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);\n"
6251 "#endif\n"
6252 "#else\n"
6253 "storepix(res, dstptr + dst_index);\n"
6254 "#endif\n"
6255 "}\n"
6256 "}\n"
6257 , "232e712bff362e53c55027da6e1e1584"};
6258 ProgramSource morph_oclsrc(morph.programStr);
6259 const struct ProgramEntry precornerdetect={"precornerdetect",
6260 "__kernel void preCornerDetect(__global const uchar * Dxptr, int dx_step, int dx_offset,\n"
6261 "__global const uchar * Dyptr, int dy_step, int dy_offset,\n"
6262 "__global const uchar * D2xptr, int d2x_step, int d2x_offset,\n"
6263 "__global const uchar * D2yptr, int d2y_step, int d2y_offset,\n"
6264 "__global const uchar * Dxyptr, int dxy_step, int dxy_offset,\n"
6265 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
6266 "int dst_rows, int dst_cols, float factor)\n"
6267 "{\n"
6268 "int x = get_global_id(0);\n"
6269 "int y = get_global_id(1);\n"
6270 "if (x < dst_cols && y < dst_rows)\n"
6271 "{\n"
6272 "int dx_index = mad24(dx_step, y, (int)sizeof(float) * x + dx_offset);\n"
6273 "int dy_index = mad24(dy_step, y, (int)sizeof(float) * x + dy_offset);\n"
6274 "int d2x_index = mad24(d2x_step, y, (int)sizeof(float) * x + d2x_offset);\n"
6275 "int d2y_index = mad24(d2y_step, y, (int)sizeof(float) * x + d2y_offset);\n"
6276 "int dxy_index = mad24(dxy_step, y, (int)sizeof(float) * x + dxy_offset);\n"
6277 "int dst_index = mad24(dst_step, y, (int)sizeof(float) * x + dst_offset);\n"
6278 "float dx = *(__global const float *)(Dxptr + dx_index);\n"
6279 "float dy = *(__global const float *)(Dyptr + dy_index);\n"
6280 "float d2x = *(__global const float *)(D2xptr + d2x_index);\n"
6281 "float d2y = *(__global const float *)(D2yptr + d2y_index);\n"
6282 "float dxy = *(__global const float *)(Dxyptr + dxy_index);\n"
6283 "__global float * dst = (__global float *)(dstptr + dst_index);\n"
6284 "dst[0] = factor * (dx*dx*d2y + dy*dy*d2x - 2*dx*dy*dxy);\n"
6285 "}\n"
6286 "}\n"
6287 , "14a94db70b88aa76ff8840f03f3ad556"};
6288 ProgramSource precornerdetect_oclsrc(precornerdetect.programStr);
6289 const struct ProgramEntry pyr_down={"pyr_down",
6290 "#ifdef DOUBLE_SUPPORT\n"
6291 "#ifdef cl_amd_fp64\n"
6292 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
6293 "#elif defined (cl_khr_fp64)\n"
6294 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
6295 "#endif\n"
6296 "#endif\n"
6297 "#if defined BORDER_REPLICATE\n"
6298 "#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)\n"
6299 "#elif defined BORDER_WRAP\n"
6300 "#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)\n"
6301 "#elif defined BORDER_REFLECT\n"
6302 "#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)\n"
6303 "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n"
6304 "#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)\n"
6305 "#else\n"
6306 "#error No extrapolation method\n"
6307 "#endif\n"
6308 "#if cn != 3\n"
6309 "#define loadpix(addr)  *(__global const T*)(addr)\n"
6310 "#define storepix(val, addr)  *(__global T*)(addr) = (val)\n"
6311 "#define PIXSIZE ((int)sizeof(T))\n"
6312 "#else\n"
6313 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
6314 "#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))\n"
6315 "#define PIXSIZE ((int)sizeof(T1)*3)\n"
6316 "#endif\n"
6317 "#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))\n"
6318 "#if kercn == 4\n"
6319 "#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))\n"
6320 "#endif\n"
6321 "#ifdef INTEL_DEVICE\n"
6322 "#define MAD(x,y,z) fma((x),(y),(z))\n"
6323 "#else\n"
6324 "#define MAD(x,y,z) mad((x),(y),(z))\n"
6325 "#endif\n"
6326 "#define LOAD_LOCAL(col_gl, col_lcl) \\\n"
6327 "sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \\\n"
6328 "sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \\\n"
6329 "temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \\\n"
6330 "sum0 = MAD(co1, temp, sum0);                                            \\\n"
6331 "sum1 = co3 * temp;                                                      \\\n"
6332 "temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \\\n"
6333 "sum0 = MAD(co2, temp, sum0);                                            \\\n"
6334 "sum1 = MAD(co2, temp, sum1);                                            \\\n"
6335 "temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \\\n"
6336 "sum0 = MAD(co3, temp, sum0);                                            \\\n"
6337 "sum1 = MAD(co1, temp, sum1);                                            \\\n"
6338 "smem[0][col_lcl] = sum0;                                                \\\n"
6339 "sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \\\n"
6340 "sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \\\n"
6341 "smem[1][col_lcl] = sum1;\n"
6342 "#if kercn == 4\n"
6343 "#define LOAD_LOCAL4(col_gl, col_lcl) \\\n"
6344 "sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \\\n"
6345 "sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \\\n"
6346 "temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \\\n"
6347 "sum40 = MAD(co1, temp4, sum40);                                             \\\n"
6348 "sum41 = co3 * temp4;                                                        \\\n"
6349 "temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \\\n"
6350 "sum40 = MAD(co2, temp4, sum40);                                             \\\n"
6351 "sum41 = MAD(co2, temp4, sum41);                                             \\\n"
6352 "temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \\\n"
6353 "sum40 = MAD(co3, temp4, sum40);                                             \\\n"
6354 "sum41 = MAD(co1, temp4, sum41);                                             \\\n"
6355 "vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \\\n"
6356 "sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \\\n"
6357 "sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \\\n"
6358 "vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);\n"
6359 "#endif\n"
6360 "#define noconvert\n"
6361 "__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n"
6362 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
6363 "{\n"
6364 "const int x = get_global_id(0)*kercn;\n"
6365 "const int y = 2*get_global_id(1);\n"
6366 "__local FT smem[2][LOCAL_SIZE + 4];\n"
6367 "__global uchar * dstData = dst + dst_offset;\n"
6368 "__global const uchar * srcData = src + src_offset;\n"
6369 "FT sum0, sum1, temp;\n"
6370 "FT co1 = 0.375f;\n"
6371 "FT co2 = 0.25f;\n"
6372 "FT co3 = 0.0625f;\n"
6373 "const int src_y = 2*y;\n"
6374 "int col;\n"
6375 "if (src_y >= 2 && src_y < src_rows - 4)\n"
6376 "{\n"
6377 "#define EXTRAPOLATE_(val, maxVal)   val\n"
6378 "#if kercn == 1\n"
6379 "col = EXTRAPOLATE(x, src_cols);\n"
6380 "LOAD_LOCAL(col, 2 + get_local_id(0))\n"
6381 "#else\n"
6382 "if (x < src_cols-4)\n"
6383 "{\n"
6384 "float4 sum40, sum41, temp4;\n"
6385 "LOAD_LOCAL4(x, get_local_id(0))\n"
6386 "}\n"
6387 "else\n"
6388 "{\n"
6389 "for (int i=0; i<4; i++)\n"
6390 "{\n"
6391 "col = EXTRAPOLATE(x+i, src_cols);\n"
6392 "LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)\n"
6393 "}\n"
6394 "}\n"
6395 "#endif\n"
6396 "if (get_local_id(0) < 2)\n"
6397 "{\n"
6398 "col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n"
6399 "LOAD_LOCAL(col, get_local_id(0))\n"
6400 "}\n"
6401 "else if (get_local_id(0) < 4)\n"
6402 "{\n"
6403 "col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n"
6404 "LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))\n"
6405 "}\n"
6406 "}\n"
6407 "else\n"
6408 "{\n"
6409 "#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)\n"
6410 "#if kercn == 1\n"
6411 "col = EXTRAPOLATE(x, src_cols);\n"
6412 "LOAD_LOCAL(col, 2 + get_local_id(0))\n"
6413 "#else\n"
6414 "if (x < src_cols-4)\n"
6415 "{\n"
6416 "float4 sum40, sum41, temp4;\n"
6417 "LOAD_LOCAL4(x, get_local_id(0))\n"
6418 "}\n"
6419 "else\n"
6420 "{\n"
6421 "for (int i=0; i<4; i++)\n"
6422 "{\n"
6423 "col = EXTRAPOLATE(x+i, src_cols);\n"
6424 "LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)\n"
6425 "}\n"
6426 "}\n"
6427 "#endif\n"
6428 "if (get_local_id(0) < 2)\n"
6429 "{\n"
6430 "col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n"
6431 "LOAD_LOCAL(col, get_local_id(0))\n"
6432 "}\n"
6433 "else if (get_local_id(0) < 4)\n"
6434 "{\n"
6435 "col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n"
6436 "LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))\n"
6437 "}\n"
6438 "}\n"
6439 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6440 "#if kercn == 1\n"
6441 "if (get_local_id(0) < LOCAL_SIZE / 2)\n"
6442 "{\n"
6443 "const int tid2 = get_local_id(0) * 2;\n"
6444 "const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;\n"
6445 "if (dst_x < dst_cols)\n"
6446 "{\n"
6447 "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n"
6448 "{\n"
6449 "#if cn == 1\n"
6450 "#if fdepth <= 5\n"
6451 "FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));\n"
6452 "#else\n"
6453 "FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));\n"
6454 "#endif\n"
6455 "#else\n"
6456 "FT sum = co3 * smem[yin - y][2 + tid2 - 2];\n"
6457 "sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);\n"
6458 "sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);\n"
6459 "sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);\n"
6460 "#endif\n"
6461 "sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);\n"
6462 "storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);\n"
6463 "}\n"
6464 "}\n"
6465 "}\n"
6466 "#else\n"
6467 "int tid4 = get_local_id(0) * 4;\n"
6468 "int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;\n"
6469 "if (dst_x < dst_cols - 1)\n"
6470 "{\n"
6471 "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n"
6472 "{\n"
6473 "FT sum =  co3* smem[yin - y][2 + tid4 + 2];\n"
6474 "sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);\n"
6475 "sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);\n"
6476 "sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);\n"
6477 "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n"
6478 "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n"
6479 "dst_x ++;\n"
6480 "sum =     co3* smem[yin - y][2 + tid4 + 4];\n"
6481 "sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);\n"
6482 "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n"
6483 "sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);\n"
6484 "sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);\n"
6485 "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n"
6486 "dst_x --;\n"
6487 "}\n"
6488 "}\n"
6489 "else if (dst_x < dst_cols)\n"
6490 "{\n"
6491 "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n"
6492 "{\n"
6493 "FT sum =  co3* smem[yin - y][2 + tid4 + 2];\n"
6494 "sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);\n"
6495 "sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);\n"
6496 "sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);\n"
6497 "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n"
6498 "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n"
6499 "}\n"
6500 "}\n"
6501 "#endif\n"
6502 "}\n"
6503 , "3266de56ccdc2bcb8226bf97c932e272"};
6504 ProgramSource pyr_down_oclsrc(pyr_down.programStr);
6505 const struct ProgramEntry pyr_up={"pyr_up",
6506 "#ifdef DOUBLE_SUPPORT\n"
6507 "#ifdef cl_amd_fp64\n"
6508 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
6509 "#elif defined (cl_khr_fp64)\n"
6510 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
6511 "#endif\n"
6512 "#endif\n"
6513 "#if cn != 3\n"
6514 "#define loadpix(addr)  *(__global const T*)(addr)\n"
6515 "#define storepix(val, addr)  *(__global T*)(addr) = (val)\n"
6516 "#define PIXSIZE ((int)sizeof(T))\n"
6517 "#else\n"
6518 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
6519 "#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))\n"
6520 "#define PIXSIZE ((int)sizeof(T1)*3)\n"
6521 "#endif\n"
6522 "#define EXTRAPOLATE(x, maxV) min(maxV - 1, (int) abs(x))\n"
6523 "#define noconvert\n"
6524 "__kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n"
6525 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
6526 "{\n"
6527 "const int x = get_global_id(0);\n"
6528 "const int y = get_global_id(1);\n"
6529 "const int tidx = get_local_id(0);\n"
6530 "const int tidy = get_local_id(1);\n"
6531 "__local FT s_srcPatch[LOCAL_SIZE/2 + 2][LOCAL_SIZE/2 + 2];\n"
6532 "__local FT s_dstPatch[LOCAL_SIZE/2 + 2][LOCAL_SIZE];\n"
6533 "__global uchar * dstData = dst + dst_offset;\n"
6534 "__global const uchar * srcData = src + src_offset;\n"
6535 "if( tidx < (LOCAL_SIZE/2 + 2) && tidy < LOCAL_SIZE/2 + 2 )\n"
6536 "{\n"
6537 "int srcx = EXTRAPOLATE(mad24((int)get_group_id(0), LOCAL_SIZE/2, tidx) - 1, src_cols);\n"
6538 "int srcy = EXTRAPOLATE(mad24((int)get_group_id(1), LOCAL_SIZE/2, tidy) - 1, src_rows);\n"
6539 "s_srcPatch[tidy][tidx] = convertToFT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));\n"
6540 "}\n"
6541 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6542 "FT sum = 0.f;\n"
6543 "const FT co1 = 0.75f;\n"
6544 "const FT co2 = 0.5f;\n"
6545 "const FT co3 = 0.125f;\n"
6546 "const FT coef1 = (tidx & 1) == 0 ? co1 : (FT) 0;\n"
6547 "const FT coef2 = (tidx & 1) == 0 ? co3 : co2;\n"
6548 "const FT coefy1 = (tidy & 1) == 0 ? co1 : (FT) 0;\n"
6549 "const FT coefy2 = (tidy & 1) == 0 ? co3 : co2;\n"
6550 "if(tidy < LOCAL_SIZE/2 + 2)\n"
6551 "{\n"
6552 "sum =     coef2* s_srcPatch[tidy][1 + ((tidx - 1) >> 1)];\n"
6553 "sum = mad(coef1, s_srcPatch[tidy][1 + ((tidx    ) >> 1)], sum);\n"
6554 "sum = mad(coef2, s_srcPatch[tidy][1 + ((tidx + 2) >> 1)], sum);\n"
6555 "s_dstPatch[tidy][tidx] = sum;\n"
6556 "}\n"
6557 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6558 "sum =     coefy2* s_dstPatch[1 + ((tidy - 1) >> 1)][tidx];\n"
6559 "sum = mad(coefy1, s_dstPatch[1 + ((tidy    ) >> 1)][tidx], sum);\n"
6560 "sum = mad(coefy2, s_dstPatch[1 + ((tidy + 2) >> 1)][tidx], sum);\n"
6561 "if ((x < dst_cols) && (y < dst_rows))\n"
6562 "storepix(convertToT(sum), dstData + y * dst_step + x * PIXSIZE);\n"
6563 "}\n"
6564 "__kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n"
6565 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
6566 "{\n"
6567 "const int lx = 2*get_local_id(0);\n"
6568 "const int ly = 2*get_local_id(1);\n"
6569 "__local FT s_srcPatch[LOCAL_SIZE+2][LOCAL_SIZE+2];\n"
6570 "__local FT s_dstPatch[LOCAL_SIZE+2][2*LOCAL_SIZE];\n"
6571 "__global uchar * dstData = dst + dst_offset;\n"
6572 "__global const uchar * srcData = src + src_offset;\n"
6573 "if( lx < (LOCAL_SIZE+2) && ly < (LOCAL_SIZE+2) )\n"
6574 "{\n"
6575 "int srcx = mad24((int)get_group_id(0), LOCAL_SIZE, lx) - 1;\n"
6576 "int srcy = mad24((int)get_group_id(1), LOCAL_SIZE, ly) - 1;\n"
6577 "int srcx1 = EXTRAPOLATE(srcx, src_cols);\n"
6578 "int srcx2 = EXTRAPOLATE(srcx+1, src_cols);\n"
6579 "int srcy1 = EXTRAPOLATE(srcy, src_rows);\n"
6580 "int srcy2 = EXTRAPOLATE(srcy+1, src_rows);\n"
6581 "s_srcPatch[ly][lx] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));\n"
6582 "s_srcPatch[ly+1][lx] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));\n"
6583 "s_srcPatch[ly][lx+1] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));\n"
6584 "s_srcPatch[ly+1][lx+1] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));\n"
6585 "}\n"
6586 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6587 "FT sum;\n"
6588 "const FT co1 = 0.75f;\n"
6589 "const FT co2 = 0.5f;\n"
6590 "const FT co3 = 0.125f;\n"
6591 "sum =       co3 * s_srcPatch[1 + (ly >> 1)][1 + ((lx - 2) >> 1)];\n"
6592 "sum = mad(co1, s_srcPatch[1 + (ly >> 1)][1 + ((lx    ) >> 1)], sum);\n"
6593 "sum = mad(co3, s_srcPatch[1 + (ly >> 1)][1 + ((lx + 2) >> 1)], sum);\n"
6594 "s_dstPatch[1 + get_local_id(1)][lx] = sum;\n"
6595 "sum =       co2 * s_srcPatch[1 + (ly >> 1)][1 + ((lx + 1 - 1) >> 1)];\n"
6596 "sum = mad(co2, s_srcPatch[1 + (ly >> 1)][1 + ((lx + 1 + 1) >> 1)], sum);\n"
6597 "s_dstPatch[1 + get_local_id(1)][lx+1] = sum;\n"
6598 "if (ly < 1)\n"
6599 "{\n"
6600 "sum =       co3 * s_srcPatch[0][1 + ((lx - 2) >> 1)];\n"
6601 "sum = mad(co1, s_srcPatch[0][1 + ((lx    ) >> 1)], sum);\n"
6602 "sum = mad(co3, s_srcPatch[0][1 + ((lx + 2) >> 1)], sum);\n"
6603 "s_dstPatch[0][lx] = sum;\n"
6604 "sum =       co2 * s_srcPatch[0][1 + ((lx + 1 - 1) >> 1)];\n"
6605 "sum = mad(co2, s_srcPatch[0][1 + ((lx + 1 + 1) >> 1)], sum);\n"
6606 "s_dstPatch[0][lx+1] = sum;\n"
6607 "}\n"
6608 "if (ly > 2*LOCAL_SIZE-3)\n"
6609 "{\n"
6610 "sum =       co3 * s_srcPatch[LOCAL_SIZE+1][1 + ((lx - 2) >> 1)];\n"
6611 "sum = mad(co1, s_srcPatch[LOCAL_SIZE+1][1 + ((lx    ) >> 1)], sum);\n"
6612 "sum = mad(co3, s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 2) >> 1)], sum);\n"
6613 "s_dstPatch[LOCAL_SIZE+1][lx] = sum;\n"
6614 "sum =       co2 * s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 1 - 1) >> 1)];\n"
6615 "sum = mad(co2, s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 1 + 1) >> 1)], sum);\n"
6616 "s_dstPatch[LOCAL_SIZE+1][lx+1] = sum;\n"
6617 "}\n"
6618 "barrier(CLK_LOCAL_MEM_FENCE);\n"
6619 "int dst_x = 2*get_global_id(0);\n"
6620 "int dst_y = 2*get_global_id(1);\n"
6621 "if ((dst_x < dst_cols) && (dst_y < dst_rows))\n"
6622 "{\n"
6623 "sum =       co3 * s_dstPatch[1 + get_local_id(1) - 1][lx];\n"
6624 "sum = mad(co1, s_dstPatch[1 + get_local_id(1)    ][lx], sum);\n"
6625 "sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);\n"
6626 "storepix(convertToT(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);\n"
6627 "sum =       co3 * s_dstPatch[1 + get_local_id(1) - 1][lx+1];\n"
6628 "sum = mad(co1, s_dstPatch[1 + get_local_id(1)    ][lx+1], sum);\n"
6629 "sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);\n"
6630 "storepix(convertToT(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);\n"
6631 "sum =       co2 * s_dstPatch[1 + get_local_id(1)    ][lx];\n"
6632 "sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);\n"
6633 "storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);\n"
6634 "sum =       co2 * s_dstPatch[1 + get_local_id(1)    ][lx+1];\n"
6635 "sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);\n"
6636 "storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);\n"
6637 "}\n"
6638 "}\n"
6639 , "e48abb0036bd5e090ad06600b018eec9"};
6640 ProgramSource pyr_up_oclsrc(pyr_up.programStr);
6641 const struct ProgramEntry remap={"remap",
6642 "#ifdef DOUBLE_SUPPORT\n"
6643 "#ifdef cl_amd_fp64\n"
6644 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
6645 "#elif defined (cl_khr_fp64)\n"
6646 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
6647 "#endif\n"
6648 "#endif\n"
6649 "#define noconvert\n"
6650 "#if cn != 3\n"
6651 "#define loadpix(addr)  *(__global const T*)(addr)\n"
6652 "#define storepix(val, addr)  *(__global T*)(addr) = val\n"
6653 "#define TSIZE ((int)sizeof(T))\n"
6654 "#define convertScalar(a) (a)\n"
6655 "#else\n"
6656 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
6657 "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n"
6658 "#define TSIZE ((int)sizeof(T1)*3)\n"
6659 "#define convertScalar(a) (T)(a.x, a.y, a.z)\n"
6660 "#endif\n"
6661 "enum\n"
6662 "{\n"
6663 "INTER_BITS = 5,\n"
6664 "INTER_TAB_SIZE = 1 << INTER_BITS,\n"
6665 "INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE\n"
6666 "};\n"
6667 "#ifdef INTER_NEAREST\n"
6668 "#define convertToWT\n"
6669 "#endif\n"
6670 "#ifdef BORDER_CONSTANT\n"
6671 "#define EXTRAPOLATE(v2, v) v = scalar;\n"
6672 "#elif defined BORDER_REPLICATE\n"
6673 "#define EXTRAPOLATE(v2, v) \\\n"
6674 "{ \\\n"
6675 "v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \\\n"
6676 "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n"
6677 "}\n"
6678 "#elif defined BORDER_WRAP\n"
6679 "#define EXTRAPOLATE(v2, v) \\\n"
6680 "{ \\\n"
6681 "if (v2.x < 0) \\\n"
6682 "v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \\\n"
6683 "if (v2.x >= src_cols) \\\n"
6684 "v2.x %= src_cols; \\\n"
6685 "\\\n"
6686 "if (v2.y < 0) \\\n"
6687 "v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \\\n"
6688 "if( v2.y >= src_rows ) \\\n"
6689 "v2.y %= src_rows; \\\n"
6690 "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n"
6691 "}\n"
6692 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
6693 "#ifdef BORDER_REFLECT\n"
6694 "#define DELTA int delta = 0\n"
6695 "#else\n"
6696 "#define DELTA int delta = 1\n"
6697 "#endif\n"
6698 "#define EXTRAPOLATE(v2, v) \\\n"
6699 "{ \\\n"
6700 "DELTA; \\\n"
6701 "if (src_cols == 1) \\\n"
6702 "v2.x = 0; \\\n"
6703 "else \\\n"
6704 "do \\\n"
6705 "{ \\\n"
6706 "if( v2.x < 0 ) \\\n"
6707 "v2.x = -v2.x - 1 + delta; \\\n"
6708 "else \\\n"
6709 "v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \\\n"
6710 "} \\\n"
6711 "while (v2.x >= src_cols || v2.x < 0); \\\n"
6712 "\\\n"
6713 "if (src_rows == 1) \\\n"
6714 "v2.y = 0; \\\n"
6715 "else \\\n"
6716 "do \\\n"
6717 "{ \\\n"
6718 "if( v2.y < 0 ) \\\n"
6719 "v2.y = -v2.y - 1 + delta; \\\n"
6720 "else \\\n"
6721 "v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \\\n"
6722 "} \\\n"
6723 "while (v2.y >= src_rows || v2.y < 0); \\\n"
6724 "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n"
6725 "}\n"
6726 "#else\n"
6727 "#error No extrapolation method\n"
6728 "#endif\n"
6729 "#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)\n"
6730 "#ifdef INTER_NEAREST\n"
6731 "__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6732 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6733 "__global const uchar * map1ptr, int map1_step, int map1_offset,\n"
6734 "__global const uchar * map2ptr, int map2_step, int map2_offset,\n"
6735 "ST nVal)\n"
6736 "{\n"
6737 "int x = get_global_id(0);\n"
6738 "int y = get_global_id(1) * rowsPerWI;\n"
6739 "if (x < dst_cols)\n"
6740 "{\n"
6741 "T scalar = convertScalar(nVal);\n"
6742 "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));\n"
6743 "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));\n"
6744 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6745 "#pragma unroll\n"
6746 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6747 "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n"
6748 "if (y < dst_rows)\n"
6749 "{\n"
6750 "__global const float * map1 = (__global const float *)(map1ptr + map1_index);\n"
6751 "__global const float * map2 = (__global const float *)(map2ptr + map2_index);\n"
6752 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6753 "int gx = convert_int_sat_rte(map1[0]);\n"
6754 "int gy = convert_int_sat_rte(map2[0]);\n"
6755 "if (NEED_EXTRAPOLATION(gx, gy))\n"
6756 "{\n"
6757 "#ifndef BORDER_CONSTANT\n"
6758 "int2 gxy = (int2)(gx, gy);\n"
6759 "#endif\n"
6760 "T v;\n"
6761 "EXTRAPOLATE(gxy, v)\n"
6762 "storepix(v, dst);\n"
6763 "}\n"
6764 "else\n"
6765 "{\n"
6766 "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n"
6767 "storepix(loadpix((__global const T*)(srcptr + src_index)), dst);\n"
6768 "}\n"
6769 "}\n"
6770 "}\n"
6771 "}\n"
6772 "__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6773 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6774 "__global const uchar * mapptr, int map_step, int map_offset,\n"
6775 "ST nVal)\n"
6776 "{\n"
6777 "int x = get_global_id(0);\n"
6778 "int y = get_global_id(1) * rowsPerWI;\n"
6779 "if (x < dst_cols)\n"
6780 "{\n"
6781 "T scalar = convertScalar(nVal);\n"
6782 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6783 "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));\n"
6784 "#pragma unroll\n"
6785 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6786 "map_index += map_step, dst_index += dst_step)\n"
6787 "if (y < dst_rows)\n"
6788 "{\n"
6789 "__global const float2 * map = (__global const float2 *)(mapptr + map_index);\n"
6790 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6791 "int2 gxy = convert_int2_sat_rte(map[0]);\n"
6792 "int gx = gxy.x, gy = gxy.y;\n"
6793 "if (NEED_EXTRAPOLATION(gx, gy))\n"
6794 "{\n"
6795 "T v;\n"
6796 "EXTRAPOLATE(gxy, v)\n"
6797 "storepix(v, dst);\n"
6798 "}\n"
6799 "else\n"
6800 "{\n"
6801 "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n"
6802 "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n"
6803 "}\n"
6804 "}\n"
6805 "}\n"
6806 "}\n"
6807 "__kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6808 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6809 "__global const uchar * mapptr, int map_step, int map_offset,\n"
6810 "ST nVal)\n"
6811 "{\n"
6812 "int x = get_global_id(0);\n"
6813 "int y = get_global_id(1) * rowsPerWI;\n"
6814 "if (x < dst_cols)\n"
6815 "{\n"
6816 "T scalar = convertScalar(nVal);\n"
6817 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6818 "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));\n"
6819 "#pragma unroll\n"
6820 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6821 "map_index += map_step, dst_index += dst_step)\n"
6822 "if (y < dst_rows)\n"
6823 "{\n"
6824 "__global const short2 * map = (__global const short2 *)(mapptr + map_index);\n"
6825 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6826 "int2 gxy = convert_int2(map[0]);\n"
6827 "int gx = gxy.x, gy = gxy.y;\n"
6828 "if (NEED_EXTRAPOLATION(gx, gy))\n"
6829 "{\n"
6830 "T v;\n"
6831 "EXTRAPOLATE(gxy, v)\n"
6832 "storepix(v, dst);\n"
6833 "}\n"
6834 "else\n"
6835 "{\n"
6836 "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n"
6837 "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n"
6838 "}\n"
6839 "}\n"
6840 "}\n"
6841 "}\n"
6842 "__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6843 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6844 "__global const uchar * map1ptr, int map1_step, int map1_offset,\n"
6845 "__global const uchar * map2ptr, int map2_step, int map2_offset,\n"
6846 "ST nVal)\n"
6847 "{\n"
6848 "int x = get_global_id(0);\n"
6849 "int y = get_global_id(1) * rowsPerWI;\n"
6850 "if (x < dst_cols)\n"
6851 "{\n"
6852 "T scalar = convertScalar(nVal);\n"
6853 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6854 "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));\n"
6855 "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));\n"
6856 "#pragma unroll\n"
6857 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6858 "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n"
6859 "if (y < dst_rows)\n"
6860 "{\n"
6861 "__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);\n"
6862 "__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);\n"
6863 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6864 "int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1);\n"
6865 "int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;\n"
6866 "int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;\n"
6867 "int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy);\n"
6868 "int gx = gxy.x, gy = gxy.y;\n"
6869 "if (NEED_EXTRAPOLATION(gx, gy))\n"
6870 "{\n"
6871 "T v;\n"
6872 "EXTRAPOLATE(gxy, v)\n"
6873 "storepix(v, dst);\n"
6874 "}\n"
6875 "else\n"
6876 "{\n"
6877 "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n"
6878 "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n"
6879 "}\n"
6880 "}\n"
6881 "}\n"
6882 "}\n"
6883 "#elif defined INTER_LINEAR\n"
6884 "__constant float coeffs[64] =\n"
6885 "{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,\n"
6886 "0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,\n"
6887 "0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,\n"
6888 "0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,\n"
6889 "0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,\n"
6890 "0.062500f, 0.937500f, 0.031250f, 0.968750f };\n"
6891 "__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6892 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6893 "__global const uchar * map1ptr, int map1_step, int map1_offset,\n"
6894 "__global const uchar * map2ptr, int map2_step, int map2_offset,\n"
6895 "ST nVal)\n"
6896 "{\n"
6897 "int x = get_global_id(0);\n"
6898 "int y = get_global_id(1) * rowsPerWI;\n"
6899 "if (x < dst_cols)\n"
6900 "{\n"
6901 "WT scalar = convertToWT(convertScalar(nVal));\n"
6902 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6903 "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));\n"
6904 "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));\n"
6905 "#pragma unroll\n"
6906 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6907 "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n"
6908 "if (y < dst_rows)\n"
6909 "{\n"
6910 "__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);\n"
6911 "__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);\n"
6912 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6913 "int2 map_dataA = convert_int2(map1[0]);\n"
6914 "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n"
6915 "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n"
6916 "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n"
6917 "ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));\n"
6918 "WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);\n"
6919 "WT a = scalar, b = scalar, c = scalar, d = scalar;\n"
6920 "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n"
6921 "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n"
6922 "else\n"
6923 "EXTRAPOLATE(map_dataA, a);\n"
6924 "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n"
6925 "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n"
6926 "else\n"
6927 "EXTRAPOLATE(map_dataB, b);\n"
6928 "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n"
6929 "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n"
6930 "else\n"
6931 "EXTRAPOLATE(map_dataC, c);\n"
6932 "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n"
6933 "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n"
6934 "else\n"
6935 "EXTRAPOLATE(map_dataD, d);\n"
6936 "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n"
6937 "b * (u.x)     * (1 - u.y) +\n"
6938 "c * (1 - u.x) * (u.y) +\n"
6939 "d * (u.x)     * (u.y);\n"
6940 "storepix(convertToT(dst_data), dst);\n"
6941 "}\n"
6942 "}\n"
6943 "}\n"
6944 "__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
6945 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
6946 "__global const uchar * map1ptr, int map1_step, int map1_offset,\n"
6947 "__global const uchar * map2ptr, int map2_step, int map2_offset,\n"
6948 "ST nVal)\n"
6949 "{\n"
6950 "int x = get_global_id(0);\n"
6951 "int y = get_global_id(1) * rowsPerWI;\n"
6952 "if (x < dst_cols)\n"
6953 "{\n"
6954 "WT scalar = convertToWT(convertScalar(nVal));\n"
6955 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
6956 "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));\n"
6957 "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));\n"
6958 "#pragma unroll\n"
6959 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
6960 "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n"
6961 "if (y < dst_rows)\n"
6962 "{\n"
6963 "__global const float * map1 = (__global const float *)(map1ptr + map1_index);\n"
6964 "__global const float * map2 = (__global const float *)(map2ptr + map2_index);\n"
6965 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
6966 "#if defined BORDER_CONSTANT\n"
6967 "float xf = map1[0], yf = map2[0];\n"
6968 "int sx = convert_int_sat_rtz(mad(xf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;\n"
6969 "int sy = convert_int_sat_rtz(mad(yf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;\n"
6970 "__constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);\n"
6971 "__constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);\n"
6972 "WT sum = (WT)(0), xsum;\n"
6973 "int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));\n"
6974 "#pragma unroll\n"
6975 "for (int yp = 0; yp < 2; ++yp, src_index += src_step)\n"
6976 "{\n"
6977 "if (sy + yp >= 0 && sy + yp < src_rows)\n"
6978 "{\n"
6979 "xsum = (WT)(0);\n"
6980 "if (sx >= 0 && sx + 2 < src_cols)\n"
6981 "{\n"
6982 "#if depth == 0 && cn == 1\n"
6983 "uchar2 value = vload2(0, srcptr + src_index);\n"
6984 "xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));\n"
6985 "#else\n"
6986 "#pragma unroll\n"
6987 "for (int xp = 0; xp < 2; ++xp)\n"
6988 "xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);\n"
6989 "#endif\n"
6990 "}\n"
6991 "else\n"
6992 "{\n"
6993 "#pragma unroll\n"
6994 "for (int xp = 0; xp < 2; ++xp)\n"
6995 "xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?\n"
6996 "convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);\n"
6997 "}\n"
6998 "sum = fma(xsum, coeffs_y[yp], sum);\n"
6999 "}\n"
7000 "else\n"
7001 "sum = fma(scalar, coeffs_y[yp], sum);\n"
7002 "}\n"
7003 "storepix(convertToT(sum), dst);\n"
7004 "#else\n"
7005 "float2 map_data = (float2)(map1[0], map2[0]);\n"
7006 "int2 map_dataA = convert_int2_sat_rtn(map_data);\n"
7007 "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n"
7008 "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n"
7009 "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n"
7010 "float2 _u = map_data - convert_float2(map_dataA);\n"
7011 "WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;\n"
7012 "WT scalar = convertToWT(convertScalar(nVal));\n"
7013 "WT a = scalar, b = scalar, c = scalar, d = scalar;\n"
7014 "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n"
7015 "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n"
7016 "else\n"
7017 "EXTRAPOLATE(map_dataA, a);\n"
7018 "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n"
7019 "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n"
7020 "else\n"
7021 "EXTRAPOLATE(map_dataB, b);\n"
7022 "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n"
7023 "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n"
7024 "else\n"
7025 "EXTRAPOLATE(map_dataC, c);\n"
7026 "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n"
7027 "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n"
7028 "else\n"
7029 "EXTRAPOLATE(map_dataD, d);\n"
7030 "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n"
7031 "b * (u.x)     * (1 - u.y) +\n"
7032 "c * (1 - u.x) * (u.y) +\n"
7033 "d * (u.x)     * (u.y);\n"
7034 "storepix(convertToT(dst_data), dst);\n"
7035 "#endif\n"
7036 "}\n"
7037 "}\n"
7038 "}\n"
7039 "__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7040 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7041 "__global const uchar * mapptr, int map_step, int map_offset,\n"
7042 "ST nVal)\n"
7043 "{\n"
7044 "int x = get_global_id(0);\n"
7045 "int y = get_global_id(1) * rowsPerWI;\n"
7046 "if (x < dst_cols)\n"
7047 "{\n"
7048 "WT scalar = convertToWT(convertScalar(nVal));\n"
7049 "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n"
7050 "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));\n"
7051 "#pragma unroll\n"
7052 "for (int i = 0; i < rowsPerWI; ++i, ++y,\n"
7053 "map_index += map_step, dst_index += dst_step)\n"
7054 "if (y < dst_rows)\n"
7055 "{\n"
7056 "__global const float2 * map = (__global const float2 *)(mapptr + map_index);\n"
7057 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
7058 "float2 map_data = map[0];\n"
7059 "int2 map_dataA = convert_int2_sat_rtn(map_data);\n"
7060 "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n"
7061 "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n"
7062 "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n"
7063 "float2 _u = map_data - convert_float2(map_dataA);\n"
7064 "WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;\n"
7065 "WT a = scalar, b = scalar, c = scalar, d = scalar;\n"
7066 "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n"
7067 "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n"
7068 "else\n"
7069 "EXTRAPOLATE(map_dataA, a);\n"
7070 "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n"
7071 "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n"
7072 "else\n"
7073 "EXTRAPOLATE(map_dataB, b);\n"
7074 "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n"
7075 "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n"
7076 "else\n"
7077 "EXTRAPOLATE(map_dataC, c);\n"
7078 "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n"
7079 "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n"
7080 "else\n"
7081 "EXTRAPOLATE(map_dataD, d);\n"
7082 "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n"
7083 "b * (u.x)     * (1 - u.y) +\n"
7084 "c * (1 - u.x) * (u.y) +\n"
7085 "d * (u.x)     * (u.y);\n"
7086 "storepix(convertToT(dst_data), dst);\n"
7087 "}\n"
7088 "}\n"
7089 "}\n"
7090 "#endif\n"
7091 , "6833b9a226d061c1ff80509eed0dd178"};
7092 ProgramSource remap_oclsrc(remap.programStr);
7093 const struct ProgramEntry resize={"resize",
7094 "#ifdef DOUBLE_SUPPORT\n"
7095 "#ifdef cl_amd_fp64\n"
7096 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
7097 "#elif defined (cl_khr_fp64)\n"
7098 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
7099 "#endif\n"
7100 "#endif\n"
7101 "#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)\n"
7102 "#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)\n"
7103 "#define INC(x,l) min(x+1,l-1)\n"
7104 "#define noconvert\n"
7105 "#if cn != 3\n"
7106 "#define loadpix(addr)  *(__global const T *)(addr)\n"
7107 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
7108 "#define TSIZE (int)sizeof(T)\n"
7109 "#else\n"
7110 "#define loadpix(addr)  vload3(0, (__global const T1 *)(addr))\n"
7111 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
7112 "#define TSIZE (int)sizeof(T1)*cn\n"
7113 "#endif\n"
7114 "#if defined USE_SAMPLER\n"
7115 "#if cn == 1\n"
7116 "#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x\n"
7117 "#define INTERMEDIATE_TYPE  float\n"
7118 "#elif cn == 2\n"
7119 "#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy\n"
7120 "#define INTERMEDIATE_TYPE  float2\n"
7121 "#elif cn == 3\n"
7122 "#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz\n"
7123 "#define INTERMEDIATE_TYPE  float3\n"
7124 "#elif cn == 4\n"
7125 "#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)\n"
7126 "#define INTERMEDIATE_TYPE  float4\n"
7127 "#endif\n"
7128 "#define __CAT(x, y) x##y\n"
7129 "#define CAT(x, y) __CAT(x, y)\n"
7130 "#define float1 float\n"
7131 "#if depth == 0\n"
7132 "#define RESULT_SCALE    255.0f\n"
7133 "#elif depth == 1\n"
7134 "#define RESULT_SCALE    127.0f\n"
7135 "#elif depth == 2\n"
7136 "#define RESULT_SCALE    65535.0f\n"
7137 "#elif depth == 3\n"
7138 "#define RESULT_SCALE    32767.0f\n"
7139 "#else\n"
7140 "#define RESULT_SCALE    1.0f\n"
7141 "#endif\n"
7142 "__kernel void resizeSampler(__read_only image2d_t srcImage,\n"
7143 "__global uchar* dstptr, int dststep, int dstoffset,\n"
7144 "int dstrows, int dstcols,\n"
7145 "float ifx, float ify)\n"
7146 "{\n"
7147 "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
7148 "CLK_ADDRESS_CLAMP_TO_EDGE |\n"
7149 "CLK_FILTER_LINEAR;\n"
7150 "int dx = get_global_id(0);\n"
7151 "int dy = get_global_id(1);\n"
7152 "float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);\n"
7153 "INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));\n"
7154 "#if depth <= 4\n"
7155 "T uval = convertToDT(round(intermediate * RESULT_SCALE));\n"
7156 "#else\n"
7157 "T uval = convertToDT(intermediate * RESULT_SCALE);\n"
7158 "#endif\n"
7159 "if(dx < dstcols && dy < dstrows)\n"
7160 "{\n"
7161 "storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));\n"
7162 "}\n"
7163 "}\n"
7164 "#elif defined INTER_LINEAR_INTEGER\n"
7165 "__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7166 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7167 "__global const uchar * buffer)\n"
7168 "{\n"
7169 "int dx = get_global_id(0);\n"
7170 "int dy = get_global_id(1);\n"
7171 "if (dx < dst_cols && dy < dst_rows)\n"
7172 "{\n"
7173 "__global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;\n"
7174 "__global const short * ialpha = (__global const short *)(yofs + dst_rows);\n"
7175 "__global const short * ibeta = ialpha + ((dst_cols + dy) << 1);\n"
7176 "ialpha += dx << 1;\n"
7177 "int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),\n"
7178 "sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);\n"
7179 "short a0 = ialpha[0], a1 = ialpha[1];\n"
7180 "short b0 = ibeta[0], b1 = ibeta[1];\n"
7181 "int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),\n"
7182 "src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));\n"
7183 "WT data0 = convertToWT(loadpix(srcptr + src_index0));\n"
7184 "WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));\n"
7185 "WT data2 = convertToWT(loadpix(srcptr + src_index1));\n"
7186 "WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));\n"
7187 "WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +\n"
7188 "( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);\n"
7189 "storepix(convertToDT((val + 2) >> 2),\n"
7190 "dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n"
7191 "}\n"
7192 "}\n"
7193 "#elif defined INTER_LINEAR\n"
7194 "__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7195 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7196 "float ifx, float ify)\n"
7197 "{\n"
7198 "int dx = get_global_id(0);\n"
7199 "int dy = get_global_id(1);\n"
7200 "if (dx < dst_cols && dy < dst_rows)\n"
7201 "{\n"
7202 "float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);\n"
7203 "int x = floor(sx), y = floor(sy);\n"
7204 "float u = sx - x, v = sy - y;\n"
7205 "if ( x<0 ) x=0,u=0;\n"
7206 "if ( x>=src_cols ) x=src_cols-1,u=0;\n"
7207 "if ( y<0 ) y=0,v=0;\n"
7208 "if ( y>=src_rows ) y=src_rows-1,v=0;\n"
7209 "int y_ = INC(y, src_rows);\n"
7210 "int x_ = INC(x, src_cols);\n"
7211 "#if depth <= 4\n"
7212 "u = u * INTER_RESIZE_COEF_SCALE;\n"
7213 "v = v * INTER_RESIZE_COEF_SCALE;\n"
7214 "int U = rint(u);\n"
7215 "int V = rint(v);\n"
7216 "int U1 = rint(INTER_RESIZE_COEF_SCALE - u);\n"
7217 "int V1 = rint(INTER_RESIZE_COEF_SCALE - v);\n"
7218 "WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));\n"
7219 "WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));\n"
7220 "WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));\n"
7221 "WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));\n"
7222 "WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +\n"
7223 "mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);\n"
7224 "T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);\n"
7225 "#else\n"
7226 "float u1 = 1.f - u;\n"
7227 "float v1 = 1.f - v;\n"
7228 "WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));\n"
7229 "WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));\n"
7230 "WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));\n"
7231 "WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));\n"
7232 "T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;\n"
7233 "#endif\n"
7234 "storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n"
7235 "}\n"
7236 "}\n"
7237 "#elif defined INTER_NEAREST\n"
7238 "__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7239 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7240 "float ifx, float ify)\n"
7241 "{\n"
7242 "int dx = get_global_id(0);\n"
7243 "int dy = get_global_id(1);\n"
7244 "if (dx < dst_cols && dy < dst_rows)\n"
7245 "{\n"
7246 "float s1 = dx * ifx;\n"
7247 "float s2 = dy * ify;\n"
7248 "int sx = min(convert_int_rtz(s1), src_cols - 1);\n"
7249 "int sy = min(convert_int_rtz(s2), src_rows - 1);\n"
7250 "storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),\n"
7251 "dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n"
7252 "}\n"
7253 "}\n"
7254 "#elif defined INTER_AREA\n"
7255 "#ifdef INTER_AREA_FAST\n"
7256 "__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n"
7257 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n"
7258 "{\n"
7259 "int dx = get_global_id(0);\n"
7260 "int dy = get_global_id(1);\n"
7261 "if (dx < dst_cols && dy < dst_rows)\n"
7262 "{\n"
7263 "int dst_index = mad24(dy, dst_step, dst_offset);\n"
7264 "int sx = XSCALE * dx;\n"
7265 "int sy = YSCALE * dy;\n"
7266 "WTV sum = (WTV)(0);\n"
7267 "#pragma unroll\n"
7268 "for (int py = 0; py < YSCALE; ++py)\n"
7269 "{\n"
7270 "int y = min(sy + py, src_rows - 1);\n"
7271 "int src_index = mad24(y, src_step, src_offset);\n"
7272 "#pragma unroll\n"
7273 "for (int px = 0; px < XSCALE; ++px)\n"
7274 "{\n"
7275 "int x = min(sx + px, src_cols - 1);\n"
7276 "sum += convertToWTV(loadpix(src + src_index + x*TSIZE));\n"
7277 "}\n"
7278 "}\n"
7279 "storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));\n"
7280 "}\n"
7281 "}\n"
7282 "#else\n"
7283 "__kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n"
7284 "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7285 "float ifx, float ify, __global const int * ofs_tab,\n"
7286 "__global const int * map_tab, __global const float * alpha_tab)\n"
7287 "{\n"
7288 "int dx = get_global_id(0);\n"
7289 "int dy = get_global_id(1);\n"
7290 "if (dx < dst_cols && dy < dst_rows)\n"
7291 "{\n"
7292 "int dst_index = mad24(dy, dst_step, dst_offset);\n"
7293 "__global const int * xmap_tab = map_tab;\n"
7294 "__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));\n"
7295 "__global const float * xalpha_tab = alpha_tab;\n"
7296 "__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));\n"
7297 "__global const int * xofs_tab = ofs_tab;\n"
7298 "__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);\n"
7299 "int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];\n"
7300 "int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];\n"
7301 "int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];\n"
7302 "int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];\n"
7303 "WTV sum = (WTV)(0), buf;\n"
7304 "int src_index = mad24(sy0, src_step, src_offset);\n"
7305 "for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)\n"
7306 "{\n"
7307 "WTV beta = (WTV)(yalpha_tab[yk]);\n"
7308 "buf = (WTV)(0);\n"
7309 "for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)\n"
7310 "{\n"
7311 "WTV alpha = (WTV)(xalpha_tab[xk]);\n"
7312 "buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;\n"
7313 "}\n"
7314 "sum += buf * beta;\n"
7315 "}\n"
7316 "storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));\n"
7317 "}\n"
7318 "}\n"
7319 "#endif\n"
7320 "#endif\n"
7321 , "3e1ea3c21fc70a7a9166d5cc66b7ff80"};
7322 ProgramSource resize_oclsrc(resize.programStr);
7323 const struct ProgramEntry threshold={"threshold",
7324 "#ifdef DOUBLE_SUPPORT\n"
7325 "#ifdef cl_amd_fp64\n"
7326 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
7327 "#elif defined (cl_khr_fp64)\n"
7328 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
7329 "#endif\n"
7330 "#endif\n"
7331 "__kernel void threshold(__global const uchar * srcptr, int src_step, int src_offset,\n"
7332 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n"
7333 "T1 thresh, T1 max_val, T1 min_val)\n"
7334 "{\n"
7335 "int gx = get_global_id(0);\n"
7336 "int gy = get_global_id(1) * STRIDE_SIZE;\n"
7337 "if (gx < cols)\n"
7338 "{\n"
7339 "int src_index = mad24(gy, src_step, mad24(gx, (int)sizeof(T), src_offset));\n"
7340 "int dst_index = mad24(gy, dst_step, mad24(gx, (int)sizeof(T), dst_offset));\n"
7341 "#pragma unroll\n"
7342 "for (int i = 0; i < STRIDE_SIZE; i++)\n"
7343 "{\n"
7344 "if (gy < rows)\n"
7345 "{\n"
7346 "T sdata = *(__global const T *)(srcptr + src_index);\n"
7347 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
7348 "#ifdef THRESH_BINARY\n"
7349 "dst[0] = sdata > (thresh) ? (T)(max_val) : (T)(0);\n"
7350 "#elif defined THRESH_BINARY_INV\n"
7351 "dst[0] = sdata > (thresh) ? (T)(0) : (T)(max_val);\n"
7352 "#elif defined THRESH_TRUNC\n"
7353 "dst[0] = clamp(sdata, (T)min_val, (T)(thresh));\n"
7354 "#elif defined THRESH_TOZERO\n"
7355 "dst[0] = sdata > (thresh) ? sdata : (T)(0);\n"
7356 "#elif defined THRESH_TOZERO_INV\n"
7357 "dst[0] = sdata > (thresh) ? (T)(0) : sdata;\n"
7358 "#endif\n"
7359 "gy++;\n"
7360 "src_index += src_step;\n"
7361 "dst_index += dst_step;\n"
7362 "}\n"
7363 "}\n"
7364 "}\n"
7365 "}\n"
7366 , "f464151682565a20de380a62e09ae458"};
7367 ProgramSource threshold_oclsrc(threshold.programStr);
7368 const struct ProgramEntry warp_affine={"warp_affine",
7369 "#ifdef DOUBLE_SUPPORT\n"
7370 "#ifdef cl_amd_fp64\n"
7371 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
7372 "#elif defined (cl_khr_fp64)\n"
7373 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
7374 "#endif\n"
7375 "#define CT double\n"
7376 "#else\n"
7377 "#define CT float\n"
7378 "#endif\n"
7379 "#define INTER_BITS 5\n"
7380 "#define INTER_TAB_SIZE (1 << INTER_BITS)\n"
7381 "#define INTER_SCALE 1.f/INTER_TAB_SIZE\n"
7382 "#define AB_BITS max(10, (int)INTER_BITS)\n"
7383 "#define AB_SCALE (1 << AB_BITS)\n"
7384 "#define INTER_REMAP_COEF_BITS 15\n"
7385 "#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)\n"
7386 "#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))\n"
7387 "#define noconvert\n"
7388 "#ifndef ST\n"
7389 "#define ST T\n"
7390 "#endif\n"
7391 "#if cn != 3\n"
7392 "#define loadpix(addr)  *(__global const T*)(addr)\n"
7393 "#define storepix(val, addr)  *(__global T*)(addr) = val\n"
7394 "#define scalar scalar_\n"
7395 "#define pixsize (int)sizeof(T)\n"
7396 "#else\n"
7397 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
7398 "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n"
7399 "#ifdef INTER_NEAREST\n"
7400 "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n"
7401 "#else\n"
7402 "#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)\n"
7403 "#endif\n"
7404 "#define pixsize ((int)sizeof(T1)*3)\n"
7405 "#endif\n"
7406 "#ifdef INTER_NEAREST\n"
7407 "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7408 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7409 "__constant CT * M, ST scalar_)\n"
7410 "{\n"
7411 "int dx = get_global_id(0);\n"
7412 "int dy0 = get_global_id(1) * rowsPerWI;\n"
7413 "if (dx < dst_cols)\n"
7414 "{\n"
7415 "int round_delta = (AB_SCALE >> 1);\n"
7416 "int X0_ = rint(M[0] * dx * AB_SCALE);\n"
7417 "int Y0_ = rint(M[3] * dx * AB_SCALE);\n"
7418 "int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));\n"
7419 "for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)\n"
7420 "{\n"
7421 "int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;\n"
7422 "int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;\n"
7423 "short sx = convert_short_sat(X0 >> AB_BITS);\n"
7424 "short sy = convert_short_sat(Y0 >> AB_BITS);\n"
7425 "if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)\n"
7426 "{\n"
7427 "int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset));\n"
7428 "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n"
7429 "}\n"
7430 "else\n"
7431 "storepix(scalar, dstptr + dst_index);\n"
7432 "}\n"
7433 "}\n"
7434 "}\n"
7435 "#elif defined INTER_LINEAR\n"
7436 "__constant float coeffs[64] =\n"
7437 "{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,\n"
7438 "0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,\n"
7439 "0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,\n"
7440 "0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,\n"
7441 "0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,\n"
7442 "0.062500f, 0.937500f, 0.031250f, 0.968750f };\n"
7443 "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7444 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7445 "__constant CT * M, ST scalar_)\n"
7446 "{\n"
7447 "int dx = get_global_id(0);\n"
7448 "int dy0 = get_global_id(1) * rowsPerWI;\n"
7449 "if (dx < dst_cols)\n"
7450 "{\n"
7451 "int tmp = dx << AB_BITS;\n"
7452 "int X0_ = rint(M[0] * tmp);\n"
7453 "int Y0_ = rint(M[3] * tmp);\n"
7454 "for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)\n"
7455 "{\n"
7456 "int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;\n"
7457 "int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;\n"
7458 "X0 = X0 >> (AB_BITS - INTER_BITS);\n"
7459 "Y0 = Y0 >> (AB_BITS - INTER_BITS);\n"
7460 "short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);\n"
7461 "short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));\n"
7462 "#if defined AMD_DEVICE || depth > 4\n"
7463 "WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;\n"
7464 "if (sx >= 0 && sx < src_cols)\n"
7465 "{\n"
7466 "if (sy >= 0 && sy < src_rows)\n"
7467 "v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));\n"
7468 "if (sy+1 >= 0 && sy+1 < src_rows)\n"
7469 "v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));\n"
7470 "}\n"
7471 "if (sx+1 >= 0 && sx+1 < src_cols)\n"
7472 "{\n"
7473 "if (sy >= 0 && sy < src_rows)\n"
7474 "v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));\n"
7475 "if (sy+1 >= 0 && sy+1 < src_rows)\n"
7476 "v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));\n"
7477 "}\n"
7478 "float taby = 1.f/INTER_TAB_SIZE*ay;\n"
7479 "float tabx = 1.f/INTER_TAB_SIZE*ax;\n"
7480 "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n"
7481 "#if depth <= 4\n"
7482 "int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n"
7483 "int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );\n"
7484 "int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n"
7485 "int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );\n"
7486 "WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));\n"
7487 "storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);\n"
7488 "#else\n"
7489 "float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;\n"
7490 "WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));\n"
7491 "storepix(convertToT(val), dstptr + dst_index);\n"
7492 "#endif\n"
7493 "#else\n"
7494 "__constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);\n"
7495 "int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;\n"
7496 "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n"
7497 "WT sum = (WT)(0), xsum;\n"
7498 "#pragma unroll\n"
7499 "for (int y = 0; y < 2; y++)\n"
7500 "{\n"
7501 "src_index = mad24(y, src_step, src_index0);\n"
7502 "if (sy + y >= 0 && sy + y < src_rows)\n"
7503 "{\n"
7504 "xsum = (WT)(0);\n"
7505 "if (sx >= 0 && sx + 2 < src_cols)\n"
7506 "{\n"
7507 "#if depth == 0 && cn == 1\n"
7508 "uchar2 value = vload2(0, srcptr + src_index);\n"
7509 "xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));\n"
7510 "#else\n"
7511 "#pragma unroll\n"
7512 "for (int x = 0; x < 2; x++)\n"
7513 "xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);\n"
7514 "#endif\n"
7515 "}\n"
7516 "else\n"
7517 "{\n"
7518 "#pragma unroll\n"
7519 "for (int x = 0; x < 2; x++)\n"
7520 "xsum = fma(sx + x >= 0 && sx + x < src_cols ?\n"
7521 "convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);\n"
7522 "}\n"
7523 "sum = fma(xsum, coeffs_y[y], sum);\n"
7524 "}\n"
7525 "else\n"
7526 "sum = fma(scalar, coeffs_y[y], sum);\n"
7527 "}\n"
7528 "storepix(convertToT(sum), dstptr + dst_index);\n"
7529 "#endif\n"
7530 "}\n"
7531 "}\n"
7532 "}\n"
7533 "#elif defined INTER_CUBIC\n"
7534 "#ifdef AMD_DEVICE\n"
7535 "inline void interpolateCubic( float x, float* coeffs )\n"
7536 "{\n"
7537 "const float A = -0.75f;\n"
7538 "coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A);\n"
7539 "coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f);\n"
7540 "coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f);\n"
7541 "coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];\n"
7542 "}\n"
7543 "#else\n"
7544 "__constant float coeffs[128] =\n"
7545 "{ 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,\n"
7546 "-0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,\n"
7547 "-0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,\n"
7548 "-0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,\n"
7549 "-0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,\n"
7550 "-0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,\n"
7551 "-0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,\n"
7552 "-0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,\n"
7553 "-0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,\n"
7554 "-0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,\n"
7555 "-0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };\n"
7556 "#endif\n"
7557 "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7558 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7559 "__constant CT * M, ST scalar_)\n"
7560 "{\n"
7561 "int dx = get_global_id(0);\n"
7562 "int dy = get_global_id(1);\n"
7563 "if (dx < dst_cols && dy < dst_rows)\n"
7564 "{\n"
7565 "int tmp = (dx << AB_BITS);\n"
7566 "int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;\n"
7567 "int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;\n"
7568 "X0 = X0 >> (AB_BITS - INTER_BITS);\n"
7569 "Y0 = Y0 >> (AB_BITS - INTER_BITS);\n"
7570 "int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;\n"
7571 "int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));\n"
7572 "#ifdef AMD_DEVICE\n"
7573 "WT v[16];\n"
7574 "#pragma unroll\n"
7575 "for (int y = 0; y < 4; y++)\n"
7576 "{\n"
7577 "if (sy+y >= 0 && sy+y < src_rows)\n"
7578 "{\n"
7579 "#pragma unroll\n"
7580 "for (int x = 0; x < 4; x++)\n"
7581 "v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?\n"
7582 "convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;\n"
7583 "}\n"
7584 "else\n"
7585 "{\n"
7586 "#pragma unroll\n"
7587 "for (int x = 0; x < 4; x++)\n"
7588 "v[mad24(y, 4, x)] = scalar;\n"
7589 "}\n"
7590 "}\n"
7591 "float tab1y[4], tab1x[4];\n"
7592 "float ayy = INTER_SCALE * ay;\n"
7593 "float axx = INTER_SCALE * ax;\n"
7594 "interpolateCubic(ayy, tab1y);\n"
7595 "interpolateCubic(axx, tab1x);\n"
7596 "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n"
7597 "WT sum = (WT)(0);\n"
7598 "#if depth <= 4\n"
7599 "int itab[16];\n"
7600 "#pragma unroll\n"
7601 "for (int i = 0; i < 16; i++)\n"
7602 "itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);\n"
7603 "#pragma unroll\n"
7604 "for (int i = 0; i < 16; i++)\n"
7605 "sum = mad24(v[i], itab[i], sum);\n"
7606 "storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);\n"
7607 "#else\n"
7608 "#pragma unroll\n"
7609 "for (int i = 0; i < 16; i++)\n"
7610 "sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);\n"
7611 "storepix(convertToT( sum ), dstptr + dst_index);\n"
7612 "#endif\n"
7613 "#else\n"
7614 "__constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);\n"
7615 "int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;\n"
7616 "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n"
7617 "WT sum = (WT)(0), xsum;\n"
7618 "#pragma unroll\n"
7619 "for (int y = 0; y < 4; y++)\n"
7620 "{\n"
7621 "src_index = mad24(y, src_step, src_index0);\n"
7622 "if (sy + y >= 0 && sy + y < src_rows)\n"
7623 "{\n"
7624 "xsum = (WT)(0);\n"
7625 "if (sx >= 0 && sx + 4 < src_cols)\n"
7626 "{\n"
7627 "#if depth == 0 && cn == 1\n"
7628 "uchar4 value = vload4(0, srcptr + src_index);\n"
7629 "xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));\n"
7630 "#else\n"
7631 "#pragma unroll\n"
7632 "for (int x = 0; x < 4; x++)\n"
7633 "xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);\n"
7634 "#endif\n"
7635 "}\n"
7636 "else\n"
7637 "{\n"
7638 "#pragma unroll\n"
7639 "for (int x = 0; x < 4; x++)\n"
7640 "xsum = fma(sx + x >= 0 && sx + x < src_cols ?\n"
7641 "convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);\n"
7642 "}\n"
7643 "sum = fma(xsum, coeffs_y[y], sum);\n"
7644 "}\n"
7645 "else\n"
7646 "sum = fma(scalar, coeffs_y[y], sum);\n"
7647 "}\n"
7648 "storepix(convertToT(sum), dstptr + dst_index);\n"
7649 "#endif\n"
7650 "}\n"
7651 "}\n"
7652 "#endif\n"
7653 , "582cfe4cf8dd76973e63698796247546"};
7654 ProgramSource warp_affine_oclsrc(warp_affine.programStr);
7655 const struct ProgramEntry warp_perspective={"warp_perspective",
7656 "#ifdef DOUBLE_SUPPORT\n"
7657 "#ifdef cl_amd_fp64\n"
7658 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
7659 "#elif defined (cl_khr_fp64)\n"
7660 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
7661 "#endif\n"
7662 "#define CT double\n"
7663 "#else\n"
7664 "#define CT float\n"
7665 "#endif\n"
7666 "#define INTER_BITS 5\n"
7667 "#define INTER_TAB_SIZE (1 << INTER_BITS)\n"
7668 "#define INTER_SCALE 1.f / INTER_TAB_SIZE\n"
7669 "#define AB_BITS max(10, (int)INTER_BITS)\n"
7670 "#define AB_SCALE (1 << AB_BITS)\n"
7671 "#define INTER_REMAP_COEF_BITS 15\n"
7672 "#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)\n"
7673 "#define noconvert\n"
7674 "#ifndef ST\n"
7675 "#define ST T\n"
7676 "#endif\n"
7677 "#if cn != 3\n"
7678 "#define loadpix(addr)  *(__global const T*)(addr)\n"
7679 "#define storepix(val, addr)  *(__global T*)(addr) = val\n"
7680 "#define scalar scalar_\n"
7681 "#define pixsize (int)sizeof(T)\n"
7682 "#else\n"
7683 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
7684 "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n"
7685 "#ifdef INTER_NEAREST\n"
7686 "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n"
7687 "#else\n"
7688 "#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)\n"
7689 "#endif\n"
7690 "#define pixsize ((int)sizeof(T1)*3)\n"
7691 "#endif\n"
7692 "#ifdef INTER_NEAREST\n"
7693 "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7694 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7695 "__constant CT * M, ST scalar_)\n"
7696 "{\n"
7697 "int dx = get_global_id(0);\n"
7698 "int dy = get_global_id(1);\n"
7699 "if (dx < dst_cols && dy < dst_rows)\n"
7700 "{\n"
7701 "CT X0 = M[0] * dx + M[1] * dy + M[2];\n"
7702 "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n"
7703 "CT W = M[6] * dx + M[7] * dy + M[8];\n"
7704 "W = W != 0.0f ? 1.f / W : 0.0f;\n"
7705 "short sx = convert_short_sat_rte(X0*W);\n"
7706 "short sy = convert_short_sat_rte(Y0*W);\n"
7707 "int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);\n"
7708 "if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)\n"
7709 "{\n"
7710 "int src_index = mad24(sy, src_step, sx * pixsize + src_offset);\n"
7711 "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n"
7712 "}\n"
7713 "else\n"
7714 "storepix(scalar, dstptr + dst_index);\n"
7715 "}\n"
7716 "}\n"
7717 "#elif defined INTER_LINEAR\n"
7718 "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7719 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7720 "__constant CT * M, ST scalar_)\n"
7721 "{\n"
7722 "int dx = get_global_id(0);\n"
7723 "int dy = get_global_id(1);\n"
7724 "if (dx < dst_cols && dy < dst_rows)\n"
7725 "{\n"
7726 "CT X0 = M[0] * dx + M[1] * dy + M[2];\n"
7727 "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n"
7728 "CT W = M[6] * dx + M[7] * dy + M[8];\n"
7729 "W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;\n"
7730 "int X = rint(X0 * W), Y = rint(Y0 * W);\n"
7731 "short sx = convert_short_sat(X >> INTER_BITS);\n"
7732 "short sy = convert_short_sat(Y >> INTER_BITS);\n"
7733 "short ay = (short)(Y & (INTER_TAB_SIZE - 1));\n"
7734 "short ax = (short)(X & (INTER_TAB_SIZE - 1));\n"
7735 "WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?\n"
7736 "convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;\n"
7737 "WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?\n"
7738 "convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;\n"
7739 "WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?\n"
7740 "convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;\n"
7741 "WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?\n"
7742 "convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;\n"
7743 "float taby = 1.f/INTER_TAB_SIZE*ay;\n"
7744 "float tabx = 1.f/INTER_TAB_SIZE*ax;\n"
7745 "int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);\n"
7746 "#if depth <= 4\n"
7747 "int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n"
7748 "int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );\n"
7749 "int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n"
7750 "int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );\n"
7751 "WT val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;\n"
7752 "storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);\n"
7753 "#else\n"
7754 "float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;\n"
7755 "WT val = v0 * tabx2 * taby2 +  v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;\n"
7756 "storepix(convertToT(val), dstptr + dst_index);\n"
7757 "#endif\n"
7758 "}\n"
7759 "}\n"
7760 "#elif defined INTER_CUBIC\n"
7761 "inline void interpolateCubic( float x, float* coeffs )\n"
7762 "{\n"
7763 "const float A = -0.75f;\n"
7764 "coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;\n"
7765 "coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;\n"
7766 "coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;\n"
7767 "coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];\n"
7768 "}\n"
7769 "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
7770 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
7771 "__constant CT * M, ST scalar_)\n"
7772 "{\n"
7773 "int dx = get_global_id(0);\n"
7774 "int dy = get_global_id(1);\n"
7775 "if (dx < dst_cols && dy < dst_rows)\n"
7776 "{\n"
7777 "CT X0 = M[0] * dx + M[1] * dy + M[2];\n"
7778 "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n"
7779 "CT W = M[6] * dx + M[7] * dy + M[8];\n"
7780 "W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;\n"
7781 "int X = rint(X0 * W), Y = rint(Y0 * W);\n"
7782 "short sx = convert_short_sat(X >> INTER_BITS) - 1;\n"
7783 "short sy = convert_short_sat(Y >> INTER_BITS) - 1;\n"
7784 "short ay = (short)(Y & (INTER_TAB_SIZE-1));\n"
7785 "short ax = (short)(X & (INTER_TAB_SIZE-1));\n"
7786 "WT v[16];\n"
7787 "#pragma unroll\n"
7788 "for (int y = 0; y < 4; y++)\n"
7789 "#pragma unroll\n"
7790 "for (int x = 0; x < 4; x++)\n"
7791 "v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?\n"
7792 "convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;\n"
7793 "float tab1y[4], tab1x[4];\n"
7794 "float ayy = INTER_SCALE * ay;\n"
7795 "float axx = INTER_SCALE * ax;\n"
7796 "interpolateCubic(ayy, tab1y);\n"
7797 "interpolateCubic(axx, tab1x);\n"
7798 "int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);\n"
7799 "WT sum = (WT)(0);\n"
7800 "#if depth <= 4\n"
7801 "int itab[16];\n"
7802 "#pragma unroll\n"
7803 "for (int i = 0; i < 16; i++)\n"
7804 "itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);\n"
7805 "#pragma unroll\n"
7806 "for (int i = 0; i < 16; i++)\n"
7807 "sum += v[i] * itab[i];\n"
7808 "storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);\n"
7809 "#else\n"
7810 "#pragma unroll\n"
7811 "for (int i = 0; i < 16; i++)\n"
7812 "sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];\n"
7813 "storepix(convertToT( sum ), dstptr + dst_index);\n"
7814 "#endif\n"
7815 "}\n"
7816 "}\n"
7817 "#endif\n"
7818 , "1449b5059b082c4595846a86ed5702ad"};
7819 ProgramSource warp_perspective_oclsrc(warp_perspective.programStr);
7820 }
7821 }}
7822