1 // This file is auto-generated. Do not edit!
2 
3 #include "precomp.hpp"
4 #include "opencl_kernels_core.hpp"
5 
6 namespace cv
7 {
8 namespace ocl
9 {
10 namespace core
11 {
12 
13 const struct ProgramEntry arithm={"arithm",
14 "#ifdef DOUBLE_SUPPORT\n"
15 "#ifdef cl_amd_fp64\n"
16 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
17 "#elif defined cl_khr_fp64\n"
18 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
19 "#endif\n"
20 "#endif\n"
21 "#ifdef INTEL_DEVICE\n"
22 "#pragma OPENCL FP_CONTRACT ON\n"
23 "#pragma OPENCL FP_FAST_FMAF ON\n"
24 "#pragma OPENCL FP_FAST_FMA ON\n"
25 "#endif\n"
26 "#if depth <= 5\n"
27 "#define CV_PI M_PI_F\n"
28 "#else\n"
29 "#define CV_PI M_PI\n"
30 "#endif\n"
31 "#ifndef cn\n"
32 "#define cn 1\n"
33 "#endif\n"
34 "#if cn == 1\n"
35 "#undef srcT1_C1\n"
36 "#undef srcT2_C1\n"
37 "#undef dstT_C1\n"
38 "#define srcT1_C1 srcT1\n"
39 "#define srcT2_C1 srcT2\n"
40 "#define dstT_C1 dstT\n"
41 "#endif\n"
42 "#if cn != 3\n"
43 "#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val\n"
44 "#define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val\n"
45 "#else\n"
46 "#define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index))\n"
47 "#define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2))\n"
48 "#endif\n"
49 "#define noconvert\n"
50 "#ifndef workT\n"
51 "#ifndef srcT1\n"
52 "#define srcT1 dstT\n"
53 "#endif\n"
54 "#ifndef srcT1_C1\n"
55 "#define srcT1_C1 dstT_C1\n"
56 "#endif\n"
57 "#ifndef srcT2\n"
58 "#define srcT2 dstT\n"
59 "#endif\n"
60 "#ifndef srcT2_C1\n"
61 "#define srcT2_C1 dstT_C1\n"
62 "#endif\n"
63 "#define workT dstT\n"
64 "#if cn != 3\n"
65 "#define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index)\n"
66 "#define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index)\n"
67 "#else\n"
68 "#define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))\n"
69 "#define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))\n"
70 "#endif\n"
71 "#ifndef convertToDT\n"
72 "#define convertToDT noconvert\n"
73 "#endif\n"
74 "#else\n"
75 "#ifndef convertToWT2\n"
76 "#define convertToWT2 convertToWT1\n"
77 "#endif\n"
78 "#if cn != 3\n"
79 "#define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index))\n"
80 "#define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index))\n"
81 "#else\n"
82 "#define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)))\n"
83 "#define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)))\n"
84 "#endif\n"
85 "#endif\n"
86 "#ifndef workST\n"
87 "#define workST workT\n"
88 "#endif\n"
89 "#define EXTRA_PARAMS\n"
90 "#define EXTRA_INDEX\n"
91 "#define EXTRA_INDEX_ADD\n"
92 "#if defined OP_ADD\n"
93 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))\n"
94 "#elif defined OP_SUB\n"
95 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))\n"
96 "#elif defined OP_RSUB\n"
97 "#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))\n"
98 "#elif defined OP_ABSDIFF\n"
99 "#if wdepth <= 4\n"
100 "#define PROCESS_ELEM \\\n"
101 "storedst(convertToDT(convertFromU(abs_diff(srcelem1, srcelem2))))\n"
102 "#else\n"
103 "#define PROCESS_ELEM \\\n"
104 "storedst(convertToDT(fabs(srcelem1 - srcelem2)))\n"
105 "#endif\n"
106 "#elif defined OP_AND\n"
107 "#define PROCESS_ELEM storedst(srcelem1 & srcelem2)\n"
108 "#elif defined OP_OR\n"
109 "#define PROCESS_ELEM storedst(srcelem1 | srcelem2)\n"
110 "#elif defined OP_XOR\n"
111 "#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)\n"
112 "#elif defined OP_NOT\n"
113 "#define PROCESS_ELEM storedst(~srcelem1)\n"
114 "#elif defined OP_MIN\n"
115 "#define PROCESS_ELEM storedst(min(srcelem1, srcelem2))\n"
116 "#elif defined OP_MAX\n"
117 "#define PROCESS_ELEM storedst(max(srcelem1, srcelem2))\n"
118 "#elif defined OP_MUL\n"
119 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))\n"
120 "#elif defined OP_MUL_SCALE\n"
121 "#undef EXTRA_PARAMS\n"
122 "#ifdef UNARY_OP\n"
123 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n"
124 "#undef srcelem2\n"
125 "#define srcelem2 srcelem2_\n"
126 "#else\n"
127 "#define EXTRA_PARAMS , scaleT scale\n"
128 "#endif\n"
129 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))\n"
130 "#elif defined OP_DIV\n"
131 "#define PROCESS_ELEM \\\n"
132 "workT e2 = srcelem2, zero = (workT)(0); \\\n"
133 "storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))\n"
134 "#elif defined OP_DIV_SCALE\n"
135 "#undef EXTRA_PARAMS\n"
136 "#ifdef UNARY_OP\n"
137 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n"
138 "#undef srcelem2\n"
139 "#define srcelem2 srcelem2_\n"
140 "#else\n"
141 "#define EXTRA_PARAMS , scaleT scale\n"
142 "#endif\n"
143 "#define PROCESS_ELEM \\\n"
144 "workT e2 = srcelem2, zero = (workT)(0); \\\n"
145 "storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))\n"
146 "#elif defined OP_RDIV_SCALE\n"
147 "#undef EXTRA_PARAMS\n"
148 "#ifdef UNARY_OP\n"
149 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n"
150 "#undef srcelem2\n"
151 "#define srcelem2 srcelem2_\n"
152 "#else\n"
153 "#define EXTRA_PARAMS , scaleT scale\n"
154 "#endif\n"
155 "#define PROCESS_ELEM \\\n"
156 "workT e1 = srcelem1, zero = (workT)(0); \\\n"
157 "storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))\n"
158 "#elif defined OP_RECIP_SCALE\n"
159 "#undef EXTRA_PARAMS\n"
160 "#define EXTRA_PARAMS , scaleT scale\n"
161 "#define PROCESS_ELEM \\\n"
162 "workT e1 = srcelem1, zero = (workT)(0); \\\n"
163 "storedst(convertToDT(e1 != zero ? scale / e1 : zero))\n"
164 "#elif defined OP_ADDW\n"
165 "#undef EXTRA_PARAMS\n"
166 "#define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma\n"
167 "#if wdepth <= 4\n"
168 "#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))\n"
169 "#else\n"
170 "#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, fma(srcelem2, beta, gamma))))\n"
171 "#endif\n"
172 "#elif defined OP_MAG\n"
173 "#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))\n"
174 "#elif defined OP_PHASE_RADIANS\n"
175 "#define PROCESS_ELEM \\\n"
176 "workT tmp = atan2(srcelem2, srcelem1); \\\n"
177 "if (tmp < 0) \\\n"
178 "tmp += 2 * CV_PI; \\\n"
179 "storedst(tmp)\n"
180 "#elif defined OP_PHASE_DEGREES\n"
181 "#define PROCESS_ELEM \\\n"
182 "workT tmp = degrees(atan2(srcelem2, srcelem1)); \\\n"
183 "if (tmp < 0) \\\n"
184 "tmp += 360; \\\n"
185 "storedst(tmp)\n"
186 "#elif defined OP_EXP\n"
187 "#if wdepth == 5\n"
188 "#define PROCESS_ELEM storedst(native_exp(srcelem1))\n"
189 "#else\n"
190 "#define PROCESS_ELEM storedst(exp(srcelem1))\n"
191 "#endif\n"
192 "#elif defined OP_POW\n"
193 "#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))\n"
194 "#elif defined OP_POWN\n"
195 "#undef workT\n"
196 "#define workT int\n"
197 "#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))\n"
198 "#elif defined OP_SQRT\n"
199 "#if depth <= 5\n"
200 "#define PROCESS_ELEM storedst(native_sqrt(srcelem1))\n"
201 "#else\n"
202 "#define PROCESS_ELEM storedst(sqrt(srcelem1))\n"
203 "#endif\n"
204 "#elif defined OP_LOG\n"
205 "#define PROCESS_ELEM \\\n"
206 "storedst(log(fabs(srcelem1)))\n"
207 "#elif defined OP_CMP\n"
208 "#define srcT2 srcT1\n"
209 "#ifndef convertToWT1\n"
210 "#define convertToWT1\n"
211 "#endif\n"
212 "#define PROCESS_ELEM \\\n"
213 "storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0))\n"
214 "#elif defined OP_CONVERT_SCALE_ABS\n"
215 "#undef EXTRA_PARAMS\n"
216 "#define EXTRA_PARAMS , workT1 alpha, workT1 beta\n"
217 "#if wdepth <= 4\n"
218 "#define PROCESS_ELEM \\\n"
219 "workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \\\n"
220 "storedst(convertToDT(abs(value)))\n"
221 "#else\n"
222 "#define PROCESS_ELEM \\\n"
223 "workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \\\n"
224 "storedst(convertToDT(fabs(value)))\n"
225 "#endif\n"
226 "#elif defined OP_SCALE_ADD\n"
227 "#undef EXTRA_PARAMS\n"
228 "#define EXTRA_PARAMS , workT1 alpha\n"
229 "#if wdepth <= 4\n"
230 "#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))\n"
231 "#else\n"
232 "#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2)))\n"
233 "#endif\n"
234 "#elif defined OP_CTP_AD || defined OP_CTP_AR\n"
235 "#if depth <= 5\n"
236 "#define CV_EPSILON FLT_EPSILON\n"
237 "#else\n"
238 "#define CV_EPSILON DBL_EPSILON\n"
239 "#endif\n"
240 "#ifdef OP_CTP_AD\n"
241 "#define TO_DEGREE cartToPolar = degrees(cartToPolar);\n"
242 "#elif defined OP_CTP_AR\n"
243 "#define TO_DEGREE\n"
244 "#endif\n"
245 "#define PROCESS_ELEM \\\n"
246 "dstT x = srcelem1, y = srcelem2; \\\n"
247 "dstT x2 = x * x, y2 = y * y; \\\n"
248 "dstT magnitude = sqrt(x2 + y2); \\\n"
249 "dstT tmp = y >= 0 ? 0 : CV_PI * 2; \\\n"
250 "tmp = x < 0 ? CV_PI : tmp; \\\n"
251 "dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \\\n"
252 "dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \\\n"
253 "TO_DEGREE \\\n"
254 "storedst(magnitude); \\\n"
255 "storedst2(cartToPolar)\n"
256 "#elif defined OP_PTC_AD || defined OP_PTC_AR\n"
257 "#ifdef OP_PTC_AD\n"
258 "#define FROM_DEGREE y = radians(y)\n"
259 "#else\n"
260 "#define FROM_DEGREE\n"
261 "#endif\n"
262 "#define PROCESS_ELEM \\\n"
263 "dstT x = srcelem1, y = srcelem2, cosval; \\\n"
264 "FROM_DEGREE; \\\n"
265 "storedst2(sincos(y, &cosval) * x); \\\n"
266 "storedst(cosval * x);\n"
267 "#elif defined OP_PATCH_NANS\n"
268 "#undef EXTRA_PARAMS\n"
269 "#define EXTRA_PARAMS , dstT val\n"
270 "#define PROCESS_ELEM \\\n"
271 "if (isnan(srcelem1)) \\\n"
272 "storedst(val)\n"
273 "#else\n"
274 "#error \"unknown op type\"\n"
275 "#endif\n"
276 "#if defined OP_CTP_AD || defined OP_CTP_AR || defined OP_PTC_AD || defined OP_PTC_AR\n"
277 "#undef EXTRA_PARAMS\n"
278 "#define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2\n"
279 "#undef EXTRA_INDEX\n"
280 "#define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))\n"
281 "#undef EXTRA_INDEX_ADD\n"
282 "#define EXTRA_INDEX_ADD dst_index2 += dststep2\n"
283 "#endif\n"
284 "#if defined UNARY_OP || defined MASK_UNARY_OP\n"
285 "#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \\\n"
286 "defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \\\n"
287 "defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \\\n"
288 "defined OP_MUL || defined OP_DIV || defined OP_POWN || defined OP_POWR || defined OP_ROOTN\n"
289 "#undef EXTRA_PARAMS\n"
290 "#define EXTRA_PARAMS , workST srcelem2_\n"
291 "#undef srcelem2\n"
292 "#define srcelem2 srcelem2_\n"
293 "#endif\n"
294 "#if cn == 3\n"
295 "#undef srcelem2\n"
296 "#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)\n"
297 "#endif\n"
298 "#endif\n"
299 "#if defined BINARY_OP\n"
300 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n"
301 "__global const uchar * srcptr2, int srcstep2, int srcoffset2,\n"
302 "__global uchar * dstptr, int dststep, int dstoffset,\n"
303 "int rows, int cols EXTRA_PARAMS )\n"
304 "{\n"
305 "int x = get_global_id(0);\n"
306 "int y0 = get_global_id(1) * rowsPerWI;\n"
307 "if (x < cols)\n"
308 "{\n"
309 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n"
310 "#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))\n"
311 "int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));\n"
312 "#endif\n"
313 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n"
314 "EXTRA_INDEX;\n"
315 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)\n"
316 "{\n"
317 "PROCESS_ELEM;\n"
318 "#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))\n"
319 "src2_index += srcstep2;\n"
320 "#endif\n"
321 "EXTRA_INDEX_ADD;\n"
322 "}\n"
323 "}\n"
324 "}\n"
325 "#elif defined MASK_BINARY_OP\n"
326 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n"
327 "__global const uchar * srcptr2, int srcstep2, int srcoffset2,\n"
328 "__global const uchar * mask, int maskstep, int maskoffset,\n"
329 "__global uchar * dstptr, int dststep, int dstoffset,\n"
330 "int rows, int cols EXTRA_PARAMS )\n"
331 "{\n"
332 "int x = get_global_id(0);\n"
333 "int y0 = get_global_id(1) * rowsPerWI;\n"
334 "if (x < cols)\n"
335 "{\n"
336 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n"
337 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n"
338 "int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));\n"
339 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n"
340 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,\n"
341 "mask_index += maskstep, dst_index += dststep)\n"
342 "if (mask[mask_index])\n"
343 "{\n"
344 "PROCESS_ELEM;\n"
345 "}\n"
346 "}\n"
347 "}\n"
348 "#elif defined UNARY_OP\n"
349 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n"
350 "__global uchar * dstptr, int dststep, int dstoffset,\n"
351 "int rows, int cols EXTRA_PARAMS )\n"
352 "{\n"
353 "int x = get_global_id(0);\n"
354 "int y0 = get_global_id(1) * rowsPerWI;\n"
355 "if (x < cols)\n"
356 "{\n"
357 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n"
358 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n"
359 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)\n"
360 "{\n"
361 "PROCESS_ELEM;\n"
362 "}\n"
363 "}\n"
364 "}\n"
365 "#elif defined MASK_UNARY_OP\n"
366 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n"
367 "__global const uchar * mask, int maskstep, int maskoffset,\n"
368 "__global uchar * dstptr, int dststep, int dstoffset,\n"
369 "int rows, int cols EXTRA_PARAMS )\n"
370 "{\n"
371 "int x = get_global_id(0);\n"
372 "int y0 = get_global_id(1) * rowsPerWI;\n"
373 "if (x < cols)\n"
374 "{\n"
375 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n"
376 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n"
377 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n"
378 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)\n"
379 "if (mask[mask_index])\n"
380 "{\n"
381 "PROCESS_ELEM;\n"
382 "}\n"
383 "}\n"
384 "}\n"
385 "#else\n"
386 "#error \"Unknown operation type\"\n"
387 "#endif\n"
388 , "ed0d3325149acdde26581a72052260a5"};
389 ProgramSource arithm_oclsrc(arithm.programStr);
390 const struct ProgramEntry convert={"convert",
391 "#ifdef DOUBLE_SUPPORT\n"
392 "#ifdef cl_amd_fp64\n"
393 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
394 "#elif defined (cl_khr_fp64)\n"
395 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
396 "#endif\n"
397 "#endif\n"
398 "#define noconvert\n"
399 "__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,\n"
400 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
401 "WT alpha, WT beta, int rowsPerWI)\n"
402 "{\n"
403 "int x = get_global_id(0);\n"
404 "int y0 = get_global_id(1) * rowsPerWI;\n"
405 "if (x < dst_cols)\n"
406 "{\n"
407 "int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset));\n"
408 "int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));\n"
409 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step)\n"
410 "{\n"
411 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n"
412 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n"
413 "dst[0] = convertToDT(fma(convertToWT(src[0]), alpha, beta));\n"
414 "}\n"
415 "}\n"
416 "}\n"
417 , "04c460dd0be049cd132d4c4611c93a5b"};
418 ProgramSource convert_oclsrc(convert.programStr);
419 const struct ProgramEntry copymakeborder={"copymakeborder",
420 "#ifdef DOUBLE_SUPPORT\n"
421 "#ifdef cl_amd_fp64\n"
422 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
423 "#elif defined (cl_khr_fp64)\n"
424 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
425 "#endif\n"
426 "#endif\n"
427 "#if cn != 3\n"
428 "#define loadpix(addr)  *(__global const T*)(addr)\n"
429 "#define storepix(val, addr)  *(__global T*)(addr) = val\n"
430 "#define TSIZE ((int)sizeof(T))\n"
431 "#define convertScalar(a) (a)\n"
432 "#else\n"
433 "#define loadpix(addr)  vload3(0, (__global const T1*)(addr))\n"
434 "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n"
435 "#define TSIZE ((int)sizeof(T1)*3)\n"
436 "#define convertScalar(a) (T)(a.x, a.y, a.z)\n"
437 "#endif\n"
438 "#ifdef BORDER_CONSTANT\n"
439 "#define EXTRAPOLATE(x, cols) \\\n"
440 ";\n"
441 "#elif defined BORDER_REPLICATE\n"
442 "#define EXTRAPOLATE(x, cols) \\\n"
443 "x = clamp(x, 0, cols - 1);\n"
444 "#elif defined BORDER_WRAP\n"
445 "#define EXTRAPOLATE(x, cols) \\\n"
446 "{ \\\n"
447 "if (x < 0) \\\n"
448 "x -= ((x - cols + 1) / cols) * cols; \\\n"
449 "if (x >= cols) \\\n"
450 "x %= cols; \\\n"
451 "}\n"
452 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n"
453 "#ifdef BORDER_REFLECT\n"
454 "#define DELTA int delta = 0\n"
455 "#else\n"
456 "#define DELTA int delta = 1\n"
457 "#endif\n"
458 "#define EXTRAPOLATE(x, cols) \\\n"
459 "{ \\\n"
460 "DELTA; \\\n"
461 "if (cols == 1) \\\n"
462 "x = 0; \\\n"
463 "else \\\n"
464 "do \\\n"
465 "{ \\\n"
466 "if( x < 0 ) \\\n"
467 "x = -x - 1 + delta; \\\n"
468 "else \\\n"
469 "x = cols - 1 - (x - cols) - delta; \\\n"
470 "} \\\n"
471 "while (x >= cols || x < 0); \\\n"
472 "}\n"
473 "#else\n"
474 "#error \"No extrapolation method\"\n"
475 "#endif\n"
476 "#define NEED_EXTRAPOLATION(x, cols) (x >= cols || x < 0)\n"
477 "__kernel void copyMakeBorder(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
478 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
479 "int top, int left, ST nVal)\n"
480 "{\n"
481 "int x = get_global_id(0);\n"
482 "int y0 = get_global_id(1) * rowsPerWI;\n"
483 "#ifdef BORDER_CONSTANT\n"
484 "T scalar = convertScalar(nVal);\n"
485 "#endif\n"
486 "if (x < dst_cols)\n"
487 "{\n"
488 "int src_x = x - left, src_y;\n"
489 "int dst_index = mad24(y0, dst_step, mad24(x, (int)TSIZE, dst_offset));\n"
490 "if (NEED_EXTRAPOLATION(src_x, src_cols))\n"
491 "{\n"
492 "#ifdef BORDER_CONSTANT\n"
493 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1; ++y, dst_index += dst_step)\n"
494 "storepix(scalar, dstptr + dst_index);\n"
495 "return;\n"
496 "#endif\n"
497 "EXTRAPOLATE(src_x, src_cols)\n"
498 "}\n"
499 "src_x = mad24(src_x, TSIZE, src_offset);\n"
500 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1; ++y, dst_index += dst_step)\n"
501 "{\n"
502 "src_y = y - top;\n"
503 "if (NEED_EXTRAPOLATION(src_y, src_rows))\n"
504 "{\n"
505 "EXTRAPOLATE(src_y, src_rows)\n"
506 "#ifdef BORDER_CONSTANT\n"
507 "storepix(scalar, dstptr + dst_index);\n"
508 "continue;\n"
509 "#endif\n"
510 "}\n"
511 "int src_index = mad24(src_y, src_step, src_x);\n"
512 "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n"
513 "}\n"
514 "}\n"
515 "}\n"
516 , "64f03714b8763ec6c2ac2f4b2ad0cf5d"};
517 ProgramSource copymakeborder_oclsrc(copymakeborder.programStr);
518 const struct ProgramEntry copyset={"copyset",
519 "#ifdef COPY_TO_MASK\n"
520 "#define DEFINE_DATA \\\n"
521 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T1) * scn, src_offset)); \\\n"
522 "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T1) * scn, dst_offset)); \\\n"
523 "\\\n"
524 "__global const T1 * src = (__global const T1 *)(srcptr + src_index); \\\n"
525 "__global T1 * dst = (__global T1 *)(dstptr + dst_index)\n"
526 "__kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_offset,\n"
527 "__global const uchar * mask, int mask_step, int mask_offset,\n"
528 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
529 "int dst_rows, int dst_cols)\n"
530 "{\n"
531 "int x = get_global_id(0);\n"
532 "int y = get_global_id(1);\n"
533 "if (x < dst_cols && y < dst_rows)\n"
534 "{\n"
535 "mask += mad24(y, mask_step, mad24(x, mcn, mask_offset));\n"
536 "#if mcn == 1\n"
537 "if (mask[0])\n"
538 "{\n"
539 "DEFINE_DATA;\n"
540 "#pragma unroll\n"
541 "for (int c = 0; c < scn; ++c)\n"
542 "dst[c] = src[c];\n"
543 "}\n"
544 "#ifdef HAVE_DST_UNINIT\n"
545 "else\n"
546 "{\n"
547 "DEFINE_DATA;\n"
548 "#pragma unroll\n"
549 "for (int c = 0; c < scn; ++c)\n"
550 "dst[c] = (T1)(0);\n"
551 "}\n"
552 "#endif\n"
553 "#elif scn == mcn\n"
554 "DEFINE_DATA;\n"
555 "#pragma unroll\n"
556 "for (int c = 0; c < scn; ++c)\n"
557 "if (mask[c])\n"
558 "dst[c] = src[c];\n"
559 "#ifdef HAVE_DST_UNINIT\n"
560 "else\n"
561 "dst[c] = (T1)(0);\n"
562 "#endif\n"
563 "#else\n"
564 "#error \"(mcn == 1 || mcn == scn) should be true\"\n"
565 "#endif\n"
566 "}\n"
567 "}\n"
568 "#else\n"
569 "#ifndef dstST\n"
570 "#define dstST dstT\n"
571 "#endif\n"
572 "#if cn != 3\n"
573 "#define value value_\n"
574 "#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val\n"
575 "#else\n"
576 "#define value (dstT)(value_.x, value_.y, value_.z)\n"
577 "#define storedst(val) vstore3(val, 0, (__global dstT1 *)(dstptr + dst_index))\n"
578 "#endif\n"
579 "__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,\n"
580 "__global uchar* dstptr, int dststep, int dstoffset,\n"
581 "int rows, int cols, dstST value_)\n"
582 "{\n"
583 "int x = get_global_id(0);\n"
584 "int y0 = get_global_id(1) * rowsPerWI;\n"
585 "if (x < cols)\n"
586 "{\n"
587 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n"
588 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));\n"
589 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y)\n"
590 "{\n"
591 "if( mask[mask_index] )\n"
592 "storedst(value);\n"
593 "mask_index += maskstep;\n"
594 "dst_index += dststep;\n"
595 "}\n"
596 "}\n"
597 "}\n"
598 "__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,\n"
599 "int rows, int cols, dstST value_)\n"
600 "{\n"
601 "int x = get_global_id(0);\n"
602 "int y0 = get_global_id(1) * rowsPerWI;\n"
603 "if (x < cols)\n"
604 "{\n"
605 "int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));\n"
606 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dststep)\n"
607 "storedst(value);\n"
608 "}\n"
609 "}\n"
610 "#endif\n"
611 , "f8f028f1776dc5c98bf03411d3b72318"};
612 ProgramSource copyset_oclsrc(copyset.programStr);
613 const struct ProgramEntry fft={"fft",
614 "#define SQRT_2 0.707106781188f\n"
615 "#define sin_120 0.866025403784f\n"
616 "#define fft5_2  0.559016994374f\n"
617 "#define fft5_3 -0.951056516295f\n"
618 "#define fft5_4 -1.538841768587f\n"
619 "#define fft5_5  0.363271264002f\n"
620 "#ifdef DOUBLE_SUPPORT\n"
621 "#ifdef cl_amd_fp64\n"
622 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
623 "#elif defined (cl_khr_fp64)\n"
624 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
625 "#endif\n"
626 "#endif\n"
627 "__attribute__((always_inline))\n"
628 "CT mul_complex(CT a, CT b) {\n"
629 "return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));\n"
630 "}\n"
631 "__attribute__((always_inline))\n"
632 "CT twiddle(CT a) {\n"
633 "return (CT)(a.y, -a.x);\n"
634 "}\n"
635 "__attribute__((always_inline))\n"
636 "void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles,\n"
637 "const int x, const int block_size)\n"
638 "{\n"
639 "const int k = x & (block_size - 1);\n"
640 "a1 = mul_complex(twiddles[k], a1);\n"
641 "const int dst_ind = (x << 1) - k;\n"
642 "smem[dst_ind] = a0 + a1;\n"
643 "smem[dst_ind+block_size] = a0 - a1;\n"
644 "}\n"
645 "__attribute__((always_inline))\n"
646 "void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles,\n"
647 "const int x, const int block_size)\n"
648 "{\n"
649 "const int k = x & (block_size - 1);\n"
650 "a1 = mul_complex(twiddles[k], a1);\n"
651 "a2 = mul_complex(twiddles[k + block_size], a2);\n"
652 "a3 = mul_complex(twiddles[k + 2*block_size], a3);\n"
653 "const int dst_ind = ((x - k) << 2) + k;\n"
654 "CT b0 = a0 + a2;\n"
655 "a2 = a0 - a2;\n"
656 "CT b1 = a1 + a3;\n"
657 "a3 = twiddle(a1 - a3);\n"
658 "smem[dst_ind]                = b0 + b1;\n"
659 "smem[dst_ind + block_size]   = a2 + a3;\n"
660 "smem[dst_ind + 2*block_size] = b0 - b1;\n"
661 "smem[dst_ind + 3*block_size] = a2 - a3;\n"
662 "}\n"
663 "__attribute__((always_inline))\n"
664 "void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles,\n"
665 "const int x, const int block_size)\n"
666 "{\n"
667 "const int k = x % block_size;\n"
668 "a1 = mul_complex(twiddles[k], a1);\n"
669 "a2 = mul_complex(twiddles[k+block_size], a2);\n"
670 "const int dst_ind = ((x - k) * 3) + k;\n"
671 "CT b1 = a1 + a2;\n"
672 "a2 = twiddle(sin_120*(a1 - a2));\n"
673 "CT b0 = a0 - (CT)(0.5f)*b1;\n"
674 "smem[dst_ind] = a0 + b1;\n"
675 "smem[dst_ind + block_size] = b0 + a2;\n"
676 "smem[dst_ind + 2*block_size] = b0 - a2;\n"
677 "}\n"
678 "__attribute__((always_inline))\n"
679 "void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles,\n"
680 "const int x, const int block_size)\n"
681 "{\n"
682 "const int k = x % block_size;\n"
683 "a1 = mul_complex(twiddles[k], a1);\n"
684 "a2 = mul_complex(twiddles[k + block_size], a2);\n"
685 "a3 = mul_complex(twiddles[k+2*block_size], a3);\n"
686 "a4 = mul_complex(twiddles[k+3*block_size], a4);\n"
687 "const int dst_ind = ((x - k) * 5) + k;\n"
688 "__local CT* dst = smem + dst_ind;\n"
689 "CT b0, b1, b5;\n"
690 "b1 = a1 + a4;\n"
691 "a1 -= a4;\n"
692 "a4 = a3 + a2;\n"
693 "a3 -= a2;\n"
694 "a2 = b1 + a4;\n"
695 "b0 = a0 - (CT)0.25f * a2;\n"
696 "b1 = fft5_2 * (b1 - a4);\n"
697 "a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x);\n"
698 "b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);\n"
699 "a4.x += fft5_4 * a3.y;\n"
700 "a4.y -= fft5_4 * a3.x;\n"
701 "a1 = b0 + b1;\n"
702 "b0 -= b1;\n"
703 "dst[0] = a0 + a2;\n"
704 "dst[block_size] = a1 + a4;\n"
705 "dst[2 * block_size] = b0 + b5;\n"
706 "dst[3 * block_size] = b0 - b5;\n"
707 "dst[4 * block_size] = a1 - a4;\n"
708 "}\n"
709 "__attribute__((always_inline))\n"
710 "void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n"
711 "{\n"
712 "CT a0, a1;\n"
713 "if (x < t)\n"
714 "{\n"
715 "a0 = smem[x];\n"
716 "a1 = smem[x+t];\n"
717 "}\n"
718 "barrier(CLK_LOCAL_MEM_FENCE);\n"
719 "if (x < t)\n"
720 "butterfly2(a0, a1, smem, twiddles, x, block_size);\n"
721 "barrier(CLK_LOCAL_MEM_FENCE);\n"
722 "}\n"
723 "__attribute__((always_inline))\n"
724 "void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
725 "{\n"
726 "const int x2 = x1 + t/2;\n"
727 "CT a0, a1, a2, a3;\n"
728 "if (x1 < t/2)\n"
729 "{\n"
730 "a0 = smem[x1]; a1 = smem[x1+t];\n"
731 "a2 = smem[x2]; a3 = smem[x2+t];\n"
732 "}\n"
733 "barrier(CLK_LOCAL_MEM_FENCE);\n"
734 "if (x1 < t/2)\n"
735 "{\n"
736 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n"
737 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n"
738 "}\n"
739 "barrier(CLK_LOCAL_MEM_FENCE);\n"
740 "}\n"
741 "__attribute__((always_inline))\n"
742 "void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
743 "{\n"
744 "const int x2 = x1 + t/3;\n"
745 "const int x3 = x1 + 2*t/3;\n"
746 "CT a0, a1, a2, a3, a4, a5;\n"
747 "if (x1 < t/3)\n"
748 "{\n"
749 "a0 = smem[x1]; a1 = smem[x1+t];\n"
750 "a2 = smem[x2]; a3 = smem[x2+t];\n"
751 "a4 = smem[x3]; a5 = smem[x3+t];\n"
752 "}\n"
753 "barrier(CLK_LOCAL_MEM_FENCE);\n"
754 "if (x1 < t/3)\n"
755 "{\n"
756 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n"
757 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n"
758 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n"
759 "}\n"
760 "barrier(CLK_LOCAL_MEM_FENCE);\n"
761 "}\n"
762 "__attribute__((always_inline))\n"
763 "void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
764 "{\n"
765 "const int thread_block = t/4;\n"
766 "const int x2 = x1 + thread_block;\n"
767 "const int x3 = x1 + 2*thread_block;\n"
768 "const int x4 = x1 + 3*thread_block;\n"
769 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n"
770 "if (x1 < t/4)\n"
771 "{\n"
772 "a0 = smem[x1]; a1 = smem[x1+t];\n"
773 "a2 = smem[x2]; a3 = smem[x2+t];\n"
774 "a4 = smem[x3]; a5 = smem[x3+t];\n"
775 "a6 = smem[x4]; a7 = smem[x4+t];\n"
776 "}\n"
777 "barrier(CLK_LOCAL_MEM_FENCE);\n"
778 "if (x1 < t/4)\n"
779 "{\n"
780 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n"
781 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n"
782 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n"
783 "butterfly2(a6, a7, smem, twiddles, x4, block_size);\n"
784 "}\n"
785 "barrier(CLK_LOCAL_MEM_FENCE);\n"
786 "}\n"
787 "__attribute__((always_inline))\n"
788 "void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
789 "{\n"
790 "const int thread_block = t/5;\n"
791 "const int x2 = x1 + thread_block;\n"
792 "const int x3 = x1 + 2*thread_block;\n"
793 "const int x4 = x1 + 3*thread_block;\n"
794 "const int x5 = x1 + 4*thread_block;\n"
795 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;\n"
796 "if (x1 < t/5)\n"
797 "{\n"
798 "a0 = smem[x1]; a1 = smem[x1+t];\n"
799 "a2 = smem[x2]; a3 = smem[x2+t];\n"
800 "a4 = smem[x3]; a5 = smem[x3+t];\n"
801 "a6 = smem[x4]; a7 = smem[x4+t];\n"
802 "a8 = smem[x5]; a9 = smem[x5+t];\n"
803 "}\n"
804 "barrier(CLK_LOCAL_MEM_FENCE);\n"
805 "if (x1 < t/5)\n"
806 "{\n"
807 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n"
808 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n"
809 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n"
810 "butterfly2(a6, a7, smem, twiddles, x4, block_size);\n"
811 "butterfly2(a8, a9, smem, twiddles, x5, block_size);\n"
812 "}\n"
813 "barrier(CLK_LOCAL_MEM_FENCE);\n"
814 "}\n"
815 "__attribute__((always_inline))\n"
816 "void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n"
817 "{\n"
818 "CT a0, a1, a2, a3;\n"
819 "if (x < t)\n"
820 "{\n"
821 "a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];\n"
822 "}\n"
823 "barrier(CLK_LOCAL_MEM_FENCE);\n"
824 "if (x < t)\n"
825 "butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);\n"
826 "barrier(CLK_LOCAL_MEM_FENCE);\n"
827 "}\n"
828 "__attribute__((always_inline))\n"
829 "void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
830 "{\n"
831 "const int x2 = x1 + t/2;\n"
832 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n"
833 "if (x1 < t/2)\n"
834 "{\n"
835 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];\n"
836 "a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];\n"
837 "}\n"
838 "barrier(CLK_LOCAL_MEM_FENCE);\n"
839 "if (x1 < t/2)\n"
840 "{\n"
841 "butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);\n"
842 "butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);\n"
843 "}\n"
844 "barrier(CLK_LOCAL_MEM_FENCE);\n"
845 "}\n"
846 "__attribute__((always_inline))\n"
847 "void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
848 "{\n"
849 "const int x2 = x1 + t/3;\n"
850 "const int x3 = x2 + t/3;\n"
851 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;\n"
852 "if (x1 < t/3)\n"
853 "{\n"
854 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];\n"
855 "a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];\n"
856 "a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];\n"
857 "}\n"
858 "barrier(CLK_LOCAL_MEM_FENCE);\n"
859 "if (x1 < t/3)\n"
860 "{\n"
861 "butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);\n"
862 "butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);\n"
863 "butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);\n"
864 "}\n"
865 "barrier(CLK_LOCAL_MEM_FENCE);\n"
866 "}\n"
867 "__attribute__((always_inline))\n"
868 "void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n"
869 "{\n"
870 "const int k = x % block_size;\n"
871 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n"
872 "if (x < t)\n"
873 "{\n"
874 "int tw_ind = block_size / 8;\n"
875 "a0 = smem[x];\n"
876 "a1 = mul_complex(twiddles[k], smem[x + t]);\n"
877 "a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]);\n"
878 "a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]);\n"
879 "a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]);\n"
880 "a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]);\n"
881 "a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]);\n"
882 "a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]);\n"
883 "CT b0, b1, b6, b7;\n"
884 "b0 = a0 + a4;\n"
885 "a4 = a0 - a4;\n"
886 "b1 = a1 + a5;\n"
887 "a5 = a1 - a5;\n"
888 "a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y);\n"
889 "b6 = twiddle(a2 - a6);\n"
890 "a2 = a2 + a6;\n"
891 "b7 = a3 - a7;\n"
892 "b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y);\n"
893 "a3 = a3 + a7;\n"
894 "a0 = b0 + a2;\n"
895 "a2 = b0 - a2;\n"
896 "a1 = b1 + a3;\n"
897 "a3 = twiddle(b1 - a3);\n"
898 "a6 = a4 - b6;\n"
899 "a4 = a4 + b6;\n"
900 "a7 = twiddle(a5 - b7);\n"
901 "a5 = a5 + b7;\n"
902 "}\n"
903 "barrier(CLK_LOCAL_MEM_FENCE);\n"
904 "if (x < t)\n"
905 "{\n"
906 "const int dst_ind = ((x - k) << 3) + k;\n"
907 "__local CT* dst = smem + dst_ind;\n"
908 "dst[0] = a0 + a1;\n"
909 "dst[block_size] = a4 + a5;\n"
910 "dst[2 * block_size] = a2 + a3;\n"
911 "dst[3 * block_size] = a6 + a7;\n"
912 "dst[4 * block_size] = a0 - a1;\n"
913 "dst[5 * block_size] = a4 - a5;\n"
914 "dst[6 * block_size] = a2 - a3;\n"
915 "dst[7 * block_size] = a6 - a7;\n"
916 "}\n"
917 "barrier(CLK_LOCAL_MEM_FENCE);\n"
918 "}\n"
919 "__attribute__((always_inline))\n"
920 "void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n"
921 "{\n"
922 "CT a0, a1, a2;\n"
923 "if (x < t)\n"
924 "{\n"
925 "a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];\n"
926 "}\n"
927 "barrier(CLK_LOCAL_MEM_FENCE);\n"
928 "if (x < t)\n"
929 "butterfly3(a0, a1, a2, smem, twiddles, x, block_size);\n"
930 "barrier(CLK_LOCAL_MEM_FENCE);\n"
931 "}\n"
932 "__attribute__((always_inline))\n"
933 "void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
934 "{\n"
935 "const int x2 = x1 + t/2;\n"
936 "CT a0, a1, a2, a3, a4, a5;\n"
937 "if (x1 < t/2)\n"
938 "{\n"
939 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n"
940 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n"
941 "}\n"
942 "barrier(CLK_LOCAL_MEM_FENCE);\n"
943 "if (x1 < t/2)\n"
944 "{\n"
945 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n"
946 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n"
947 "}\n"
948 "barrier(CLK_LOCAL_MEM_FENCE);\n"
949 "}\n"
950 "__attribute__((always_inline))\n"
951 "void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
952 "{\n"
953 "const int x2 = x1 + t/3;\n"
954 "const int x3 = x2 + t/3;\n"
955 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8;\n"
956 "if (x1 < t/3)\n"
957 "{\n"
958 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n"
959 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n"
960 "a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];\n"
961 "}\n"
962 "barrier(CLK_LOCAL_MEM_FENCE);\n"
963 "if (x1 < t/3)\n"
964 "{\n"
965 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n"
966 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n"
967 "butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);\n"
968 "}\n"
969 "barrier(CLK_LOCAL_MEM_FENCE);\n"
970 "}\n"
971 "__attribute__((always_inline))\n"
972 "void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
973 "{\n"
974 "const int thread_block = t/4;\n"
975 "const int x2 = x1 + thread_block;\n"
976 "const int x3 = x1 + 2*thread_block;\n"
977 "const int x4 = x1 + 3*thread_block;\n"
978 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;\n"
979 "if (x1 < t/4)\n"
980 "{\n"
981 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n"
982 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n"
983 "a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];\n"
984 "a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];\n"
985 "}\n"
986 "barrier(CLK_LOCAL_MEM_FENCE);\n"
987 "if (x1 < t/4)\n"
988 "{\n"
989 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n"
990 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n"
991 "butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);\n"
992 "butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);\n"
993 "}\n"
994 "barrier(CLK_LOCAL_MEM_FENCE);\n"
995 "}\n"
996 "__attribute__((always_inline))\n"
997 "void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n"
998 "{\n"
999 "const int k = x % block_size;\n"
1000 "CT a0, a1, a2, a3, a4;\n"
1001 "if (x < t)\n"
1002 "{\n"
1003 "a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];\n"
1004 "}\n"
1005 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1006 "if (x < t)\n"
1007 "butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);\n"
1008 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1009 "}\n"
1010 "__attribute__((always_inline))\n"
1011 "void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n"
1012 "{\n"
1013 "const int x2 = x1+t/2;\n"
1014 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;\n"
1015 "if (x1 < t/2)\n"
1016 "{\n"
1017 "a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];\n"
1018 "a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];\n"
1019 "}\n"
1020 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1021 "if (x1 < t/2)\n"
1022 "{\n"
1023 "butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);\n"
1024 "butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);\n"
1025 "}\n"
1026 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1027 "}\n"
1028 "#ifdef DFT_SCALE\n"
1029 "#define SCALE_VAL(x, scale) x*scale\n"
1030 "#else\n"
1031 "#define SCALE_VAL(x, scale) x\n"
1032 "#endif\n"
1033 "__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
1034 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
1035 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n"
1036 "{\n"
1037 "const int x = get_global_id(0);\n"
1038 "const int y = get_group_id(1);\n"
1039 "const int block_size = LOCAL_SIZE/kercn;\n"
1040 "if (y < nz)\n"
1041 "{\n"
1042 "__local CT smem[LOCAL_SIZE];\n"
1043 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n"
1044 "const int ind = x;\n"
1045 "#ifdef IS_1D\n"
1046 "FT scale = (FT) 1/dst_cols;\n"
1047 "#else\n"
1048 "FT scale = (FT) 1/(dst_cols*dst_rows);\n"
1049 "#endif\n"
1050 "#ifdef COMPLEX_INPUT\n"
1051 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));\n"
1052 "#pragma unroll\n"
1053 "for (int i=0; i<kercn; i++)\n"
1054 "smem[x+i*block_size] = src[i*block_size];\n"
1055 "#else\n"
1056 "__global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset)));\n"
1057 "#pragma unroll\n"
1058 "for (int i=0; i<kercn; i++)\n"
1059 "smem[x+i*block_size] = (CT)(src[i*block_size], 0.f);\n"
1060 "#endif\n"
1061 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1062 "RADIX_PROCESS;\n"
1063 "#ifdef COMPLEX_OUTPUT\n"
1064 "#ifdef NO_CONJUGATE\n"
1065 "const int cols = dst_cols/2 + 1;\n"
1066 "#else\n"
1067 "const int cols = dst_cols;\n"
1068 "#endif\n"
1069 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1070 "#pragma unroll\n"
1071 "for (int i=x; i<cols; i+=block_size)\n"
1072 "dst[i] = SCALE_VAL(smem[i], scale);\n"
1073 "#ifdef REAL_INPUT\n"
1074 "#ifdef COMPLEX_OUTPUT\n"
1075 "#ifdef IS_1D\n"
1076 "for(int i=x+1; i < (dst_cols+1)/2; i+=block_size)\n"
1077 "{\n"
1078 "dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale));\n"
1079 "}\n"
1080 "#endif\n"
1081 "#endif\n"
1082 "#endif\n"
1083 "#else\n"
1084 "__local FT* smem_1cn = (__local FT*) smem;\n"
1085 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1086 "for (int i=x; i<dst_cols-1; i+=block_size)\n"
1087 "dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);\n"
1088 "if (x == 0)\n"
1089 "dst[0] = SCALE_VAL(smem_1cn[0], scale);\n"
1090 "#endif\n"
1091 "}\n"
1092 "else\n"
1093 "{\n"
1094 "#ifdef COMPLEX_OUTPUT\n"
1095 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1096 "#else\n"
1097 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1098 "#endif\n"
1099 "#pragma unroll\n"
1100 "for (int i=x; i<dst_cols; i+=block_size)\n"
1101 "dst[i] = 0.f;\n"
1102 "}\n"
1103 "}\n"
1104 "__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
1105 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
1106 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n"
1107 "{\n"
1108 "const int x = get_group_id(0);\n"
1109 "const int y = get_global_id(1);\n"
1110 "if (x < nz)\n"
1111 "{\n"
1112 "__local CT smem[LOCAL_SIZE];\n"
1113 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));\n"
1114 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n"
1115 "const int ind = y;\n"
1116 "const int block_size = LOCAL_SIZE/kercn;\n"
1117 "FT scale = 1.f/(dst_rows*dst_cols);\n"
1118 "#pragma unroll\n"
1119 "for (int i=0; i<kercn; i++)\n"
1120 "smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step));\n"
1121 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1122 "RADIX_PROCESS;\n"
1123 "#ifdef COMPLEX_OUTPUT\n"
1124 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n"
1125 "#pragma unroll\n"
1126 "for (int i=0; i<kercn; i++)\n"
1127 "*((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);\n"
1128 "#else\n"
1129 "if (x == 0)\n"
1130 "{\n"
1131 "__local FT* smem_1cn = (__local FT*) smem;\n"
1132 "__global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);\n"
1133 "for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)\n"
1134 "*((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);\n"
1135 "if (y == 0)\n"
1136 "*((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);\n"
1137 "}\n"
1138 "else if (x == (dst_cols+1)/2)\n"
1139 "{\n"
1140 "__local FT* smem_1cn = (__local FT*) smem;\n"
1141 "__global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset));\n"
1142 "for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)\n"
1143 "*((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);\n"
1144 "if (y == 0)\n"
1145 "*((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);\n"
1146 "}\n"
1147 "else\n"
1148 "{\n"
1149 "__global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT)));\n"
1150 "#pragma unroll\n"
1151 "for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)\n"
1152 "vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst);\n"
1153 "}\n"
1154 "#endif\n"
1155 "}\n"
1156 "}\n"
1157 "__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
1158 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
1159 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n"
1160 "{\n"
1161 "const int x = get_global_id(0);\n"
1162 "const int y = get_group_id(1);\n"
1163 "const int block_size = LOCAL_SIZE/kercn;\n"
1164 "#ifdef IS_1D\n"
1165 "const FT scale = (FT) 1/dst_cols;\n"
1166 "#else\n"
1167 "const FT scale = (FT) 1/(dst_cols*dst_rows);\n"
1168 "#endif\n"
1169 "if (y < nz)\n"
1170 "{\n"
1171 "__local CT smem[LOCAL_SIZE];\n"
1172 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n"
1173 "const int ind = x;\n"
1174 "#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)\n"
1175 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));\n"
1176 "#pragma unroll\n"
1177 "for (int i=0; i<kercn; i++)\n"
1178 "{\n"
1179 "smem[x+i*block_size].x =  src[i*block_size].x;\n"
1180 "smem[x+i*block_size].y = -src[i*block_size].y;\n"
1181 "}\n"
1182 "#else\n"
1183 "#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)\n"
1184 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset)));\n"
1185 "#pragma unroll\n"
1186 "for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)\n"
1187 "{\n"
1188 "smem[i+1].x = src[i].x;\n"
1189 "smem[i+1].y = -src[i].y;\n"
1190 "smem[LOCAL_SIZE-i-1] = src[i];\n"
1191 "}\n"
1192 "#else\n"
1193 "#pragma unroll\n"
1194 "for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)\n"
1195 "{\n"
1196 "CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset))));\n"
1197 "smem[i+1].x = src.x;\n"
1198 "smem[i+1].y = -src.y;\n"
1199 "smem[LOCAL_SIZE-i-1] = src;\n"
1200 "}\n"
1201 "#endif\n"
1202 "if (x==0)\n"
1203 "{\n"
1204 "smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset));\n"
1205 "smem[0].y = 0.f;\n"
1206 "if(LOCAL_SIZE % 2 ==0)\n"
1207 "{\n"
1208 "#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)\n"
1209 "smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;\n"
1210 "#else\n"
1211 "smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset)));\n"
1212 "#endif\n"
1213 "smem[LOCAL_SIZE/2].y = 0.f;\n"
1214 "}\n"
1215 "}\n"
1216 "#endif\n"
1217 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1218 "RADIX_PROCESS;\n"
1219 "#ifdef COMPLEX_OUTPUT\n"
1220 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)));\n"
1221 "#pragma unroll\n"
1222 "for (int i=0; i<kercn; i++)\n"
1223 "{\n"
1224 "dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);\n"
1225 "dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);\n"
1226 "}\n"
1227 "#else\n"
1228 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset)));\n"
1229 "#pragma unroll\n"
1230 "for (int i=0; i<kercn; i++)\n"
1231 "{\n"
1232 "dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);\n"
1233 "}\n"
1234 "#endif\n"
1235 "}\n"
1236 "else\n"
1237 "{\n"
1238 "#ifdef COMPLEX_OUTPUT\n"
1239 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1240 "#else\n"
1241 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n"
1242 "#endif\n"
1243 "#pragma unroll\n"
1244 "for (int i=x; i<dst_cols; i+=block_size)\n"
1245 "dst[i] = 0.f;\n"
1246 "}\n"
1247 "}\n"
1248 "__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
1249 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
1250 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n"
1251 "{\n"
1252 "const int x = get_group_id(0);\n"
1253 "const int y = get_global_id(1);\n"
1254 "#ifdef COMPLEX_INPUT\n"
1255 "if (x < nz)\n"
1256 "{\n"
1257 "__local CT smem[LOCAL_SIZE];\n"
1258 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));\n"
1259 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n"
1260 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n"
1261 "const int ind = y;\n"
1262 "const int block_size = LOCAL_SIZE/kercn;\n"
1263 "#pragma unroll\n"
1264 "for (int i=0; i<kercn; i++)\n"
1265 "{\n"
1266 "CT temp = *((__global const CT*)(src + i*block_size*src_step));\n"
1267 "smem[y+i*block_size].x =  temp.x;\n"
1268 "smem[y+i*block_size].y =  -temp.y;\n"
1269 "}\n"
1270 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1271 "RADIX_PROCESS;\n"
1272 "#pragma unroll\n"
1273 "for (int i=0; i<kercn; i++)\n"
1274 "{\n"
1275 "__global CT* res = (__global CT*)(dst + i*block_size*dst_step);\n"
1276 "res[0].x = smem[y + i*block_size].x;\n"
1277 "res[0].y = -smem[y + i*block_size].y;\n"
1278 "}\n"
1279 "}\n"
1280 "#else\n"
1281 "if (x < nz)\n"
1282 "{\n"
1283 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n"
1284 "const int ind = y;\n"
1285 "const int block_size = LOCAL_SIZE/kercn;\n"
1286 "__local CT smem[LOCAL_SIZE];\n"
1287 "#ifdef EVEN\n"
1288 "if (x!=0 && (x!=(nz-1)))\n"
1289 "#else\n"
1290 "if (x!=0)\n"
1291 "#endif\n"
1292 "{\n"
1293 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset));\n"
1294 "#pragma unroll\n"
1295 "for (int i=0; i<kercn; i++)\n"
1296 "{\n"
1297 "CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step));\n"
1298 "smem[y+i*block_size].x = temp.x;\n"
1299 "smem[y+i*block_size].y = -temp.y;\n"
1300 "}\n"
1301 "}\n"
1302 "else\n"
1303 "{\n"
1304 "int ind = x==0 ? 0: 2*x-1;\n"
1305 "__global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset)));\n"
1306 "int step = src_step/(int)sizeof(FT);\n"
1307 "#pragma unroll\n"
1308 "for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)\n"
1309 "{\n"
1310 "smem[i+1].x = src[2*i*step];\n"
1311 "smem[i+1].y = -src[(2*i+1)*step];\n"
1312 "smem[LOCAL_SIZE-i-1].x = src[2*i*step];;\n"
1313 "smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];\n"
1314 "}\n"
1315 "if (y==0)\n"
1316 "{\n"
1317 "smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset));\n"
1318 "smem[0].y = 0.f;\n"
1319 "if(LOCAL_SIZE % 2 ==0)\n"
1320 "{\n"
1321 "smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];\n"
1322 "smem[LOCAL_SIZE/2].y = 0.f;\n"
1323 "}\n"
1324 "}\n"
1325 "}\n"
1326 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1327 "RADIX_PROCESS;\n"
1328 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n"
1329 "#pragma unroll\n"
1330 "for (int i=0; i<kercn; i++)\n"
1331 "{\n"
1332 "__global CT* res = (__global CT*)(dst + i*block_size*dst_step);\n"
1333 "res[0].x =  smem[y + i*block_size].x;\n"
1334 "res[0].y = -smem[y + i*block_size].y;\n"
1335 "}\n"
1336 "}\n"
1337 "#endif\n"
1338 "}\n"
1339 , "3d61276324844b5841beb34e245f270a"};
1340 ProgramSource fft_oclsrc(fft.programStr);
1341 const struct ProgramEntry flip={"flip",
1342 "#if kercn != 3\n"
1343 "#define loadpix(addr) *(__global const T *)(addr)\n"
1344 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
1345 "#define TSIZE (int)sizeof(T)\n"
1346 "#else\n"
1347 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
1348 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
1349 "#define TSIZE ((int)sizeof(T1)*3)\n"
1350 "#endif\n"
1351 "__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset,\n"
1352 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1353 "int rows, int cols, int thread_rows, int thread_cols)\n"
1354 "{\n"
1355 "int x = get_global_id(0);\n"
1356 "int y0 = get_global_id(1) * PIX_PER_WI_Y;\n"
1357 "if (x < cols)\n"
1358 "{\n"
1359 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n"
1360 "int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset));\n"
1361 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n"
1362 "int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset));\n"
1363 "#pragma unroll\n"
1364 "for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n"
1365 "{\n"
1366 "T src0 = loadpix(srcptr + src_index0);\n"
1367 "T src1 = loadpix(srcptr + src_index1);\n"
1368 "storepix(src1, dstptr + dst_index0);\n"
1369 "storepix(src0, dstptr + dst_index1);\n"
1370 "src_index0 += src_step;\n"
1371 "src_index1 -= src_step;\n"
1372 "dst_index0 += dst_step;\n"
1373 "dst_index1 -= dst_step;\n"
1374 "}\n"
1375 "}\n"
1376 "}\n"
1377 "__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset,\n"
1378 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1379 "int rows, int cols, int thread_rows, int thread_cols)\n"
1380 "{\n"
1381 "int x = get_global_id(0);\n"
1382 "int y0 = get_global_id(1)*PIX_PER_WI_Y;\n"
1383 "if (x < cols)\n"
1384 "{\n"
1385 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n"
1386 "int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset));\n"
1387 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n"
1388 "int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));\n"
1389 "#pragma unroll\n"
1390 "for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n"
1391 "{\n"
1392 "T src0 = loadpix(srcptr + src_index0);\n"
1393 "T src1 = loadpix(srcptr + src_index1);\n"
1394 "#if kercn == 2\n"
1395 "#if cn == 1\n"
1396 "src0 = src0.s10;\n"
1397 "src1 = src1.s10;\n"
1398 "#endif\n"
1399 "#elif kercn == 4\n"
1400 "#if cn == 1\n"
1401 "src0 = src0.s3210;\n"
1402 "src1 = src1.s3210;\n"
1403 "#elif cn == 2\n"
1404 "src0 = src0.s2301;\n"
1405 "src1 = src1.s2301;\n"
1406 "#endif\n"
1407 "#endif\n"
1408 "storepix(src1, dstptr + dst_index0);\n"
1409 "storepix(src0, dstptr + dst_index1);\n"
1410 "src_index0 += src_step;\n"
1411 "src_index1 -= src_step;\n"
1412 "dst_index0 += dst_step;\n"
1413 "dst_index1 -= dst_step;\n"
1414 "}\n"
1415 "}\n"
1416 "}\n"
1417 "__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset,\n"
1418 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
1419 "int rows, int cols, int thread_rows, int thread_cols)\n"
1420 "{\n"
1421 "int x = get_global_id(0);\n"
1422 "int y0 = get_global_id(1)*PIX_PER_WI_Y;\n"
1423 "if (x < thread_cols)\n"
1424 "{\n"
1425 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n"
1426 "int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset));\n"
1427 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n"
1428 "int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));\n"
1429 "#pragma unroll\n"
1430 "for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n"
1431 "{\n"
1432 "T src0 = loadpix(srcptr + src_index0);\n"
1433 "T src1 = loadpix(srcptr + src_index1);\n"
1434 "#if kercn == 2\n"
1435 "#if cn == 1\n"
1436 "src0 = src0.s10;\n"
1437 "src1 = src1.s10;\n"
1438 "#endif\n"
1439 "#elif kercn == 4\n"
1440 "#if cn == 1\n"
1441 "src0 = src0.s3210;\n"
1442 "src1 = src1.s3210;\n"
1443 "#elif cn == 2\n"
1444 "src0 = src0.s2301;\n"
1445 "src1 = src1.s2301;\n"
1446 "#endif\n"
1447 "#endif\n"
1448 "storepix(src1, dstptr + dst_index0);\n"
1449 "storepix(src0, dstptr + dst_index1);\n"
1450 "src_index0 += src_step;\n"
1451 "src_index1 += src_step;\n"
1452 "dst_index0 += dst_step;\n"
1453 "dst_index1 += dst_step;\n"
1454 "}\n"
1455 "}\n"
1456 "}\n"
1457 , "296714e8641b2d8359104d11b332b14b"};
1458 ProgramSource flip_oclsrc(flip.programStr);
1459 const struct ProgramEntry gemm={"gemm",
1460 "#ifdef DOUBLE_SUPPORT\n"
1461 "#ifdef cl_amd_fp64\n"
1462 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
1463 "#elif defined (cl_khr_fp64)\n"
1464 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
1465 "#endif\n"
1466 "#endif\n"
1467 "#define TSIZE  (int)sizeof(T)\n"
1468 "#define WTSIZE (int)sizeof(WT)\n"
1469 "#define IND_A mad24(y, A_step, A_offset)\n"
1470 "#define IND_B mad24(x, WTSIZE, B_offset)\n"
1471 "#define STEP_B B_step / WTSIZE\n"
1472 "#define LOCAL_SIZE_ODD (LOCAL_SIZE + 1)\n"
1473 "#if cn==2\n"
1474 "#if kercn==2\n"
1475 "#define MUL(a, b)\\\n"
1476 "{\\\n"
1477 "sum.x += fma(a.x, b.x, - a.y * b.y);\\\n"
1478 "sum.y += fma(a.x, b.y, a.y * b.x);\\\n"
1479 "}\n"
1480 "#else\n"
1481 "#define MUL(a, b)\\\n"
1482 "{\\\n"
1483 "sum.x += fma(a.x, b.x, - a.y * b.y);\\\n"
1484 "sum.y += fma(a.x, b.y, a.y * b.x);\\\n"
1485 "sum.z += fma(a.x, b.z, - a.y * b.w);\\\n"
1486 "sum.w += fma(a.x, b.w, a.y * b.z);\\\n"
1487 "}\n"
1488 "#endif\n"
1489 "#else\n"
1490 "#define MUL(a, b) sum = fma(a, b, sum);\n"
1491 "#endif\n"
1492 "__kernel void gemm(__global const uchar * A_ptr, int A_step, int A_offset,\n"
1493 "__global const uchar * B_ptr, int B_step, int B_offset,\n"
1494 "__global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols,\n"
1495 "int n, T1 alpha, T1 beta)\n"
1496 "{\n"
1497 "int x = get_global_id(0);\n"
1498 "int y = get_global_id(1);\n"
1499 "int lidx = get_local_id(0);\n"
1500 "int lidy = get_local_id(1);\n"
1501 "__global const T* A = (__global const T*)(A_ptr + IND_A);\n"
1502 "__global const WT* B = (__global const WT*)(B_ptr + IND_B);\n"
1503 "WT sum = (WT)(0);\n"
1504 "#if LOCAL_SIZE == 1\n"
1505 "if (x < D_cols && y < D_rows)\n"
1506 "{\n"
1507 "for (int i = 0; i < n; ++i)\n"
1508 "MUL(A[i], B[i*STEP_B]);\n"
1509 "#else\n"
1510 "__local T  a_local[LOCAL_SIZE_ODD*LOCAL_SIZE];\n"
1511 "__local WT b_local[LOCAL_SIZE_ODD*LOCAL_SIZE];\n"
1512 "int reps;\n"
1513 "#if NO_MULT\n"
1514 "reps = (n + LOCAL_SIZE-1)/LOCAL_SIZE;\n"
1515 "#else\n"
1516 "reps = n/LOCAL_SIZE;\n"
1517 "#endif\n"
1518 "for (int p = 0; p < reps; ++p)\n"
1519 "{\n"
1520 "if (p * LOCAL_SIZE + lidx < n && y < D_rows)\n"
1521 "a_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = A[mad24(p, LOCAL_SIZE, lidx)];\n"
1522 "if (p * LOCAL_SIZE + lidy < n && x < D_cols)\n"
1523 "b_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = B[mad24(p, LOCAL_SIZE, lidy)*STEP_B];\n"
1524 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1525 "if (x < D_cols && y < D_rows)\n"
1526 "{\n"
1527 "#if NO_MULT\n"
1528 "int ie = min(LOCAL_SIZE, n - p * LOCAL_SIZE);\n"
1529 "for (int i = 0; i < ie; ++i)\n"
1530 "#else\n"
1531 "for (int i = 0; i < LOCAL_SIZE; ++i)\n"
1532 "#endif\n"
1533 "MUL(a_local[mad24(lidy, LOCAL_SIZE_ODD, i)], b_local[mad24(i, LOCAL_SIZE_ODD, lidx)]);\n"
1534 "}\n"
1535 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1536 "}\n"
1537 "if (x < D_cols && y < D_rows)\n"
1538 "{\n"
1539 "#endif\n"
1540 "__global WT* D = (__global WT*)(D_ptr + mad24(y, D_step, mad24(x, WTSIZE, D_offset)));\n"
1541 "#if HAVE_C\n"
1542 "D[0] = mad(alpha, sum, D[0]*beta);\n"
1543 "#else\n"
1544 "D[0] = alpha * sum;\n"
1545 "#endif\n"
1546 "}\n"
1547 "}\n"
1548 , "0a79f557db56fcdce22ea905e7fc899f"};
1549 ProgramSource gemm_oclsrc(gemm.programStr);
1550 const struct ProgramEntry inrange={"inrange",
1551 "#ifdef DOUBLE_SUPPORT\n"
1552 "#ifdef cl_amd_fp64\n"
1553 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
1554 "#elif defined (cl_khr_fp64)\n"
1555 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
1556 "#endif\n"
1557 "#endif\n"
1558 "__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,\n"
1559 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n"
1560 "#ifdef HAVE_SCALAR\n"
1561 "__global const srcT1 * src2, __global const srcT1 * src3,\n"
1562 "#else\n"
1563 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n"
1564 "__global const uchar * src3ptr, int src3_step, int src3_offset,\n"
1565 "#endif\n"
1566 "int rowsPerWI)\n"
1567 "{\n"
1568 "int x = get_global_id(0);\n"
1569 "int y0 = get_global_id(1) * rowsPerWI;\n"
1570 "if (x < dst_cols)\n"
1571 "{\n"
1572 "int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));\n"
1573 "int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));\n"
1574 "#ifndef HAVE_SCALAR\n"
1575 "int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));\n"
1576 "int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));\n"
1577 "#endif\n"
1578 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)\n"
1579 "{\n"
1580 "#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR\n"
1581 "srcT src1 = *(__global const srcT *)(src1ptr + src1_index);\n"
1582 "srcT src2 = *(__global const srcT *)(src2ptr + src2_index);\n"
1583 "srcT src3 = *(__global const srcT *)(src3ptr + src3_index);\n"
1584 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n"
1585 "#if cn == 1\n"
1586 "dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);\n"
1587 "#elif cn == 2\n"
1588 "dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||\n"
1589 "src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);\n"
1590 "#elif cn == 4\n"
1591 "dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||\n"
1592 "src2.y > src1.y || src3.y < src1.y ||\n"
1593 "src2.z > src1.z || src3.z < src1.z ||\n"
1594 "src2.w > src1.w || src3.w < src1.w ? 0 : 255);\n"
1595 "#endif\n"
1596 "#else\n"
1597 "__global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);\n"
1598 "__global uchar * dst = dstptr + dst_index;\n"
1599 "#ifndef HAVE_SCALAR\n"
1600 "__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);\n"
1601 "__global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);\n"
1602 "#endif\n"
1603 "#pragma unroll\n"
1604 "for (int px = 0; px < colsPerWI; ++px, src1 += cn\n"
1605 "#ifndef HAVE_SCALAR\n"
1606 ", src2 += cn, src3 += cn\n"
1607 "#endif\n"
1608 ")\n"
1609 "{\n"
1610 "dst[px] = 255;\n"
1611 "for (int c = 0; c < cn; ++c)\n"
1612 "if (src2[c] > src1[c] || src3[c] < src1[c])\n"
1613 "{\n"
1614 "dst[px] = 0;\n"
1615 "break;\n"
1616 "}\n"
1617 "}\n"
1618 "#endif\n"
1619 "#ifndef HAVE_SCALAR\n"
1620 "src2_index += src2_step;\n"
1621 "src3_index += src3_step;\n"
1622 "#endif\n"
1623 "}\n"
1624 "}\n"
1625 "}\n"
1626 , "e7220f9dc5b30fc5558622a452890287"};
1627 ProgramSource inrange_oclsrc(inrange.programStr);
1628 const struct ProgramEntry lut={"lut",
1629 "#if lcn == 1\n"
1630 "#if dcn == 4\n"
1631 "#define LUT_OP  \\\n"
1632 "int idx = *(__global const int *)(srcptr + src_index); \\\n"
1633 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1634 "dst[0] = lut_l[idx & 0xff]; \\\n"
1635 "dst[1] = lut_l[(idx >> 8) & 0xff]; \\\n"
1636 "dst[2] = lut_l[(idx >> 16) & 0xff]; \\\n"
1637 "dst[3] = lut_l[(idx >> 24) & 0xff];\n"
1638 "#elif dcn == 3\n"
1639 "#define LUT_OP  \\\n"
1640 "uchar3 idx = vload3(0, srcptr + src_index); \\\n"
1641 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1642 "dst[0] = lut_l[idx.x]; \\\n"
1643 "dst[1] = lut_l[idx.y]; \\\n"
1644 "dst[2] = lut_l[idx.z];\n"
1645 "#elif dcn == 2\n"
1646 "#define LUT_OP \\\n"
1647 "short idx = *(__global const short *)(srcptr + src_index); \\\n"
1648 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1649 "dst[0] = lut_l[idx & 0xff]; \\\n"
1650 "dst[1] = lut_l[(idx >> 8) & 0xff];\n"
1651 "#elif dcn == 1\n"
1652 "#define LUT_OP \\\n"
1653 "uchar idx = (srcptr + src_index)[0]; \\\n"
1654 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1655 "dst[0] = lut_l[idx];\n"
1656 "#else\n"
1657 "#define LUT_OP \\\n"
1658 "__global const srcT * src = (__global const srcT *)(srcptr + src_index); \\\n"
1659 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1660 "for (int cn = 0; cn < dcn; ++cn) \\\n"
1661 "dst[cn] = lut_l[src[cn]];\n"
1662 "#endif\n"
1663 "#else\n"
1664 "#if dcn == 4\n"
1665 "#define LUT_OP \\\n"
1666 "__global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \\\n"
1667 "int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \\\n"
1668 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1669 "dst[0] = lut_l[idx.x]; \\\n"
1670 "dst[1] = lut_l[idx.y]; \\\n"
1671 "dst[2] = lut_l[idx.z]; \\\n"
1672 "dst[3] = lut_l[idx.w];\n"
1673 "#elif dcn == 3\n"
1674 "#define LUT_OP \\\n"
1675 "uchar3 src_pixel = vload3(0, srcptr + src_index); \\\n"
1676 "int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \\\n"
1677 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1678 "dst[0] = lut_l[idx.x]; \\\n"
1679 "dst[1] = lut_l[idx.y]; \\\n"
1680 "dst[2] = lut_l[idx.z];\n"
1681 "#elif dcn == 2\n"
1682 "#define LUT_OP \\\n"
1683 "__global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \\\n"
1684 "int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \\\n"
1685 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1686 "dst[0] = lut_l[idx.x]; \\\n"
1687 "dst[1] = lut_l[idx.y];\n"
1688 "#elif dcn == 1\n"
1689 "#define LUT_OP \\\n"
1690 "uchar idx = (srcptr + src_index)[0]; \\\n"
1691 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1692 "dst[0] = lut_l[idx];\n"
1693 "#else\n"
1694 "#define LUT_OP \\\n"
1695 "__global const srcT * src = (__global const srcT *)(srcptr + src_index); \\\n"
1696 "dst = (__global dstT *)(dstptr + dst_index); \\\n"
1697 "for (int cn = 0; cn < dcn; ++cn) \\\n"
1698 "dst[cn] = lut_l[mad24(src[cn], lcn, cn)];\n"
1699 "#endif\n"
1700 "#endif\n"
1701 "__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,\n"
1702 "__global const uchar * lutptr, int lut_step, int lut_offset,\n"
1703 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)\n"
1704 "{\n"
1705 "int x = get_global_id(0);\n"
1706 "int y = get_global_id(1) << 2;\n"
1707 "__local dstT lut_l[256 * lcn];\n"
1708 "__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\n"
1709 "for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),\n"
1710 "step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)\n"
1711 "lut_l[i] = lut[i];\n"
1712 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1713 "if (x < cols && y < rows)\n"
1714 "{\n"
1715 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));\n"
1716 "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));\n"
1717 "__global dstT * dst;\n"
1718 "LUT_OP;\n"
1719 "if (y < rows - 1)\n"
1720 "{\n"
1721 "src_index += src_step;\n"
1722 "dst_index += dst_step;\n"
1723 "LUT_OP;\n"
1724 "if (y < rows - 2)\n"
1725 "{\n"
1726 "src_index += src_step;\n"
1727 "dst_index += dst_step;\n"
1728 "LUT_OP;\n"
1729 "if (y < rows - 3)\n"
1730 "{\n"
1731 "src_index += src_step;\n"
1732 "dst_index += dst_step;\n"
1733 "LUT_OP;\n"
1734 "}\n"
1735 "}\n"
1736 "}\n"
1737 "}\n"
1738 "}\n"
1739 , "02217d060320fc126306ad16885be711"};
1740 ProgramSource lut_oclsrc(lut.programStr);
1741 const struct ProgramEntry meanstddev={"meanstddev",
1742 "#ifdef DOUBLE_SUPPORT\n"
1743 "#ifdef cl_amd_fp64\n"
1744 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
1745 "#elif defined (cl_khr_fp64)\n"
1746 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
1747 "#endif\n"
1748 "#endif\n"
1749 "#define noconvert\n"
1750 "#if cn != 3\n"
1751 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
1752 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
1753 "#define storesqpix(val, addr)  *(__global sqdstT *)(addr) = val\n"
1754 "#define srcTSIZE (int)sizeof(srcT)\n"
1755 "#define dstTSIZE (int)sizeof(dstT)\n"
1756 "#define sqdstTSIZE (int)sizeof(sqdstT)\n"
1757 "#else\n"
1758 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
1759 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
1760 "#define storesqpix(val, addr) vstore3(val, 0, (__global sqdstT1 *)(addr))\n"
1761 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n"
1762 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n"
1763 "#define sqdstTSIZE ((int)sizeof(sqdstT1)*3)\n"
1764 "#endif\n"
1765 "__kernel void meanStdDev(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n"
1766 "int total, int groups, __global uchar * dstptr\n"
1767 "#ifdef HAVE_MASK\n"
1768 ", __global const uchar * mask, int mask_step, int mask_offset\n"
1769 "#endif\n"
1770 ")\n"
1771 "{\n"
1772 "int lid = get_local_id(0);\n"
1773 "int gid = get_group_id(0);\n"
1774 "int id = get_global_id(0);\n"
1775 "__local dstT localMemSum[WGS2_ALIGNED];\n"
1776 "__local sqdstT localMemSqSum[WGS2_ALIGNED];\n"
1777 "#ifdef HAVE_MASK\n"
1778 "__local int localMemNonZero[WGS2_ALIGNED];\n"
1779 "#endif\n"
1780 "dstT accSum = (dstT)(0);\n"
1781 "sqdstT accSqSum = (sqdstT)(0);\n"
1782 "#ifdef HAVE_MASK\n"
1783 "int accNonZero = 0;\n"
1784 "mask += mask_offset;\n"
1785 "#endif\n"
1786 "srcptr += src_offset;\n"
1787 "for (int grain = groups * WGS; id < total; id += grain)\n"
1788 "{\n"
1789 "#ifdef HAVE_MASK\n"
1790 "#ifdef HAVE_MASK_CONT\n"
1791 "int mask_index = id;\n"
1792 "#else\n"
1793 "int mask_index = mad24(id / cols, mask_step, id % cols);\n"
1794 "#endif\n"
1795 "if (mask[mask_index])\n"
1796 "#endif\n"
1797 "{\n"
1798 "#ifdef HAVE_SRC_CONT\n"
1799 "int src_index = id * srcTSIZE;\n"
1800 "#else\n"
1801 "int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n"
1802 "#endif\n"
1803 "srcT value = loadpix(srcptr + src_index);\n"
1804 "accSum += convertToDT(value);\n"
1805 "sqdstT dvalue = convertToSDT(value);\n"
1806 "accSqSum = fma(dvalue, dvalue, accSqSum);\n"
1807 "#ifdef HAVE_MASK\n"
1808 "++accNonZero;\n"
1809 "#endif\n"
1810 "}\n"
1811 "}\n"
1812 "if (lid < WGS2_ALIGNED)\n"
1813 "{\n"
1814 "localMemSum[lid] = accSum;\n"
1815 "localMemSqSum[lid] = accSqSum;\n"
1816 "#ifdef HAVE_MASK\n"
1817 "localMemNonZero[lid] = accNonZero;\n"
1818 "#endif\n"
1819 "}\n"
1820 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1821 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n"
1822 "{\n"
1823 "localMemSum[lid - WGS2_ALIGNED] += accSum;\n"
1824 "localMemSqSum[lid - WGS2_ALIGNED] += accSqSum;\n"
1825 "#ifdef HAVE_MASK\n"
1826 "localMemNonZero[lid - WGS2_ALIGNED] += accNonZero;\n"
1827 "#endif\n"
1828 "}\n"
1829 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1830 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n"
1831 "{\n"
1832 "if (lid < lsize)\n"
1833 "{\n"
1834 "int lid2 = lsize + lid;\n"
1835 "localMemSum[lid] += localMemSum[lid2];\n"
1836 "localMemSqSum[lid] += localMemSqSum[lid2];\n"
1837 "#ifdef HAVE_MASK\n"
1838 "localMemNonZero[lid] += localMemNonZero[lid2];\n"
1839 "#endif\n"
1840 "}\n"
1841 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1842 "}\n"
1843 "if (lid == 0)\n"
1844 "{\n"
1845 "storepix(localMemSum[0], dstptr + dstTSIZE * gid);\n"
1846 "storesqpix(localMemSqSum[0], dstptr + mad24(dstTSIZE, groups, sqdstTSIZE * gid));\n"
1847 "#ifdef HAVE_MASK\n"
1848 "*(__global int *)(dstptr + mad24(dstTSIZE + sqdstTSIZE, groups, (int)sizeof(int) * gid)) = localMemNonZero[0];\n"
1849 "#endif\n"
1850 "}\n"
1851 "}\n"
1852 , "1284edd21da32ce135cd26c0c897bd08"};
1853 ProgramSource meanstddev_oclsrc(meanstddev.programStr);
1854 const struct ProgramEntry minmaxloc={"minmaxloc",
1855 "#ifdef DOUBLE_SUPPORT\n"
1856 "#ifdef cl_amd_fp64\n"
1857 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
1858 "#elif defined (cl_khr_fp64)\n"
1859 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
1860 "#endif\n"
1861 "#endif\n"
1862 "#ifdef DEPTH_0\n"
1863 "#define MIN_VAL 0\n"
1864 "#define MAX_VAL UCHAR_MAX\n"
1865 "#elif defined DEPTH_1\n"
1866 "#define MIN_VAL SCHAR_MIN\n"
1867 "#define MAX_VAL SCHAR_MAX\n"
1868 "#elif defined DEPTH_2\n"
1869 "#define MIN_VAL 0\n"
1870 "#define MAX_VAL USHRT_MAX\n"
1871 "#elif defined DEPTH_3\n"
1872 "#define MIN_VAL SHRT_MIN\n"
1873 "#define MAX_VAL SHRT_MAX\n"
1874 "#elif defined DEPTH_4\n"
1875 "#define MIN_VAL INT_MIN\n"
1876 "#define MAX_VAL INT_MAX\n"
1877 "#elif defined DEPTH_5\n"
1878 "#define MIN_VAL (-FLT_MAX)\n"
1879 "#define MAX_VAL FLT_MAX\n"
1880 "#elif defined DEPTH_6\n"
1881 "#define MIN_VAL (-DBL_MAX)\n"
1882 "#define MAX_VAL DBL_MAX\n"
1883 "#endif\n"
1884 "#define noconvert\n"
1885 "#define INDEX_MAX UINT_MAX\n"
1886 "#if wdepth <= 4\n"
1887 "#define MIN_ABS(a) convertFromU(abs(a))\n"
1888 "#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))\n"
1889 "#define MIN(a, b) min(a, b)\n"
1890 "#define MAX(a, b) max(a, b)\n"
1891 "#else\n"
1892 "#define MIN_ABS(a) fabs(a)\n"
1893 "#define MIN_ABS2(a, b) fabs(a - b)\n"
1894 "#define MIN(a, b) fmin(a, b)\n"
1895 "#define MAX(a, b) fmax(a, b)\n"
1896 "#endif\n"
1897 "#if kercn != 3\n"
1898 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
1899 "#define srcTSIZE (int)sizeof(srcT)\n"
1900 "#else\n"
1901 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
1902 "#define srcTSIZE ((int)sizeof(srcT1) * 3)\n"
1903 "#endif\n"
1904 "#ifndef HAVE_MASK\n"
1905 "#undef srcTSIZE\n"
1906 "#define srcTSIZE (int)sizeof(srcT1)\n"
1907 "#endif\n"
1908 "#ifdef NEED_MINVAL\n"
1909 "#ifdef NEED_MINLOC\n"
1910 "#define CALC_MIN(p, inc) \\\n"
1911 "if (minval > temp.p) \\\n"
1912 "{ \\\n"
1913 "minval = temp.p; \\\n"
1914 "minloc = id + inc; \\\n"
1915 "}\n"
1916 "#else\n"
1917 "#define CALC_MIN(p, inc) \\\n"
1918 "minval = MIN(minval, temp.p);\n"
1919 "#endif\n"
1920 "#else\n"
1921 "#define CALC_MIN(p, inc)\n"
1922 "#endif\n"
1923 "#ifdef NEED_MAXVAL\n"
1924 "#ifdef NEED_MAXLOC\n"
1925 "#define CALC_MAX(p, inc) \\\n"
1926 "if (maxval < temp.p) \\\n"
1927 "{ \\\n"
1928 "maxval = temp.p; \\\n"
1929 "maxloc = id + inc; \\\n"
1930 "}\n"
1931 "#else\n"
1932 "#define CALC_MAX(p, inc) \\\n"
1933 "maxval = MAX(maxval, temp.p);\n"
1934 "#endif\n"
1935 "#else\n"
1936 "#define CALC_MAX(p, inc)\n"
1937 "#endif\n"
1938 "#ifdef OP_CALC2\n"
1939 "#define CALC_MAX2(p) \\\n"
1940 "maxval2 = MAX(maxval2, temp2.p);\n"
1941 "#else\n"
1942 "#define CALC_MAX2(p)\n"
1943 "#endif\n"
1944 "#define CALC_P(p, inc) \\\n"
1945 "CALC_MIN(p, inc) \\\n"
1946 "CALC_MAX(p, inc) \\\n"
1947 "CALC_MAX2(p)\n"
1948 "__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n"
1949 "int total, int groupnum, __global uchar * dstptr\n"
1950 "#ifdef HAVE_MASK\n"
1951 ", __global const uchar * mask, int mask_step, int mask_offset\n"
1952 "#endif\n"
1953 "#ifdef HAVE_SRC2\n"
1954 ", __global const uchar * src2ptr, int src2_step, int src2_offset\n"
1955 "#endif\n"
1956 ")\n"
1957 "{\n"
1958 "int lid = get_local_id(0);\n"
1959 "int gid = get_group_id(0);\n"
1960 "int  id = get_global_id(0)\n"
1961 "#ifndef HAVE_MASK\n"
1962 "* kercn;\n"
1963 "#else\n"
1964 ";\n"
1965 "#endif\n"
1966 "srcptr += src_offset;\n"
1967 "#ifdef HAVE_MASK\n"
1968 "mask += mask_offset;\n"
1969 "#endif\n"
1970 "#ifdef HAVE_SRC2\n"
1971 "src2ptr += src2_offset;\n"
1972 "#endif\n"
1973 "#ifdef NEED_MINVAL\n"
1974 "__local dstT1 localmem_min[WGS2_ALIGNED];\n"
1975 "dstT1 minval = MAX_VAL;\n"
1976 "#ifdef NEED_MINLOC\n"
1977 "__local uint localmem_minloc[WGS2_ALIGNED];\n"
1978 "uint minloc = INDEX_MAX;\n"
1979 "#endif\n"
1980 "#endif\n"
1981 "#ifdef NEED_MAXVAL\n"
1982 "dstT1 maxval = MIN_VAL;\n"
1983 "__local dstT1 localmem_max[WGS2_ALIGNED];\n"
1984 "#ifdef NEED_MAXLOC\n"
1985 "__local uint localmem_maxloc[WGS2_ALIGNED];\n"
1986 "uint maxloc = INDEX_MAX;\n"
1987 "#endif\n"
1988 "#endif\n"
1989 "#ifdef OP_CALC2\n"
1990 "__local dstT1 localmem_max2[WGS2_ALIGNED];\n"
1991 "dstT1 maxval2 = MIN_VAL;\n"
1992 "#endif\n"
1993 "int src_index;\n"
1994 "#ifdef HAVE_MASK\n"
1995 "int mask_index;\n"
1996 "#endif\n"
1997 "#ifdef HAVE_SRC2\n"
1998 "int src2_index;\n"
1999 "#endif\n"
2000 "dstT temp;\n"
2001 "#ifdef HAVE_SRC2\n"
2002 "dstT temp2;\n"
2003 "#endif\n"
2004 "for (int grain = groupnum * WGS\n"
2005 "#ifndef HAVE_MASK\n"
2006 "* kercn\n"
2007 "#endif\n"
2008 "; id < total; id += grain)\n"
2009 "{\n"
2010 "#ifdef HAVE_MASK\n"
2011 "#ifdef HAVE_MASK_CONT\n"
2012 "mask_index = id;\n"
2013 "#else\n"
2014 "mask_index = mad24(id / cols, mask_step, id % cols);\n"
2015 "#endif\n"
2016 "if (mask[mask_index])\n"
2017 "#endif\n"
2018 "{\n"
2019 "#ifdef HAVE_SRC_CONT\n"
2020 "src_index = id * srcTSIZE;\n"
2021 "#else\n"
2022 "src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n"
2023 "#endif\n"
2024 "temp = convertToDT(loadpix(srcptr + src_index));\n"
2025 "#ifdef OP_ABS\n"
2026 "temp = MIN_ABS(temp);\n"
2027 "#endif\n"
2028 "#ifdef HAVE_SRC2\n"
2029 "#ifdef HAVE_SRC2_CONT\n"
2030 "src2_index = id * srcTSIZE;\n"
2031 "#else\n"
2032 "src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));\n"
2033 "#endif\n"
2034 "temp2 = convertToDT(loadpix(src2ptr + src2_index));\n"
2035 "temp = MIN_ABS2(temp, temp2);\n"
2036 "#ifdef OP_CALC2\n"
2037 "temp2 = MIN_ABS(temp2);\n"
2038 "#endif\n"
2039 "#endif\n"
2040 "#if kercn == 1\n"
2041 "#ifdef NEED_MINVAL\n"
2042 "#ifdef NEED_MINLOC\n"
2043 "if (minval > temp)\n"
2044 "{\n"
2045 "minval = temp;\n"
2046 "minloc = id;\n"
2047 "}\n"
2048 "#else\n"
2049 "minval = MIN(minval, temp);\n"
2050 "#endif\n"
2051 "#endif\n"
2052 "#ifdef NEED_MAXVAL\n"
2053 "#ifdef NEED_MAXLOC\n"
2054 "if (maxval < temp)\n"
2055 "{\n"
2056 "maxval = temp;\n"
2057 "maxloc = id;\n"
2058 "}\n"
2059 "#else\n"
2060 "maxval = MAX(maxval, temp);\n"
2061 "#endif\n"
2062 "#ifdef OP_CALC2\n"
2063 "maxval2 = MAX(maxval2, temp2);\n"
2064 "#endif\n"
2065 "#endif\n"
2066 "#elif kercn >= 2\n"
2067 "CALC_P(s0, 0)\n"
2068 "CALC_P(s1, 1)\n"
2069 "#if kercn >= 3\n"
2070 "CALC_P(s2, 2)\n"
2071 "#if kercn >= 4\n"
2072 "CALC_P(s3, 3)\n"
2073 "#if kercn >= 8\n"
2074 "CALC_P(s4, 4)\n"
2075 "CALC_P(s5, 5)\n"
2076 "CALC_P(s6, 6)\n"
2077 "CALC_P(s7, 7)\n"
2078 "#if kercn == 16\n"
2079 "CALC_P(s8, 8)\n"
2080 "CALC_P(s9, 9)\n"
2081 "CALC_P(sA, 10)\n"
2082 "CALC_P(sB, 11)\n"
2083 "CALC_P(sC, 12)\n"
2084 "CALC_P(sD, 13)\n"
2085 "CALC_P(sE, 14)\n"
2086 "CALC_P(sF, 15)\n"
2087 "#endif\n"
2088 "#endif\n"
2089 "#endif\n"
2090 "#endif\n"
2091 "#endif\n"
2092 "}\n"
2093 "}\n"
2094 "if (lid < WGS2_ALIGNED)\n"
2095 "{\n"
2096 "#ifdef NEED_MINVAL\n"
2097 "localmem_min[lid] = minval;\n"
2098 "#endif\n"
2099 "#ifdef NEED_MAXVAL\n"
2100 "localmem_max[lid] = maxval;\n"
2101 "#endif\n"
2102 "#ifdef NEED_MINLOC\n"
2103 "localmem_minloc[lid] = minloc;\n"
2104 "#endif\n"
2105 "#ifdef NEED_MAXLOC\n"
2106 "localmem_maxloc[lid] = maxloc;\n"
2107 "#endif\n"
2108 "#ifdef OP_CALC2\n"
2109 "localmem_max2[lid] = maxval2;\n"
2110 "#endif\n"
2111 "}\n"
2112 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2113 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n"
2114 "{\n"
2115 "int lid3 = lid - WGS2_ALIGNED;\n"
2116 "#ifdef NEED_MINVAL\n"
2117 "#ifdef NEED_MINLOC\n"
2118 "if (localmem_min[lid3] >= minval)\n"
2119 "{\n"
2120 "if (localmem_min[lid3] == minval)\n"
2121 "localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);\n"
2122 "else\n"
2123 "localmem_minloc[lid3] = minloc,\n"
2124 "localmem_min[lid3] = minval;\n"
2125 "}\n"
2126 "#else\n"
2127 "localmem_min[lid3] = MIN(localmem_min[lid3], minval);\n"
2128 "#endif\n"
2129 "#endif\n"
2130 "#ifdef NEED_MAXVAL\n"
2131 "#ifdef NEED_MAXLOC\n"
2132 "if (localmem_max[lid3] <= maxval)\n"
2133 "{\n"
2134 "if (localmem_max[lid3] == maxval)\n"
2135 "localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);\n"
2136 "else\n"
2137 "localmem_maxloc[lid3] = maxloc,\n"
2138 "localmem_max[lid3] = maxval;\n"
2139 "}\n"
2140 "#else\n"
2141 "localmem_max[lid3] = MAX(localmem_max[lid3], maxval);\n"
2142 "#endif\n"
2143 "#endif\n"
2144 "#ifdef OP_CALC2\n"
2145 "localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);\n"
2146 "#endif\n"
2147 "}\n"
2148 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2149 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n"
2150 "{\n"
2151 "if (lid < lsize)\n"
2152 "{\n"
2153 "int lid2 = lsize + lid;\n"
2154 "#ifdef NEED_MINVAL\n"
2155 "#ifdef NEED_MINLOC\n"
2156 "if (localmem_min[lid] >= localmem_min[lid2])\n"
2157 "{\n"
2158 "if (localmem_min[lid] == localmem_min[lid2])\n"
2159 "localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);\n"
2160 "else\n"
2161 "localmem_minloc[lid] = localmem_minloc[lid2],\n"
2162 "localmem_min[lid] = localmem_min[lid2];\n"
2163 "}\n"
2164 "#else\n"
2165 "localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);\n"
2166 "#endif\n"
2167 "#endif\n"
2168 "#ifdef NEED_MAXVAL\n"
2169 "#ifdef NEED_MAXLOC\n"
2170 "if (localmem_max[lid] <= localmem_max[lid2])\n"
2171 "{\n"
2172 "if (localmem_max[lid] == localmem_max[lid2])\n"
2173 "localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);\n"
2174 "else\n"
2175 "localmem_maxloc[lid] = localmem_maxloc[lid2],\n"
2176 "localmem_max[lid] = localmem_max[lid2];\n"
2177 "}\n"
2178 "#else\n"
2179 "localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);\n"
2180 "#endif\n"
2181 "#endif\n"
2182 "#ifdef OP_CALC2\n"
2183 "localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);\n"
2184 "#endif\n"
2185 "}\n"
2186 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2187 "}\n"
2188 "if (lid == 0)\n"
2189 "{\n"
2190 "int pos = 0;\n"
2191 "#ifdef NEED_MINVAL\n"
2192 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_min[0];\n"
2193 "pos = mad24(groupnum, (int)sizeof(dstT1), pos);\n"
2194 "#endif\n"
2195 "#ifdef NEED_MAXVAL\n"
2196 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max[0];\n"
2197 "pos = mad24(groupnum, (int)sizeof(dstT1), pos);\n"
2198 "#endif\n"
2199 "#ifdef NEED_MINLOC\n"
2200 "*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0];\n"
2201 "pos = mad24(groupnum, (int)sizeof(uint), pos);\n"
2202 "#endif\n"
2203 "#ifdef NEED_MAXLOC\n"
2204 "*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];\n"
2205 "#ifdef OP_CALC2\n"
2206 "pos = mad24(groupnum, (int)sizeof(uint), pos);\n"
2207 "#endif\n"
2208 "#endif\n"
2209 "#ifdef OP_CALC2\n"
2210 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];\n"
2211 "#endif\n"
2212 "}\n"
2213 "}\n"
2214 , "c2741330ed8390675ed210edd54f5258"};
2215 ProgramSource minmaxloc_oclsrc(minmaxloc.programStr);
2216 const struct ProgramEntry mixchannels={"mixchannels",
2217 "#define DECLARE_INPUT_MAT(i) \\\n"
2218 "__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,\n"
2219 "#define DECLARE_OUTPUT_MAT(i) \\\n"
2220 "__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,\n"
2221 "#define DECLARE_INDEX(i) \\\n"
2222 "int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \\\n"
2223 "int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset));\n"
2224 "#define PROCESS_ELEM(i) \\\n"
2225 "__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \\\n"
2226 "__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \\\n"
2227 "dst##i[0] = src##i[0]; \\\n"
2228 "src##i##_index += src##i##_step; \\\n"
2229 "dst##i##_index += dst##i##_step;\n"
2230 "__kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI)\n"
2231 "{\n"
2232 "int x = get_global_id(0);\n"
2233 "int y0 = get_global_id(1) * rowsPerWI;\n"
2234 "if (x < cols)\n"
2235 "{\n"
2236 "DECLARE_INDEX_N\n"
2237 "for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y)\n"
2238 "{\n"
2239 "PROCESS_ELEM_N\n"
2240 "}\n"
2241 "}\n"
2242 "}\n"
2243 , "26a27b81c3e2524a8eb918b3a518da0a"};
2244 ProgramSource mixchannels_oclsrc(mixchannels.programStr);
2245 const struct ProgramEntry mulspectrums={"mulspectrums",
2246 "inline float2 cmulf(float2 a, float2 b)\n"
2247 "{\n"
2248 "return (float2)(mad(a.x, b.x, - a.y * b.y), mad(a.x, b.y, a.y * b.x));\n"
2249 "}\n"
2250 "inline float2 conjf(float2 a)\n"
2251 "{\n"
2252 "return (float2)(a.x, - a.y);\n"
2253 "}\n"
2254 "__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,\n"
2255 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n"
2256 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
2257 "int dst_rows, int dst_cols, int rowsPerWI)\n"
2258 "{\n"
2259 "int x = get_global_id(0);\n"
2260 "int y0 = get_global_id(1) * rowsPerWI;\n"
2261 "if (x < dst_cols)\n"
2262 "{\n"
2263 "int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset));\n"
2264 "int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset));\n"
2265 "int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset));\n"
2266 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y,\n"
2267 "src1_index += src1_step, src2_index += src2_step, dst_index += dst_step)\n"
2268 "{\n"
2269 "float2 src0 = *(__global const float2 *)(src1ptr + src1_index);\n"
2270 "float2 src1 = *(__global const float2 *)(src2ptr + src2_index);\n"
2271 "__global float2 * dst = (__global float2 *)(dstptr + dst_index);\n"
2272 "#ifdef CONJ\n"
2273 "float2 v = cmulf(src0, conjf(src1));\n"
2274 "#else\n"
2275 "float2 v = cmulf(src0, src1);\n"
2276 "#endif\n"
2277 "dst[0] = v;\n"
2278 "}\n"
2279 "}\n"
2280 "}\n"
2281 , "0ffb2c858f6664aa3e56efb81f025f5c"};
2282 ProgramSource mulspectrums_oclsrc(mulspectrums.programStr);
2283 const struct ProgramEntry normalize={"normalize",
2284 "#ifdef DOUBLE_SUPPORT\n"
2285 "#ifdef cl_amd_fp64\n"
2286 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
2287 "#elif defined (cl_khr_fp64)\n"
2288 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
2289 "#endif\n"
2290 "#endif\n"
2291 "#define noconvert\n"
2292 "#if cn != 3\n"
2293 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
2294 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
2295 "#define srcTSIZE (int)sizeof(srcT)\n"
2296 "#define dstTSIZE (int)sizeof(dstT)\n"
2297 "#else\n"
2298 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
2299 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
2300 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n"
2301 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n"
2302 "#endif\n"
2303 "__kernel void normalizek(__global const uchar * srcptr, int src_step, int src_offset,\n"
2304 "__global const uchar * mask, int mask_step, int mask_offset,\n"
2305 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols\n"
2306 "#ifdef HAVE_SCALE\n"
2307 ", float scale\n"
2308 "#endif\n"
2309 "#ifdef HAVE_DELTA\n"
2310 ", float delta\n"
2311 "#endif\n"
2312 ")\n"
2313 "{\n"
2314 "int x = get_global_id(0);\n"
2315 "int y0 = get_global_id(1) * rowsPerWI;\n"
2316 "if (x < dst_cols)\n"
2317 "{\n"
2318 "int src_index  = mad24(y0, src_step, mad24(x, srcTSIZE, src_offset));\n"
2319 "int mask_index = mad24(y0, mask_step, x + mask_offset);\n"
2320 "int dst_index  = mad24(y0, dst_step, mad24(x, dstTSIZE, dst_offset));\n"
2321 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1;\n"
2322 "++y, src_index += src_step, dst_index += dst_step, mask_index += mask_step)\n"
2323 "{\n"
2324 "if (mask[mask_index])\n"
2325 "{\n"
2326 "workT value = convertToWT(loadpix(srcptr + src_index));\n"
2327 "#ifdef HAVE_SCALE\n"
2328 "#ifdef HAVE_DELTA\n"
2329 "value = fma(value, (workT)(scale), (workT)(delta));\n"
2330 "#else\n"
2331 "value *= (workT)(scale);\n"
2332 "#endif\n"
2333 "#else\n"
2334 "#ifdef HAVE_DELTA\n"
2335 "value += (workT)(delta);\n"
2336 "#endif\n"
2337 "#endif\n"
2338 "storepix(convertToDT(value), dstptr + dst_index);\n"
2339 "}\n"
2340 "}\n"
2341 "}\n"
2342 "}\n"
2343 , "05e23451b4bf16c50a0eba9d6c5c0012"};
2344 ProgramSource normalize_oclsrc(normalize.programStr);
2345 const struct ProgramEntry reduce={"reduce",
2346 "#ifdef DOUBLE_SUPPORT\n"
2347 "#ifdef cl_amd_fp64\n"
2348 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
2349 "#elif defined (cl_khr_fp64)\n"
2350 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
2351 "#endif\n"
2352 "#endif\n"
2353 "#if defined OP_NORM_INF_MASK\n"
2354 "#ifdef DEPTH_0\n"
2355 "#define MIN_VAL 0\n"
2356 "#define MAX_VAL 255\n"
2357 "#elif defined DEPTH_1\n"
2358 "#define MIN_VAL -128\n"
2359 "#define MAX_VAL 127\n"
2360 "#elif defined DEPTH_2\n"
2361 "#define MIN_VAL 0\n"
2362 "#define MAX_VAL 65535\n"
2363 "#elif defined DEPTH_3\n"
2364 "#define MIN_VAL -32768\n"
2365 "#define MAX_VAL 32767\n"
2366 "#elif defined DEPTH_4\n"
2367 "#define MIN_VAL INT_MIN\n"
2368 "#define MAX_VAL INT_MAX\n"
2369 "#elif defined DEPTH_5\n"
2370 "#define MIN_VAL (-FLT_MAX)\n"
2371 "#define MAX_VAL FLT_MAX\n"
2372 "#elif defined DEPTH_6\n"
2373 "#define MIN_VAL (-DBL_MAX)\n"
2374 "#define MAX_VAL DBL_MAX\n"
2375 "#endif\n"
2376 "#define dstT srcT\n"
2377 "#define dstT1 srcT1\n"
2378 "#endif\n"
2379 "#define noconvert\n"
2380 "#ifndef kercn\n"
2381 "#define kercn 1\n"
2382 "#endif\n"
2383 "#ifdef HAVE_MASK_CONT\n"
2384 "#define MASK_INDEX int mask_index = id + mask_offset;\n"
2385 "#else\n"
2386 "#define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols))\n"
2387 "#endif\n"
2388 "#if cn != 3\n"
2389 "#define loadpix(addr) *(__global const srcT *)(addr)\n"
2390 "#define storepix(val, addr)  *(__global dstT *)(addr) = val\n"
2391 "#if kercn == 1\n"
2392 "#define srcTSIZE (int)sizeof(srcT)\n"
2393 "#else\n"
2394 "#define srcTSIZE (int)sizeof(srcT1)\n"
2395 "#endif\n"
2396 "#define dstTSIZE (int)sizeof(dstT)\n"
2397 "#else\n"
2398 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n"
2399 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n"
2400 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n"
2401 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n"
2402 "#endif\n"
2403 "#if ddepth <= 4\n"
2404 "#define SUM_ABS(a) convertFromU(abs(a))\n"
2405 "#define SUM_ABS2(a, b) convertFromU(abs_diff(a, b))\n"
2406 "#else\n"
2407 "#define SUM_ABS(a) fabs(a)\n"
2408 "#define SUM_ABS2(a, b) fabs(a - b)\n"
2409 "#endif\n"
2410 "#ifdef HAVE_MASK\n"
2411 "#ifdef HAVE_SRC2\n"
2412 "#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset\n"
2413 "#else\n"
2414 "#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset\n"
2415 "#endif\n"
2416 "#else\n"
2417 "#ifdef HAVE_SRC2\n"
2418 "#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset\n"
2419 "#else\n"
2420 "#define EXTRA_PARAMS\n"
2421 "#endif\n"
2422 "#endif\n"
2423 "#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT\n"
2424 "#ifdef OP_DOT\n"
2425 "#if ddepth <= 4\n"
2426 "#define FUNC(a, b, c) a = mad24(b, c, a)\n"
2427 "#else\n"
2428 "#define FUNC(a, b, c) a = mad(b, c, a)\n"
2429 "#endif\n"
2430 "#elif defined OP_SUM\n"
2431 "#define FUNC(a, b) a += b\n"
2432 "#elif defined OP_SUM_ABS\n"
2433 "#define FUNC(a, b) a += SUM_ABS(b)\n"
2434 "#elif defined OP_SUM_SQR\n"
2435 "#if ddepth <= 4\n"
2436 "#define FUNC(a, b) a = mad24(b, b, a)\n"
2437 "#else\n"
2438 "#define FUNC(a, b) a = mad(b, b, a)\n"
2439 "#endif\n"
2440 "#endif\n"
2441 "#ifdef OP_CALC2\n"
2442 "#define DECLARE_LOCAL_MEM \\\n"
2443 "__local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]\n"
2444 "#define DEFINE_ACCUMULATOR \\\n"
2445 "dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)\n"
2446 "#else\n"
2447 "#define DECLARE_LOCAL_MEM \\\n"
2448 "__local dstT localmem[WGS2_ALIGNED]\n"
2449 "#define DEFINE_ACCUMULATOR \\\n"
2450 "dstT accumulator = (dstT)(0)\n"
2451 "#endif\n"
2452 "#ifdef HAVE_SRC2\n"
2453 "#ifdef OP_CALC2\n"
2454 "#define PROCESS_ELEMS \\\n"
2455 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2456 "dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2457 "temp = SUM_ABS2(temp, temp2); \\\n"
2458 "temp2 = SUM_ABS(temp2); \\\n"
2459 "FUNC(accumulator2, temp2); \\\n"
2460 "FUNC(accumulator, temp)\n"
2461 "#else\n"
2462 "#define PROCESS_ELEMS \\\n"
2463 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2464 "dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2465 "temp = SUM_ABS2(temp, temp2); \\\n"
2466 "FUNC(accumulator, temp)\n"
2467 "#endif\n"
2468 "#else\n"
2469 "#define PROCESS_ELEMS \\\n"
2470 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2471 "FUNC(accumulator, temp)\n"
2472 "#endif\n"
2473 "#ifdef HAVE_MASK\n"
2474 "#define REDUCE_GLOBAL \\\n"
2475 "MASK_INDEX; \\\n"
2476 "if (mask[mask_index]) \\\n"
2477 "{ \\\n"
2478 "PROCESS_ELEMS; \\\n"
2479 "}\n"
2480 "#elif defined OP_DOT\n"
2481 "#ifdef HAVE_SRC2_CONT\n"
2482 "#define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset);\n"
2483 "#else\n"
2484 "#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))\n"
2485 "#endif\n"
2486 "#if kercn == 1\n"
2487 "#define REDUCE_GLOBAL \\\n"
2488 "SRC2_INDEX; \\\n"
2489 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2490 "FUNC(accumulator, temp, temp2)\n"
2491 "#elif kercn == 2\n"
2492 "#define REDUCE_GLOBAL \\\n"
2493 "SRC2_INDEX; \\\n"
2494 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2495 "FUNC(accumulator, temp.s0, temp2.s0); \\\n"
2496 "FUNC(accumulator, temp.s1, temp2.s1)\n"
2497 "#elif kercn == 4\n"
2498 "#define REDUCE_GLOBAL \\\n"
2499 "SRC2_INDEX; \\\n"
2500 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2501 "FUNC(accumulator, temp.s0, temp2.s0); \\\n"
2502 "FUNC(accumulator, temp.s1, temp2.s1); \\\n"
2503 "FUNC(accumulator, temp.s2, temp2.s2); \\\n"
2504 "FUNC(accumulator, temp.s3, temp2.s3)\n"
2505 "#elif kercn == 8\n"
2506 "#define REDUCE_GLOBAL \\\n"
2507 "SRC2_INDEX; \\\n"
2508 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2509 "FUNC(accumulator, temp.s0, temp2.s0); \\\n"
2510 "FUNC(accumulator, temp.s1, temp2.s1); \\\n"
2511 "FUNC(accumulator, temp.s2, temp2.s2); \\\n"
2512 "FUNC(accumulator, temp.s3, temp2.s3); \\\n"
2513 "FUNC(accumulator, temp.s4, temp2.s4); \\\n"
2514 "FUNC(accumulator, temp.s5, temp2.s5); \\\n"
2515 "FUNC(accumulator, temp.s6, temp2.s6); \\\n"
2516 "FUNC(accumulator, temp.s7, temp2.s7)\n"
2517 "#elif kercn == 16\n"
2518 "#define REDUCE_GLOBAL \\\n"
2519 "SRC2_INDEX; \\\n"
2520 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2521 "FUNC(accumulator, temp.s0, temp2.s0); \\\n"
2522 "FUNC(accumulator, temp.s1, temp2.s1); \\\n"
2523 "FUNC(accumulator, temp.s2, temp2.s2); \\\n"
2524 "FUNC(accumulator, temp.s3, temp2.s3); \\\n"
2525 "FUNC(accumulator, temp.s4, temp2.s4); \\\n"
2526 "FUNC(accumulator, temp.s5, temp2.s5); \\\n"
2527 "FUNC(accumulator, temp.s6, temp2.s6); \\\n"
2528 "FUNC(accumulator, temp.s7, temp2.s7); \\\n"
2529 "FUNC(accumulator, temp.s8, temp2.s8); \\\n"
2530 "FUNC(accumulator, temp.s9, temp2.s9); \\\n"
2531 "FUNC(accumulator, temp.sA, temp2.sA); \\\n"
2532 "FUNC(accumulator, temp.sB, temp2.sB); \\\n"
2533 "FUNC(accumulator, temp.sC, temp2.sC); \\\n"
2534 "FUNC(accumulator, temp.sD, temp2.sD); \\\n"
2535 "FUNC(accumulator, temp.sE, temp2.sE); \\\n"
2536 "FUNC(accumulator, temp.sF, temp2.sF)\n"
2537 "#endif\n"
2538 "#else\n"
2539 "#ifdef HAVE_SRC2\n"
2540 "#ifdef OP_CALC2\n"
2541 "#if kercn == 1\n"
2542 "#define REDUCE_GLOBAL \\\n"
2543 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2544 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2545 "temp = SUM_ABS2(temp, temp2); \\\n"
2546 "temp2 = SUM_ABS(temp2); \\\n"
2547 "FUNC(accumulator, temp); \\\n"
2548 "FUNC(accumulator2, temp2)\n"
2549 "#elif kercn == 2\n"
2550 "#define REDUCE_GLOBAL \\\n"
2551 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2552 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2553 "temp = SUM_ABS2(temp, temp2); \\\n"
2554 "temp2 = SUM_ABS(temp2); \\\n"
2555 "FUNC(accumulator, temp.s0); \\\n"
2556 "FUNC(accumulator, temp.s1); \\\n"
2557 "FUNC(accumulator2, temp2.s0); \\\n"
2558 "FUNC(accumulator2, temp2.s1)\n"
2559 "#elif kercn == 4\n"
2560 "#define REDUCE_GLOBAL \\\n"
2561 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2562 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2563 "temp = SUM_ABS2(temp, temp2); \\\n"
2564 "temp2 = SUM_ABS(temp2); \\\n"
2565 "FUNC(accumulator, temp.s0); \\\n"
2566 "FUNC(accumulator, temp.s1); \\\n"
2567 "FUNC(accumulator, temp.s2); \\\n"
2568 "FUNC(accumulator, temp.s3); \\\n"
2569 "FUNC(accumulator2, temp2.s0); \\\n"
2570 "FUNC(accumulator2, temp2.s1); \\\n"
2571 "FUNC(accumulator2, temp2.s2); \\\n"
2572 "FUNC(accumulator2, temp2.s3)\n"
2573 "#elif kercn == 8\n"
2574 "#define REDUCE_GLOBAL \\\n"
2575 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2576 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2577 "temp = SUM_ABS2(temp, temp2); \\\n"
2578 "temp2 = SUM_ABS(temp2); \\\n"
2579 "FUNC(accumulator, temp.s0); \\\n"
2580 "FUNC(accumulator, temp.s1); \\\n"
2581 "FUNC(accumulator, temp.s2); \\\n"
2582 "FUNC(accumulator, temp.s3); \\\n"
2583 "FUNC(accumulator, temp.s4); \\\n"
2584 "FUNC(accumulator, temp.s5); \\\n"
2585 "FUNC(accumulator, temp.s6); \\\n"
2586 "FUNC(accumulator, temp.s7); \\\n"
2587 "FUNC(accumulator2, temp2.s0); \\\n"
2588 "FUNC(accumulator2, temp2.s1); \\\n"
2589 "FUNC(accumulator2, temp2.s2); \\\n"
2590 "FUNC(accumulator2, temp2.s3); \\\n"
2591 "FUNC(accumulator2, temp2.s4); \\\n"
2592 "FUNC(accumulator2, temp2.s5); \\\n"
2593 "FUNC(accumulator2, temp2.s6); \\\n"
2594 "FUNC(accumulator2, temp2.s7)\n"
2595 "#elif kercn == 16\n"
2596 "#define REDUCE_GLOBAL \\\n"
2597 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2598 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2599 "temp = SUM_ABS2(temp, temp2); \\\n"
2600 "temp2 = SUM_ABS(temp2); \\\n"
2601 "FUNC(accumulator, temp.s0); \\\n"
2602 "FUNC(accumulator, temp.s1); \\\n"
2603 "FUNC(accumulator, temp.s2); \\\n"
2604 "FUNC(accumulator, temp.s3); \\\n"
2605 "FUNC(accumulator, temp.s4); \\\n"
2606 "FUNC(accumulator, temp.s5); \\\n"
2607 "FUNC(accumulator, temp.s6); \\\n"
2608 "FUNC(accumulator, temp.s7); \\\n"
2609 "FUNC(accumulator, temp.s8); \\\n"
2610 "FUNC(accumulator, temp.s9); \\\n"
2611 "FUNC(accumulator, temp.sA); \\\n"
2612 "FUNC(accumulator, temp.sB); \\\n"
2613 "FUNC(accumulator, temp.sC); \\\n"
2614 "FUNC(accumulator, temp.sD); \\\n"
2615 "FUNC(accumulator, temp.sE); \\\n"
2616 "FUNC(accumulator, temp.sF); \\\n"
2617 "FUNC(accumulator2, temp2.s0); \\\n"
2618 "FUNC(accumulator2, temp2.s1); \\\n"
2619 "FUNC(accumulator2, temp2.s2); \\\n"
2620 "FUNC(accumulator2, temp2.s3); \\\n"
2621 "FUNC(accumulator2, temp2.s4); \\\n"
2622 "FUNC(accumulator2, temp2.s5); \\\n"
2623 "FUNC(accumulator2, temp2.s6); \\\n"
2624 "FUNC(accumulator2, temp2.s7); \\\n"
2625 "FUNC(accumulator2, temp2.s8); \\\n"
2626 "FUNC(accumulator2, temp2.s9); \\\n"
2627 "FUNC(accumulator2, temp2.sA); \\\n"
2628 "FUNC(accumulator2, temp2.sB); \\\n"
2629 "FUNC(accumulator2, temp2.sC); \\\n"
2630 "FUNC(accumulator2, temp2.sD); \\\n"
2631 "FUNC(accumulator2, temp2.sE); \\\n"
2632 "FUNC(accumulator2, temp2.sF)\n"
2633 "#endif\n"
2634 "#else\n"
2635 "#if kercn == 1\n"
2636 "#define REDUCE_GLOBAL \\\n"
2637 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2638 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2639 "temp = SUM_ABS2(temp, temp2); \\\n"
2640 "FUNC(accumulator, temp)\n"
2641 "#elif kercn == 2\n"
2642 "#define REDUCE_GLOBAL \\\n"
2643 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2644 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2645 "temp = SUM_ABS2(temp, temp2); \\\n"
2646 "FUNC(accumulator, temp.s0); \\\n"
2647 "FUNC(accumulator, temp.s1)\n"
2648 "#elif kercn == 4\n"
2649 "#define REDUCE_GLOBAL \\\n"
2650 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2651 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2652 "temp = SUM_ABS2(temp, temp2); \\\n"
2653 "FUNC(accumulator, temp.s0); \\\n"
2654 "FUNC(accumulator, temp.s1); \\\n"
2655 "FUNC(accumulator, temp.s2); \\\n"
2656 "FUNC(accumulator, temp.s3)\n"
2657 "#elif kercn == 8\n"
2658 "#define REDUCE_GLOBAL \\\n"
2659 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2660 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2661 "temp = SUM_ABS2(temp, temp2); \\\n"
2662 "FUNC(accumulator, temp.s0); \\\n"
2663 "FUNC(accumulator, temp.s1); \\\n"
2664 "FUNC(accumulator, temp.s2); \\\n"
2665 "FUNC(accumulator, temp.s3); \\\n"
2666 "FUNC(accumulator, temp.s4); \\\n"
2667 "FUNC(accumulator, temp.s5); \\\n"
2668 "FUNC(accumulator, temp.s6); \\\n"
2669 "FUNC(accumulator, temp.s7)\n"
2670 "#elif kercn == 16\n"
2671 "#define REDUCE_GLOBAL \\\n"
2672 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2673 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n"
2674 "temp = SUM_ABS2(temp, temp2); \\\n"
2675 "FUNC(accumulator, temp.s0); \\\n"
2676 "FUNC(accumulator, temp.s1); \\\n"
2677 "FUNC(accumulator, temp.s2); \\\n"
2678 "FUNC(accumulator, temp.s3); \\\n"
2679 "FUNC(accumulator, temp.s4); \\\n"
2680 "FUNC(accumulator, temp.s5); \\\n"
2681 "FUNC(accumulator, temp.s6); \\\n"
2682 "FUNC(accumulator, temp.s7); \\\n"
2683 "FUNC(accumulator, temp.s8); \\\n"
2684 "FUNC(accumulator, temp.s9); \\\n"
2685 "FUNC(accumulator, temp.sA); \\\n"
2686 "FUNC(accumulator, temp.sB); \\\n"
2687 "FUNC(accumulator, temp.sC); \\\n"
2688 "FUNC(accumulator, temp.sD); \\\n"
2689 "FUNC(accumulator, temp.sE); \\\n"
2690 "FUNC(accumulator, temp.sF)\n"
2691 "#endif\n"
2692 "#endif\n"
2693 "#else\n"
2694 "#if kercn == 1\n"
2695 "#define REDUCE_GLOBAL \\\n"
2696 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2697 "FUNC(accumulator, temp)\n"
2698 "#elif kercn == 2\n"
2699 "#define REDUCE_GLOBAL \\\n"
2700 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2701 "FUNC(accumulator, temp.s0); \\\n"
2702 "FUNC(accumulator, temp.s1)\n"
2703 "#elif kercn == 4\n"
2704 "#define REDUCE_GLOBAL \\\n"
2705 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2706 "FUNC(accumulator, temp.s0); \\\n"
2707 "FUNC(accumulator, temp.s1); \\\n"
2708 "FUNC(accumulator, temp.s2); \\\n"
2709 "FUNC(accumulator, temp.s3)\n"
2710 "#elif kercn == 8\n"
2711 "#define REDUCE_GLOBAL \\\n"
2712 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2713 "FUNC(accumulator, temp.s0); \\\n"
2714 "FUNC(accumulator, temp.s1); \\\n"
2715 "FUNC(accumulator, temp.s2); \\\n"
2716 "FUNC(accumulator, temp.s3); \\\n"
2717 "FUNC(accumulator, temp.s4); \\\n"
2718 "FUNC(accumulator, temp.s5); \\\n"
2719 "FUNC(accumulator, temp.s6); \\\n"
2720 "FUNC(accumulator, temp.s7)\n"
2721 "#elif kercn == 16\n"
2722 "#define REDUCE_GLOBAL \\\n"
2723 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n"
2724 "FUNC(accumulator, temp.s0); \\\n"
2725 "FUNC(accumulator, temp.s1); \\\n"
2726 "FUNC(accumulator, temp.s2); \\\n"
2727 "FUNC(accumulator, temp.s3); \\\n"
2728 "FUNC(accumulator, temp.s4); \\\n"
2729 "FUNC(accumulator, temp.s5); \\\n"
2730 "FUNC(accumulator, temp.s6); \\\n"
2731 "FUNC(accumulator, temp.s7); \\\n"
2732 "FUNC(accumulator, temp.s8); \\\n"
2733 "FUNC(accumulator, temp.s9); \\\n"
2734 "FUNC(accumulator, temp.sA); \\\n"
2735 "FUNC(accumulator, temp.sB); \\\n"
2736 "FUNC(accumulator, temp.sC); \\\n"
2737 "FUNC(accumulator, temp.sD); \\\n"
2738 "FUNC(accumulator, temp.sE); \\\n"
2739 "FUNC(accumulator, temp.sF)\n"
2740 "#endif\n"
2741 "#endif\n"
2742 "#endif\n"
2743 "#ifdef OP_CALC2\n"
2744 "#define SET_LOCAL_1 \\\n"
2745 "localmem[lid] = accumulator; \\\n"
2746 "localmem2[lid] = accumulator2\n"
2747 "#define REDUCE_LOCAL_1 \\\n"
2748 "localmem[lid - WGS2_ALIGNED] += accumulator; \\\n"
2749 "localmem2[lid - WGS2_ALIGNED] += accumulator2\n"
2750 "#define REDUCE_LOCAL_2 \\\n"
2751 "localmem[lid] += localmem[lid2]; \\\n"
2752 "localmem2[lid] += localmem2[lid2]\n"
2753 "#define CALC_RESULT \\\n"
2754 "storepix(localmem[0], dstptr + dstTSIZE * gid); \\\n"
2755 "storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))\n"
2756 "#else\n"
2757 "#define SET_LOCAL_1 \\\n"
2758 "localmem[lid] = accumulator\n"
2759 "#define REDUCE_LOCAL_1 \\\n"
2760 "localmem[lid - WGS2_ALIGNED] += accumulator\n"
2761 "#define REDUCE_LOCAL_2 \\\n"
2762 "localmem[lid] += localmem[lid2]\n"
2763 "#define CALC_RESULT \\\n"
2764 "storepix(localmem[0], dstptr + dstTSIZE * gid)\n"
2765 "#endif\n"
2766 "#elif defined OP_COUNT_NON_ZERO\n"
2767 "#define dstT int\n"
2768 "#define DECLARE_LOCAL_MEM \\\n"
2769 "__local dstT localmem[WGS2_ALIGNED]\n"
2770 "#define DEFINE_ACCUMULATOR \\\n"
2771 "dstT accumulator = (dstT)(0); \\\n"
2772 "srcT1 zero = (srcT1)(0), one = (srcT1)(1)\n"
2773 "#if kercn == 1\n"
2774 "#define REDUCE_GLOBAL \\\n"
2775 "accumulator += loadpix(srcptr + src_index) == zero ? zero : one\n"
2776 "#elif kercn == 2\n"
2777 "#define REDUCE_GLOBAL \\\n"
2778 "srcT value = loadpix(srcptr + src_index); \\\n"
2779 "accumulator += value.s0 == zero ? zero : one; \\\n"
2780 "accumulator += value.s1 == zero ? zero : one\n"
2781 "#elif kercn == 4\n"
2782 "#define REDUCE_GLOBAL \\\n"
2783 "srcT value = loadpix(srcptr + src_index); \\\n"
2784 "accumulator += value.s0 == zero ? zero : one; \\\n"
2785 "accumulator += value.s1 == zero ? zero : one; \\\n"
2786 "accumulator += value.s2 == zero ? zero : one; \\\n"
2787 "accumulator += value.s3 == zero ? zero : one\n"
2788 "#elif kercn == 8\n"
2789 "#define REDUCE_GLOBAL \\\n"
2790 "srcT value = loadpix(srcptr + src_index); \\\n"
2791 "accumulator += value.s0 == zero ? zero : one; \\\n"
2792 "accumulator += value.s1 == zero ? zero : one; \\\n"
2793 "accumulator += value.s2 == zero ? zero : one; \\\n"
2794 "accumulator += value.s3 == zero ? zero : one; \\\n"
2795 "accumulator += value.s4 == zero ? zero : one; \\\n"
2796 "accumulator += value.s5 == zero ? zero : one; \\\n"
2797 "accumulator += value.s6 == zero ? zero : one; \\\n"
2798 "accumulator += value.s7 == zero ? zero : one\n"
2799 "#elif kercn == 16\n"
2800 "#define REDUCE_GLOBAL \\\n"
2801 "srcT value = loadpix(srcptr + src_index); \\\n"
2802 "accumulator += value.s0 == zero ? zero : one; \\\n"
2803 "accumulator += value.s1 == zero ? zero : one; \\\n"
2804 "accumulator += value.s2 == zero ? zero : one; \\\n"
2805 "accumulator += value.s3 == zero ? zero : one; \\\n"
2806 "accumulator += value.s4 == zero ? zero : one; \\\n"
2807 "accumulator += value.s5 == zero ? zero : one; \\\n"
2808 "accumulator += value.s6 == zero ? zero : one; \\\n"
2809 "accumulator += value.s7 == zero ? zero : one; \\\n"
2810 "accumulator += value.s8 == zero ? zero : one; \\\n"
2811 "accumulator += value.s9 == zero ? zero : one; \\\n"
2812 "accumulator += value.sA == zero ? zero : one; \\\n"
2813 "accumulator += value.sB == zero ? zero : one; \\\n"
2814 "accumulator += value.sC == zero ? zero : one; \\\n"
2815 "accumulator += value.sD == zero ? zero : one; \\\n"
2816 "accumulator += value.sE == zero ? zero : one; \\\n"
2817 "accumulator += value.sF == zero ? zero : one\n"
2818 "#endif\n"
2819 "#define SET_LOCAL_1 \\\n"
2820 "localmem[lid] = accumulator\n"
2821 "#define REDUCE_LOCAL_1 \\\n"
2822 "localmem[lid - WGS2_ALIGNED] += accumulator\n"
2823 "#define REDUCE_LOCAL_2 \\\n"
2824 "localmem[lid] += localmem[lid2]\n"
2825 "#define CALC_RESULT \\\n"
2826 "storepix(localmem[0], dstptr + dstTSIZE * gid)\n"
2827 "#else\n"
2828 "#error \"No operation\"\n"
2829 "#endif\n"
2830 "#ifdef OP_DOT\n"
2831 "#undef EXTRA_PARAMS\n"
2832 "#define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset\n"
2833 "#endif\n"
2834 "__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n"
2835 "int total, int groupnum, __global uchar * dstptr EXTRA_PARAMS)\n"
2836 "{\n"
2837 "int lid = get_local_id(0);\n"
2838 "int gid = get_group_id(0);\n"
2839 "int  id = get_global_id(0) * kercn;\n"
2840 "srcptr += src_offset;\n"
2841 "#ifdef HAVE_SRC2\n"
2842 "src2ptr += src2_offset;\n"
2843 "#endif\n"
2844 "DECLARE_LOCAL_MEM;\n"
2845 "DEFINE_ACCUMULATOR;\n"
2846 "for (int grain = groupnum * WGS * kercn; id < total; id += grain)\n"
2847 "{\n"
2848 "#ifdef HAVE_SRC_CONT\n"
2849 "int src_index = id * srcTSIZE;\n"
2850 "#else\n"
2851 "int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n"
2852 "#endif\n"
2853 "#ifdef HAVE_SRC2\n"
2854 "#ifdef HAVE_SRC2_CONT\n"
2855 "int src2_index = id * srcTSIZE;\n"
2856 "#else\n"
2857 "int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));\n"
2858 "#endif\n"
2859 "#endif\n"
2860 "REDUCE_GLOBAL;\n"
2861 "}\n"
2862 "if (lid < WGS2_ALIGNED)\n"
2863 "{\n"
2864 "SET_LOCAL_1;\n"
2865 "}\n"
2866 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2867 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n"
2868 "{\n"
2869 "REDUCE_LOCAL_1;\n"
2870 "}\n"
2871 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2872 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n"
2873 "{\n"
2874 "if (lid < lsize)\n"
2875 "{\n"
2876 "int lid2 = lsize + lid;\n"
2877 "REDUCE_LOCAL_2;\n"
2878 "}\n"
2879 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2880 "}\n"
2881 "if (lid == 0)\n"
2882 "{\n"
2883 "CALC_RESULT;\n"
2884 "}\n"
2885 "}\n"
2886 , "2bd554448b0b0af7e1a1ddd57a55f5a6"};
2887 ProgramSource reduce_oclsrc(reduce.programStr);
2888 const struct ProgramEntry reduce2={"reduce2",
2889 "#ifdef DOUBLE_SUPPORT\n"
2890 "#ifdef cl_amd_fp64\n"
2891 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
2892 "#elif defined (cl_khr_fp64)\n"
2893 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
2894 "#endif\n"
2895 "#endif\n"
2896 "#if ddepth == 0\n"
2897 "#define MIN_VAL 0\n"
2898 "#define MAX_VAL 255\n"
2899 "#elif ddepth == 1\n"
2900 "#define MIN_VAL -128\n"
2901 "#define MAX_VAL 127\n"
2902 "#elif ddepth == 2\n"
2903 "#define MIN_VAL 0\n"
2904 "#define MAX_VAL 65535\n"
2905 "#elif ddepth == 3\n"
2906 "#define MIN_VAL -32768\n"
2907 "#define MAX_VAL 32767\n"
2908 "#elif ddepth == 4\n"
2909 "#define MIN_VAL INT_MIN\n"
2910 "#define MAX_VAL INT_MAX\n"
2911 "#elif ddepth == 5\n"
2912 "#define MIN_VAL (-FLT_MAX)\n"
2913 "#define MAX_VAL FLT_MAX\n"
2914 "#elif ddepth == 6\n"
2915 "#define MIN_VAL (-DBL_MAX)\n"
2916 "#define MAX_VAL DBL_MAX\n"
2917 "#else\n"
2918 "#error \"Unsupported depth\"\n"
2919 "#endif\n"
2920 "#define noconvert\n"
2921 "#if defined OCL_CV_REDUCE_SUM || defined OCL_CV_REDUCE_AVG\n"
2922 "#define INIT_VALUE 0\n"
2923 "#define PROCESS_ELEM(acc, value) acc += value\n"
2924 "#elif defined OCL_CV_REDUCE_MAX\n"
2925 "#define INIT_VALUE MIN_VAL\n"
2926 "#define PROCESS_ELEM(acc, value) acc = max(value, acc)\n"
2927 "#elif defined OCL_CV_REDUCE_MIN\n"
2928 "#define INIT_VALUE MAX_VAL\n"
2929 "#define PROCESS_ELEM(acc, value) acc = min(value, acc)\n"
2930 "#else\n"
2931 "#error \"No operation is specified\"\n"
2932 "#endif\n"
2933 "#ifdef OP_REDUCE_PRE\n"
2934 "__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n"
2935 "__global uchar * dstptr, int dst_step, int dst_offset\n"
2936 "#ifdef OCL_CV_REDUCE_AVG\n"
2937 ", float fscale\n"
2938 "#endif\n"
2939 ")\n"
2940 "{\n"
2941 "__local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];\n"
2942 "int x = get_global_id(0);\n"
2943 "int y = get_global_id(1);\n"
2944 "int liy = get_local_id(1);\n"
2945 "if ((x < BUF_COLS) && (y < rows))\n"
2946 "{\n"
2947 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));\n"
2948 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n"
2949 "bufT tmp[cn];\n"
2950 "#pragma unroll\n"
2951 "for (int c = 0; c < cn; ++c)\n"
2952 "tmp[c] = INIT_VALUE;\n"
2953 "int src_step_mul = BUF_COLS * cn;\n"
2954 "for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)\n"
2955 "{\n"
2956 "#pragma unroll\n"
2957 "for (int c = 0; c < cn; ++c)\n"
2958 "{\n"
2959 "bufT value = convertToBufT(src[c]);\n"
2960 "PROCESS_ELEM(tmp[c], value);\n"
2961 "}\n"
2962 "}\n"
2963 "#pragma unroll\n"
2964 "for (int c = 0; c < cn; ++c)\n"
2965 "lsmem[liy][x][c] = tmp[c];\n"
2966 "}\n"
2967 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2968 "if ((x < BUF_COLS / 2) && (y < rows))\n"
2969 "{\n"
2970 "#pragma unroll\n"
2971 "for (int c = 0; c < cn; ++c)\n"
2972 "{\n"
2973 "PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x +  BUF_COLS / 2][c]);\n"
2974 "}\n"
2975 "}\n"
2976 "barrier(CLK_LOCAL_MEM_FENCE);\n"
2977 "if ((x == 0) && (y < rows))\n"
2978 "{\n"
2979 "int dst_index = mad24(y, dst_step, dst_offset);\n"
2980 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n"
2981 "bufT tmp[cn];\n"
2982 "#pragma unroll\n"
2983 "for (int c = 0; c < cn; ++c)\n"
2984 "tmp[c] = INIT_VALUE;\n"
2985 "#pragma unroll\n"
2986 "for (int xin = 0; xin < BUF_COLS / 2; xin ++)\n"
2987 "{\n"
2988 "#pragma unroll\n"
2989 "for (int c = 0; c < cn; ++c)\n"
2990 "{\n"
2991 "PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);\n"
2992 "}\n"
2993 "}\n"
2994 "#pragma unroll\n"
2995 "for (int c = 0; c < cn; ++c)\n"
2996 "#ifdef OCL_CV_REDUCE_AVG\n"
2997 "dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);\n"
2998 "#else\n"
2999 "dst[c] = convertToDT(tmp[c]);\n"
3000 "#endif\n"
3001 "}\n"
3002 "}\n"
3003 "#else\n"
3004 "__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n"
3005 "__global uchar * dstptr, int dst_step, int dst_offset\n"
3006 "#ifdef OCL_CV_REDUCE_AVG\n"
3007 ", float fscale\n"
3008 "#endif\n"
3009 ")\n"
3010 "{\n"
3011 "#if dim == 0\n"
3012 "int x = get_global_id(0);\n"
3013 "if (x < cols)\n"
3014 "{\n"
3015 "int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);\n"
3016 "int dst_index = mad24(x, (int)sizeof(dstT0) * cn, dst_offset);\n"
3017 "__global dstT0 * dst = (__global dstT0 *)(dstptr + dst_index);\n"
3018 "dstT tmp[cn];\n"
3019 "#pragma unroll\n"
3020 "for (int c = 0; c < cn; ++c)\n"
3021 "tmp[c] = INIT_VALUE;\n"
3022 "for (int y = 0; y < rows; ++y, src_index += src_step)\n"
3023 "{\n"
3024 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n"
3025 "#pragma unroll\n"
3026 "for (int c = 0; c < cn; ++c)\n"
3027 "{\n"
3028 "dstT value = convertToDT(src[c]);\n"
3029 "PROCESS_ELEM(tmp[c], value);\n"
3030 "}\n"
3031 "}\n"
3032 "#pragma unroll\n"
3033 "for (int c = 0; c < cn; ++c)\n"
3034 "#ifdef OCL_CV_REDUCE_AVG\n"
3035 "dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);\n"
3036 "#else\n"
3037 "dst[c] = convertToDT0(tmp[c]);\n"
3038 "#endif\n"
3039 "}\n"
3040 "#elif dim == 1\n"
3041 "int y = get_global_id(0);\n"
3042 "if (y < rows)\n"
3043 "{\n"
3044 "int src_index = mad24(y, src_step, src_offset);\n"
3045 "int dst_index = mad24(y, dst_step, dst_offset);\n"
3046 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n"
3047 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n"
3048 "dstT tmp[cn];\n"
3049 "#pragma unroll\n"
3050 "for (int c = 0; c < cn; ++c)\n"
3051 "tmp[c] = INIT_VALUE;\n"
3052 "for (int x = 0; x < cols; ++x, src += cn)\n"
3053 "{\n"
3054 "#pragma unroll\n"
3055 "for (int c = 0; c < cn; ++c)\n"
3056 "{\n"
3057 "dstT value = convertToDT(src[c]);\n"
3058 "PROCESS_ELEM(tmp[c], value);\n"
3059 "}\n"
3060 "}\n"
3061 "#pragma unroll\n"
3062 "for (int c = 0; c < cn; ++c)\n"
3063 "#ifdef OCL_CV_REDUCE_AVG\n"
3064 "dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);\n"
3065 "#else\n"
3066 "dst[c] = convertToDT0(tmp[c]);\n"
3067 "#endif\n"
3068 "}\n"
3069 "#else\n"
3070 "#error \"Dims must be either 0 or 1\"\n"
3071 "#endif\n"
3072 "}\n"
3073 "#endif\n"
3074 , "675811294a4da68880f2ace25764d371"};
3075 ProgramSource reduce2_oclsrc(reduce2.programStr);
3076 const struct ProgramEntry repeat={"repeat",
3077 "#if cn != 3\n"
3078 "#define loadpix(addr) *(__global const T *)(addr)\n"
3079 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
3080 "#define TSIZE (int)sizeof(T)\n"
3081 "#else\n"
3082 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
3083 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
3084 "#define TSIZE ((int)sizeof(T1)*3)\n"
3085 "#endif\n"
3086 "__kernel void repeat(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
3087 "__global uchar * dstptr, int dst_step, int dst_offset)\n"
3088 "{\n"
3089 "int x = get_global_id(0);\n"
3090 "int y0 = get_global_id(1) * rowsPerWI;\n"
3091 "if (x < src_cols)\n"
3092 "{\n"
3093 "int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(T), src_offset));\n"
3094 "int dst_index0 = mad24(y0, dst_step, mad24(x, (int)sizeof(T), dst_offset));\n"
3095 "for (int y = y0, y1 = min(src_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index0 += dst_step)\n"
3096 "{\n"
3097 "T srcelem = loadpix(srcptr + src_index);\n"
3098 "#pragma unroll\n"
3099 "for (int ey = 0; ey < ny; ++ey)\n"
3100 "{\n"
3101 "int dst_index = mad24(ey * src_rows, dst_step, dst_index0);\n"
3102 "#pragma unroll\n"
3103 "for (int ex = 0; ex < nx; ++ex)\n"
3104 "{\n"
3105 "storepix(srcelem, dstptr + dst_index);\n"
3106 "dst_index = mad24(src_cols, (int)sizeof(T), dst_index);\n"
3107 "}\n"
3108 "}\n"
3109 "}\n"
3110 "}\n"
3111 "}\n"
3112 , "d7a6b479ac9abf39f50a2d86c0b50863"};
3113 ProgramSource repeat_oclsrc(repeat.programStr);
3114 const struct ProgramEntry set_identity={"set_identity",
3115 "#if kercn != 3\n"
3116 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
3117 "#define TSIZE (int)sizeof(T)\n"
3118 "#define scalar scalar_\n"
3119 "#else\n"
3120 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
3121 "#define TSIZE ((int)sizeof(T1)*3)\n"
3122 "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n"
3123 "#endif\n"
3124 "__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n"
3125 "ST scalar_)\n"
3126 "{\n"
3127 "int x = get_global_id(0);\n"
3128 "int y0 = get_global_id(1) * rowsPerWI;\n"
3129 "if (x < cols)\n"
3130 "{\n"
3131 "int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n"
3132 "#if kercn == cn\n"
3133 "#pragma unroll\n"
3134 "for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)\n"
3135 "if (y < y1)\n"
3136 "storepix(x == y ? scalar : (T)(0), srcptr + src_index);\n"
3137 "#elif kercn == 4 && cn == 1\n"
3138 "if (y0 < rows)\n"
3139 "{\n"
3140 "storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);\n"
3141 "if (++y0 < rows)\n"
3142 "{\n"
3143 "src_index += src_step;\n"
3144 "storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);\n"
3145 "if (++y0 < rows)\n"
3146 "{\n"
3147 "src_index += src_step;\n"
3148 "storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);\n"
3149 "if (++y0 < rows)\n"
3150 "{\n"
3151 "src_index += src_step;\n"
3152 "storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);\n"
3153 "}\n"
3154 "}\n"
3155 "}\n"
3156 "}\n"
3157 "#else\n"
3158 "#error \"Incorrect combination of cn && kercn\"\n"
3159 "#endif\n"
3160 "}\n"
3161 "}\n"
3162 , "75020e8c1da6cf8aece6bd5cc5b9ed4f"};
3163 ProgramSource set_identity_oclsrc(set_identity.programStr);
3164 const struct ProgramEntry split_merge={"split_merge",
3165 "#ifdef OP_MERGE\n"
3166 "#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,\n"
3167 "#define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset));\n"
3168 "#define PROCESS_ELEM(index) \\\n"
3169 "__global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \\\n"
3170 "dst[index] = src##index[0]; \\\n"
3171 "src##index##_index += src##index##_step;\n"
3172 "__kernel void merge(DECLARE_SRC_PARAMS_N\n"
3173 "__global uchar * dstptr, int dst_step, int dst_offset,\n"
3174 "int rows, int cols, int rowsPerWI)\n"
3175 "{\n"
3176 "int x = get_global_id(0);\n"
3177 "int y0 = get_global_id(1) * rowsPerWI;\n"
3178 "if (x < cols)\n"
3179 "{\n"
3180 "DECLARE_INDEX_N\n"
3181 "int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset));\n"
3182 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step)\n"
3183 "{\n"
3184 "__global T * dst = (__global T *)(dstptr + dst_index);\n"
3185 "PROCESS_ELEMS_N\n"
3186 "}\n"
3187 "}\n"
3188 "}\n"
3189 "#elif defined OP_SPLIT\n"
3190 "#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset\n"
3191 "#define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset));\n"
3192 "#define PROCESS_ELEM(index) \\\n"
3193 "__global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \\\n"
3194 "dst##index[0] = src[index]; \\\n"
3195 "dst##index##_index += dst##index##_step;\n"
3196 "__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI)\n"
3197 "{\n"
3198 "int x = get_global_id(0);\n"
3199 "int y0 = get_global_id(1) * rowsPerWI;\n"
3200 "if (x < cols)\n"
3201 "{\n"
3202 "DECLARE_INDEX_N\n"
3203 "int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset));\n"
3204 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)\n"
3205 "{\n"
3206 "__global const T * src = (__global const T *)(srcptr + src_index);\n"
3207 "PROCESS_ELEMS_N\n"
3208 "}\n"
3209 "}\n"
3210 "}\n"
3211 "#else\n"
3212 "#error \"No operation\"\n"
3213 "#endif\n"
3214 , "11e06966b3c2f2081fd02cf70337b495"};
3215 ProgramSource split_merge_oclsrc(split_merge.programStr);
3216 const struct ProgramEntry transpose={"transpose",
3217 "#if cn != 3\n"
3218 "#define loadpix(addr) *(__global const T *)(addr)\n"
3219 "#define storepix(val, addr)  *(__global T *)(addr) = val\n"
3220 "#define TSIZE (int)sizeof(T)\n"
3221 "#else\n"
3222 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n"
3223 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n"
3224 "#define TSIZE ((int)sizeof(T1)*3)\n"
3225 "#endif\n"
3226 "#ifndef INPLACE\n"
3227 "#define LDS_STEP      (TILE_DIM + 1)\n"
3228 "__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n"
3229 "__global uchar * dstptr, int dst_step, int dst_offset)\n"
3230 "{\n"
3231 "int gp_x = get_group_id(0),   gp_y = get_group_id(1);\n"
3232 "int gs_x = get_num_groups(0), gs_y = get_num_groups(1);\n"
3233 "int groupId_x, groupId_y;\n"
3234 "if (src_rows == src_cols)\n"
3235 "{\n"
3236 "groupId_y = gp_x;\n"
3237 "groupId_x = (gp_x + gp_y) % gs_x;\n"
3238 "}\n"
3239 "else\n"
3240 "{\n"
3241 "int bid = mad24(gs_x, gp_y, gp_x);\n"
3242 "groupId_y =  bid % gs_y;\n"
3243 "groupId_x = ((bid / gs_y) + groupId_y) % gs_x;\n"
3244 "}\n"
3245 "int lx = get_local_id(0);\n"
3246 "int ly = get_local_id(1);\n"
3247 "int x = mad24(groupId_x, TILE_DIM, lx);\n"
3248 "int y = mad24(groupId_y, TILE_DIM, ly);\n"
3249 "int x_index = mad24(groupId_y, TILE_DIM, lx);\n"
3250 "int y_index = mad24(groupId_x, TILE_DIM, ly);\n"
3251 "__local T tile[TILE_DIM * LDS_STEP];\n"
3252 "if (x < src_cols && y < src_rows)\n"
3253 "{\n"
3254 "int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));\n"
3255 "#pragma unroll\n"
3256 "for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)\n"
3257 "if (y + i < src_rows)\n"
3258 "{\n"
3259 "tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src);\n"
3260 "index_src = mad24(BLOCK_ROWS, src_step, index_src);\n"
3261 "}\n"
3262 "}\n"
3263 "barrier(CLK_LOCAL_MEM_FENCE);\n"
3264 "if (x_index < src_rows && y_index < src_cols)\n"
3265 "{\n"
3266 "int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));\n"
3267 "#pragma unroll\n"
3268 "for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)\n"
3269 "if ((y_index + i) < src_cols)\n"
3270 "{\n"
3271 "storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst);\n"
3272 "index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);\n"
3273 "}\n"
3274 "}\n"
3275 "}\n"
3276 "#else\n"
3277 "__kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows)\n"
3278 "{\n"
3279 "int x = get_global_id(0);\n"
3280 "int y = get_global_id(1) * rowsPerWI;\n"
3281 "if (x < y + rowsPerWI)\n"
3282 "{\n"
3283 "int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));\n"
3284 "int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));\n"
3285 "T tmp;\n"
3286 "#pragma unroll\n"
3287 "for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE)\n"
3288 "if (y < src_rows && x < y)\n"
3289 "{\n"
3290 "__global uchar * src = srcptr + src_index;\n"
3291 "__global uchar * dst = srcptr + dst_index;\n"
3292 "tmp = loadpix(dst);\n"
3293 "storepix(loadpix(src), dst);\n"
3294 "storepix(tmp, src);\n"
3295 "}\n"
3296 "}\n"
3297 "}\n"
3298 "#endif\n"
3299 , "f938bc7c686ae7bca004809df19ef032"};
3300 ProgramSource transpose_oclsrc(transpose.programStr);
3301 }
3302 }}
3303