1 // This file is auto-generated. Do not edit! 2 3 #include "precomp.hpp" 4 #include "opencl_kernels_core.hpp" 5 6 namespace cv 7 { 8 namespace ocl 9 { 10 namespace core 11 { 12 13 const struct ProgramEntry arithm={"arithm", 14 "#ifdef DOUBLE_SUPPORT\n" 15 "#ifdef cl_amd_fp64\n" 16 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 17 "#elif defined cl_khr_fp64\n" 18 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 19 "#endif\n" 20 "#endif\n" 21 "#ifdef INTEL_DEVICE\n" 22 "#pragma OPENCL FP_CONTRACT ON\n" 23 "#pragma OPENCL FP_FAST_FMAF ON\n" 24 "#pragma OPENCL FP_FAST_FMA ON\n" 25 "#endif\n" 26 "#if depth <= 5\n" 27 "#define CV_PI M_PI_F\n" 28 "#else\n" 29 "#define CV_PI M_PI\n" 30 "#endif\n" 31 "#ifndef cn\n" 32 "#define cn 1\n" 33 "#endif\n" 34 "#if cn == 1\n" 35 "#undef srcT1_C1\n" 36 "#undef srcT2_C1\n" 37 "#undef dstT_C1\n" 38 "#define srcT1_C1 srcT1\n" 39 "#define srcT2_C1 srcT2\n" 40 "#define dstT_C1 dstT\n" 41 "#endif\n" 42 "#if cn != 3\n" 43 "#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val\n" 44 "#define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val\n" 45 "#else\n" 46 "#define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index))\n" 47 "#define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2))\n" 48 "#endif\n" 49 "#define noconvert\n" 50 "#ifndef workT\n" 51 "#ifndef srcT1\n" 52 "#define srcT1 dstT\n" 53 "#endif\n" 54 "#ifndef srcT1_C1\n" 55 "#define srcT1_C1 dstT_C1\n" 56 "#endif\n" 57 "#ifndef srcT2\n" 58 "#define srcT2 dstT\n" 59 "#endif\n" 60 "#ifndef srcT2_C1\n" 61 "#define srcT2_C1 dstT_C1\n" 62 "#endif\n" 63 "#define workT dstT\n" 64 "#if cn != 3\n" 65 "#define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index)\n" 66 "#define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index)\n" 67 "#else\n" 68 "#define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))\n" 69 "#define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))\n" 70 "#endif\n" 71 "#ifndef convertToDT\n" 72 "#define convertToDT noconvert\n" 73 "#endif\n" 74 "#else\n" 75 "#ifndef convertToWT2\n" 76 "#define convertToWT2 convertToWT1\n" 77 "#endif\n" 78 "#if cn != 3\n" 79 "#define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index))\n" 80 "#define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index))\n" 81 "#else\n" 82 "#define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)))\n" 83 "#define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)))\n" 84 "#endif\n" 85 "#endif\n" 86 "#ifndef workST\n" 87 "#define workST workT\n" 88 "#endif\n" 89 "#define EXTRA_PARAMS\n" 90 "#define EXTRA_INDEX\n" 91 "#define EXTRA_INDEX_ADD\n" 92 "#if defined OP_ADD\n" 93 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))\n" 94 "#elif defined OP_SUB\n" 95 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))\n" 96 "#elif defined OP_RSUB\n" 97 "#define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))\n" 98 "#elif defined OP_ABSDIFF\n" 99 "#if wdepth <= 4\n" 100 "#define PROCESS_ELEM \\\n" 101 "storedst(convertToDT(convertFromU(abs_diff(srcelem1, srcelem2))))\n" 102 "#else\n" 103 "#define PROCESS_ELEM \\\n" 104 "storedst(convertToDT(fabs(srcelem1 - srcelem2)))\n" 105 "#endif\n" 106 "#elif defined OP_AND\n" 107 "#define PROCESS_ELEM storedst(srcelem1 & srcelem2)\n" 108 "#elif defined OP_OR\n" 109 "#define PROCESS_ELEM storedst(srcelem1 | srcelem2)\n" 110 "#elif defined OP_XOR\n" 111 "#define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)\n" 112 "#elif defined OP_NOT\n" 113 "#define PROCESS_ELEM storedst(~srcelem1)\n" 114 "#elif defined OP_MIN\n" 115 "#define PROCESS_ELEM storedst(min(srcelem1, srcelem2))\n" 116 "#elif defined OP_MAX\n" 117 "#define PROCESS_ELEM storedst(max(srcelem1, srcelem2))\n" 118 "#elif defined OP_MUL\n" 119 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))\n" 120 "#elif defined OP_MUL_SCALE\n" 121 "#undef EXTRA_PARAMS\n" 122 "#ifdef UNARY_OP\n" 123 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n" 124 "#undef srcelem2\n" 125 "#define srcelem2 srcelem2_\n" 126 "#else\n" 127 "#define EXTRA_PARAMS , scaleT scale\n" 128 "#endif\n" 129 "#define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))\n" 130 "#elif defined OP_DIV\n" 131 "#define PROCESS_ELEM \\\n" 132 "workT e2 = srcelem2, zero = (workT)(0); \\\n" 133 "storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))\n" 134 "#elif defined OP_DIV_SCALE\n" 135 "#undef EXTRA_PARAMS\n" 136 "#ifdef UNARY_OP\n" 137 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n" 138 "#undef srcelem2\n" 139 "#define srcelem2 srcelem2_\n" 140 "#else\n" 141 "#define EXTRA_PARAMS , scaleT scale\n" 142 "#endif\n" 143 "#define PROCESS_ELEM \\\n" 144 "workT e2 = srcelem2, zero = (workT)(0); \\\n" 145 "storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))\n" 146 "#elif defined OP_RDIV_SCALE\n" 147 "#undef EXTRA_PARAMS\n" 148 "#ifdef UNARY_OP\n" 149 "#define EXTRA_PARAMS , workST srcelem2_, scaleT scale\n" 150 "#undef srcelem2\n" 151 "#define srcelem2 srcelem2_\n" 152 "#else\n" 153 "#define EXTRA_PARAMS , scaleT scale\n" 154 "#endif\n" 155 "#define PROCESS_ELEM \\\n" 156 "workT e1 = srcelem1, zero = (workT)(0); \\\n" 157 "storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))\n" 158 "#elif defined OP_RECIP_SCALE\n" 159 "#undef EXTRA_PARAMS\n" 160 "#define EXTRA_PARAMS , scaleT scale\n" 161 "#define PROCESS_ELEM \\\n" 162 "workT e1 = srcelem1, zero = (workT)(0); \\\n" 163 "storedst(convertToDT(e1 != zero ? scale / e1 : zero))\n" 164 "#elif defined OP_ADDW\n" 165 "#undef EXTRA_PARAMS\n" 166 "#define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma\n" 167 "#if wdepth <= 4\n" 168 "#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))\n" 169 "#else\n" 170 "#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, fma(srcelem2, beta, gamma))))\n" 171 "#endif\n" 172 "#elif defined OP_MAG\n" 173 "#define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))\n" 174 "#elif defined OP_PHASE_RADIANS\n" 175 "#define PROCESS_ELEM \\\n" 176 "workT tmp = atan2(srcelem2, srcelem1); \\\n" 177 "if (tmp < 0) \\\n" 178 "tmp += 2 * CV_PI; \\\n" 179 "storedst(tmp)\n" 180 "#elif defined OP_PHASE_DEGREES\n" 181 "#define PROCESS_ELEM \\\n" 182 "workT tmp = degrees(atan2(srcelem2, srcelem1)); \\\n" 183 "if (tmp < 0) \\\n" 184 "tmp += 360; \\\n" 185 "storedst(tmp)\n" 186 "#elif defined OP_EXP\n" 187 "#if wdepth == 5\n" 188 "#define PROCESS_ELEM storedst(native_exp(srcelem1))\n" 189 "#else\n" 190 "#define PROCESS_ELEM storedst(exp(srcelem1))\n" 191 "#endif\n" 192 "#elif defined OP_POW\n" 193 "#define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))\n" 194 "#elif defined OP_POWN\n" 195 "#undef workT\n" 196 "#define workT int\n" 197 "#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))\n" 198 "#elif defined OP_SQRT\n" 199 "#if depth <= 5\n" 200 "#define PROCESS_ELEM storedst(native_sqrt(srcelem1))\n" 201 "#else\n" 202 "#define PROCESS_ELEM storedst(sqrt(srcelem1))\n" 203 "#endif\n" 204 "#elif defined OP_LOG\n" 205 "#define PROCESS_ELEM \\\n" 206 "storedst(log(fabs(srcelem1)))\n" 207 "#elif defined OP_CMP\n" 208 "#define srcT2 srcT1\n" 209 "#ifndef convertToWT1\n" 210 "#define convertToWT1\n" 211 "#endif\n" 212 "#define PROCESS_ELEM \\\n" 213 "storedst(srcelem1 CMP_OPERATOR srcelem2 ? (dstT)(255) : (dstT)(0))\n" 214 "#elif defined OP_CONVERT_SCALE_ABS\n" 215 "#undef EXTRA_PARAMS\n" 216 "#define EXTRA_PARAMS , workT1 alpha, workT1 beta\n" 217 "#if wdepth <= 4\n" 218 "#define PROCESS_ELEM \\\n" 219 "workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \\\n" 220 "storedst(convertToDT(abs(value)))\n" 221 "#else\n" 222 "#define PROCESS_ELEM \\\n" 223 "workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \\\n" 224 "storedst(convertToDT(fabs(value)))\n" 225 "#endif\n" 226 "#elif defined OP_SCALE_ADD\n" 227 "#undef EXTRA_PARAMS\n" 228 "#define EXTRA_PARAMS , workT1 alpha\n" 229 "#if wdepth <= 4\n" 230 "#define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))\n" 231 "#else\n" 232 "#define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2)))\n" 233 "#endif\n" 234 "#elif defined OP_CTP_AD || defined OP_CTP_AR\n" 235 "#if depth <= 5\n" 236 "#define CV_EPSILON FLT_EPSILON\n" 237 "#else\n" 238 "#define CV_EPSILON DBL_EPSILON\n" 239 "#endif\n" 240 "#ifdef OP_CTP_AD\n" 241 "#define TO_DEGREE cartToPolar = degrees(cartToPolar);\n" 242 "#elif defined OP_CTP_AR\n" 243 "#define TO_DEGREE\n" 244 "#endif\n" 245 "#define PROCESS_ELEM \\\n" 246 "dstT x = srcelem1, y = srcelem2; \\\n" 247 "dstT x2 = x * x, y2 = y * y; \\\n" 248 "dstT magnitude = sqrt(x2 + y2); \\\n" 249 "dstT tmp = y >= 0 ? 0 : CV_PI * 2; \\\n" 250 "tmp = x < 0 ? CV_PI : tmp; \\\n" 251 "dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \\\n" 252 "dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \\\n" 253 "TO_DEGREE \\\n" 254 "storedst(magnitude); \\\n" 255 "storedst2(cartToPolar)\n" 256 "#elif defined OP_PTC_AD || defined OP_PTC_AR\n" 257 "#ifdef OP_PTC_AD\n" 258 "#define FROM_DEGREE y = radians(y)\n" 259 "#else\n" 260 "#define FROM_DEGREE\n" 261 "#endif\n" 262 "#define PROCESS_ELEM \\\n" 263 "dstT x = srcelem1, y = srcelem2, cosval; \\\n" 264 "FROM_DEGREE; \\\n" 265 "storedst2(sincos(y, &cosval) * x); \\\n" 266 "storedst(cosval * x);\n" 267 "#elif defined OP_PATCH_NANS\n" 268 "#undef EXTRA_PARAMS\n" 269 "#define EXTRA_PARAMS , dstT val\n" 270 "#define PROCESS_ELEM \\\n" 271 "if (isnan(srcelem1)) \\\n" 272 "storedst(val)\n" 273 "#else\n" 274 "#error \"unknown op type\"\n" 275 "#endif\n" 276 "#if defined OP_CTP_AD || defined OP_CTP_AR || defined OP_PTC_AD || defined OP_PTC_AR\n" 277 "#undef EXTRA_PARAMS\n" 278 "#define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2\n" 279 "#undef EXTRA_INDEX\n" 280 "#define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))\n" 281 "#undef EXTRA_INDEX_ADD\n" 282 "#define EXTRA_INDEX_ADD dst_index2 += dststep2\n" 283 "#endif\n" 284 "#if defined UNARY_OP || defined MASK_UNARY_OP\n" 285 "#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \\\n" 286 "defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \\\n" 287 "defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \\\n" 288 "defined OP_MUL || defined OP_DIV || defined OP_POWN || defined OP_POWR || defined OP_ROOTN\n" 289 "#undef EXTRA_PARAMS\n" 290 "#define EXTRA_PARAMS , workST srcelem2_\n" 291 "#undef srcelem2\n" 292 "#define srcelem2 srcelem2_\n" 293 "#endif\n" 294 "#if cn == 3\n" 295 "#undef srcelem2\n" 296 "#define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)\n" 297 "#endif\n" 298 "#endif\n" 299 "#if defined BINARY_OP\n" 300 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n" 301 "__global const uchar * srcptr2, int srcstep2, int srcoffset2,\n" 302 "__global uchar * dstptr, int dststep, int dstoffset,\n" 303 "int rows, int cols EXTRA_PARAMS )\n" 304 "{\n" 305 "int x = get_global_id(0);\n" 306 "int y0 = get_global_id(1) * rowsPerWI;\n" 307 "if (x < cols)\n" 308 "{\n" 309 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n" 310 "#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))\n" 311 "int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));\n" 312 "#endif\n" 313 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n" 314 "EXTRA_INDEX;\n" 315 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)\n" 316 "{\n" 317 "PROCESS_ELEM;\n" 318 "#if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))\n" 319 "src2_index += srcstep2;\n" 320 "#endif\n" 321 "EXTRA_INDEX_ADD;\n" 322 "}\n" 323 "}\n" 324 "}\n" 325 "#elif defined MASK_BINARY_OP\n" 326 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n" 327 "__global const uchar * srcptr2, int srcstep2, int srcoffset2,\n" 328 "__global const uchar * mask, int maskstep, int maskoffset,\n" 329 "__global uchar * dstptr, int dststep, int dstoffset,\n" 330 "int rows, int cols EXTRA_PARAMS )\n" 331 "{\n" 332 "int x = get_global_id(0);\n" 333 "int y0 = get_global_id(1) * rowsPerWI;\n" 334 "if (x < cols)\n" 335 "{\n" 336 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n" 337 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n" 338 "int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));\n" 339 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n" 340 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,\n" 341 "mask_index += maskstep, dst_index += dststep)\n" 342 "if (mask[mask_index])\n" 343 "{\n" 344 "PROCESS_ELEM;\n" 345 "}\n" 346 "}\n" 347 "}\n" 348 "#elif defined UNARY_OP\n" 349 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n" 350 "__global uchar * dstptr, int dststep, int dstoffset,\n" 351 "int rows, int cols EXTRA_PARAMS )\n" 352 "{\n" 353 "int x = get_global_id(0);\n" 354 "int y0 = get_global_id(1) * rowsPerWI;\n" 355 "if (x < cols)\n" 356 "{\n" 357 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n" 358 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n" 359 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)\n" 360 "{\n" 361 "PROCESS_ELEM;\n" 362 "}\n" 363 "}\n" 364 "}\n" 365 "#elif defined MASK_UNARY_OP\n" 366 "__kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,\n" 367 "__global const uchar * mask, int maskstep, int maskoffset,\n" 368 "__global uchar * dstptr, int dststep, int dstoffset,\n" 369 "int rows, int cols EXTRA_PARAMS )\n" 370 "{\n" 371 "int x = get_global_id(0);\n" 372 "int y0 = get_global_id(1) * rowsPerWI;\n" 373 "if (x < cols)\n" 374 "{\n" 375 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n" 376 "int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));\n" 377 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));\n" 378 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)\n" 379 "if (mask[mask_index])\n" 380 "{\n" 381 "PROCESS_ELEM;\n" 382 "}\n" 383 "}\n" 384 "}\n" 385 "#else\n" 386 "#error \"Unknown operation type\"\n" 387 "#endif\n" 388 , "ed0d3325149acdde26581a72052260a5"}; 389 ProgramSource arithm_oclsrc(arithm.programStr); 390 const struct ProgramEntry convert={"convert", 391 "#ifdef DOUBLE_SUPPORT\n" 392 "#ifdef cl_amd_fp64\n" 393 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 394 "#elif defined (cl_khr_fp64)\n" 395 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 396 "#endif\n" 397 "#endif\n" 398 "#define noconvert\n" 399 "__kernel void convertTo(__global const uchar * srcptr, int src_step, int src_offset,\n" 400 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 401 "WT alpha, WT beta, int rowsPerWI)\n" 402 "{\n" 403 "int x = get_global_id(0);\n" 404 "int y0 = get_global_id(1) * rowsPerWI;\n" 405 "if (x < dst_cols)\n" 406 "{\n" 407 "int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(srcT), src_offset));\n" 408 "int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));\n" 409 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index += dst_step)\n" 410 "{\n" 411 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n" 412 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n" 413 "dst[0] = convertToDT(fma(convertToWT(src[0]), alpha, beta));\n" 414 "}\n" 415 "}\n" 416 "}\n" 417 , "04c460dd0be049cd132d4c4611c93a5b"}; 418 ProgramSource convert_oclsrc(convert.programStr); 419 const struct ProgramEntry copymakeborder={"copymakeborder", 420 "#ifdef DOUBLE_SUPPORT\n" 421 "#ifdef cl_amd_fp64\n" 422 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 423 "#elif defined (cl_khr_fp64)\n" 424 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 425 "#endif\n" 426 "#endif\n" 427 "#if cn != 3\n" 428 "#define loadpix(addr) *(__global const T*)(addr)\n" 429 "#define storepix(val, addr) *(__global T*)(addr) = val\n" 430 "#define TSIZE ((int)sizeof(T))\n" 431 "#define convertScalar(a) (a)\n" 432 "#else\n" 433 "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" 434 "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n" 435 "#define TSIZE ((int)sizeof(T1)*3)\n" 436 "#define convertScalar(a) (T)(a.x, a.y, a.z)\n" 437 "#endif\n" 438 "#ifdef BORDER_CONSTANT\n" 439 "#define EXTRAPOLATE(x, cols) \\\n" 440 ";\n" 441 "#elif defined BORDER_REPLICATE\n" 442 "#define EXTRAPOLATE(x, cols) \\\n" 443 "x = clamp(x, 0, cols - 1);\n" 444 "#elif defined BORDER_WRAP\n" 445 "#define EXTRAPOLATE(x, cols) \\\n" 446 "{ \\\n" 447 "if (x < 0) \\\n" 448 "x -= ((x - cols + 1) / cols) * cols; \\\n" 449 "if (x >= cols) \\\n" 450 "x %= cols; \\\n" 451 "}\n" 452 "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" 453 "#ifdef BORDER_REFLECT\n" 454 "#define DELTA int delta = 0\n" 455 "#else\n" 456 "#define DELTA int delta = 1\n" 457 "#endif\n" 458 "#define EXTRAPOLATE(x, cols) \\\n" 459 "{ \\\n" 460 "DELTA; \\\n" 461 "if (cols == 1) \\\n" 462 "x = 0; \\\n" 463 "else \\\n" 464 "do \\\n" 465 "{ \\\n" 466 "if( x < 0 ) \\\n" 467 "x = -x - 1 + delta; \\\n" 468 "else \\\n" 469 "x = cols - 1 - (x - cols) - delta; \\\n" 470 "} \\\n" 471 "while (x >= cols || x < 0); \\\n" 472 "}\n" 473 "#else\n" 474 "#error \"No extrapolation method\"\n" 475 "#endif\n" 476 "#define NEED_EXTRAPOLATION(x, cols) (x >= cols || x < 0)\n" 477 "__kernel void copyMakeBorder(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 478 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 479 "int top, int left, ST nVal)\n" 480 "{\n" 481 "int x = get_global_id(0);\n" 482 "int y0 = get_global_id(1) * rowsPerWI;\n" 483 "#ifdef BORDER_CONSTANT\n" 484 "T scalar = convertScalar(nVal);\n" 485 "#endif\n" 486 "if (x < dst_cols)\n" 487 "{\n" 488 "int src_x = x - left, src_y;\n" 489 "int dst_index = mad24(y0, dst_step, mad24(x, (int)TSIZE, dst_offset));\n" 490 "if (NEED_EXTRAPOLATION(src_x, src_cols))\n" 491 "{\n" 492 "#ifdef BORDER_CONSTANT\n" 493 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1; ++y, dst_index += dst_step)\n" 494 "storepix(scalar, dstptr + dst_index);\n" 495 "return;\n" 496 "#endif\n" 497 "EXTRAPOLATE(src_x, src_cols)\n" 498 "}\n" 499 "src_x = mad24(src_x, TSIZE, src_offset);\n" 500 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1; ++y, dst_index += dst_step)\n" 501 "{\n" 502 "src_y = y - top;\n" 503 "if (NEED_EXTRAPOLATION(src_y, src_rows))\n" 504 "{\n" 505 "EXTRAPOLATE(src_y, src_rows)\n" 506 "#ifdef BORDER_CONSTANT\n" 507 "storepix(scalar, dstptr + dst_index);\n" 508 "continue;\n" 509 "#endif\n" 510 "}\n" 511 "int src_index = mad24(src_y, src_step, src_x);\n" 512 "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n" 513 "}\n" 514 "}\n" 515 "}\n" 516 , "64f03714b8763ec6c2ac2f4b2ad0cf5d"}; 517 ProgramSource copymakeborder_oclsrc(copymakeborder.programStr); 518 const struct ProgramEntry copyset={"copyset", 519 "#ifdef COPY_TO_MASK\n" 520 "#define DEFINE_DATA \\\n" 521 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(T1) * scn, src_offset)); \\\n" 522 "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(T1) * scn, dst_offset)); \\\n" 523 "\\\n" 524 "__global const T1 * src = (__global const T1 *)(srcptr + src_index); \\\n" 525 "__global T1 * dst = (__global T1 *)(dstptr + dst_index)\n" 526 "__kernel void copyToMask(__global const uchar * srcptr, int src_step, int src_offset,\n" 527 "__global const uchar * mask, int mask_step, int mask_offset,\n" 528 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 529 "int dst_rows, int dst_cols)\n" 530 "{\n" 531 "int x = get_global_id(0);\n" 532 "int y = get_global_id(1);\n" 533 "if (x < dst_cols && y < dst_rows)\n" 534 "{\n" 535 "mask += mad24(y, mask_step, mad24(x, mcn, mask_offset));\n" 536 "#if mcn == 1\n" 537 "if (mask[0])\n" 538 "{\n" 539 "DEFINE_DATA;\n" 540 "#pragma unroll\n" 541 "for (int c = 0; c < scn; ++c)\n" 542 "dst[c] = src[c];\n" 543 "}\n" 544 "#ifdef HAVE_DST_UNINIT\n" 545 "else\n" 546 "{\n" 547 "DEFINE_DATA;\n" 548 "#pragma unroll\n" 549 "for (int c = 0; c < scn; ++c)\n" 550 "dst[c] = (T1)(0);\n" 551 "}\n" 552 "#endif\n" 553 "#elif scn == mcn\n" 554 "DEFINE_DATA;\n" 555 "#pragma unroll\n" 556 "for (int c = 0; c < scn; ++c)\n" 557 "if (mask[c])\n" 558 "dst[c] = src[c];\n" 559 "#ifdef HAVE_DST_UNINIT\n" 560 "else\n" 561 "dst[c] = (T1)(0);\n" 562 "#endif\n" 563 "#else\n" 564 "#error \"(mcn == 1 || mcn == scn) should be true\"\n" 565 "#endif\n" 566 "}\n" 567 "}\n" 568 "#else\n" 569 "#ifndef dstST\n" 570 "#define dstST dstT\n" 571 "#endif\n" 572 "#if cn != 3\n" 573 "#define value value_\n" 574 "#define storedst(val) *(__global dstT *)(dstptr + dst_index) = val\n" 575 "#else\n" 576 "#define value (dstT)(value_.x, value_.y, value_.z)\n" 577 "#define storedst(val) vstore3(val, 0, (__global dstT1 *)(dstptr + dst_index))\n" 578 "#endif\n" 579 "__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,\n" 580 "__global uchar* dstptr, int dststep, int dstoffset,\n" 581 "int rows, int cols, dstST value_)\n" 582 "{\n" 583 "int x = get_global_id(0);\n" 584 "int y0 = get_global_id(1) * rowsPerWI;\n" 585 "if (x < cols)\n" 586 "{\n" 587 "int mask_index = mad24(y0, maskstep, x + maskoffset);\n" 588 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));\n" 589 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y)\n" 590 "{\n" 591 "if( mask[mask_index] )\n" 592 "storedst(value);\n" 593 "mask_index += maskstep;\n" 594 "dst_index += dststep;\n" 595 "}\n" 596 "}\n" 597 "}\n" 598 "__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,\n" 599 "int rows, int cols, dstST value_)\n" 600 "{\n" 601 "int x = get_global_id(0);\n" 602 "int y0 = get_global_id(1) * rowsPerWI;\n" 603 "if (x < cols)\n" 604 "{\n" 605 "int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT1) * cn, dstoffset));\n" 606 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dststep)\n" 607 "storedst(value);\n" 608 "}\n" 609 "}\n" 610 "#endif\n" 611 , "f8f028f1776dc5c98bf03411d3b72318"}; 612 ProgramSource copyset_oclsrc(copyset.programStr); 613 const struct ProgramEntry fft={"fft", 614 "#define SQRT_2 0.707106781188f\n" 615 "#define sin_120 0.866025403784f\n" 616 "#define fft5_2 0.559016994374f\n" 617 "#define fft5_3 -0.951056516295f\n" 618 "#define fft5_4 -1.538841768587f\n" 619 "#define fft5_5 0.363271264002f\n" 620 "#ifdef DOUBLE_SUPPORT\n" 621 "#ifdef cl_amd_fp64\n" 622 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 623 "#elif defined (cl_khr_fp64)\n" 624 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 625 "#endif\n" 626 "#endif\n" 627 "__attribute__((always_inline))\n" 628 "CT mul_complex(CT a, CT b) {\n" 629 "return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));\n" 630 "}\n" 631 "__attribute__((always_inline))\n" 632 "CT twiddle(CT a) {\n" 633 "return (CT)(a.y, -a.x);\n" 634 "}\n" 635 "__attribute__((always_inline))\n" 636 "void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles,\n" 637 "const int x, const int block_size)\n" 638 "{\n" 639 "const int k = x & (block_size - 1);\n" 640 "a1 = mul_complex(twiddles[k], a1);\n" 641 "const int dst_ind = (x << 1) - k;\n" 642 "smem[dst_ind] = a0 + a1;\n" 643 "smem[dst_ind+block_size] = a0 - a1;\n" 644 "}\n" 645 "__attribute__((always_inline))\n" 646 "void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles,\n" 647 "const int x, const int block_size)\n" 648 "{\n" 649 "const int k = x & (block_size - 1);\n" 650 "a1 = mul_complex(twiddles[k], a1);\n" 651 "a2 = mul_complex(twiddles[k + block_size], a2);\n" 652 "a3 = mul_complex(twiddles[k + 2*block_size], a3);\n" 653 "const int dst_ind = ((x - k) << 2) + k;\n" 654 "CT b0 = a0 + a2;\n" 655 "a2 = a0 - a2;\n" 656 "CT b1 = a1 + a3;\n" 657 "a3 = twiddle(a1 - a3);\n" 658 "smem[dst_ind] = b0 + b1;\n" 659 "smem[dst_ind + block_size] = a2 + a3;\n" 660 "smem[dst_ind + 2*block_size] = b0 - b1;\n" 661 "smem[dst_ind + 3*block_size] = a2 - a3;\n" 662 "}\n" 663 "__attribute__((always_inline))\n" 664 "void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles,\n" 665 "const int x, const int block_size)\n" 666 "{\n" 667 "const int k = x % block_size;\n" 668 "a1 = mul_complex(twiddles[k], a1);\n" 669 "a2 = mul_complex(twiddles[k+block_size], a2);\n" 670 "const int dst_ind = ((x - k) * 3) + k;\n" 671 "CT b1 = a1 + a2;\n" 672 "a2 = twiddle(sin_120*(a1 - a2));\n" 673 "CT b0 = a0 - (CT)(0.5f)*b1;\n" 674 "smem[dst_ind] = a0 + b1;\n" 675 "smem[dst_ind + block_size] = b0 + a2;\n" 676 "smem[dst_ind + 2*block_size] = b0 - a2;\n" 677 "}\n" 678 "__attribute__((always_inline))\n" 679 "void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles,\n" 680 "const int x, const int block_size)\n" 681 "{\n" 682 "const int k = x % block_size;\n" 683 "a1 = mul_complex(twiddles[k], a1);\n" 684 "a2 = mul_complex(twiddles[k + block_size], a2);\n" 685 "a3 = mul_complex(twiddles[k+2*block_size], a3);\n" 686 "a4 = mul_complex(twiddles[k+3*block_size], a4);\n" 687 "const int dst_ind = ((x - k) * 5) + k;\n" 688 "__local CT* dst = smem + dst_ind;\n" 689 "CT b0, b1, b5;\n" 690 "b1 = a1 + a4;\n" 691 "a1 -= a4;\n" 692 "a4 = a3 + a2;\n" 693 "a3 -= a2;\n" 694 "a2 = b1 + a4;\n" 695 "b0 = a0 - (CT)0.25f * a2;\n" 696 "b1 = fft5_2 * (b1 - a4);\n" 697 "a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x);\n" 698 "b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);\n" 699 "a4.x += fft5_4 * a3.y;\n" 700 "a4.y -= fft5_4 * a3.x;\n" 701 "a1 = b0 + b1;\n" 702 "b0 -= b1;\n" 703 "dst[0] = a0 + a2;\n" 704 "dst[block_size] = a1 + a4;\n" 705 "dst[2 * block_size] = b0 + b5;\n" 706 "dst[3 * block_size] = b0 - b5;\n" 707 "dst[4 * block_size] = a1 - a4;\n" 708 "}\n" 709 "__attribute__((always_inline))\n" 710 "void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n" 711 "{\n" 712 "CT a0, a1;\n" 713 "if (x < t)\n" 714 "{\n" 715 "a0 = smem[x];\n" 716 "a1 = smem[x+t];\n" 717 "}\n" 718 "barrier(CLK_LOCAL_MEM_FENCE);\n" 719 "if (x < t)\n" 720 "butterfly2(a0, a1, smem, twiddles, x, block_size);\n" 721 "barrier(CLK_LOCAL_MEM_FENCE);\n" 722 "}\n" 723 "__attribute__((always_inline))\n" 724 "void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 725 "{\n" 726 "const int x2 = x1 + t/2;\n" 727 "CT a0, a1, a2, a3;\n" 728 "if (x1 < t/2)\n" 729 "{\n" 730 "a0 = smem[x1]; a1 = smem[x1+t];\n" 731 "a2 = smem[x2]; a3 = smem[x2+t];\n" 732 "}\n" 733 "barrier(CLK_LOCAL_MEM_FENCE);\n" 734 "if (x1 < t/2)\n" 735 "{\n" 736 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n" 737 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n" 738 "}\n" 739 "barrier(CLK_LOCAL_MEM_FENCE);\n" 740 "}\n" 741 "__attribute__((always_inline))\n" 742 "void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 743 "{\n" 744 "const int x2 = x1 + t/3;\n" 745 "const int x3 = x1 + 2*t/3;\n" 746 "CT a0, a1, a2, a3, a4, a5;\n" 747 "if (x1 < t/3)\n" 748 "{\n" 749 "a0 = smem[x1]; a1 = smem[x1+t];\n" 750 "a2 = smem[x2]; a3 = smem[x2+t];\n" 751 "a4 = smem[x3]; a5 = smem[x3+t];\n" 752 "}\n" 753 "barrier(CLK_LOCAL_MEM_FENCE);\n" 754 "if (x1 < t/3)\n" 755 "{\n" 756 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n" 757 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n" 758 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n" 759 "}\n" 760 "barrier(CLK_LOCAL_MEM_FENCE);\n" 761 "}\n" 762 "__attribute__((always_inline))\n" 763 "void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 764 "{\n" 765 "const int thread_block = t/4;\n" 766 "const int x2 = x1 + thread_block;\n" 767 "const int x3 = x1 + 2*thread_block;\n" 768 "const int x4 = x1 + 3*thread_block;\n" 769 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n" 770 "if (x1 < t/4)\n" 771 "{\n" 772 "a0 = smem[x1]; a1 = smem[x1+t];\n" 773 "a2 = smem[x2]; a3 = smem[x2+t];\n" 774 "a4 = smem[x3]; a5 = smem[x3+t];\n" 775 "a6 = smem[x4]; a7 = smem[x4+t];\n" 776 "}\n" 777 "barrier(CLK_LOCAL_MEM_FENCE);\n" 778 "if (x1 < t/4)\n" 779 "{\n" 780 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n" 781 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n" 782 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n" 783 "butterfly2(a6, a7, smem, twiddles, x4, block_size);\n" 784 "}\n" 785 "barrier(CLK_LOCAL_MEM_FENCE);\n" 786 "}\n" 787 "__attribute__((always_inline))\n" 788 "void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 789 "{\n" 790 "const int thread_block = t/5;\n" 791 "const int x2 = x1 + thread_block;\n" 792 "const int x3 = x1 + 2*thread_block;\n" 793 "const int x4 = x1 + 3*thread_block;\n" 794 "const int x5 = x1 + 4*thread_block;\n" 795 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;\n" 796 "if (x1 < t/5)\n" 797 "{\n" 798 "a0 = smem[x1]; a1 = smem[x1+t];\n" 799 "a2 = smem[x2]; a3 = smem[x2+t];\n" 800 "a4 = smem[x3]; a5 = smem[x3+t];\n" 801 "a6 = smem[x4]; a7 = smem[x4+t];\n" 802 "a8 = smem[x5]; a9 = smem[x5+t];\n" 803 "}\n" 804 "barrier(CLK_LOCAL_MEM_FENCE);\n" 805 "if (x1 < t/5)\n" 806 "{\n" 807 "butterfly2(a0, a1, smem, twiddles, x1, block_size);\n" 808 "butterfly2(a2, a3, smem, twiddles, x2, block_size);\n" 809 "butterfly2(a4, a5, smem, twiddles, x3, block_size);\n" 810 "butterfly2(a6, a7, smem, twiddles, x4, block_size);\n" 811 "butterfly2(a8, a9, smem, twiddles, x5, block_size);\n" 812 "}\n" 813 "barrier(CLK_LOCAL_MEM_FENCE);\n" 814 "}\n" 815 "__attribute__((always_inline))\n" 816 "void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n" 817 "{\n" 818 "CT a0, a1, a2, a3;\n" 819 "if (x < t)\n" 820 "{\n" 821 "a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];\n" 822 "}\n" 823 "barrier(CLK_LOCAL_MEM_FENCE);\n" 824 "if (x < t)\n" 825 "butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);\n" 826 "barrier(CLK_LOCAL_MEM_FENCE);\n" 827 "}\n" 828 "__attribute__((always_inline))\n" 829 "void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 830 "{\n" 831 "const int x2 = x1 + t/2;\n" 832 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n" 833 "if (x1 < t/2)\n" 834 "{\n" 835 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];\n" 836 "a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];\n" 837 "}\n" 838 "barrier(CLK_LOCAL_MEM_FENCE);\n" 839 "if (x1 < t/2)\n" 840 "{\n" 841 "butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);\n" 842 "butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);\n" 843 "}\n" 844 "barrier(CLK_LOCAL_MEM_FENCE);\n" 845 "}\n" 846 "__attribute__((always_inline))\n" 847 "void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 848 "{\n" 849 "const int x2 = x1 + t/3;\n" 850 "const int x3 = x2 + t/3;\n" 851 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;\n" 852 "if (x1 < t/3)\n" 853 "{\n" 854 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];\n" 855 "a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];\n" 856 "a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];\n" 857 "}\n" 858 "barrier(CLK_LOCAL_MEM_FENCE);\n" 859 "if (x1 < t/3)\n" 860 "{\n" 861 "butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);\n" 862 "butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);\n" 863 "butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);\n" 864 "}\n" 865 "barrier(CLK_LOCAL_MEM_FENCE);\n" 866 "}\n" 867 "__attribute__((always_inline))\n" 868 "void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n" 869 "{\n" 870 "const int k = x % block_size;\n" 871 "CT a0, a1, a2, a3, a4, a5, a6, a7;\n" 872 "if (x < t)\n" 873 "{\n" 874 "int tw_ind = block_size / 8;\n" 875 "a0 = smem[x];\n" 876 "a1 = mul_complex(twiddles[k], smem[x + t]);\n" 877 "a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]);\n" 878 "a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]);\n" 879 "a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]);\n" 880 "a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]);\n" 881 "a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]);\n" 882 "a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]);\n" 883 "CT b0, b1, b6, b7;\n" 884 "b0 = a0 + a4;\n" 885 "a4 = a0 - a4;\n" 886 "b1 = a1 + a5;\n" 887 "a5 = a1 - a5;\n" 888 "a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y);\n" 889 "b6 = twiddle(a2 - a6);\n" 890 "a2 = a2 + a6;\n" 891 "b7 = a3 - a7;\n" 892 "b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y);\n" 893 "a3 = a3 + a7;\n" 894 "a0 = b0 + a2;\n" 895 "a2 = b0 - a2;\n" 896 "a1 = b1 + a3;\n" 897 "a3 = twiddle(b1 - a3);\n" 898 "a6 = a4 - b6;\n" 899 "a4 = a4 + b6;\n" 900 "a7 = twiddle(a5 - b7);\n" 901 "a5 = a5 + b7;\n" 902 "}\n" 903 "barrier(CLK_LOCAL_MEM_FENCE);\n" 904 "if (x < t)\n" 905 "{\n" 906 "const int dst_ind = ((x - k) << 3) + k;\n" 907 "__local CT* dst = smem + dst_ind;\n" 908 "dst[0] = a0 + a1;\n" 909 "dst[block_size] = a4 + a5;\n" 910 "dst[2 * block_size] = a2 + a3;\n" 911 "dst[3 * block_size] = a6 + a7;\n" 912 "dst[4 * block_size] = a0 - a1;\n" 913 "dst[5 * block_size] = a4 - a5;\n" 914 "dst[6 * block_size] = a2 - a3;\n" 915 "dst[7 * block_size] = a6 - a7;\n" 916 "}\n" 917 "barrier(CLK_LOCAL_MEM_FENCE);\n" 918 "}\n" 919 "__attribute__((always_inline))\n" 920 "void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n" 921 "{\n" 922 "CT a0, a1, a2;\n" 923 "if (x < t)\n" 924 "{\n" 925 "a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];\n" 926 "}\n" 927 "barrier(CLK_LOCAL_MEM_FENCE);\n" 928 "if (x < t)\n" 929 "butterfly3(a0, a1, a2, smem, twiddles, x, block_size);\n" 930 "barrier(CLK_LOCAL_MEM_FENCE);\n" 931 "}\n" 932 "__attribute__((always_inline))\n" 933 "void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 934 "{\n" 935 "const int x2 = x1 + t/2;\n" 936 "CT a0, a1, a2, a3, a4, a5;\n" 937 "if (x1 < t/2)\n" 938 "{\n" 939 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n" 940 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n" 941 "}\n" 942 "barrier(CLK_LOCAL_MEM_FENCE);\n" 943 "if (x1 < t/2)\n" 944 "{\n" 945 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n" 946 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n" 947 "}\n" 948 "barrier(CLK_LOCAL_MEM_FENCE);\n" 949 "}\n" 950 "__attribute__((always_inline))\n" 951 "void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 952 "{\n" 953 "const int x2 = x1 + t/3;\n" 954 "const int x3 = x2 + t/3;\n" 955 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8;\n" 956 "if (x1 < t/3)\n" 957 "{\n" 958 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n" 959 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n" 960 "a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];\n" 961 "}\n" 962 "barrier(CLK_LOCAL_MEM_FENCE);\n" 963 "if (x1 < t/3)\n" 964 "{\n" 965 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n" 966 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n" 967 "butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);\n" 968 "}\n" 969 "barrier(CLK_LOCAL_MEM_FENCE);\n" 970 "}\n" 971 "__attribute__((always_inline))\n" 972 "void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 973 "{\n" 974 "const int thread_block = t/4;\n" 975 "const int x2 = x1 + thread_block;\n" 976 "const int x3 = x1 + 2*thread_block;\n" 977 "const int x4 = x1 + 3*thread_block;\n" 978 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;\n" 979 "if (x1 < t/4)\n" 980 "{\n" 981 "a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];\n" 982 "a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];\n" 983 "a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];\n" 984 "a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];\n" 985 "}\n" 986 "barrier(CLK_LOCAL_MEM_FENCE);\n" 987 "if (x1 < t/4)\n" 988 "{\n" 989 "butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);\n" 990 "butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);\n" 991 "butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);\n" 992 "butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);\n" 993 "}\n" 994 "barrier(CLK_LOCAL_MEM_FENCE);\n" 995 "}\n" 996 "__attribute__((always_inline))\n" 997 "void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)\n" 998 "{\n" 999 "const int k = x % block_size;\n" 1000 "CT a0, a1, a2, a3, a4;\n" 1001 "if (x < t)\n" 1002 "{\n" 1003 "a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];\n" 1004 "}\n" 1005 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1006 "if (x < t)\n" 1007 "butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);\n" 1008 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1009 "}\n" 1010 "__attribute__((always_inline))\n" 1011 "void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)\n" 1012 "{\n" 1013 "const int x2 = x1+t/2;\n" 1014 "CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;\n" 1015 "if (x1 < t/2)\n" 1016 "{\n" 1017 "a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];\n" 1018 "a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];\n" 1019 "}\n" 1020 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1021 "if (x1 < t/2)\n" 1022 "{\n" 1023 "butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);\n" 1024 "butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);\n" 1025 "}\n" 1026 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1027 "}\n" 1028 "#ifdef DFT_SCALE\n" 1029 "#define SCALE_VAL(x, scale) x*scale\n" 1030 "#else\n" 1031 "#define SCALE_VAL(x, scale) x\n" 1032 "#endif\n" 1033 "__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 1034 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 1035 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n" 1036 "{\n" 1037 "const int x = get_global_id(0);\n" 1038 "const int y = get_group_id(1);\n" 1039 "const int block_size = LOCAL_SIZE/kercn;\n" 1040 "if (y < nz)\n" 1041 "{\n" 1042 "__local CT smem[LOCAL_SIZE];\n" 1043 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n" 1044 "const int ind = x;\n" 1045 "#ifdef IS_1D\n" 1046 "FT scale = (FT) 1/dst_cols;\n" 1047 "#else\n" 1048 "FT scale = (FT) 1/(dst_cols*dst_rows);\n" 1049 "#endif\n" 1050 "#ifdef COMPLEX_INPUT\n" 1051 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));\n" 1052 "#pragma unroll\n" 1053 "for (int i=0; i<kercn; i++)\n" 1054 "smem[x+i*block_size] = src[i*block_size];\n" 1055 "#else\n" 1056 "__global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset)));\n" 1057 "#pragma unroll\n" 1058 "for (int i=0; i<kercn; i++)\n" 1059 "smem[x+i*block_size] = (CT)(src[i*block_size], 0.f);\n" 1060 "#endif\n" 1061 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1062 "RADIX_PROCESS;\n" 1063 "#ifdef COMPLEX_OUTPUT\n" 1064 "#ifdef NO_CONJUGATE\n" 1065 "const int cols = dst_cols/2 + 1;\n" 1066 "#else\n" 1067 "const int cols = dst_cols;\n" 1068 "#endif\n" 1069 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1070 "#pragma unroll\n" 1071 "for (int i=x; i<cols; i+=block_size)\n" 1072 "dst[i] = SCALE_VAL(smem[i], scale);\n" 1073 "#ifdef REAL_INPUT\n" 1074 "#ifdef COMPLEX_OUTPUT\n" 1075 "#ifdef IS_1D\n" 1076 "for(int i=x+1; i < (dst_cols+1)/2; i+=block_size)\n" 1077 "{\n" 1078 "dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale));\n" 1079 "}\n" 1080 "#endif\n" 1081 "#endif\n" 1082 "#endif\n" 1083 "#else\n" 1084 "__local FT* smem_1cn = (__local FT*) smem;\n" 1085 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1086 "for (int i=x; i<dst_cols-1; i+=block_size)\n" 1087 "dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);\n" 1088 "if (x == 0)\n" 1089 "dst[0] = SCALE_VAL(smem_1cn[0], scale);\n" 1090 "#endif\n" 1091 "}\n" 1092 "else\n" 1093 "{\n" 1094 "#ifdef COMPLEX_OUTPUT\n" 1095 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1096 "#else\n" 1097 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1098 "#endif\n" 1099 "#pragma unroll\n" 1100 "for (int i=x; i<dst_cols; i+=block_size)\n" 1101 "dst[i] = 0.f;\n" 1102 "}\n" 1103 "}\n" 1104 "__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 1105 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 1106 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n" 1107 "{\n" 1108 "const int x = get_group_id(0);\n" 1109 "const int y = get_global_id(1);\n" 1110 "if (x < nz)\n" 1111 "{\n" 1112 "__local CT smem[LOCAL_SIZE];\n" 1113 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));\n" 1114 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n" 1115 "const int ind = y;\n" 1116 "const int block_size = LOCAL_SIZE/kercn;\n" 1117 "FT scale = 1.f/(dst_rows*dst_cols);\n" 1118 "#pragma unroll\n" 1119 "for (int i=0; i<kercn; i++)\n" 1120 "smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step));\n" 1121 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1122 "RADIX_PROCESS;\n" 1123 "#ifdef COMPLEX_OUTPUT\n" 1124 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n" 1125 "#pragma unroll\n" 1126 "for (int i=0; i<kercn; i++)\n" 1127 "*((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);\n" 1128 "#else\n" 1129 "if (x == 0)\n" 1130 "{\n" 1131 "__local FT* smem_1cn = (__local FT*) smem;\n" 1132 "__global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);\n" 1133 "for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)\n" 1134 "*((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);\n" 1135 "if (y == 0)\n" 1136 "*((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);\n" 1137 "}\n" 1138 "else if (x == (dst_cols+1)/2)\n" 1139 "{\n" 1140 "__local FT* smem_1cn = (__local FT*) smem;\n" 1141 "__global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset));\n" 1142 "for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)\n" 1143 "*((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);\n" 1144 "if (y == 0)\n" 1145 "*((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);\n" 1146 "}\n" 1147 "else\n" 1148 "{\n" 1149 "__global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT)));\n" 1150 "#pragma unroll\n" 1151 "for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)\n" 1152 "vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst);\n" 1153 "}\n" 1154 "#endif\n" 1155 "}\n" 1156 "}\n" 1157 "__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 1158 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 1159 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n" 1160 "{\n" 1161 "const int x = get_global_id(0);\n" 1162 "const int y = get_group_id(1);\n" 1163 "const int block_size = LOCAL_SIZE/kercn;\n" 1164 "#ifdef IS_1D\n" 1165 "const FT scale = (FT) 1/dst_cols;\n" 1166 "#else\n" 1167 "const FT scale = (FT) 1/(dst_cols*dst_rows);\n" 1168 "#endif\n" 1169 "if (y < nz)\n" 1170 "{\n" 1171 "__local CT smem[LOCAL_SIZE];\n" 1172 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n" 1173 "const int ind = x;\n" 1174 "#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)\n" 1175 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));\n" 1176 "#pragma unroll\n" 1177 "for (int i=0; i<kercn; i++)\n" 1178 "{\n" 1179 "smem[x+i*block_size].x = src[i*block_size].x;\n" 1180 "smem[x+i*block_size].y = -src[i*block_size].y;\n" 1181 "}\n" 1182 "#else\n" 1183 "#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)\n" 1184 "__global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset)));\n" 1185 "#pragma unroll\n" 1186 "for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)\n" 1187 "{\n" 1188 "smem[i+1].x = src[i].x;\n" 1189 "smem[i+1].y = -src[i].y;\n" 1190 "smem[LOCAL_SIZE-i-1] = src[i];\n" 1191 "}\n" 1192 "#else\n" 1193 "#pragma unroll\n" 1194 "for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)\n" 1195 "{\n" 1196 "CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset))));\n" 1197 "smem[i+1].x = src.x;\n" 1198 "smem[i+1].y = -src.y;\n" 1199 "smem[LOCAL_SIZE-i-1] = src;\n" 1200 "}\n" 1201 "#endif\n" 1202 "if (x==0)\n" 1203 "{\n" 1204 "smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset));\n" 1205 "smem[0].y = 0.f;\n" 1206 "if(LOCAL_SIZE % 2 ==0)\n" 1207 "{\n" 1208 "#if !defined(REAL_INPUT) && defined(NO_CONJUGATE)\n" 1209 "smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;\n" 1210 "#else\n" 1211 "smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset)));\n" 1212 "#endif\n" 1213 "smem[LOCAL_SIZE/2].y = 0.f;\n" 1214 "}\n" 1215 "}\n" 1216 "#endif\n" 1217 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1218 "RADIX_PROCESS;\n" 1219 "#ifdef COMPLEX_OUTPUT\n" 1220 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)));\n" 1221 "#pragma unroll\n" 1222 "for (int i=0; i<kercn; i++)\n" 1223 "{\n" 1224 "dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);\n" 1225 "dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);\n" 1226 "}\n" 1227 "#else\n" 1228 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset)));\n" 1229 "#pragma unroll\n" 1230 "for (int i=0; i<kercn; i++)\n" 1231 "{\n" 1232 "dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);\n" 1233 "}\n" 1234 "#endif\n" 1235 "}\n" 1236 "else\n" 1237 "{\n" 1238 "#ifdef COMPLEX_OUTPUT\n" 1239 "__global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1240 "#else\n" 1241 "__global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));\n" 1242 "#endif\n" 1243 "#pragma unroll\n" 1244 "for (int i=x; i<dst_cols; i+=block_size)\n" 1245 "dst[i] = 0.f;\n" 1246 "}\n" 1247 "}\n" 1248 "__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 1249 "__global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 1250 "__global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)\n" 1251 "{\n" 1252 "const int x = get_group_id(0);\n" 1253 "const int y = get_global_id(1);\n" 1254 "#ifdef COMPLEX_INPUT\n" 1255 "if (x < nz)\n" 1256 "{\n" 1257 "__local CT smem[LOCAL_SIZE];\n" 1258 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));\n" 1259 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n" 1260 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n" 1261 "const int ind = y;\n" 1262 "const int block_size = LOCAL_SIZE/kercn;\n" 1263 "#pragma unroll\n" 1264 "for (int i=0; i<kercn; i++)\n" 1265 "{\n" 1266 "CT temp = *((__global const CT*)(src + i*block_size*src_step));\n" 1267 "smem[y+i*block_size].x = temp.x;\n" 1268 "smem[y+i*block_size].y = -temp.y;\n" 1269 "}\n" 1270 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1271 "RADIX_PROCESS;\n" 1272 "#pragma unroll\n" 1273 "for (int i=0; i<kercn; i++)\n" 1274 "{\n" 1275 "__global CT* res = (__global CT*)(dst + i*block_size*dst_step);\n" 1276 "res[0].x = smem[y + i*block_size].x;\n" 1277 "res[0].y = -smem[y + i*block_size].y;\n" 1278 "}\n" 1279 "}\n" 1280 "#else\n" 1281 "if (x < nz)\n" 1282 "{\n" 1283 "__global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);\n" 1284 "const int ind = y;\n" 1285 "const int block_size = LOCAL_SIZE/kercn;\n" 1286 "__local CT smem[LOCAL_SIZE];\n" 1287 "#ifdef EVEN\n" 1288 "if (x!=0 && (x!=(nz-1)))\n" 1289 "#else\n" 1290 "if (x!=0)\n" 1291 "#endif\n" 1292 "{\n" 1293 "__global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset));\n" 1294 "#pragma unroll\n" 1295 "for (int i=0; i<kercn; i++)\n" 1296 "{\n" 1297 "CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step));\n" 1298 "smem[y+i*block_size].x = temp.x;\n" 1299 "smem[y+i*block_size].y = -temp.y;\n" 1300 "}\n" 1301 "}\n" 1302 "else\n" 1303 "{\n" 1304 "int ind = x==0 ? 0: 2*x-1;\n" 1305 "__global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset)));\n" 1306 "int step = src_step/(int)sizeof(FT);\n" 1307 "#pragma unroll\n" 1308 "for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)\n" 1309 "{\n" 1310 "smem[i+1].x = src[2*i*step];\n" 1311 "smem[i+1].y = -src[(2*i+1)*step];\n" 1312 "smem[LOCAL_SIZE-i-1].x = src[2*i*step];;\n" 1313 "smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];\n" 1314 "}\n" 1315 "if (y==0)\n" 1316 "{\n" 1317 "smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset));\n" 1318 "smem[0].y = 0.f;\n" 1319 "if(LOCAL_SIZE % 2 ==0)\n" 1320 "{\n" 1321 "smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];\n" 1322 "smem[LOCAL_SIZE/2].y = 0.f;\n" 1323 "}\n" 1324 "}\n" 1325 "}\n" 1326 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1327 "RADIX_PROCESS;\n" 1328 "__global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));\n" 1329 "#pragma unroll\n" 1330 "for (int i=0; i<kercn; i++)\n" 1331 "{\n" 1332 "__global CT* res = (__global CT*)(dst + i*block_size*dst_step);\n" 1333 "res[0].x = smem[y + i*block_size].x;\n" 1334 "res[0].y = -smem[y + i*block_size].y;\n" 1335 "}\n" 1336 "}\n" 1337 "#endif\n" 1338 "}\n" 1339 , "3d61276324844b5841beb34e245f270a"}; 1340 ProgramSource fft_oclsrc(fft.programStr); 1341 const struct ProgramEntry flip={"flip", 1342 "#if kercn != 3\n" 1343 "#define loadpix(addr) *(__global const T *)(addr)\n" 1344 "#define storepix(val, addr) *(__global T *)(addr) = val\n" 1345 "#define TSIZE (int)sizeof(T)\n" 1346 "#else\n" 1347 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" 1348 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" 1349 "#define TSIZE ((int)sizeof(T1)*3)\n" 1350 "#endif\n" 1351 "__kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int src_offset,\n" 1352 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 1353 "int rows, int cols, int thread_rows, int thread_cols)\n" 1354 "{\n" 1355 "int x = get_global_id(0);\n" 1356 "int y0 = get_global_id(1) * PIX_PER_WI_Y;\n" 1357 "if (x < cols)\n" 1358 "{\n" 1359 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n" 1360 "int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset));\n" 1361 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n" 1362 "int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset));\n" 1363 "#pragma unroll\n" 1364 "for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n" 1365 "{\n" 1366 "T src0 = loadpix(srcptr + src_index0);\n" 1367 "T src1 = loadpix(srcptr + src_index1);\n" 1368 "storepix(src1, dstptr + dst_index0);\n" 1369 "storepix(src0, dstptr + dst_index1);\n" 1370 "src_index0 += src_step;\n" 1371 "src_index1 -= src_step;\n" 1372 "dst_index0 += dst_step;\n" 1373 "dst_index1 -= dst_step;\n" 1374 "}\n" 1375 "}\n" 1376 "}\n" 1377 "__kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int src_offset,\n" 1378 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 1379 "int rows, int cols, int thread_rows, int thread_cols)\n" 1380 "{\n" 1381 "int x = get_global_id(0);\n" 1382 "int y0 = get_global_id(1)*PIX_PER_WI_Y;\n" 1383 "if (x < cols)\n" 1384 "{\n" 1385 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n" 1386 "int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset));\n" 1387 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n" 1388 "int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));\n" 1389 "#pragma unroll\n" 1390 "for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n" 1391 "{\n" 1392 "T src0 = loadpix(srcptr + src_index0);\n" 1393 "T src1 = loadpix(srcptr + src_index1);\n" 1394 "#if kercn == 2\n" 1395 "#if cn == 1\n" 1396 "src0 = src0.s10;\n" 1397 "src1 = src1.s10;\n" 1398 "#endif\n" 1399 "#elif kercn == 4\n" 1400 "#if cn == 1\n" 1401 "src0 = src0.s3210;\n" 1402 "src1 = src1.s3210;\n" 1403 "#elif cn == 2\n" 1404 "src0 = src0.s2301;\n" 1405 "src1 = src1.s2301;\n" 1406 "#endif\n" 1407 "#endif\n" 1408 "storepix(src1, dstptr + dst_index0);\n" 1409 "storepix(src0, dstptr + dst_index1);\n" 1410 "src_index0 += src_step;\n" 1411 "src_index1 -= src_step;\n" 1412 "dst_index0 += dst_step;\n" 1413 "dst_index1 -= dst_step;\n" 1414 "}\n" 1415 "}\n" 1416 "}\n" 1417 "__kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int src_offset,\n" 1418 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 1419 "int rows, int cols, int thread_rows, int thread_cols)\n" 1420 "{\n" 1421 "int x = get_global_id(0);\n" 1422 "int y0 = get_global_id(1)*PIX_PER_WI_Y;\n" 1423 "if (x < thread_cols)\n" 1424 "{\n" 1425 "int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n" 1426 "int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset));\n" 1427 "int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));\n" 1428 "int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));\n" 1429 "#pragma unroll\n" 1430 "for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y)\n" 1431 "{\n" 1432 "T src0 = loadpix(srcptr + src_index0);\n" 1433 "T src1 = loadpix(srcptr + src_index1);\n" 1434 "#if kercn == 2\n" 1435 "#if cn == 1\n" 1436 "src0 = src0.s10;\n" 1437 "src1 = src1.s10;\n" 1438 "#endif\n" 1439 "#elif kercn == 4\n" 1440 "#if cn == 1\n" 1441 "src0 = src0.s3210;\n" 1442 "src1 = src1.s3210;\n" 1443 "#elif cn == 2\n" 1444 "src0 = src0.s2301;\n" 1445 "src1 = src1.s2301;\n" 1446 "#endif\n" 1447 "#endif\n" 1448 "storepix(src1, dstptr + dst_index0);\n" 1449 "storepix(src0, dstptr + dst_index1);\n" 1450 "src_index0 += src_step;\n" 1451 "src_index1 += src_step;\n" 1452 "dst_index0 += dst_step;\n" 1453 "dst_index1 += dst_step;\n" 1454 "}\n" 1455 "}\n" 1456 "}\n" 1457 , "296714e8641b2d8359104d11b332b14b"}; 1458 ProgramSource flip_oclsrc(flip.programStr); 1459 const struct ProgramEntry gemm={"gemm", 1460 "#ifdef DOUBLE_SUPPORT\n" 1461 "#ifdef cl_amd_fp64\n" 1462 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 1463 "#elif defined (cl_khr_fp64)\n" 1464 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 1465 "#endif\n" 1466 "#endif\n" 1467 "#define TSIZE (int)sizeof(T)\n" 1468 "#define WTSIZE (int)sizeof(WT)\n" 1469 "#define IND_A mad24(y, A_step, A_offset)\n" 1470 "#define IND_B mad24(x, WTSIZE, B_offset)\n" 1471 "#define STEP_B B_step / WTSIZE\n" 1472 "#define LOCAL_SIZE_ODD (LOCAL_SIZE + 1)\n" 1473 "#if cn==2\n" 1474 "#if kercn==2\n" 1475 "#define MUL(a, b)\\\n" 1476 "{\\\n" 1477 "sum.x += fma(a.x, b.x, - a.y * b.y);\\\n" 1478 "sum.y += fma(a.x, b.y, a.y * b.x);\\\n" 1479 "}\n" 1480 "#else\n" 1481 "#define MUL(a, b)\\\n" 1482 "{\\\n" 1483 "sum.x += fma(a.x, b.x, - a.y * b.y);\\\n" 1484 "sum.y += fma(a.x, b.y, a.y * b.x);\\\n" 1485 "sum.z += fma(a.x, b.z, - a.y * b.w);\\\n" 1486 "sum.w += fma(a.x, b.w, a.y * b.z);\\\n" 1487 "}\n" 1488 "#endif\n" 1489 "#else\n" 1490 "#define MUL(a, b) sum = fma(a, b, sum);\n" 1491 "#endif\n" 1492 "__kernel void gemm(__global const uchar * A_ptr, int A_step, int A_offset,\n" 1493 "__global const uchar * B_ptr, int B_step, int B_offset,\n" 1494 "__global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols,\n" 1495 "int n, T1 alpha, T1 beta)\n" 1496 "{\n" 1497 "int x = get_global_id(0);\n" 1498 "int y = get_global_id(1);\n" 1499 "int lidx = get_local_id(0);\n" 1500 "int lidy = get_local_id(1);\n" 1501 "__global const T* A = (__global const T*)(A_ptr + IND_A);\n" 1502 "__global const WT* B = (__global const WT*)(B_ptr + IND_B);\n" 1503 "WT sum = (WT)(0);\n" 1504 "#if LOCAL_SIZE == 1\n" 1505 "if (x < D_cols && y < D_rows)\n" 1506 "{\n" 1507 "for (int i = 0; i < n; ++i)\n" 1508 "MUL(A[i], B[i*STEP_B]);\n" 1509 "#else\n" 1510 "__local T a_local[LOCAL_SIZE_ODD*LOCAL_SIZE];\n" 1511 "__local WT b_local[LOCAL_SIZE_ODD*LOCAL_SIZE];\n" 1512 "int reps;\n" 1513 "#if NO_MULT\n" 1514 "reps = (n + LOCAL_SIZE-1)/LOCAL_SIZE;\n" 1515 "#else\n" 1516 "reps = n/LOCAL_SIZE;\n" 1517 "#endif\n" 1518 "for (int p = 0; p < reps; ++p)\n" 1519 "{\n" 1520 "if (p * LOCAL_SIZE + lidx < n && y < D_rows)\n" 1521 "a_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = A[mad24(p, LOCAL_SIZE, lidx)];\n" 1522 "if (p * LOCAL_SIZE + lidy < n && x < D_cols)\n" 1523 "b_local[mad24(lidy, LOCAL_SIZE_ODD, lidx)] = B[mad24(p, LOCAL_SIZE, lidy)*STEP_B];\n" 1524 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1525 "if (x < D_cols && y < D_rows)\n" 1526 "{\n" 1527 "#if NO_MULT\n" 1528 "int ie = min(LOCAL_SIZE, n - p * LOCAL_SIZE);\n" 1529 "for (int i = 0; i < ie; ++i)\n" 1530 "#else\n" 1531 "for (int i = 0; i < LOCAL_SIZE; ++i)\n" 1532 "#endif\n" 1533 "MUL(a_local[mad24(lidy, LOCAL_SIZE_ODD, i)], b_local[mad24(i, LOCAL_SIZE_ODD, lidx)]);\n" 1534 "}\n" 1535 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1536 "}\n" 1537 "if (x < D_cols && y < D_rows)\n" 1538 "{\n" 1539 "#endif\n" 1540 "__global WT* D = (__global WT*)(D_ptr + mad24(y, D_step, mad24(x, WTSIZE, D_offset)));\n" 1541 "#if HAVE_C\n" 1542 "D[0] = mad(alpha, sum, D[0]*beta);\n" 1543 "#else\n" 1544 "D[0] = alpha * sum;\n" 1545 "#endif\n" 1546 "}\n" 1547 "}\n" 1548 , "0a79f557db56fcdce22ea905e7fc899f"}; 1549 ProgramSource gemm_oclsrc(gemm.programStr); 1550 const struct ProgramEntry inrange={"inrange", 1551 "#ifdef DOUBLE_SUPPORT\n" 1552 "#ifdef cl_amd_fp64\n" 1553 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 1554 "#elif defined (cl_khr_fp64)\n" 1555 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 1556 "#endif\n" 1557 "#endif\n" 1558 "__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,\n" 1559 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" 1560 "#ifdef HAVE_SCALAR\n" 1561 "__global const srcT1 * src2, __global const srcT1 * src3,\n" 1562 "#else\n" 1563 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n" 1564 "__global const uchar * src3ptr, int src3_step, int src3_offset,\n" 1565 "#endif\n" 1566 "int rowsPerWI)\n" 1567 "{\n" 1568 "int x = get_global_id(0);\n" 1569 "int y0 = get_global_id(1) * rowsPerWI;\n" 1570 "if (x < dst_cols)\n" 1571 "{\n" 1572 "int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));\n" 1573 "int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));\n" 1574 "#ifndef HAVE_SCALAR\n" 1575 "int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));\n" 1576 "int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));\n" 1577 "#endif\n" 1578 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)\n" 1579 "{\n" 1580 "#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR\n" 1581 "srcT src1 = *(__global const srcT *)(src1ptr + src1_index);\n" 1582 "srcT src2 = *(__global const srcT *)(src2ptr + src2_index);\n" 1583 "srcT src3 = *(__global const srcT *)(src3ptr + src3_index);\n" 1584 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n" 1585 "#if cn == 1\n" 1586 "dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);\n" 1587 "#elif cn == 2\n" 1588 "dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||\n" 1589 "src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);\n" 1590 "#elif cn == 4\n" 1591 "dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||\n" 1592 "src2.y > src1.y || src3.y < src1.y ||\n" 1593 "src2.z > src1.z || src3.z < src1.z ||\n" 1594 "src2.w > src1.w || src3.w < src1.w ? 0 : 255);\n" 1595 "#endif\n" 1596 "#else\n" 1597 "__global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);\n" 1598 "__global uchar * dst = dstptr + dst_index;\n" 1599 "#ifndef HAVE_SCALAR\n" 1600 "__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);\n" 1601 "__global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);\n" 1602 "#endif\n" 1603 "#pragma unroll\n" 1604 "for (int px = 0; px < colsPerWI; ++px, src1 += cn\n" 1605 "#ifndef HAVE_SCALAR\n" 1606 ", src2 += cn, src3 += cn\n" 1607 "#endif\n" 1608 ")\n" 1609 "{\n" 1610 "dst[px] = 255;\n" 1611 "for (int c = 0; c < cn; ++c)\n" 1612 "if (src2[c] > src1[c] || src3[c] < src1[c])\n" 1613 "{\n" 1614 "dst[px] = 0;\n" 1615 "break;\n" 1616 "}\n" 1617 "}\n" 1618 "#endif\n" 1619 "#ifndef HAVE_SCALAR\n" 1620 "src2_index += src2_step;\n" 1621 "src3_index += src3_step;\n" 1622 "#endif\n" 1623 "}\n" 1624 "}\n" 1625 "}\n" 1626 , "e7220f9dc5b30fc5558622a452890287"}; 1627 ProgramSource inrange_oclsrc(inrange.programStr); 1628 const struct ProgramEntry lut={"lut", 1629 "#if lcn == 1\n" 1630 "#if dcn == 4\n" 1631 "#define LUT_OP \\\n" 1632 "int idx = *(__global const int *)(srcptr + src_index); \\\n" 1633 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1634 "dst[0] = lut_l[idx & 0xff]; \\\n" 1635 "dst[1] = lut_l[(idx >> 8) & 0xff]; \\\n" 1636 "dst[2] = lut_l[(idx >> 16) & 0xff]; \\\n" 1637 "dst[3] = lut_l[(idx >> 24) & 0xff];\n" 1638 "#elif dcn == 3\n" 1639 "#define LUT_OP \\\n" 1640 "uchar3 idx = vload3(0, srcptr + src_index); \\\n" 1641 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1642 "dst[0] = lut_l[idx.x]; \\\n" 1643 "dst[1] = lut_l[idx.y]; \\\n" 1644 "dst[2] = lut_l[idx.z];\n" 1645 "#elif dcn == 2\n" 1646 "#define LUT_OP \\\n" 1647 "short idx = *(__global const short *)(srcptr + src_index); \\\n" 1648 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1649 "dst[0] = lut_l[idx & 0xff]; \\\n" 1650 "dst[1] = lut_l[(idx >> 8) & 0xff];\n" 1651 "#elif dcn == 1\n" 1652 "#define LUT_OP \\\n" 1653 "uchar idx = (srcptr + src_index)[0]; \\\n" 1654 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1655 "dst[0] = lut_l[idx];\n" 1656 "#else\n" 1657 "#define LUT_OP \\\n" 1658 "__global const srcT * src = (__global const srcT *)(srcptr + src_index); \\\n" 1659 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1660 "for (int cn = 0; cn < dcn; ++cn) \\\n" 1661 "dst[cn] = lut_l[src[cn]];\n" 1662 "#endif\n" 1663 "#else\n" 1664 "#if dcn == 4\n" 1665 "#define LUT_OP \\\n" 1666 "__global const uchar4 * src_pixel = (__global const uchar4 *)(srcptr + src_index); \\\n" 1667 "int4 idx = mad24(convert_int4(src_pixel[0]), (int4)(lcn), (int4)(0, 1, 2, 3)); \\\n" 1668 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1669 "dst[0] = lut_l[idx.x]; \\\n" 1670 "dst[1] = lut_l[idx.y]; \\\n" 1671 "dst[2] = lut_l[idx.z]; \\\n" 1672 "dst[3] = lut_l[idx.w];\n" 1673 "#elif dcn == 3\n" 1674 "#define LUT_OP \\\n" 1675 "uchar3 src_pixel = vload3(0, srcptr + src_index); \\\n" 1676 "int3 idx = mad24(convert_int3(src_pixel), (int3)(lcn), (int3)(0, 1, 2)); \\\n" 1677 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1678 "dst[0] = lut_l[idx.x]; \\\n" 1679 "dst[1] = lut_l[idx.y]; \\\n" 1680 "dst[2] = lut_l[idx.z];\n" 1681 "#elif dcn == 2\n" 1682 "#define LUT_OP \\\n" 1683 "__global const uchar2 * src_pixel = (__global const uchar2 *)(srcptr + src_index); \\\n" 1684 "int2 idx = mad24(convert_int2(src_pixel[0]), lcn, (int2)(0, 1)); \\\n" 1685 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1686 "dst[0] = lut_l[idx.x]; \\\n" 1687 "dst[1] = lut_l[idx.y];\n" 1688 "#elif dcn == 1\n" 1689 "#define LUT_OP \\\n" 1690 "uchar idx = (srcptr + src_index)[0]; \\\n" 1691 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1692 "dst[0] = lut_l[idx];\n" 1693 "#else\n" 1694 "#define LUT_OP \\\n" 1695 "__global const srcT * src = (__global const srcT *)(srcptr + src_index); \\\n" 1696 "dst = (__global dstT *)(dstptr + dst_index); \\\n" 1697 "for (int cn = 0; cn < dcn; ++cn) \\\n" 1698 "dst[cn] = lut_l[mad24(src[cn], lcn, cn)];\n" 1699 "#endif\n" 1700 "#endif\n" 1701 "__kernel void LUT(__global const uchar * srcptr, int src_step, int src_offset,\n" 1702 "__global const uchar * lutptr, int lut_step, int lut_offset,\n" 1703 "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols)\n" 1704 "{\n" 1705 "int x = get_global_id(0);\n" 1706 "int y = get_global_id(1) << 2;\n" 1707 "__local dstT lut_l[256 * lcn];\n" 1708 "__global const dstT * lut = (__global const dstT *)(lutptr + lut_offset);\n" 1709 "for (int i = mad24((int)get_local_id(1), (int)get_local_size(0), (int)get_local_id(0)),\n" 1710 "step = get_local_size(0) * get_local_size(1); i < 256 * lcn; i += step)\n" 1711 "lut_l[i] = lut[i];\n" 1712 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1713 "if (x < cols && y < rows)\n" 1714 "{\n" 1715 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * dcn, src_offset));\n" 1716 "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT) * dcn, dst_offset));\n" 1717 "__global dstT * dst;\n" 1718 "LUT_OP;\n" 1719 "if (y < rows - 1)\n" 1720 "{\n" 1721 "src_index += src_step;\n" 1722 "dst_index += dst_step;\n" 1723 "LUT_OP;\n" 1724 "if (y < rows - 2)\n" 1725 "{\n" 1726 "src_index += src_step;\n" 1727 "dst_index += dst_step;\n" 1728 "LUT_OP;\n" 1729 "if (y < rows - 3)\n" 1730 "{\n" 1731 "src_index += src_step;\n" 1732 "dst_index += dst_step;\n" 1733 "LUT_OP;\n" 1734 "}\n" 1735 "}\n" 1736 "}\n" 1737 "}\n" 1738 "}\n" 1739 , "02217d060320fc126306ad16885be711"}; 1740 ProgramSource lut_oclsrc(lut.programStr); 1741 const struct ProgramEntry meanstddev={"meanstddev", 1742 "#ifdef DOUBLE_SUPPORT\n" 1743 "#ifdef cl_amd_fp64\n" 1744 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 1745 "#elif defined (cl_khr_fp64)\n" 1746 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 1747 "#endif\n" 1748 "#endif\n" 1749 "#define noconvert\n" 1750 "#if cn != 3\n" 1751 "#define loadpix(addr) *(__global const srcT *)(addr)\n" 1752 "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" 1753 "#define storesqpix(val, addr) *(__global sqdstT *)(addr) = val\n" 1754 "#define srcTSIZE (int)sizeof(srcT)\n" 1755 "#define dstTSIZE (int)sizeof(dstT)\n" 1756 "#define sqdstTSIZE (int)sizeof(sqdstT)\n" 1757 "#else\n" 1758 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" 1759 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" 1760 "#define storesqpix(val, addr) vstore3(val, 0, (__global sqdstT1 *)(addr))\n" 1761 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n" 1762 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n" 1763 "#define sqdstTSIZE ((int)sizeof(sqdstT1)*3)\n" 1764 "#endif\n" 1765 "__kernel void meanStdDev(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n" 1766 "int total, int groups, __global uchar * dstptr\n" 1767 "#ifdef HAVE_MASK\n" 1768 ", __global const uchar * mask, int mask_step, int mask_offset\n" 1769 "#endif\n" 1770 ")\n" 1771 "{\n" 1772 "int lid = get_local_id(0);\n" 1773 "int gid = get_group_id(0);\n" 1774 "int id = get_global_id(0);\n" 1775 "__local dstT localMemSum[WGS2_ALIGNED];\n" 1776 "__local sqdstT localMemSqSum[WGS2_ALIGNED];\n" 1777 "#ifdef HAVE_MASK\n" 1778 "__local int localMemNonZero[WGS2_ALIGNED];\n" 1779 "#endif\n" 1780 "dstT accSum = (dstT)(0);\n" 1781 "sqdstT accSqSum = (sqdstT)(0);\n" 1782 "#ifdef HAVE_MASK\n" 1783 "int accNonZero = 0;\n" 1784 "mask += mask_offset;\n" 1785 "#endif\n" 1786 "srcptr += src_offset;\n" 1787 "for (int grain = groups * WGS; id < total; id += grain)\n" 1788 "{\n" 1789 "#ifdef HAVE_MASK\n" 1790 "#ifdef HAVE_MASK_CONT\n" 1791 "int mask_index = id;\n" 1792 "#else\n" 1793 "int mask_index = mad24(id / cols, mask_step, id % cols);\n" 1794 "#endif\n" 1795 "if (mask[mask_index])\n" 1796 "#endif\n" 1797 "{\n" 1798 "#ifdef HAVE_SRC_CONT\n" 1799 "int src_index = id * srcTSIZE;\n" 1800 "#else\n" 1801 "int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n" 1802 "#endif\n" 1803 "srcT value = loadpix(srcptr + src_index);\n" 1804 "accSum += convertToDT(value);\n" 1805 "sqdstT dvalue = convertToSDT(value);\n" 1806 "accSqSum = fma(dvalue, dvalue, accSqSum);\n" 1807 "#ifdef HAVE_MASK\n" 1808 "++accNonZero;\n" 1809 "#endif\n" 1810 "}\n" 1811 "}\n" 1812 "if (lid < WGS2_ALIGNED)\n" 1813 "{\n" 1814 "localMemSum[lid] = accSum;\n" 1815 "localMemSqSum[lid] = accSqSum;\n" 1816 "#ifdef HAVE_MASK\n" 1817 "localMemNonZero[lid] = accNonZero;\n" 1818 "#endif\n" 1819 "}\n" 1820 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1821 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n" 1822 "{\n" 1823 "localMemSum[lid - WGS2_ALIGNED] += accSum;\n" 1824 "localMemSqSum[lid - WGS2_ALIGNED] += accSqSum;\n" 1825 "#ifdef HAVE_MASK\n" 1826 "localMemNonZero[lid - WGS2_ALIGNED] += accNonZero;\n" 1827 "#endif\n" 1828 "}\n" 1829 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1830 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n" 1831 "{\n" 1832 "if (lid < lsize)\n" 1833 "{\n" 1834 "int lid2 = lsize + lid;\n" 1835 "localMemSum[lid] += localMemSum[lid2];\n" 1836 "localMemSqSum[lid] += localMemSqSum[lid2];\n" 1837 "#ifdef HAVE_MASK\n" 1838 "localMemNonZero[lid] += localMemNonZero[lid2];\n" 1839 "#endif\n" 1840 "}\n" 1841 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1842 "}\n" 1843 "if (lid == 0)\n" 1844 "{\n" 1845 "storepix(localMemSum[0], dstptr + dstTSIZE * gid);\n" 1846 "storesqpix(localMemSqSum[0], dstptr + mad24(dstTSIZE, groups, sqdstTSIZE * gid));\n" 1847 "#ifdef HAVE_MASK\n" 1848 "*(__global int *)(dstptr + mad24(dstTSIZE + sqdstTSIZE, groups, (int)sizeof(int) * gid)) = localMemNonZero[0];\n" 1849 "#endif\n" 1850 "}\n" 1851 "}\n" 1852 , "1284edd21da32ce135cd26c0c897bd08"}; 1853 ProgramSource meanstddev_oclsrc(meanstddev.programStr); 1854 const struct ProgramEntry minmaxloc={"minmaxloc", 1855 "#ifdef DOUBLE_SUPPORT\n" 1856 "#ifdef cl_amd_fp64\n" 1857 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 1858 "#elif defined (cl_khr_fp64)\n" 1859 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 1860 "#endif\n" 1861 "#endif\n" 1862 "#ifdef DEPTH_0\n" 1863 "#define MIN_VAL 0\n" 1864 "#define MAX_VAL UCHAR_MAX\n" 1865 "#elif defined DEPTH_1\n" 1866 "#define MIN_VAL SCHAR_MIN\n" 1867 "#define MAX_VAL SCHAR_MAX\n" 1868 "#elif defined DEPTH_2\n" 1869 "#define MIN_VAL 0\n" 1870 "#define MAX_VAL USHRT_MAX\n" 1871 "#elif defined DEPTH_3\n" 1872 "#define MIN_VAL SHRT_MIN\n" 1873 "#define MAX_VAL SHRT_MAX\n" 1874 "#elif defined DEPTH_4\n" 1875 "#define MIN_VAL INT_MIN\n" 1876 "#define MAX_VAL INT_MAX\n" 1877 "#elif defined DEPTH_5\n" 1878 "#define MIN_VAL (-FLT_MAX)\n" 1879 "#define MAX_VAL FLT_MAX\n" 1880 "#elif defined DEPTH_6\n" 1881 "#define MIN_VAL (-DBL_MAX)\n" 1882 "#define MAX_VAL DBL_MAX\n" 1883 "#endif\n" 1884 "#define noconvert\n" 1885 "#define INDEX_MAX UINT_MAX\n" 1886 "#if wdepth <= 4\n" 1887 "#define MIN_ABS(a) convertFromU(abs(a))\n" 1888 "#define MIN_ABS2(a, b) convertFromU(abs_diff(a, b))\n" 1889 "#define MIN(a, b) min(a, b)\n" 1890 "#define MAX(a, b) max(a, b)\n" 1891 "#else\n" 1892 "#define MIN_ABS(a) fabs(a)\n" 1893 "#define MIN_ABS2(a, b) fabs(a - b)\n" 1894 "#define MIN(a, b) fmin(a, b)\n" 1895 "#define MAX(a, b) fmax(a, b)\n" 1896 "#endif\n" 1897 "#if kercn != 3\n" 1898 "#define loadpix(addr) *(__global const srcT *)(addr)\n" 1899 "#define srcTSIZE (int)sizeof(srcT)\n" 1900 "#else\n" 1901 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" 1902 "#define srcTSIZE ((int)sizeof(srcT1) * 3)\n" 1903 "#endif\n" 1904 "#ifndef HAVE_MASK\n" 1905 "#undef srcTSIZE\n" 1906 "#define srcTSIZE (int)sizeof(srcT1)\n" 1907 "#endif\n" 1908 "#ifdef NEED_MINVAL\n" 1909 "#ifdef NEED_MINLOC\n" 1910 "#define CALC_MIN(p, inc) \\\n" 1911 "if (minval > temp.p) \\\n" 1912 "{ \\\n" 1913 "minval = temp.p; \\\n" 1914 "minloc = id + inc; \\\n" 1915 "}\n" 1916 "#else\n" 1917 "#define CALC_MIN(p, inc) \\\n" 1918 "minval = MIN(minval, temp.p);\n" 1919 "#endif\n" 1920 "#else\n" 1921 "#define CALC_MIN(p, inc)\n" 1922 "#endif\n" 1923 "#ifdef NEED_MAXVAL\n" 1924 "#ifdef NEED_MAXLOC\n" 1925 "#define CALC_MAX(p, inc) \\\n" 1926 "if (maxval < temp.p) \\\n" 1927 "{ \\\n" 1928 "maxval = temp.p; \\\n" 1929 "maxloc = id + inc; \\\n" 1930 "}\n" 1931 "#else\n" 1932 "#define CALC_MAX(p, inc) \\\n" 1933 "maxval = MAX(maxval, temp.p);\n" 1934 "#endif\n" 1935 "#else\n" 1936 "#define CALC_MAX(p, inc)\n" 1937 "#endif\n" 1938 "#ifdef OP_CALC2\n" 1939 "#define CALC_MAX2(p) \\\n" 1940 "maxval2 = MAX(maxval2, temp2.p);\n" 1941 "#else\n" 1942 "#define CALC_MAX2(p)\n" 1943 "#endif\n" 1944 "#define CALC_P(p, inc) \\\n" 1945 "CALC_MIN(p, inc) \\\n" 1946 "CALC_MAX(p, inc) \\\n" 1947 "CALC_MAX2(p)\n" 1948 "__kernel void minmaxloc(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n" 1949 "int total, int groupnum, __global uchar * dstptr\n" 1950 "#ifdef HAVE_MASK\n" 1951 ", __global const uchar * mask, int mask_step, int mask_offset\n" 1952 "#endif\n" 1953 "#ifdef HAVE_SRC2\n" 1954 ", __global const uchar * src2ptr, int src2_step, int src2_offset\n" 1955 "#endif\n" 1956 ")\n" 1957 "{\n" 1958 "int lid = get_local_id(0);\n" 1959 "int gid = get_group_id(0);\n" 1960 "int id = get_global_id(0)\n" 1961 "#ifndef HAVE_MASK\n" 1962 "* kercn;\n" 1963 "#else\n" 1964 ";\n" 1965 "#endif\n" 1966 "srcptr += src_offset;\n" 1967 "#ifdef HAVE_MASK\n" 1968 "mask += mask_offset;\n" 1969 "#endif\n" 1970 "#ifdef HAVE_SRC2\n" 1971 "src2ptr += src2_offset;\n" 1972 "#endif\n" 1973 "#ifdef NEED_MINVAL\n" 1974 "__local dstT1 localmem_min[WGS2_ALIGNED];\n" 1975 "dstT1 minval = MAX_VAL;\n" 1976 "#ifdef NEED_MINLOC\n" 1977 "__local uint localmem_minloc[WGS2_ALIGNED];\n" 1978 "uint minloc = INDEX_MAX;\n" 1979 "#endif\n" 1980 "#endif\n" 1981 "#ifdef NEED_MAXVAL\n" 1982 "dstT1 maxval = MIN_VAL;\n" 1983 "__local dstT1 localmem_max[WGS2_ALIGNED];\n" 1984 "#ifdef NEED_MAXLOC\n" 1985 "__local uint localmem_maxloc[WGS2_ALIGNED];\n" 1986 "uint maxloc = INDEX_MAX;\n" 1987 "#endif\n" 1988 "#endif\n" 1989 "#ifdef OP_CALC2\n" 1990 "__local dstT1 localmem_max2[WGS2_ALIGNED];\n" 1991 "dstT1 maxval2 = MIN_VAL;\n" 1992 "#endif\n" 1993 "int src_index;\n" 1994 "#ifdef HAVE_MASK\n" 1995 "int mask_index;\n" 1996 "#endif\n" 1997 "#ifdef HAVE_SRC2\n" 1998 "int src2_index;\n" 1999 "#endif\n" 2000 "dstT temp;\n" 2001 "#ifdef HAVE_SRC2\n" 2002 "dstT temp2;\n" 2003 "#endif\n" 2004 "for (int grain = groupnum * WGS\n" 2005 "#ifndef HAVE_MASK\n" 2006 "* kercn\n" 2007 "#endif\n" 2008 "; id < total; id += grain)\n" 2009 "{\n" 2010 "#ifdef HAVE_MASK\n" 2011 "#ifdef HAVE_MASK_CONT\n" 2012 "mask_index = id;\n" 2013 "#else\n" 2014 "mask_index = mad24(id / cols, mask_step, id % cols);\n" 2015 "#endif\n" 2016 "if (mask[mask_index])\n" 2017 "#endif\n" 2018 "{\n" 2019 "#ifdef HAVE_SRC_CONT\n" 2020 "src_index = id * srcTSIZE;\n" 2021 "#else\n" 2022 "src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n" 2023 "#endif\n" 2024 "temp = convertToDT(loadpix(srcptr + src_index));\n" 2025 "#ifdef OP_ABS\n" 2026 "temp = MIN_ABS(temp);\n" 2027 "#endif\n" 2028 "#ifdef HAVE_SRC2\n" 2029 "#ifdef HAVE_SRC2_CONT\n" 2030 "src2_index = id * srcTSIZE;\n" 2031 "#else\n" 2032 "src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));\n" 2033 "#endif\n" 2034 "temp2 = convertToDT(loadpix(src2ptr + src2_index));\n" 2035 "temp = MIN_ABS2(temp, temp2);\n" 2036 "#ifdef OP_CALC2\n" 2037 "temp2 = MIN_ABS(temp2);\n" 2038 "#endif\n" 2039 "#endif\n" 2040 "#if kercn == 1\n" 2041 "#ifdef NEED_MINVAL\n" 2042 "#ifdef NEED_MINLOC\n" 2043 "if (minval > temp)\n" 2044 "{\n" 2045 "minval = temp;\n" 2046 "minloc = id;\n" 2047 "}\n" 2048 "#else\n" 2049 "minval = MIN(minval, temp);\n" 2050 "#endif\n" 2051 "#endif\n" 2052 "#ifdef NEED_MAXVAL\n" 2053 "#ifdef NEED_MAXLOC\n" 2054 "if (maxval < temp)\n" 2055 "{\n" 2056 "maxval = temp;\n" 2057 "maxloc = id;\n" 2058 "}\n" 2059 "#else\n" 2060 "maxval = MAX(maxval, temp);\n" 2061 "#endif\n" 2062 "#ifdef OP_CALC2\n" 2063 "maxval2 = MAX(maxval2, temp2);\n" 2064 "#endif\n" 2065 "#endif\n" 2066 "#elif kercn >= 2\n" 2067 "CALC_P(s0, 0)\n" 2068 "CALC_P(s1, 1)\n" 2069 "#if kercn >= 3\n" 2070 "CALC_P(s2, 2)\n" 2071 "#if kercn >= 4\n" 2072 "CALC_P(s3, 3)\n" 2073 "#if kercn >= 8\n" 2074 "CALC_P(s4, 4)\n" 2075 "CALC_P(s5, 5)\n" 2076 "CALC_P(s6, 6)\n" 2077 "CALC_P(s7, 7)\n" 2078 "#if kercn == 16\n" 2079 "CALC_P(s8, 8)\n" 2080 "CALC_P(s9, 9)\n" 2081 "CALC_P(sA, 10)\n" 2082 "CALC_P(sB, 11)\n" 2083 "CALC_P(sC, 12)\n" 2084 "CALC_P(sD, 13)\n" 2085 "CALC_P(sE, 14)\n" 2086 "CALC_P(sF, 15)\n" 2087 "#endif\n" 2088 "#endif\n" 2089 "#endif\n" 2090 "#endif\n" 2091 "#endif\n" 2092 "}\n" 2093 "}\n" 2094 "if (lid < WGS2_ALIGNED)\n" 2095 "{\n" 2096 "#ifdef NEED_MINVAL\n" 2097 "localmem_min[lid] = minval;\n" 2098 "#endif\n" 2099 "#ifdef NEED_MAXVAL\n" 2100 "localmem_max[lid] = maxval;\n" 2101 "#endif\n" 2102 "#ifdef NEED_MINLOC\n" 2103 "localmem_minloc[lid] = minloc;\n" 2104 "#endif\n" 2105 "#ifdef NEED_MAXLOC\n" 2106 "localmem_maxloc[lid] = maxloc;\n" 2107 "#endif\n" 2108 "#ifdef OP_CALC2\n" 2109 "localmem_max2[lid] = maxval2;\n" 2110 "#endif\n" 2111 "}\n" 2112 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2113 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n" 2114 "{\n" 2115 "int lid3 = lid - WGS2_ALIGNED;\n" 2116 "#ifdef NEED_MINVAL\n" 2117 "#ifdef NEED_MINLOC\n" 2118 "if (localmem_min[lid3] >= minval)\n" 2119 "{\n" 2120 "if (localmem_min[lid3] == minval)\n" 2121 "localmem_minloc[lid3] = min(localmem_minloc[lid3], minloc);\n" 2122 "else\n" 2123 "localmem_minloc[lid3] = minloc,\n" 2124 "localmem_min[lid3] = minval;\n" 2125 "}\n" 2126 "#else\n" 2127 "localmem_min[lid3] = MIN(localmem_min[lid3], minval);\n" 2128 "#endif\n" 2129 "#endif\n" 2130 "#ifdef NEED_MAXVAL\n" 2131 "#ifdef NEED_MAXLOC\n" 2132 "if (localmem_max[lid3] <= maxval)\n" 2133 "{\n" 2134 "if (localmem_max[lid3] == maxval)\n" 2135 "localmem_maxloc[lid3] = min(localmem_maxloc[lid3], maxloc);\n" 2136 "else\n" 2137 "localmem_maxloc[lid3] = maxloc,\n" 2138 "localmem_max[lid3] = maxval;\n" 2139 "}\n" 2140 "#else\n" 2141 "localmem_max[lid3] = MAX(localmem_max[lid3], maxval);\n" 2142 "#endif\n" 2143 "#endif\n" 2144 "#ifdef OP_CALC2\n" 2145 "localmem_max2[lid3] = MAX(localmem_max2[lid3], maxval2);\n" 2146 "#endif\n" 2147 "}\n" 2148 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2149 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n" 2150 "{\n" 2151 "if (lid < lsize)\n" 2152 "{\n" 2153 "int lid2 = lsize + lid;\n" 2154 "#ifdef NEED_MINVAL\n" 2155 "#ifdef NEED_MINLOC\n" 2156 "if (localmem_min[lid] >= localmem_min[lid2])\n" 2157 "{\n" 2158 "if (localmem_min[lid] == localmem_min[lid2])\n" 2159 "localmem_minloc[lid] = min(localmem_minloc[lid2], localmem_minloc[lid]);\n" 2160 "else\n" 2161 "localmem_minloc[lid] = localmem_minloc[lid2],\n" 2162 "localmem_min[lid] = localmem_min[lid2];\n" 2163 "}\n" 2164 "#else\n" 2165 "localmem_min[lid] = MIN(localmem_min[lid], localmem_min[lid2]);\n" 2166 "#endif\n" 2167 "#endif\n" 2168 "#ifdef NEED_MAXVAL\n" 2169 "#ifdef NEED_MAXLOC\n" 2170 "if (localmem_max[lid] <= localmem_max[lid2])\n" 2171 "{\n" 2172 "if (localmem_max[lid] == localmem_max[lid2])\n" 2173 "localmem_maxloc[lid] = min(localmem_maxloc[lid2], localmem_maxloc[lid]);\n" 2174 "else\n" 2175 "localmem_maxloc[lid] = localmem_maxloc[lid2],\n" 2176 "localmem_max[lid] = localmem_max[lid2];\n" 2177 "}\n" 2178 "#else\n" 2179 "localmem_max[lid] = MAX(localmem_max[lid], localmem_max[lid2]);\n" 2180 "#endif\n" 2181 "#endif\n" 2182 "#ifdef OP_CALC2\n" 2183 "localmem_max2[lid] = MAX(localmem_max2[lid], localmem_max2[lid2]);\n" 2184 "#endif\n" 2185 "}\n" 2186 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2187 "}\n" 2188 "if (lid == 0)\n" 2189 "{\n" 2190 "int pos = 0;\n" 2191 "#ifdef NEED_MINVAL\n" 2192 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_min[0];\n" 2193 "pos = mad24(groupnum, (int)sizeof(dstT1), pos);\n" 2194 "#endif\n" 2195 "#ifdef NEED_MAXVAL\n" 2196 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max[0];\n" 2197 "pos = mad24(groupnum, (int)sizeof(dstT1), pos);\n" 2198 "#endif\n" 2199 "#ifdef NEED_MINLOC\n" 2200 "*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_minloc[0];\n" 2201 "pos = mad24(groupnum, (int)sizeof(uint), pos);\n" 2202 "#endif\n" 2203 "#ifdef NEED_MAXLOC\n" 2204 "*(__global uint *)(dstptr + mad24(gid, (int)sizeof(uint), pos)) = localmem_maxloc[0];\n" 2205 "#ifdef OP_CALC2\n" 2206 "pos = mad24(groupnum, (int)sizeof(uint), pos);\n" 2207 "#endif\n" 2208 "#endif\n" 2209 "#ifdef OP_CALC2\n" 2210 "*(__global dstT1 *)(dstptr + mad24(gid, (int)sizeof(dstT1), pos)) = localmem_max2[0];\n" 2211 "#endif\n" 2212 "}\n" 2213 "}\n" 2214 , "c2741330ed8390675ed210edd54f5258"}; 2215 ProgramSource minmaxloc_oclsrc(minmaxloc.programStr); 2216 const struct ProgramEntry mixchannels={"mixchannels", 2217 "#define DECLARE_INPUT_MAT(i) \\\n" 2218 "__global const uchar * src##i##ptr, int src##i##_step, int src##i##_offset,\n" 2219 "#define DECLARE_OUTPUT_MAT(i) \\\n" 2220 "__global uchar * dst##i##ptr, int dst##i##_step, int dst##i##_offset,\n" 2221 "#define DECLARE_INDEX(i) \\\n" 2222 "int src##i##_index = mad24(src##i##_step, y0, mad24(x, (int)sizeof(T) * scn##i, src##i##_offset)); \\\n" 2223 "int dst##i##_index = mad24(dst##i##_step, y0, mad24(x, (int)sizeof(T) * dcn##i, dst##i##_offset));\n" 2224 "#define PROCESS_ELEM(i) \\\n" 2225 "__global const T * src##i = (__global const T *)(src##i##ptr + src##i##_index); \\\n" 2226 "__global T * dst##i = (__global T *)(dst##i##ptr + dst##i##_index); \\\n" 2227 "dst##i[0] = src##i[0]; \\\n" 2228 "src##i##_index += src##i##_step; \\\n" 2229 "dst##i##_index += dst##i##_step;\n" 2230 "__kernel void mixChannels(DECLARE_INPUT_MAT_N DECLARE_OUTPUT_MAT_N int rows, int cols, int rowsPerWI)\n" 2231 "{\n" 2232 "int x = get_global_id(0);\n" 2233 "int y0 = get_global_id(1) * rowsPerWI;\n" 2234 "if (x < cols)\n" 2235 "{\n" 2236 "DECLARE_INDEX_N\n" 2237 "for (int y = y0, y1 = min(y0 + rowsPerWI, rows); y < y1; ++y)\n" 2238 "{\n" 2239 "PROCESS_ELEM_N\n" 2240 "}\n" 2241 "}\n" 2242 "}\n" 2243 , "26a27b81c3e2524a8eb918b3a518da0a"}; 2244 ProgramSource mixchannels_oclsrc(mixchannels.programStr); 2245 const struct ProgramEntry mulspectrums={"mulspectrums", 2246 "inline float2 cmulf(float2 a, float2 b)\n" 2247 "{\n" 2248 "return (float2)(mad(a.x, b.x, - a.y * b.y), mad(a.x, b.y, a.y * b.x));\n" 2249 "}\n" 2250 "inline float2 conjf(float2 a)\n" 2251 "{\n" 2252 "return (float2)(a.x, - a.y);\n" 2253 "}\n" 2254 "__kernel void mulAndScaleSpectrums(__global const uchar * src1ptr, int src1_step, int src1_offset,\n" 2255 "__global const uchar * src2ptr, int src2_step, int src2_offset,\n" 2256 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 2257 "int dst_rows, int dst_cols, int rowsPerWI)\n" 2258 "{\n" 2259 "int x = get_global_id(0);\n" 2260 "int y0 = get_global_id(1) * rowsPerWI;\n" 2261 "if (x < dst_cols)\n" 2262 "{\n" 2263 "int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(float2), src1_offset));\n" 2264 "int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(float2), src2_offset));\n" 2265 "int dst_index = mad24(y0, dst_step, mad24(x, (int)sizeof(float2), dst_offset));\n" 2266 "for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y,\n" 2267 "src1_index += src1_step, src2_index += src2_step, dst_index += dst_step)\n" 2268 "{\n" 2269 "float2 src0 = *(__global const float2 *)(src1ptr + src1_index);\n" 2270 "float2 src1 = *(__global const float2 *)(src2ptr + src2_index);\n" 2271 "__global float2 * dst = (__global float2 *)(dstptr + dst_index);\n" 2272 "#ifdef CONJ\n" 2273 "float2 v = cmulf(src0, conjf(src1));\n" 2274 "#else\n" 2275 "float2 v = cmulf(src0, src1);\n" 2276 "#endif\n" 2277 "dst[0] = v;\n" 2278 "}\n" 2279 "}\n" 2280 "}\n" 2281 , "0ffb2c858f6664aa3e56efb81f025f5c"}; 2282 ProgramSource mulspectrums_oclsrc(mulspectrums.programStr); 2283 const struct ProgramEntry normalize={"normalize", 2284 "#ifdef DOUBLE_SUPPORT\n" 2285 "#ifdef cl_amd_fp64\n" 2286 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 2287 "#elif defined (cl_khr_fp64)\n" 2288 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 2289 "#endif\n" 2290 "#endif\n" 2291 "#define noconvert\n" 2292 "#if cn != 3\n" 2293 "#define loadpix(addr) *(__global const srcT *)(addr)\n" 2294 "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" 2295 "#define srcTSIZE (int)sizeof(srcT)\n" 2296 "#define dstTSIZE (int)sizeof(dstT)\n" 2297 "#else\n" 2298 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" 2299 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" 2300 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n" 2301 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n" 2302 "#endif\n" 2303 "__kernel void normalizek(__global const uchar * srcptr, int src_step, int src_offset,\n" 2304 "__global const uchar * mask, int mask_step, int mask_offset,\n" 2305 "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols\n" 2306 "#ifdef HAVE_SCALE\n" 2307 ", float scale\n" 2308 "#endif\n" 2309 "#ifdef HAVE_DELTA\n" 2310 ", float delta\n" 2311 "#endif\n" 2312 ")\n" 2313 "{\n" 2314 "int x = get_global_id(0);\n" 2315 "int y0 = get_global_id(1) * rowsPerWI;\n" 2316 "if (x < dst_cols)\n" 2317 "{\n" 2318 "int src_index = mad24(y0, src_step, mad24(x, srcTSIZE, src_offset));\n" 2319 "int mask_index = mad24(y0, mask_step, x + mask_offset);\n" 2320 "int dst_index = mad24(y0, dst_step, mad24(x, dstTSIZE, dst_offset));\n" 2321 "for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1;\n" 2322 "++y, src_index += src_step, dst_index += dst_step, mask_index += mask_step)\n" 2323 "{\n" 2324 "if (mask[mask_index])\n" 2325 "{\n" 2326 "workT value = convertToWT(loadpix(srcptr + src_index));\n" 2327 "#ifdef HAVE_SCALE\n" 2328 "#ifdef HAVE_DELTA\n" 2329 "value = fma(value, (workT)(scale), (workT)(delta));\n" 2330 "#else\n" 2331 "value *= (workT)(scale);\n" 2332 "#endif\n" 2333 "#else\n" 2334 "#ifdef HAVE_DELTA\n" 2335 "value += (workT)(delta);\n" 2336 "#endif\n" 2337 "#endif\n" 2338 "storepix(convertToDT(value), dstptr + dst_index);\n" 2339 "}\n" 2340 "}\n" 2341 "}\n" 2342 "}\n" 2343 , "05e23451b4bf16c50a0eba9d6c5c0012"}; 2344 ProgramSource normalize_oclsrc(normalize.programStr); 2345 const struct ProgramEntry reduce={"reduce", 2346 "#ifdef DOUBLE_SUPPORT\n" 2347 "#ifdef cl_amd_fp64\n" 2348 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 2349 "#elif defined (cl_khr_fp64)\n" 2350 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 2351 "#endif\n" 2352 "#endif\n" 2353 "#if defined OP_NORM_INF_MASK\n" 2354 "#ifdef DEPTH_0\n" 2355 "#define MIN_VAL 0\n" 2356 "#define MAX_VAL 255\n" 2357 "#elif defined DEPTH_1\n" 2358 "#define MIN_VAL -128\n" 2359 "#define MAX_VAL 127\n" 2360 "#elif defined DEPTH_2\n" 2361 "#define MIN_VAL 0\n" 2362 "#define MAX_VAL 65535\n" 2363 "#elif defined DEPTH_3\n" 2364 "#define MIN_VAL -32768\n" 2365 "#define MAX_VAL 32767\n" 2366 "#elif defined DEPTH_4\n" 2367 "#define MIN_VAL INT_MIN\n" 2368 "#define MAX_VAL INT_MAX\n" 2369 "#elif defined DEPTH_5\n" 2370 "#define MIN_VAL (-FLT_MAX)\n" 2371 "#define MAX_VAL FLT_MAX\n" 2372 "#elif defined DEPTH_6\n" 2373 "#define MIN_VAL (-DBL_MAX)\n" 2374 "#define MAX_VAL DBL_MAX\n" 2375 "#endif\n" 2376 "#define dstT srcT\n" 2377 "#define dstT1 srcT1\n" 2378 "#endif\n" 2379 "#define noconvert\n" 2380 "#ifndef kercn\n" 2381 "#define kercn 1\n" 2382 "#endif\n" 2383 "#ifdef HAVE_MASK_CONT\n" 2384 "#define MASK_INDEX int mask_index = id + mask_offset;\n" 2385 "#else\n" 2386 "#define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols))\n" 2387 "#endif\n" 2388 "#if cn != 3\n" 2389 "#define loadpix(addr) *(__global const srcT *)(addr)\n" 2390 "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" 2391 "#if kercn == 1\n" 2392 "#define srcTSIZE (int)sizeof(srcT)\n" 2393 "#else\n" 2394 "#define srcTSIZE (int)sizeof(srcT1)\n" 2395 "#endif\n" 2396 "#define dstTSIZE (int)sizeof(dstT)\n" 2397 "#else\n" 2398 "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" 2399 "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" 2400 "#define srcTSIZE ((int)sizeof(srcT1)*3)\n" 2401 "#define dstTSIZE ((int)sizeof(dstT1)*3)\n" 2402 "#endif\n" 2403 "#if ddepth <= 4\n" 2404 "#define SUM_ABS(a) convertFromU(abs(a))\n" 2405 "#define SUM_ABS2(a, b) convertFromU(abs_diff(a, b))\n" 2406 "#else\n" 2407 "#define SUM_ABS(a) fabs(a)\n" 2408 "#define SUM_ABS2(a, b) fabs(a - b)\n" 2409 "#endif\n" 2410 "#ifdef HAVE_MASK\n" 2411 "#ifdef HAVE_SRC2\n" 2412 "#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset\n" 2413 "#else\n" 2414 "#define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset\n" 2415 "#endif\n" 2416 "#else\n" 2417 "#ifdef HAVE_SRC2\n" 2418 "#define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset\n" 2419 "#else\n" 2420 "#define EXTRA_PARAMS\n" 2421 "#endif\n" 2422 "#endif\n" 2423 "#if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT\n" 2424 "#ifdef OP_DOT\n" 2425 "#if ddepth <= 4\n" 2426 "#define FUNC(a, b, c) a = mad24(b, c, a)\n" 2427 "#else\n" 2428 "#define FUNC(a, b, c) a = mad(b, c, a)\n" 2429 "#endif\n" 2430 "#elif defined OP_SUM\n" 2431 "#define FUNC(a, b) a += b\n" 2432 "#elif defined OP_SUM_ABS\n" 2433 "#define FUNC(a, b) a += SUM_ABS(b)\n" 2434 "#elif defined OP_SUM_SQR\n" 2435 "#if ddepth <= 4\n" 2436 "#define FUNC(a, b) a = mad24(b, b, a)\n" 2437 "#else\n" 2438 "#define FUNC(a, b) a = mad(b, b, a)\n" 2439 "#endif\n" 2440 "#endif\n" 2441 "#ifdef OP_CALC2\n" 2442 "#define DECLARE_LOCAL_MEM \\\n" 2443 "__local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]\n" 2444 "#define DEFINE_ACCUMULATOR \\\n" 2445 "dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)\n" 2446 "#else\n" 2447 "#define DECLARE_LOCAL_MEM \\\n" 2448 "__local dstT localmem[WGS2_ALIGNED]\n" 2449 "#define DEFINE_ACCUMULATOR \\\n" 2450 "dstT accumulator = (dstT)(0)\n" 2451 "#endif\n" 2452 "#ifdef HAVE_SRC2\n" 2453 "#ifdef OP_CALC2\n" 2454 "#define PROCESS_ELEMS \\\n" 2455 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2456 "dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2457 "temp = SUM_ABS2(temp, temp2); \\\n" 2458 "temp2 = SUM_ABS(temp2); \\\n" 2459 "FUNC(accumulator2, temp2); \\\n" 2460 "FUNC(accumulator, temp)\n" 2461 "#else\n" 2462 "#define PROCESS_ELEMS \\\n" 2463 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2464 "dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2465 "temp = SUM_ABS2(temp, temp2); \\\n" 2466 "FUNC(accumulator, temp)\n" 2467 "#endif\n" 2468 "#else\n" 2469 "#define PROCESS_ELEMS \\\n" 2470 "dstT temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2471 "FUNC(accumulator, temp)\n" 2472 "#endif\n" 2473 "#ifdef HAVE_MASK\n" 2474 "#define REDUCE_GLOBAL \\\n" 2475 "MASK_INDEX; \\\n" 2476 "if (mask[mask_index]) \\\n" 2477 "{ \\\n" 2478 "PROCESS_ELEMS; \\\n" 2479 "}\n" 2480 "#elif defined OP_DOT\n" 2481 "#ifdef HAVE_SRC2_CONT\n" 2482 "#define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset);\n" 2483 "#else\n" 2484 "#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))\n" 2485 "#endif\n" 2486 "#if kercn == 1\n" 2487 "#define REDUCE_GLOBAL \\\n" 2488 "SRC2_INDEX; \\\n" 2489 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2490 "FUNC(accumulator, temp, temp2)\n" 2491 "#elif kercn == 2\n" 2492 "#define REDUCE_GLOBAL \\\n" 2493 "SRC2_INDEX; \\\n" 2494 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2495 "FUNC(accumulator, temp.s0, temp2.s0); \\\n" 2496 "FUNC(accumulator, temp.s1, temp2.s1)\n" 2497 "#elif kercn == 4\n" 2498 "#define REDUCE_GLOBAL \\\n" 2499 "SRC2_INDEX; \\\n" 2500 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2501 "FUNC(accumulator, temp.s0, temp2.s0); \\\n" 2502 "FUNC(accumulator, temp.s1, temp2.s1); \\\n" 2503 "FUNC(accumulator, temp.s2, temp2.s2); \\\n" 2504 "FUNC(accumulator, temp.s3, temp2.s3)\n" 2505 "#elif kercn == 8\n" 2506 "#define REDUCE_GLOBAL \\\n" 2507 "SRC2_INDEX; \\\n" 2508 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2509 "FUNC(accumulator, temp.s0, temp2.s0); \\\n" 2510 "FUNC(accumulator, temp.s1, temp2.s1); \\\n" 2511 "FUNC(accumulator, temp.s2, temp2.s2); \\\n" 2512 "FUNC(accumulator, temp.s3, temp2.s3); \\\n" 2513 "FUNC(accumulator, temp.s4, temp2.s4); \\\n" 2514 "FUNC(accumulator, temp.s5, temp2.s5); \\\n" 2515 "FUNC(accumulator, temp.s6, temp2.s6); \\\n" 2516 "FUNC(accumulator, temp.s7, temp2.s7)\n" 2517 "#elif kercn == 16\n" 2518 "#define REDUCE_GLOBAL \\\n" 2519 "SRC2_INDEX; \\\n" 2520 "dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2521 "FUNC(accumulator, temp.s0, temp2.s0); \\\n" 2522 "FUNC(accumulator, temp.s1, temp2.s1); \\\n" 2523 "FUNC(accumulator, temp.s2, temp2.s2); \\\n" 2524 "FUNC(accumulator, temp.s3, temp2.s3); \\\n" 2525 "FUNC(accumulator, temp.s4, temp2.s4); \\\n" 2526 "FUNC(accumulator, temp.s5, temp2.s5); \\\n" 2527 "FUNC(accumulator, temp.s6, temp2.s6); \\\n" 2528 "FUNC(accumulator, temp.s7, temp2.s7); \\\n" 2529 "FUNC(accumulator, temp.s8, temp2.s8); \\\n" 2530 "FUNC(accumulator, temp.s9, temp2.s9); \\\n" 2531 "FUNC(accumulator, temp.sA, temp2.sA); \\\n" 2532 "FUNC(accumulator, temp.sB, temp2.sB); \\\n" 2533 "FUNC(accumulator, temp.sC, temp2.sC); \\\n" 2534 "FUNC(accumulator, temp.sD, temp2.sD); \\\n" 2535 "FUNC(accumulator, temp.sE, temp2.sE); \\\n" 2536 "FUNC(accumulator, temp.sF, temp2.sF)\n" 2537 "#endif\n" 2538 "#else\n" 2539 "#ifdef HAVE_SRC2\n" 2540 "#ifdef OP_CALC2\n" 2541 "#if kercn == 1\n" 2542 "#define REDUCE_GLOBAL \\\n" 2543 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2544 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2545 "temp = SUM_ABS2(temp, temp2); \\\n" 2546 "temp2 = SUM_ABS(temp2); \\\n" 2547 "FUNC(accumulator, temp); \\\n" 2548 "FUNC(accumulator2, temp2)\n" 2549 "#elif kercn == 2\n" 2550 "#define REDUCE_GLOBAL \\\n" 2551 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2552 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2553 "temp = SUM_ABS2(temp, temp2); \\\n" 2554 "temp2 = SUM_ABS(temp2); \\\n" 2555 "FUNC(accumulator, temp.s0); \\\n" 2556 "FUNC(accumulator, temp.s1); \\\n" 2557 "FUNC(accumulator2, temp2.s0); \\\n" 2558 "FUNC(accumulator2, temp2.s1)\n" 2559 "#elif kercn == 4\n" 2560 "#define REDUCE_GLOBAL \\\n" 2561 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2562 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2563 "temp = SUM_ABS2(temp, temp2); \\\n" 2564 "temp2 = SUM_ABS(temp2); \\\n" 2565 "FUNC(accumulator, temp.s0); \\\n" 2566 "FUNC(accumulator, temp.s1); \\\n" 2567 "FUNC(accumulator, temp.s2); \\\n" 2568 "FUNC(accumulator, temp.s3); \\\n" 2569 "FUNC(accumulator2, temp2.s0); \\\n" 2570 "FUNC(accumulator2, temp2.s1); \\\n" 2571 "FUNC(accumulator2, temp2.s2); \\\n" 2572 "FUNC(accumulator2, temp2.s3)\n" 2573 "#elif kercn == 8\n" 2574 "#define REDUCE_GLOBAL \\\n" 2575 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2576 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2577 "temp = SUM_ABS2(temp, temp2); \\\n" 2578 "temp2 = SUM_ABS(temp2); \\\n" 2579 "FUNC(accumulator, temp.s0); \\\n" 2580 "FUNC(accumulator, temp.s1); \\\n" 2581 "FUNC(accumulator, temp.s2); \\\n" 2582 "FUNC(accumulator, temp.s3); \\\n" 2583 "FUNC(accumulator, temp.s4); \\\n" 2584 "FUNC(accumulator, temp.s5); \\\n" 2585 "FUNC(accumulator, temp.s6); \\\n" 2586 "FUNC(accumulator, temp.s7); \\\n" 2587 "FUNC(accumulator2, temp2.s0); \\\n" 2588 "FUNC(accumulator2, temp2.s1); \\\n" 2589 "FUNC(accumulator2, temp2.s2); \\\n" 2590 "FUNC(accumulator2, temp2.s3); \\\n" 2591 "FUNC(accumulator2, temp2.s4); \\\n" 2592 "FUNC(accumulator2, temp2.s5); \\\n" 2593 "FUNC(accumulator2, temp2.s6); \\\n" 2594 "FUNC(accumulator2, temp2.s7)\n" 2595 "#elif kercn == 16\n" 2596 "#define REDUCE_GLOBAL \\\n" 2597 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2598 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2599 "temp = SUM_ABS2(temp, temp2); \\\n" 2600 "temp2 = SUM_ABS(temp2); \\\n" 2601 "FUNC(accumulator, temp.s0); \\\n" 2602 "FUNC(accumulator, temp.s1); \\\n" 2603 "FUNC(accumulator, temp.s2); \\\n" 2604 "FUNC(accumulator, temp.s3); \\\n" 2605 "FUNC(accumulator, temp.s4); \\\n" 2606 "FUNC(accumulator, temp.s5); \\\n" 2607 "FUNC(accumulator, temp.s6); \\\n" 2608 "FUNC(accumulator, temp.s7); \\\n" 2609 "FUNC(accumulator, temp.s8); \\\n" 2610 "FUNC(accumulator, temp.s9); \\\n" 2611 "FUNC(accumulator, temp.sA); \\\n" 2612 "FUNC(accumulator, temp.sB); \\\n" 2613 "FUNC(accumulator, temp.sC); \\\n" 2614 "FUNC(accumulator, temp.sD); \\\n" 2615 "FUNC(accumulator, temp.sE); \\\n" 2616 "FUNC(accumulator, temp.sF); \\\n" 2617 "FUNC(accumulator2, temp2.s0); \\\n" 2618 "FUNC(accumulator2, temp2.s1); \\\n" 2619 "FUNC(accumulator2, temp2.s2); \\\n" 2620 "FUNC(accumulator2, temp2.s3); \\\n" 2621 "FUNC(accumulator2, temp2.s4); \\\n" 2622 "FUNC(accumulator2, temp2.s5); \\\n" 2623 "FUNC(accumulator2, temp2.s6); \\\n" 2624 "FUNC(accumulator2, temp2.s7); \\\n" 2625 "FUNC(accumulator2, temp2.s8); \\\n" 2626 "FUNC(accumulator2, temp2.s9); \\\n" 2627 "FUNC(accumulator2, temp2.sA); \\\n" 2628 "FUNC(accumulator2, temp2.sB); \\\n" 2629 "FUNC(accumulator2, temp2.sC); \\\n" 2630 "FUNC(accumulator2, temp2.sD); \\\n" 2631 "FUNC(accumulator2, temp2.sE); \\\n" 2632 "FUNC(accumulator2, temp2.sF)\n" 2633 "#endif\n" 2634 "#else\n" 2635 "#if kercn == 1\n" 2636 "#define REDUCE_GLOBAL \\\n" 2637 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2638 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2639 "temp = SUM_ABS2(temp, temp2); \\\n" 2640 "FUNC(accumulator, temp)\n" 2641 "#elif kercn == 2\n" 2642 "#define REDUCE_GLOBAL \\\n" 2643 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2644 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2645 "temp = SUM_ABS2(temp, temp2); \\\n" 2646 "FUNC(accumulator, temp.s0); \\\n" 2647 "FUNC(accumulator, temp.s1)\n" 2648 "#elif kercn == 4\n" 2649 "#define REDUCE_GLOBAL \\\n" 2650 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2651 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2652 "temp = SUM_ABS2(temp, temp2); \\\n" 2653 "FUNC(accumulator, temp.s0); \\\n" 2654 "FUNC(accumulator, temp.s1); \\\n" 2655 "FUNC(accumulator, temp.s2); \\\n" 2656 "FUNC(accumulator, temp.s3)\n" 2657 "#elif kercn == 8\n" 2658 "#define REDUCE_GLOBAL \\\n" 2659 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2660 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2661 "temp = SUM_ABS2(temp, temp2); \\\n" 2662 "FUNC(accumulator, temp.s0); \\\n" 2663 "FUNC(accumulator, temp.s1); \\\n" 2664 "FUNC(accumulator, temp.s2); \\\n" 2665 "FUNC(accumulator, temp.s3); \\\n" 2666 "FUNC(accumulator, temp.s4); \\\n" 2667 "FUNC(accumulator, temp.s5); \\\n" 2668 "FUNC(accumulator, temp.s6); \\\n" 2669 "FUNC(accumulator, temp.s7)\n" 2670 "#elif kercn == 16\n" 2671 "#define REDUCE_GLOBAL \\\n" 2672 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2673 "dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \\\n" 2674 "temp = SUM_ABS2(temp, temp2); \\\n" 2675 "FUNC(accumulator, temp.s0); \\\n" 2676 "FUNC(accumulator, temp.s1); \\\n" 2677 "FUNC(accumulator, temp.s2); \\\n" 2678 "FUNC(accumulator, temp.s3); \\\n" 2679 "FUNC(accumulator, temp.s4); \\\n" 2680 "FUNC(accumulator, temp.s5); \\\n" 2681 "FUNC(accumulator, temp.s6); \\\n" 2682 "FUNC(accumulator, temp.s7); \\\n" 2683 "FUNC(accumulator, temp.s8); \\\n" 2684 "FUNC(accumulator, temp.s9); \\\n" 2685 "FUNC(accumulator, temp.sA); \\\n" 2686 "FUNC(accumulator, temp.sB); \\\n" 2687 "FUNC(accumulator, temp.sC); \\\n" 2688 "FUNC(accumulator, temp.sD); \\\n" 2689 "FUNC(accumulator, temp.sE); \\\n" 2690 "FUNC(accumulator, temp.sF)\n" 2691 "#endif\n" 2692 "#endif\n" 2693 "#else\n" 2694 "#if kercn == 1\n" 2695 "#define REDUCE_GLOBAL \\\n" 2696 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2697 "FUNC(accumulator, temp)\n" 2698 "#elif kercn == 2\n" 2699 "#define REDUCE_GLOBAL \\\n" 2700 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2701 "FUNC(accumulator, temp.s0); \\\n" 2702 "FUNC(accumulator, temp.s1)\n" 2703 "#elif kercn == 4\n" 2704 "#define REDUCE_GLOBAL \\\n" 2705 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2706 "FUNC(accumulator, temp.s0); \\\n" 2707 "FUNC(accumulator, temp.s1); \\\n" 2708 "FUNC(accumulator, temp.s2); \\\n" 2709 "FUNC(accumulator, temp.s3)\n" 2710 "#elif kercn == 8\n" 2711 "#define REDUCE_GLOBAL \\\n" 2712 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2713 "FUNC(accumulator, temp.s0); \\\n" 2714 "FUNC(accumulator, temp.s1); \\\n" 2715 "FUNC(accumulator, temp.s2); \\\n" 2716 "FUNC(accumulator, temp.s3); \\\n" 2717 "FUNC(accumulator, temp.s4); \\\n" 2718 "FUNC(accumulator, temp.s5); \\\n" 2719 "FUNC(accumulator, temp.s6); \\\n" 2720 "FUNC(accumulator, temp.s7)\n" 2721 "#elif kercn == 16\n" 2722 "#define REDUCE_GLOBAL \\\n" 2723 "dstTK temp = convertToDT(loadpix(srcptr + src_index)); \\\n" 2724 "FUNC(accumulator, temp.s0); \\\n" 2725 "FUNC(accumulator, temp.s1); \\\n" 2726 "FUNC(accumulator, temp.s2); \\\n" 2727 "FUNC(accumulator, temp.s3); \\\n" 2728 "FUNC(accumulator, temp.s4); \\\n" 2729 "FUNC(accumulator, temp.s5); \\\n" 2730 "FUNC(accumulator, temp.s6); \\\n" 2731 "FUNC(accumulator, temp.s7); \\\n" 2732 "FUNC(accumulator, temp.s8); \\\n" 2733 "FUNC(accumulator, temp.s9); \\\n" 2734 "FUNC(accumulator, temp.sA); \\\n" 2735 "FUNC(accumulator, temp.sB); \\\n" 2736 "FUNC(accumulator, temp.sC); \\\n" 2737 "FUNC(accumulator, temp.sD); \\\n" 2738 "FUNC(accumulator, temp.sE); \\\n" 2739 "FUNC(accumulator, temp.sF)\n" 2740 "#endif\n" 2741 "#endif\n" 2742 "#endif\n" 2743 "#ifdef OP_CALC2\n" 2744 "#define SET_LOCAL_1 \\\n" 2745 "localmem[lid] = accumulator; \\\n" 2746 "localmem2[lid] = accumulator2\n" 2747 "#define REDUCE_LOCAL_1 \\\n" 2748 "localmem[lid - WGS2_ALIGNED] += accumulator; \\\n" 2749 "localmem2[lid - WGS2_ALIGNED] += accumulator2\n" 2750 "#define REDUCE_LOCAL_2 \\\n" 2751 "localmem[lid] += localmem[lid2]; \\\n" 2752 "localmem2[lid] += localmem2[lid2]\n" 2753 "#define CALC_RESULT \\\n" 2754 "storepix(localmem[0], dstptr + dstTSIZE * gid); \\\n" 2755 "storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))\n" 2756 "#else\n" 2757 "#define SET_LOCAL_1 \\\n" 2758 "localmem[lid] = accumulator\n" 2759 "#define REDUCE_LOCAL_1 \\\n" 2760 "localmem[lid - WGS2_ALIGNED] += accumulator\n" 2761 "#define REDUCE_LOCAL_2 \\\n" 2762 "localmem[lid] += localmem[lid2]\n" 2763 "#define CALC_RESULT \\\n" 2764 "storepix(localmem[0], dstptr + dstTSIZE * gid)\n" 2765 "#endif\n" 2766 "#elif defined OP_COUNT_NON_ZERO\n" 2767 "#define dstT int\n" 2768 "#define DECLARE_LOCAL_MEM \\\n" 2769 "__local dstT localmem[WGS2_ALIGNED]\n" 2770 "#define DEFINE_ACCUMULATOR \\\n" 2771 "dstT accumulator = (dstT)(0); \\\n" 2772 "srcT1 zero = (srcT1)(0), one = (srcT1)(1)\n" 2773 "#if kercn == 1\n" 2774 "#define REDUCE_GLOBAL \\\n" 2775 "accumulator += loadpix(srcptr + src_index) == zero ? zero : one\n" 2776 "#elif kercn == 2\n" 2777 "#define REDUCE_GLOBAL \\\n" 2778 "srcT value = loadpix(srcptr + src_index); \\\n" 2779 "accumulator += value.s0 == zero ? zero : one; \\\n" 2780 "accumulator += value.s1 == zero ? zero : one\n" 2781 "#elif kercn == 4\n" 2782 "#define REDUCE_GLOBAL \\\n" 2783 "srcT value = loadpix(srcptr + src_index); \\\n" 2784 "accumulator += value.s0 == zero ? zero : one; \\\n" 2785 "accumulator += value.s1 == zero ? zero : one; \\\n" 2786 "accumulator += value.s2 == zero ? zero : one; \\\n" 2787 "accumulator += value.s3 == zero ? zero : one\n" 2788 "#elif kercn == 8\n" 2789 "#define REDUCE_GLOBAL \\\n" 2790 "srcT value = loadpix(srcptr + src_index); \\\n" 2791 "accumulator += value.s0 == zero ? zero : one; \\\n" 2792 "accumulator += value.s1 == zero ? zero : one; \\\n" 2793 "accumulator += value.s2 == zero ? zero : one; \\\n" 2794 "accumulator += value.s3 == zero ? zero : one; \\\n" 2795 "accumulator += value.s4 == zero ? zero : one; \\\n" 2796 "accumulator += value.s5 == zero ? zero : one; \\\n" 2797 "accumulator += value.s6 == zero ? zero : one; \\\n" 2798 "accumulator += value.s7 == zero ? zero : one\n" 2799 "#elif kercn == 16\n" 2800 "#define REDUCE_GLOBAL \\\n" 2801 "srcT value = loadpix(srcptr + src_index); \\\n" 2802 "accumulator += value.s0 == zero ? zero : one; \\\n" 2803 "accumulator += value.s1 == zero ? zero : one; \\\n" 2804 "accumulator += value.s2 == zero ? zero : one; \\\n" 2805 "accumulator += value.s3 == zero ? zero : one; \\\n" 2806 "accumulator += value.s4 == zero ? zero : one; \\\n" 2807 "accumulator += value.s5 == zero ? zero : one; \\\n" 2808 "accumulator += value.s6 == zero ? zero : one; \\\n" 2809 "accumulator += value.s7 == zero ? zero : one; \\\n" 2810 "accumulator += value.s8 == zero ? zero : one; \\\n" 2811 "accumulator += value.s9 == zero ? zero : one; \\\n" 2812 "accumulator += value.sA == zero ? zero : one; \\\n" 2813 "accumulator += value.sB == zero ? zero : one; \\\n" 2814 "accumulator += value.sC == zero ? zero : one; \\\n" 2815 "accumulator += value.sD == zero ? zero : one; \\\n" 2816 "accumulator += value.sE == zero ? zero : one; \\\n" 2817 "accumulator += value.sF == zero ? zero : one\n" 2818 "#endif\n" 2819 "#define SET_LOCAL_1 \\\n" 2820 "localmem[lid] = accumulator\n" 2821 "#define REDUCE_LOCAL_1 \\\n" 2822 "localmem[lid - WGS2_ALIGNED] += accumulator\n" 2823 "#define REDUCE_LOCAL_2 \\\n" 2824 "localmem[lid] += localmem[lid2]\n" 2825 "#define CALC_RESULT \\\n" 2826 "storepix(localmem[0], dstptr + dstTSIZE * gid)\n" 2827 "#else\n" 2828 "#error \"No operation\"\n" 2829 "#endif\n" 2830 "#ifdef OP_DOT\n" 2831 "#undef EXTRA_PARAMS\n" 2832 "#define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset\n" 2833 "#endif\n" 2834 "__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n" 2835 "int total, int groupnum, __global uchar * dstptr EXTRA_PARAMS)\n" 2836 "{\n" 2837 "int lid = get_local_id(0);\n" 2838 "int gid = get_group_id(0);\n" 2839 "int id = get_global_id(0) * kercn;\n" 2840 "srcptr += src_offset;\n" 2841 "#ifdef HAVE_SRC2\n" 2842 "src2ptr += src2_offset;\n" 2843 "#endif\n" 2844 "DECLARE_LOCAL_MEM;\n" 2845 "DEFINE_ACCUMULATOR;\n" 2846 "for (int grain = groupnum * WGS * kercn; id < total; id += grain)\n" 2847 "{\n" 2848 "#ifdef HAVE_SRC_CONT\n" 2849 "int src_index = id * srcTSIZE;\n" 2850 "#else\n" 2851 "int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));\n" 2852 "#endif\n" 2853 "#ifdef HAVE_SRC2\n" 2854 "#ifdef HAVE_SRC2_CONT\n" 2855 "int src2_index = id * srcTSIZE;\n" 2856 "#else\n" 2857 "int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));\n" 2858 "#endif\n" 2859 "#endif\n" 2860 "REDUCE_GLOBAL;\n" 2861 "}\n" 2862 "if (lid < WGS2_ALIGNED)\n" 2863 "{\n" 2864 "SET_LOCAL_1;\n" 2865 "}\n" 2866 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2867 "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n" 2868 "{\n" 2869 "REDUCE_LOCAL_1;\n" 2870 "}\n" 2871 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2872 "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n" 2873 "{\n" 2874 "if (lid < lsize)\n" 2875 "{\n" 2876 "int lid2 = lsize + lid;\n" 2877 "REDUCE_LOCAL_2;\n" 2878 "}\n" 2879 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2880 "}\n" 2881 "if (lid == 0)\n" 2882 "{\n" 2883 "CALC_RESULT;\n" 2884 "}\n" 2885 "}\n" 2886 , "2bd554448b0b0af7e1a1ddd57a55f5a6"}; 2887 ProgramSource reduce_oclsrc(reduce.programStr); 2888 const struct ProgramEntry reduce2={"reduce2", 2889 "#ifdef DOUBLE_SUPPORT\n" 2890 "#ifdef cl_amd_fp64\n" 2891 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 2892 "#elif defined (cl_khr_fp64)\n" 2893 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 2894 "#endif\n" 2895 "#endif\n" 2896 "#if ddepth == 0\n" 2897 "#define MIN_VAL 0\n" 2898 "#define MAX_VAL 255\n" 2899 "#elif ddepth == 1\n" 2900 "#define MIN_VAL -128\n" 2901 "#define MAX_VAL 127\n" 2902 "#elif ddepth == 2\n" 2903 "#define MIN_VAL 0\n" 2904 "#define MAX_VAL 65535\n" 2905 "#elif ddepth == 3\n" 2906 "#define MIN_VAL -32768\n" 2907 "#define MAX_VAL 32767\n" 2908 "#elif ddepth == 4\n" 2909 "#define MIN_VAL INT_MIN\n" 2910 "#define MAX_VAL INT_MAX\n" 2911 "#elif ddepth == 5\n" 2912 "#define MIN_VAL (-FLT_MAX)\n" 2913 "#define MAX_VAL FLT_MAX\n" 2914 "#elif ddepth == 6\n" 2915 "#define MIN_VAL (-DBL_MAX)\n" 2916 "#define MAX_VAL DBL_MAX\n" 2917 "#else\n" 2918 "#error \"Unsupported depth\"\n" 2919 "#endif\n" 2920 "#define noconvert\n" 2921 "#if defined OCL_CV_REDUCE_SUM || defined OCL_CV_REDUCE_AVG\n" 2922 "#define INIT_VALUE 0\n" 2923 "#define PROCESS_ELEM(acc, value) acc += value\n" 2924 "#elif defined OCL_CV_REDUCE_MAX\n" 2925 "#define INIT_VALUE MIN_VAL\n" 2926 "#define PROCESS_ELEM(acc, value) acc = max(value, acc)\n" 2927 "#elif defined OCL_CV_REDUCE_MIN\n" 2928 "#define INIT_VALUE MAX_VAL\n" 2929 "#define PROCESS_ELEM(acc, value) acc = min(value, acc)\n" 2930 "#else\n" 2931 "#error \"No operation is specified\"\n" 2932 "#endif\n" 2933 "#ifdef OP_REDUCE_PRE\n" 2934 "__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n" 2935 "__global uchar * dstptr, int dst_step, int dst_offset\n" 2936 "#ifdef OCL_CV_REDUCE_AVG\n" 2937 ", float fscale\n" 2938 "#endif\n" 2939 ")\n" 2940 "{\n" 2941 "__local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn];\n" 2942 "int x = get_global_id(0);\n" 2943 "int y = get_global_id(1);\n" 2944 "int liy = get_local_id(1);\n" 2945 "if ((x < BUF_COLS) && (y < rows))\n" 2946 "{\n" 2947 "int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));\n" 2948 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n" 2949 "bufT tmp[cn];\n" 2950 "#pragma unroll\n" 2951 "for (int c = 0; c < cn; ++c)\n" 2952 "tmp[c] = INIT_VALUE;\n" 2953 "int src_step_mul = BUF_COLS * cn;\n" 2954 "for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul)\n" 2955 "{\n" 2956 "#pragma unroll\n" 2957 "for (int c = 0; c < cn; ++c)\n" 2958 "{\n" 2959 "bufT value = convertToBufT(src[c]);\n" 2960 "PROCESS_ELEM(tmp[c], value);\n" 2961 "}\n" 2962 "}\n" 2963 "#pragma unroll\n" 2964 "for (int c = 0; c < cn; ++c)\n" 2965 "lsmem[liy][x][c] = tmp[c];\n" 2966 "}\n" 2967 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2968 "if ((x < BUF_COLS / 2) && (y < rows))\n" 2969 "{\n" 2970 "#pragma unroll\n" 2971 "for (int c = 0; c < cn; ++c)\n" 2972 "{\n" 2973 "PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x + BUF_COLS / 2][c]);\n" 2974 "}\n" 2975 "}\n" 2976 "barrier(CLK_LOCAL_MEM_FENCE);\n" 2977 "if ((x == 0) && (y < rows))\n" 2978 "{\n" 2979 "int dst_index = mad24(y, dst_step, dst_offset);\n" 2980 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n" 2981 "bufT tmp[cn];\n" 2982 "#pragma unroll\n" 2983 "for (int c = 0; c < cn; ++c)\n" 2984 "tmp[c] = INIT_VALUE;\n" 2985 "#pragma unroll\n" 2986 "for (int xin = 0; xin < BUF_COLS / 2; xin ++)\n" 2987 "{\n" 2988 "#pragma unroll\n" 2989 "for (int c = 0; c < cn; ++c)\n" 2990 "{\n" 2991 "PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]);\n" 2992 "}\n" 2993 "}\n" 2994 "#pragma unroll\n" 2995 "for (int c = 0; c < cn; ++c)\n" 2996 "#ifdef OCL_CV_REDUCE_AVG\n" 2997 "dst[c] = convertToDT(convertToWT(tmp[c]) * fscale);\n" 2998 "#else\n" 2999 "dst[c] = convertToDT(tmp[c]);\n" 3000 "#endif\n" 3001 "}\n" 3002 "}\n" 3003 "#else\n" 3004 "__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n" 3005 "__global uchar * dstptr, int dst_step, int dst_offset\n" 3006 "#ifdef OCL_CV_REDUCE_AVG\n" 3007 ", float fscale\n" 3008 "#endif\n" 3009 ")\n" 3010 "{\n" 3011 "#if dim == 0\n" 3012 "int x = get_global_id(0);\n" 3013 "if (x < cols)\n" 3014 "{\n" 3015 "int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);\n" 3016 "int dst_index = mad24(x, (int)sizeof(dstT0) * cn, dst_offset);\n" 3017 "__global dstT0 * dst = (__global dstT0 *)(dstptr + dst_index);\n" 3018 "dstT tmp[cn];\n" 3019 "#pragma unroll\n" 3020 "for (int c = 0; c < cn; ++c)\n" 3021 "tmp[c] = INIT_VALUE;\n" 3022 "for (int y = 0; y < rows; ++y, src_index += src_step)\n" 3023 "{\n" 3024 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n" 3025 "#pragma unroll\n" 3026 "for (int c = 0; c < cn; ++c)\n" 3027 "{\n" 3028 "dstT value = convertToDT(src[c]);\n" 3029 "PROCESS_ELEM(tmp[c], value);\n" 3030 "}\n" 3031 "}\n" 3032 "#pragma unroll\n" 3033 "for (int c = 0; c < cn; ++c)\n" 3034 "#ifdef OCL_CV_REDUCE_AVG\n" 3035 "dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);\n" 3036 "#else\n" 3037 "dst[c] = convertToDT0(tmp[c]);\n" 3038 "#endif\n" 3039 "}\n" 3040 "#elif dim == 1\n" 3041 "int y = get_global_id(0);\n" 3042 "if (y < rows)\n" 3043 "{\n" 3044 "int src_index = mad24(y, src_step, src_offset);\n" 3045 "int dst_index = mad24(y, dst_step, dst_offset);\n" 3046 "__global const srcT * src = (__global const srcT *)(srcptr + src_index);\n" 3047 "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n" 3048 "dstT tmp[cn];\n" 3049 "#pragma unroll\n" 3050 "for (int c = 0; c < cn; ++c)\n" 3051 "tmp[c] = INIT_VALUE;\n" 3052 "for (int x = 0; x < cols; ++x, src += cn)\n" 3053 "{\n" 3054 "#pragma unroll\n" 3055 "for (int c = 0; c < cn; ++c)\n" 3056 "{\n" 3057 "dstT value = convertToDT(src[c]);\n" 3058 "PROCESS_ELEM(tmp[c], value);\n" 3059 "}\n" 3060 "}\n" 3061 "#pragma unroll\n" 3062 "for (int c = 0; c < cn; ++c)\n" 3063 "#ifdef OCL_CV_REDUCE_AVG\n" 3064 "dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);\n" 3065 "#else\n" 3066 "dst[c] = convertToDT0(tmp[c]);\n" 3067 "#endif\n" 3068 "}\n" 3069 "#else\n" 3070 "#error \"Dims must be either 0 or 1\"\n" 3071 "#endif\n" 3072 "}\n" 3073 "#endif\n" 3074 , "675811294a4da68880f2ace25764d371"}; 3075 ProgramSource reduce2_oclsrc(reduce2.programStr); 3076 const struct ProgramEntry repeat={"repeat", 3077 "#if cn != 3\n" 3078 "#define loadpix(addr) *(__global const T *)(addr)\n" 3079 "#define storepix(val, addr) *(__global T *)(addr) = val\n" 3080 "#define TSIZE (int)sizeof(T)\n" 3081 "#else\n" 3082 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" 3083 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" 3084 "#define TSIZE ((int)sizeof(T1)*3)\n" 3085 "#endif\n" 3086 "__kernel void repeat(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 3087 "__global uchar * dstptr, int dst_step, int dst_offset)\n" 3088 "{\n" 3089 "int x = get_global_id(0);\n" 3090 "int y0 = get_global_id(1) * rowsPerWI;\n" 3091 "if (x < src_cols)\n" 3092 "{\n" 3093 "int src_index = mad24(y0, src_step, mad24(x, (int)sizeof(T), src_offset));\n" 3094 "int dst_index0 = mad24(y0, dst_step, mad24(x, (int)sizeof(T), dst_offset));\n" 3095 "for (int y = y0, y1 = min(src_rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step, dst_index0 += dst_step)\n" 3096 "{\n" 3097 "T srcelem = loadpix(srcptr + src_index);\n" 3098 "#pragma unroll\n" 3099 "for (int ey = 0; ey < ny; ++ey)\n" 3100 "{\n" 3101 "int dst_index = mad24(ey * src_rows, dst_step, dst_index0);\n" 3102 "#pragma unroll\n" 3103 "for (int ex = 0; ex < nx; ++ex)\n" 3104 "{\n" 3105 "storepix(srcelem, dstptr + dst_index);\n" 3106 "dst_index = mad24(src_cols, (int)sizeof(T), dst_index);\n" 3107 "}\n" 3108 "}\n" 3109 "}\n" 3110 "}\n" 3111 "}\n" 3112 , "d7a6b479ac9abf39f50a2d86c0b50863"}; 3113 ProgramSource repeat_oclsrc(repeat.programStr); 3114 const struct ProgramEntry set_identity={"set_identity", 3115 "#if kercn != 3\n" 3116 "#define storepix(val, addr) *(__global T *)(addr) = val\n" 3117 "#define TSIZE (int)sizeof(T)\n" 3118 "#define scalar scalar_\n" 3119 "#else\n" 3120 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" 3121 "#define TSIZE ((int)sizeof(T1)*3)\n" 3122 "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n" 3123 "#endif\n" 3124 "__kernel void setIdentity(__global uchar * srcptr, int src_step, int src_offset, int rows, int cols,\n" 3125 "ST scalar_)\n" 3126 "{\n" 3127 "int x = get_global_id(0);\n" 3128 "int y0 = get_global_id(1) * rowsPerWI;\n" 3129 "if (x < cols)\n" 3130 "{\n" 3131 "int src_index = mad24(y0, src_step, mad24(x, TSIZE, src_offset));\n" 3132 "#if kercn == cn\n" 3133 "#pragma unroll\n" 3134 "for (int y = y0, i = 0, y1 = min(rows, y0 + rowsPerWI); i < rowsPerWI; ++y, ++i, src_index += src_step)\n" 3135 "if (y < y1)\n" 3136 "storepix(x == y ? scalar : (T)(0), srcptr + src_index);\n" 3137 "#elif kercn == 4 && cn == 1\n" 3138 "if (y0 < rows)\n" 3139 "{\n" 3140 "storepix(x == y0 >> 2 ? (T)(scalar, 0, 0, 0) : (T)(0), srcptr + src_index);\n" 3141 "if (++y0 < rows)\n" 3142 "{\n" 3143 "src_index += src_step;\n" 3144 "storepix(x == y0 >> 2 ? (T)(0, scalar, 0, 0) : (T)(0), srcptr + src_index);\n" 3145 "if (++y0 < rows)\n" 3146 "{\n" 3147 "src_index += src_step;\n" 3148 "storepix(x == y0 >> 2 ? (T)(0, 0, scalar, 0) : (T)(0), srcptr + src_index);\n" 3149 "if (++y0 < rows)\n" 3150 "{\n" 3151 "src_index += src_step;\n" 3152 "storepix(x == y0 >> 2 ? (T)(0, 0, 0, scalar) : (T)(0), srcptr + src_index);\n" 3153 "}\n" 3154 "}\n" 3155 "}\n" 3156 "}\n" 3157 "#else\n" 3158 "#error \"Incorrect combination of cn && kercn\"\n" 3159 "#endif\n" 3160 "}\n" 3161 "}\n" 3162 , "75020e8c1da6cf8aece6bd5cc5b9ed4f"}; 3163 ProgramSource set_identity_oclsrc(set_identity.programStr); 3164 const struct ProgramEntry split_merge={"split_merge", 3165 "#ifdef OP_MERGE\n" 3166 "#define DECLARE_SRC_PARAM(index) __global const uchar * src##index##ptr, int src##index##_step, int src##index##_offset,\n" 3167 "#define DECLARE_INDEX(index) int src##index##_index = mad24(src##index##_step, y0, mad24(x, (int)sizeof(T) * scn##index, src##index##_offset));\n" 3168 "#define PROCESS_ELEM(index) \\\n" 3169 "__global const T * src##index = (__global const T *)(src##index##ptr + src##index##_index); \\\n" 3170 "dst[index] = src##index[0]; \\\n" 3171 "src##index##_index += src##index##_step;\n" 3172 "__kernel void merge(DECLARE_SRC_PARAMS_N\n" 3173 "__global uchar * dstptr, int dst_step, int dst_offset,\n" 3174 "int rows, int cols, int rowsPerWI)\n" 3175 "{\n" 3176 "int x = get_global_id(0);\n" 3177 "int y0 = get_global_id(1) * rowsPerWI;\n" 3178 "if (x < cols)\n" 3179 "{\n" 3180 "DECLARE_INDEX_N\n" 3181 "int dst_index = mad24(dst_step, y0, mad24(x, (int)sizeof(T) * cn, dst_offset));\n" 3182 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, dst_index += dst_step)\n" 3183 "{\n" 3184 "__global T * dst = (__global T *)(dstptr + dst_index);\n" 3185 "PROCESS_ELEMS_N\n" 3186 "}\n" 3187 "}\n" 3188 "}\n" 3189 "#elif defined OP_SPLIT\n" 3190 "#define DECLARE_DST_PARAM(index) , __global uchar * dst##index##ptr, int dst##index##_step, int dst##index##_offset\n" 3191 "#define DECLARE_INDEX(index) int dst##index##_index = mad24(y0, dst##index##_step, mad24(x, (int)sizeof(T), dst##index##_offset));\n" 3192 "#define PROCESS_ELEM(index) \\\n" 3193 "__global T * dst##index = (__global T *)(dst##index##ptr + dst##index##_index); \\\n" 3194 "dst##index[0] = src[index]; \\\n" 3195 "dst##index##_index += dst##index##_step;\n" 3196 "__kernel void split(__global uchar* srcptr, int src_step, int src_offset, int rows, int cols DECLARE_DST_PARAMS, int rowsPerWI)\n" 3197 "{\n" 3198 "int x = get_global_id(0);\n" 3199 "int y0 = get_global_id(1) * rowsPerWI;\n" 3200 "if (x < cols)\n" 3201 "{\n" 3202 "DECLARE_INDEX_N\n" 3203 "int src_index = mad24(y0, src_step, mad24(x, cn * (int)sizeof(T), src_offset));\n" 3204 "for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += src_step)\n" 3205 "{\n" 3206 "__global const T * src = (__global const T *)(srcptr + src_index);\n" 3207 "PROCESS_ELEMS_N\n" 3208 "}\n" 3209 "}\n" 3210 "}\n" 3211 "#else\n" 3212 "#error \"No operation\"\n" 3213 "#endif\n" 3214 , "11e06966b3c2f2081fd02cf70337b495"}; 3215 ProgramSource split_merge_oclsrc(split_merge.programStr); 3216 const struct ProgramEntry transpose={"transpose", 3217 "#if cn != 3\n" 3218 "#define loadpix(addr) *(__global const T *)(addr)\n" 3219 "#define storepix(val, addr) *(__global T *)(addr) = val\n" 3220 "#define TSIZE (int)sizeof(T)\n" 3221 "#else\n" 3222 "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" 3223 "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" 3224 "#define TSIZE ((int)sizeof(T1)*3)\n" 3225 "#endif\n" 3226 "#ifndef INPLACE\n" 3227 "#define LDS_STEP (TILE_DIM + 1)\n" 3228 "__kernel void transpose(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" 3229 "__global uchar * dstptr, int dst_step, int dst_offset)\n" 3230 "{\n" 3231 "int gp_x = get_group_id(0), gp_y = get_group_id(1);\n" 3232 "int gs_x = get_num_groups(0), gs_y = get_num_groups(1);\n" 3233 "int groupId_x, groupId_y;\n" 3234 "if (src_rows == src_cols)\n" 3235 "{\n" 3236 "groupId_y = gp_x;\n" 3237 "groupId_x = (gp_x + gp_y) % gs_x;\n" 3238 "}\n" 3239 "else\n" 3240 "{\n" 3241 "int bid = mad24(gs_x, gp_y, gp_x);\n" 3242 "groupId_y = bid % gs_y;\n" 3243 "groupId_x = ((bid / gs_y) + groupId_y) % gs_x;\n" 3244 "}\n" 3245 "int lx = get_local_id(0);\n" 3246 "int ly = get_local_id(1);\n" 3247 "int x = mad24(groupId_x, TILE_DIM, lx);\n" 3248 "int y = mad24(groupId_y, TILE_DIM, ly);\n" 3249 "int x_index = mad24(groupId_y, TILE_DIM, lx);\n" 3250 "int y_index = mad24(groupId_x, TILE_DIM, ly);\n" 3251 "__local T tile[TILE_DIM * LDS_STEP];\n" 3252 "if (x < src_cols && y < src_rows)\n" 3253 "{\n" 3254 "int index_src = mad24(y, src_step, mad24(x, TSIZE, src_offset));\n" 3255 "#pragma unroll\n" 3256 "for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)\n" 3257 "if (y + i < src_rows)\n" 3258 "{\n" 3259 "tile[mad24(ly + i, LDS_STEP, lx)] = loadpix(srcptr + index_src);\n" 3260 "index_src = mad24(BLOCK_ROWS, src_step, index_src);\n" 3261 "}\n" 3262 "}\n" 3263 "barrier(CLK_LOCAL_MEM_FENCE);\n" 3264 "if (x_index < src_rows && y_index < src_cols)\n" 3265 "{\n" 3266 "int index_dst = mad24(y_index, dst_step, mad24(x_index, TSIZE, dst_offset));\n" 3267 "#pragma unroll\n" 3268 "for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)\n" 3269 "if ((y_index + i) < src_cols)\n" 3270 "{\n" 3271 "storepix(tile[mad24(lx, LDS_STEP, ly + i)], dstptr + index_dst);\n" 3272 "index_dst = mad24(BLOCK_ROWS, dst_step, index_dst);\n" 3273 "}\n" 3274 "}\n" 3275 "}\n" 3276 "#else\n" 3277 "__kernel void transpose_inplace(__global uchar * srcptr, int src_step, int src_offset, int src_rows)\n" 3278 "{\n" 3279 "int x = get_global_id(0);\n" 3280 "int y = get_global_id(1) * rowsPerWI;\n" 3281 "if (x < y + rowsPerWI)\n" 3282 "{\n" 3283 "int src_index = mad24(y, src_step, mad24(x, TSIZE, src_offset));\n" 3284 "int dst_index = mad24(x, src_step, mad24(y, TSIZE, src_offset));\n" 3285 "T tmp;\n" 3286 "#pragma unroll\n" 3287 "for (int i = 0; i < rowsPerWI; ++i, ++y, src_index += src_step, dst_index += TSIZE)\n" 3288 "if (y < src_rows && x < y)\n" 3289 "{\n" 3290 "__global uchar * src = srcptr + src_index;\n" 3291 "__global uchar * dst = srcptr + dst_index;\n" 3292 "tmp = loadpix(dst);\n" 3293 "storepix(loadpix(src), dst);\n" 3294 "storepix(tmp, src);\n" 3295 "}\n" 3296 "}\n" 3297 "}\n" 3298 "#endif\n" 3299 , "f938bc7c686ae7bca004809df19ef032"}; 3300 ProgramSource transpose_oclsrc(transpose.programStr); 3301 } 3302 }} 3303