// This file is auto-generated. Do not edit! #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" namespace cv { namespace ocl { namespace imgproc { const struct ProgramEntry accumulate={"accumulate", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define SRC_TSIZE cn * (int)sizeof(srcT1)\n" "#define DST_TSIZE cn * (int)sizeof(dstT1)\n" "#define noconvert\n" "__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,\n" "#ifdef ACCUMULATE_PRODUCT\n" "__global const uchar * src2ptr, int src2_step, int src2_offset,\n" "#endif\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols\n" "#ifdef ACCUMULATE_WEIGHTED\n" ", dstT1 alpha\n" "#endif\n" "#ifdef HAVE_MASK\n" ", __global const uchar * mask, int mask_step, int mask_offset\n" "#endif\n" ")\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));\n" "#ifdef HAVE_MASK\n" "int mask_index = mad24(y, mask_step, mask_offset + x);\n" "mask += mask_index;\n" "#endif\n" "#ifdef ACCUMULATE_PRODUCT\n" "int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));\n" "#endif\n" "int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i)\n" "if (y < dst_rows)\n" "{\n" "__global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);\n" "#ifdef ACCUMULATE_PRODUCT\n" "__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);\n" "#endif\n" "__global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);\n" "#ifdef HAVE_MASK\n" "if (mask[0])\n" "#endif\n" "#pragma unroll\n" "for (int c = 0; c < cn; ++c)\n" "{\n" "#ifdef ACCUMULATE\n" "dst[c] += convertToDT(src[c]);\n" "#elif defined ACCUMULATE_SQUARE\n" "dstT1 val = convertToDT(src[c]);\n" "dst[c] = fma(val, val, dst[c]);\n" "#elif defined ACCUMULATE_PRODUCT\n" "dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);\n" "#elif defined ACCUMULATE_WEIGHTED\n" "dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);\n" "#else\n" "#error \"Unknown accumulation type\"\n" "#endif\n" "}\n" "src_index += src_step;\n" "#ifdef ACCUMULATE_PRODUCT\n" "src2_index += src2_step;\n" "#endif\n" "#ifdef HAVE_MASK\n" "mask += mask_step;\n" "#endif\n" "dst_index += dst_step;\n" "++y;\n" "}\n" "}\n" "}\n" , "5f2c2d40f721d738ad2b8ef755376c6f"}; ProgramSource accumulate_oclsrc(accumulate.programStr); const struct ProgramEntry bilateral={"bilateral", "#if cn != 3\n" "#define loadpix(addr) *(__global const uchar_t *)(addr)\n" "#define storepix(val, addr) *(__global uchar_t *)(addr) = val\n" "#define TSIZE cn\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const uchar *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global uchar *)(addr))\n" "#define TSIZE 3\n" "#endif\n" "#if cn == 1\n" "#define SUM(a) a\n" "#elif cn == 2\n" "#define SUM(a) a.x + a.y\n" "#elif cn == 3\n" "#define SUM(a) a.x + a.y + a.z\n" "#elif cn == 4\n" "#define SUM(a) a.x + a.y + a.z + a.w\n" "#else\n" "#error \"cn should be <= 4\"\n" "#endif\n" "__kernel void bilateral(__global const uchar * src, int src_step, int src_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant float * space_weight, __constant int * space_ofs)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (y < dst_rows && x < dst_cols)\n" "{\n" "int src_index = mad24(y + radius, src_step, mad24(x + radius, TSIZE, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "float_t sum = (float_t)(0.0f);\n" "float wsum = 0.0f;\n" "#ifdef INTEL_DEVICE\n" "float_t val0 = convert_float_t(loadpix(src + src_index));\n" "#else\n" "int_t val0 = convert_int_t(loadpix(src + src_index));\n" "#endif\n" "#pragma unroll\n" "for (int k = 0; k < maxk; k++ )\n" "{\n" "#ifdef INTEL_DEVICE\n" "float_t val = convert_float_t(loadpix(src + src_index + space_ofs[k]));\n" "float diff = SUM(fabs(val - val0));\n" "#else\n" "int_t val = convert_int_t(loadpix(src + src_index + space_ofs[k]));\n" "int diff = SUM(abs(val - val0));\n" "#endif\n" "float w = space_weight[k] * native_exp((float)(diff * diff * gauss_color_coeff));\n" "sum += convert_float_t(val) * (float_t)(w);\n" "wsum += w;\n" "}\n" "storepix(convert_uchar_t(sum / (float_t)(wsum)), dst + dst_index);\n" "}\n" "}\n" "#ifdef INTEL_DEVICE\n" "#if cn == 1\n" "__kernel void bilateral_float4(__global const uchar * src, int src_step, int src_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant float * space_weight, __constant int * space_ofs)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (y < dst_rows && x < dst_cols / 4 )\n" "{\n" "int src_index = ((y + radius) * src_step) + x * 4 + (radius + src_offset);\n" "int dst_index = (y * dst_step) + x * 4 + dst_offset ;\n" "float4 sum = 0.f, wsum = 0.f;\n" "float4 val0 = convert_float4(vload4(0, src + src_index));\n" "#pragma unroll\n" "for (int k = 0; k < maxk; k++ )\n" "{\n" "float4 val = convert_float4(vload4(0, src + src_index + space_ofs[k]));\n" "float4 w = space_weight[k] * native_exp((val - val0) * (val - val0) * gauss_color_coeff);\n" "sum += val * w;\n" "wsum += w;\n" "}\n" "sum = sum / wsum + .5f;\n" "vstore4(convert_uchar4_rtz(sum), 0, dst + dst_index);\n" "}\n" "}\n" "#endif\n" "#endif\n" , "1cc12569fdb93cbfa05bb215d3d42e64"}; ProgramSource bilateral_oclsrc(bilateral.programStr); const struct ProgramEntry blend_linear={"blend_linear", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define noconvert\n" "__kernel void blendLinear(__global const uchar * src1ptr, int src1_step, int src1_offset,\n" "__global const uchar * src2ptr, int src2_step, int src2_offset,\n" "__global const uchar * weight1, int weight1_step, int weight1_offset,\n" "__global const uchar * weight2, int weight2_step, int weight2_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int src1_index = mad24(y, src1_step, src1_offset + x * cn * (int)sizeof(T));\n" "int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(T));\n" "int weight1_index = mad24(y, weight1_step, weight1_offset + x * (int)sizeof(float));\n" "int weight2_index = mad24(y, weight2_step, weight2_offset + x * (int)sizeof(float));\n" "int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(T));\n" "float w1 = *(__global const float *)(weight1 + weight1_index),\n" "w2 = *(__global const float *)(weight2 + weight2_index);\n" "float den = w1 + w2 + 1e-5f;\n" "__global const T * src1 = (__global const T *)(src1ptr + src1_index);\n" "__global const T * src2 = (__global const T *)(src2ptr + src2_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "#pragma unroll\n" "for (int i = 0; i < cn; ++i)\n" "{\n" "float num = w1 * convert_float(src1[i]) + w2 * convert_float(src2[i]);\n" "dst[i] = convertToT(num / den);\n" "}\n" "}\n" "}\n" , "76072b51c3ede4951ee0200aa33297dc"}; ProgramSource blend_linear_oclsrc(blend_linear.programStr); const struct ProgramEntry boxFilter={"boxFilter", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const ST *)(addr)\n" "#define storepix(val, addr) *(__global DT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(ST)\n" "#define DSTSIZE (int)sizeof(DT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const ST1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global DT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(ST1)*cn\n" "#define DSTSIZE (int)sizeof(DT1)*cn\n" "#endif\n" "#ifdef BORDER_CONSTANT\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "x = max(min(x, maxX - 1), minX); \\\n" "y = max(min(y, maxY - 1), minY); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "if (x < minX) \\\n" "x -= ((x - maxX + 1) / maxX) * maxX; \\\n" "if (x >= maxX) \\\n" "x %= maxX; \\\n" "if (y < minY) \\\n" "y -= ((y - maxY + 1) / maxY) * maxY; \\\n" "if (y >= maxY) \\\n" "y %= maxY; \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n" "{ \\\n" "if (maxX - minX == 1) \\\n" "x = minX; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (x < minX) \\\n" "x = minX - (x - minX) - 1 + delta; \\\n" "else \\\n" "x = maxX - 1 - (x - maxX) - delta; \\\n" "} \\\n" "while (x >= maxX || x < minX); \\\n" "\\\n" "if (maxY - minY == 1) \\\n" "y = minY; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (y < minY) \\\n" "y = minY - (y - minY) - 1 + delta; \\\n" "else \\\n" "y = maxY - 1 - (y - maxY) - delta; \\\n" "} \\\n" "while (y >= maxY || y < minY); \\\n" "}\n" "#ifdef BORDER_REFLECT\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n" "#elif defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n" "#endif\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#define noconvert\n" "#ifdef SQR\n" "#define PROCESS_ELEM(value) (value * value)\n" "#else\n" "#define PROCESS_ELEM(value) value\n" "#endif\n" "struct RectCoords\n" "{\n" "int x1, y1, x2, y2;\n" "};\n" "inline WT readSrcPixel(int2 pos, __global const uchar * srcptr, int src_step, const struct RectCoords srcCoords)\n" "{\n" "#ifdef BORDER_ISOLATED\n" "if (pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)\n" "#else\n" "if (pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2)\n" "#endif\n" "{\n" "int src_index = mad24(pos.y, src_step, pos.x * SRCSIZE);\n" "WT value = convertToWT(loadpix(srcptr + src_index));\n" "return PROCESS_ELEM(value);\n" "}\n" "else\n" "{\n" "#ifdef BORDER_CONSTANT\n" "return (WT)(0);\n" "#else\n" "int selected_col = pos.x, selected_row = pos.y;\n" "EXTRAPOLATE(selected_col, selected_row,\n" "#ifdef BORDER_ISOLATED\n" "srcCoords.x1, srcCoords.y1,\n" "#else\n" "0, 0,\n" "#endif\n" "srcCoords.x2, srcCoords.y2);\n" "int src_index = mad24(selected_row, src_step, selected_col * SRCSIZE);\n" "WT value = convertToWT(loadpix(srcptr + src_index));\n" "return PROCESS_ELEM(value);\n" "#endif\n" "}\n" "}\n" "__kernel void boxFilter(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols\n" "#ifdef NORMALIZE\n" ", float alpha\n" "#endif\n" ")\n" "{\n" "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n" "int x = get_local_id(0) + (LOCAL_SIZE_X - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;\n" "int y = get_global_id(1) * BLOCK_SIZE_Y;\n" "int local_id = get_local_id(0);\n" "WT data[KERNEL_SIZE_Y];\n" "__local WT sumOfCols[LOCAL_SIZE_X];\n" "int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y);\n" "#pragma unroll\n" "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++, srcPos.y++)\n" "data[sy] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);\n" "WT tmp_sum = (WT)(0);\n" "#pragma unroll\n" "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)\n" "tmp_sum += data[sy];\n" "sumOfCols[local_id] = tmp_sum;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));\n" "__global DT * dst = (__global DT *)(dstptr + dst_index);\n" "int sy_index = 0;\n" "for (int i = 0, stepY = min(rows - y, BLOCK_SIZE_Y); i < stepY; ++i)\n" "{\n" "if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE_X - (KERNEL_SIZE_X - 1 - ANCHOR_X) &&\n" "x >= 0 && x < cols)\n" "{\n" "WT total_sum = (WT)(0);\n" "#pragma unroll\n" "for (int sx = 0; sx < KERNEL_SIZE_X; sx++)\n" "total_sum += sumOfCols[local_id + sx - ANCHOR_X];\n" "#ifdef NORMALIZE\n" "DT dstval = convertToDT((WT)(alpha) * total_sum);\n" "#else\n" "DT dstval = convertToDT(total_sum);\n" "#endif\n" "storepix(dstval, dst);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "tmp_sum = sumOfCols[local_id];\n" "tmp_sum -= data[sy_index];\n" "data[sy_index] = readSrcPixel(srcPos, srcptr, src_step, srcCoords);\n" "srcPos.y++;\n" "tmp_sum += data[sy_index];\n" "sumOfCols[local_id] = tmp_sum;\n" "sy_index = sy_index + 1 < KERNEL_SIZE_Y ? sy_index + 1 : 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "dst = (__global DT *)((__global uchar *)dst + dst_step);\n" "}\n" "}\n" , "d3e542270fa2ea1fc3744043dad50cb4"}; ProgramSource boxFilter_oclsrc(boxFilter.programStr); const struct ProgramEntry calc_back_project={"calc_back_project", "#define OUT_OF_RANGE -1\n" "#define ROUNDING_EPS 0.000001f\n" "#if histdims == 1\n" "__kernel void calcLUT(__global const uchar * histptr, int hist_step, int hist_offset, int hist_bins,\n" "__global int * lut, float scale, __constant float * ranges)\n" "{\n" "int x = get_global_id(0);\n" "float value = convert_float(x);\n" "if (value > ranges[1] || value < ranges[0])\n" "lut[x] = OUT_OF_RANGE;\n" "else\n" "{\n" "float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;\n" "value -= lb;\n" "int bin = convert_int_sat_rtn(value / gap + ROUNDING_EPS);\n" "if (bin >= hist_bins)\n" "lut[x] = OUT_OF_RANGE;\n" "else\n" "{\n" "int hist_index = mad24(hist_step, bin, hist_offset);\n" "__global const float * hist = (__global const float *)(histptr + hist_index);\n" "lut[x] = (int)convert_uchar_sat_rte(hist[0] * scale);\n" "}\n" "}\n" "}\n" "__kernel void LUT(__global const uchar * src, int src_step, int src_offset,\n" "__constant int * lut,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int src_index = mad24(y, src_step, src_offset + x * scn);\n" "int dst_index = mad24(y, dst_step, dst_offset + x);\n" "int value = lut[src[src_index]];\n" "dst[dst_index] = value == OUT_OF_RANGE ? 0 : convert_uchar(value);\n" "}\n" "}\n" "#elif histdims == 2\n" "__kernel void calcLUT(int hist_bins, __global int * lut, int lut_offset,\n" "__constant float * ranges, int roffset)\n" "{\n" "int x = get_global_id(0);\n" "float value = convert_float(x);\n" "ranges += roffset;\n" "lut += lut_offset;\n" "if (value > ranges[1] || value < ranges[0])\n" "lut[x] = OUT_OF_RANGE;\n" "else\n" "{\n" "float lb = ranges[0], ub = ranges[1], gap = (ub - lb) / hist_bins;\n" "value -= lb;\n" "int bin = convert_int_sat_rtn(value / gap + ROUNDING_EPS);\n" "lut[x] = bin >= hist_bins ? OUT_OF_RANGE : bin;\n" "}\n" "}\n" "__kernel void LUT(__global const uchar * src1, int src1_step, int src1_offset,\n" "__global const uchar * src2, int src2_step, int src2_offset,\n" "__global const uchar * histptr, int hist_step, int hist_offset,\n" "__constant int * lut, float scale,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int src1_index = mad24(y, src1_step, src1_offset + x * scn1);\n" "int src2_index = mad24(y, src2_step, src2_offset + x * scn2);\n" "int dst_index = mad24(y, dst_step, dst_offset + x);\n" "int bin1 = lut[src1[src1_index]];\n" "int bin2 = lut[src2[src2_index] + 256];\n" "dst[dst_index] = bin1 == OUT_OF_RANGE || bin2 == OUT_OF_RANGE ? 0 :\n" "convert_uchar_sat_rte(*(__global const float *)(histptr +\n" "mad24(hist_step, bin1, hist_offset + bin2 * (int)sizeof(float))) * scale);\n" "}\n" "}\n" "#else\n" "#error \"(nimages <= 2) should be true\"\n" "#endif\n" , "6bab391f796ff5b2ba3d38f23929307e"}; ProgramSource calc_back_project_oclsrc(calc_back_project.programStr); const struct ProgramEntry canny={"canny", "#define TG22 0.4142135623730950488016887242097f\n" "#define TG67 2.4142135623730950488016887242097f\n" "#ifdef WITH_SOBEL\n" "#if cn == 1\n" "#define loadpix(addr) convert_floatN(*(__global const TYPE *)(addr))\n" "#else\n" "#define loadpix(addr) convert_floatN(vload3(0, (__global const TYPE *)(addr)))\n" "#endif\n" "#define storepix(value, addr) *(__global int *)(addr) = (int)(value)\n" "__constant int prev[4][2] = {\n" "{ 0, -1 },\n" "{ -1, 1 },\n" "{ -1, 0 },\n" "{ -1, -1 }\n" "};\n" "__constant int next[4][2] = {\n" "{ 0, 1 },\n" "{ 1, -1 },\n" "{ 1, 0 },\n" "{ 1, 1 }\n" "};\n" "inline float3 sobel(int idx, __local const floatN *smem)\n" "{\n" "float3 res;\n" "floatN dx = fma(2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4],\n" "smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]);\n" "floatN dy = fma(2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9],\n" "smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]);\n" "#ifdef L2GRAD\n" "floatN magN = fma(dx, dx, dy * dy);\n" "#else\n" "floatN magN = fabs(dx) + fabs(dy);\n" "#endif\n" "#if cn == 1\n" "res.z = magN;\n" "res.x = dx;\n" "res.y = dy;\n" "#else\n" "res.z = max(magN.x, max(magN.y, magN.z));\n" "if (res.z == magN.y)\n" "{\n" "dx.x = dx.y;\n" "dy.x = dy.y;\n" "}\n" "else if (res.z == magN.z)\n" "{\n" "dx.x = dx.z;\n" "dy.x = dy.z;\n" "}\n" "res.x = dx.x;\n" "res.y = dy.x;\n" "#endif\n" "return res;\n" "}\n" "__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols,\n" "__global uchar *map, int map_step, int map_offset,\n" "float low_thr, float high_thr)\n" "{\n" "__local floatN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)];\n" "int lidx = get_local_id(0);\n" "int lidy = get_local_id(1);\n" "int start_x = GRP_SIZEX * get_group_id(0);\n" "int start_y = GRP_SIZEY * get_group_id(1);\n" "int i = lidx + lidy * GRP_SIZEX;\n" "for (int j = i; j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY)\n" "{\n" "int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1);\n" "int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1);\n" "smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset)));\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "__local float mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n" "lidx++;\n" "lidy++;\n" "if (i < GRP_SIZEX + 2)\n" "{\n" "int grp_sizey = min(GRP_SIZEY + 1, rows - start_y);\n" "mag[i] = (sobel(i, smem)).z;\n" "mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z;\n" "}\n" "if (i < GRP_SIZEY + 2)\n" "{\n" "int grp_sizex = min(GRP_SIZEX + 1, cols - start_x);\n" "mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z;\n" "mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z;\n" "}\n" "int idx = lidx + lidy * (GRP_SIZEX + 4);\n" "i = lidx + lidy * (GRP_SIZEX + 2);\n" "float3 res = sobel(idx, smem);\n" "mag[i] = res.z;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int x = (int) res.x;\n" "int y = (int) res.y;\n" "int gidx = get_global_id(0);\n" "int gidy = get_global_id(1);\n" "if (gidx >= cols || gidy >= rows)\n" "return;\n" "float mag0 = mag[i];\n" "int value = 1;\n" "if (mag0 > low_thr)\n" "{\n" "int a = (y / (float)x) * TG22;\n" "int b = (y / (float)x) * TG67;\n" "a = min((int)abs(a), 1) + 1;\n" "b = min((int)abs(b), 1);\n" "int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);\n" "int dir = a * b + 2 * dir3;\n" "float prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];\n" "float next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);\n" "if (mag0 > prev_mag && mag0 >= next_mag)\n" "{\n" "value = (mag0 > high_thr) ? 2 : 0;\n" "}\n" "}\n" "storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));\n" "}\n" "#elif defined WITHOUT_SOBEL\n" "#define loadpix(addr) (__global short *)(addr)\n" "#define storepix(val, addr) *(__global int *)(addr) = (int)(val)\n" "#ifdef L2GRAD\n" "#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y))\n" "#else\n" "#define dist(x, y) (abs(x) + abs(y))\n" "#endif\n" "__constant int prev[4][2] = {\n" "{ 0, -1 },\n" "{ -1, -1 },\n" "{ -1, 0 },\n" "{ -1, 1 }\n" "};\n" "__constant int next[4][2] = {\n" "{ 0, 1 },\n" "{ 1, 1 },\n" "{ 1, 0 },\n" "{ 1, -1 }\n" "};\n" "__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset,\n" "__global const uchar *dyptr, int dy_step, int dy_offset,\n" "__global uchar *map, int map_step, int map_offset, int rows, int cols,\n" "int low_thr, int high_thr)\n" "{\n" "int start_x = get_group_id(0) * GRP_SIZEX;\n" "int start_y = get_group_id(1) * GRP_SIZEY;\n" "int lidx = get_local_id(0);\n" "int lidy = get_local_id(1);\n" "__local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n" "__local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];\n" "#pragma unroll\n" "for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY)\n" "{\n" "int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1);\n" "int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1);\n" "int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset));\n" "int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset));\n" "__global short *dx = loadpix(dxptr + dx_index);\n" "__global short *dy = loadpix(dyptr + dy_index);\n" "int mag0 = dist(dx[0], dy[0]);\n" "#if cn > 1\n" "short cdx = dx[0], cdy = dy[0];\n" "#pragma unroll\n" "for (int j = 1; j < cn; ++j)\n" "{\n" "int mag1 = dist(dx[j], dy[j]);\n" "if (mag1 > mag0)\n" "{\n" "mag0 = mag1;\n" "cdx = dx[j];\n" "cdy = dy[j];\n" "}\n" "}\n" "dx[0] = cdx;\n" "dy[0] = cdy;\n" "#endif\n" "mag[i] = mag0;\n" "sigma[i] = (short2)(dx[0], dy[0]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int gidx = get_global_id(0);\n" "int gidy = get_global_id(1);\n" "if (gidx >= cols || gidy >= rows)\n" "return;\n" "lidx++;\n" "lidy++;\n" "int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)];\n" "short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x;\n" "short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y;\n" "int value = 1;\n" "if (mag0 > low_thr)\n" "{\n" "int a = (y / (float)x) * TG22;\n" "int b = (y / (float)x) * TG67;\n" "a = min((int)abs(a), 1) + 1;\n" "b = min((int)abs(b), 1);\n" "int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);\n" "int dir = a * b + 2 * dir3;\n" "int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];\n" "int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);\n" "if (mag0 > prev_mag && mag0 >= next_mag)\n" "{\n" "value = (mag0 > high_thr) ? 2 : 0;\n" "}\n" "}\n" "storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));\n" "}\n" "#undef TG22\n" "#undef CANNY_SHIFT\n" "#elif defined STAGE2\n" "#define loadpix(addr) *(__global int *)(addr)\n" "#define storepix(val, addr) *(__global int *)(addr) = (int)(val)\n" "#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y)\n" "#define l_stack_size (4*LOCAL_TOTAL)\n" "#define p_stack_size 8\n" "__constant short move_dir[2][8] = {\n" "{ -1, -1, -1, 0, 0, 1, 1, 1 },\n" "{ -1, 0, 1, -1, 1, -1, 0, 1 }\n" "};\n" "__kernel void stage2_hysteresis(__global uchar *map_ptr, int map_step, int map_offset, int rows, int cols)\n" "{\n" "map_ptr += map_offset;\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI;\n" "int lid = get_local_id(0) + get_local_id(1) * LOCAL_X;\n" "__local ushort2 l_stack[l_stack_size];\n" "__local int l_counter;\n" "if (lid == 0)\n" "l_counter = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (x < cols)\n" "{\n" "__global uchar* map = map_ptr + mad24(y, map_step, x * (int)sizeof(int));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "int type = loadpix(map);\n" "if (type == 2)\n" "{\n" "l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);\n" "}\n" "y++;\n" "map += map_step;\n" "}\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "ushort2 p_stack[p_stack_size];\n" "int p_counter = 0;\n" "while(l_counter != 0)\n" "{\n" "int mod = l_counter % LOCAL_TOTAL;\n" "int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0);\n" "for (int i = 0; i < pix_per_thr; ++i)\n" "{\n" "int index = atomic_dec(&l_counter) - 1;\n" "if (index < 0)\n" "continue;\n" "ushort2 pos = l_stack[ index ];\n" "#pragma unroll\n" "for (int j = 0; j < 8; ++j)\n" "{\n" "ushort posx = pos.x + move_dir[0][j];\n" "ushort posy = pos.y + move_dir[1][j];\n" "if (posx < 0 || posy < 0 || posx >= cols || posy >= rows)\n" "continue;\n" "__global uchar *addr = map_ptr + mad24(posy, map_step, posx * (int)sizeof(int));\n" "int type = loadpix(addr);\n" "if (type == 0)\n" "{\n" "p_stack[p_counter++] = (ushort2)(posx, posy);\n" "storepix(2, addr);\n" "}\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (l_counter < 0)\n" "l_counter = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "while (p_counter > 0)\n" "{\n" "l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "#elif defined GET_EDGES\n" "__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols,\n" "__global uchar *dst, int dst_step, int dst_offset)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI;\n" "if (x < cols)\n" "{\n" "int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset));\n" "int dst_index = mad24(dst_step, y, x + dst_offset);\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const int * map = (__global const int *)(mapptr + map_index);\n" "dst[dst_index] = (uchar)(-(map[0] >> 1));\n" "y++;\n" "map_index += map_step;\n" "dst_index += dst_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" , "00cd5c7db7816a3deac5680f13536a02"}; ProgramSource canny_oclsrc(canny.programStr); const struct ProgramEntry clahe={"clahe", "#ifndef WAVE_SIZE\n" "#define WAVE_SIZE 1\n" "#endif\n" "inline int calc_lut(__local int* smem, int val, int tid)\n" "{\n" "smem[tid] = val;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid == 0)\n" "for (int i = 1; i < 256; ++i)\n" "smem[i] += smem[i - 1];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "return smem[tid];\n" "}\n" "#ifdef CPU\n" "inline void reduce(volatile __local int* smem, int val, int tid)\n" "{\n" "smem[tid] = val;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 128)\n" "smem[tid] = val += smem[tid + 128];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 64)\n" "smem[tid] = val += smem[tid + 64];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 32)\n" "smem[tid] += smem[tid + 32];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 16)\n" "smem[tid] += smem[tid + 16];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 8)\n" "smem[tid] += smem[tid + 8];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 4)\n" "smem[tid] += smem[tid + 4];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 2)\n" "smem[tid] += smem[tid + 2];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 1)\n" "smem[256] = smem[tid] + smem[tid + 1];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "#else\n" "inline void reduce(__local volatile int* smem, int val, int tid)\n" "{\n" "smem[tid] = val;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 128)\n" "smem[tid] = val += smem[tid + 128];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 64)\n" "smem[tid] = val += smem[tid + 64];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 32)\n" "{\n" "smem[tid] += smem[tid + 32];\n" "#if WAVE_SIZE < 32\n" "} barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 16)\n" "{\n" "#endif\n" "smem[tid] += smem[tid + 16];\n" "#if WAVE_SIZE < 16\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (tid < 8)\n" "{\n" "#endif\n" "smem[tid] += smem[tid + 8];\n" "smem[tid] += smem[tid + 4];\n" "smem[tid] += smem[tid + 2];\n" "smem[tid] += smem[tid + 1];\n" "}\n" "}\n" "#endif\n" "__kernel void calcLut(__global __const uchar * src, const int srcStep,\n" "const int src_offset, __global uchar * lut,\n" "const int dstStep, const int dst_offset,\n" "const int2 tileSize, const int tilesX,\n" "const int clipLimit, const float lutScale)\n" "{\n" "__local int smem[512];\n" "int tx = get_group_id(0);\n" "int ty = get_group_id(1);\n" "int tid = get_local_id(1) * get_local_size(0)\n" "+ get_local_id(0);\n" "smem[tid] = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))\n" "{\n" "__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);\n" "for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))\n" "{\n" "const int data = srcPtr[j];\n" "atomic_inc(&smem[data]);\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int tHistVal = smem[tid];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (clipLimit > 0)\n" "{\n" "int clipped = 0;\n" "if (tHistVal > clipLimit)\n" "{\n" "clipped = tHistVal - clipLimit;\n" "tHistVal = clipLimit;\n" "}\n" "reduce(smem, clipped, tid);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#ifdef CPU\n" "clipped = smem[256];\n" "#else\n" "clipped = smem[0];\n" "#endif\n" "__local int totalClipped;\n" "if (tid == 0)\n" "totalClipped = clipped;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int redistBatch = totalClipped / 256;\n" "tHistVal += redistBatch;\n" "int residual = totalClipped - redistBatch * 256;\n" "if (tid < residual)\n" "++tHistVal;\n" "}\n" "const int lutVal = calc_lut(smem, tHistVal, tid);\n" "uint ires = (uint)convert_int_rte(lutScale * lutVal);\n" "lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =\n" "convert_uchar(clamp(ires, (uint)0, (uint)255));\n" "}\n" "__kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,\n" "__global uchar * dst, const int dstStep, const int dst_offset,\n" "__global uchar * lut, const int lutStep, int lut_offset,\n" "const int cols, const int rows,\n" "const int2 tileSize,\n" "const int tilesX, const int tilesY)\n" "{\n" "const int x = get_global_id(0);\n" "const int y = get_global_id(1);\n" "if (x >= cols || y >= rows)\n" "return;\n" "const float tyf = (convert_float(y) / tileSize.y) - 0.5f;\n" "int ty1 = convert_int_rtn(tyf);\n" "int ty2 = ty1 + 1;\n" "const float ya = tyf - ty1;\n" "ty1 = max(ty1, 0);\n" "ty2 = min(ty2, tilesY - 1);\n" "const float txf = (convert_float(x) / tileSize.x) - 0.5f;\n" "int tx1 = convert_int_rtn(txf);\n" "int tx2 = tx1 + 1;\n" "const float xa = txf - tx1;\n" "tx1 = max(tx1, 0);\n" "tx2 = min(tx2, tilesX - 1);\n" "const int srcVal = src[mad24(y, srcStep, x + src_offset)];\n" "float res = 0;\n" "res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));\n" "res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));\n" "res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));\n" "res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));\n" "uint ires = (uint)convert_int_rte(res);\n" "dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));\n" "}\n" , "1240500336efb8988a25b1da384c217d"}; ProgramSource clahe_oclsrc(clahe.programStr); const struct ProgramEntry corner={"corner", "#ifdef BORDER_CONSTANT\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "x = max(min(x, maxV - 1), 0); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "if (x < 0) \\\n" "x -= ((x - maxV + 1) / maxV) * maxV; \\\n" "if (x >= maxV) \\\n" "x %= maxV; \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)\n" "#define EXTRAPOLATE_(x, maxV, delta) \\\n" "{ \\\n" "if (maxV == 1) \\\n" "x = 0; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if ( x < 0 ) \\\n" "x = -x - 1 + delta; \\\n" "else \\\n" "x = maxV - 1 - (x - maxV) - delta; \\\n" "} \\\n" "while (x >= maxV || x < 0); \\\n" "}\n" "#ifdef BORDER_REFLECT\n" "#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)\n" "#else\n" "#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)\n" "#endif\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#define THREADS 256\n" "__kernel void corner(__global const float * Dx, int dx_step, int dx_offset, int dx_whole_rows, int dx_whole_cols,\n" "__global const float * Dy, int dy_step, int dy_offset, int dy_whole_rows, int dy_whole_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float k)\n" "{\n" "int col = get_local_id(0);\n" "int gX = get_group_id(0);\n" "int gY = get_group_id(1);\n" "int gly = get_global_id(1);\n" "int dx_x_off = (dx_offset % dx_step) >> 2;\n" "int dx_y_off = dx_offset / dx_step;\n" "int dy_x_off = (dy_offset % dy_step) >> 2;\n" "int dy_y_off = dy_offset / dy_step;\n" "int dst_x_off = (dst_offset % dst_step) >> 2;\n" "int dst_y_off = dst_offset / dst_step;\n" "int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;\n" "int dx_startY = (gY << 1) - anY + dx_y_off;\n" "int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;\n" "int dy_startY = (gY << 1) - anY + dy_y_off;\n" "int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;\n" "int dst_startY = (gY << 1) + dst_y_off;\n" "float data[3][ksY+1];\n" "__local float temp[6][THREADS];\n" "#ifdef BORDER_CONSTANT\n" "for (int i=0; i < ksY+1; i++)\n" "{\n" "bool dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;\n" "int indexDx = mad24(dx_startY+i, dx_step>>2, dx_startX+col);\n" "float dx_s = dx_con ? Dx[indexDx] : 0.0f;\n" "bool dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;\n" "int indexDy = mad24(dy_startY+i, dy_step>>2, dy_startX+col);\n" "float dy_s = dy_con ? Dy[indexDy] : 0.0f;\n" "data[0][i] = dx_s * dx_s;\n" "data[1][i] = dx_s * dy_s;\n" "data[2][i] = dy_s * dy_s;\n" "}\n" "#else\n" "int clamped_col = min(2*dst_cols, col);\n" "for (int i=0; i < ksY+1; i++)\n" "{\n" "int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;\n" "EXTRAPOLATE(dx_selected_row, dx_whole_rows)\n" "EXTRAPOLATE(dx_selected_col, dx_whole_cols)\n" "float dx_s = Dx[mad24(dx_selected_row, dx_step>>2, dx_selected_col)];\n" "int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;\n" "EXTRAPOLATE(dy_selected_row, dy_whole_rows)\n" "EXTRAPOLATE(dy_selected_col, dy_whole_cols)\n" "float dy_s = Dy[mad24(dy_selected_row, dy_step>>2, dy_selected_col)];\n" "data[0][i] = dx_s * dx_s;\n" "data[1][i] = dx_s * dy_s;\n" "data[2][i] = dy_s * dy_s;\n" "}\n" "#endif\n" "float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f;\n" "for (int i=1; i < ksY; i++)\n" "{\n" "sum0 += data[0][i];\n" "sum1 += data[1][i];\n" "sum2 += data[2][i];\n" "}\n" "float sum01 = sum0 + data[0][0];\n" "float sum02 = sum0 + data[0][ksY];\n" "temp[0][col] = sum01;\n" "temp[1][col] = sum02;\n" "float sum11 = sum1 + data[1][0];\n" "float sum12 = sum1 + data[1][ksY];\n" "temp[2][col] = sum11;\n" "temp[3][col] = sum12;\n" "float sum21 = sum2 + data[2][0];\n" "float sum22 = sum2 + data[2][ksY];\n" "temp[4][col] = sum21;\n" "temp[5][col] = sum22;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (col < (THREADS - (ksX - 1)))\n" "{\n" "col += anX;\n" "int posX = dst_startX - dst_x_off + col - anX;\n" "int posY = (gly << 1);\n" "int till = (ksX + 1) & 1;\n" "float tmp_sum[6] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };\n" "for (int k=0; k<6; k++)\n" "{\n" "float temp_sum = 0;\n" "for (int i=-anX; i<=anX - till; i++)\n" "temp_sum += temp[k][col+i];\n" "tmp_sum[k] = temp_sum;\n" "}\n" "#ifdef CORNER_HARRIS\n" "if (posX < dst_cols && (posY) < dst_rows)\n" "{\n" "int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));\n" "*(__global float *)(dst + dst_index) =\n" "tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);\n" "}\n" "if (posX < dst_cols && (posY + 1) < dst_rows)\n" "{\n" "int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));\n" "*(__global float *)(dst + dst_index) =\n" "tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);\n" "}\n" "#elif defined CORNER_MINEIGENVAL\n" "if (posX < dst_cols && (posY) < dst_rows)\n" "{\n" "int dst_index = mad24(dst_step, dst_startY, (int)sizeof(float) * (dst_startX + col - anX));\n" "float a = tmp_sum[0] * 0.5f;\n" "float b = tmp_sum[2];\n" "float c = tmp_sum[4] * 0.5f;\n" "*(__global float *)(dst + dst_index) = (float)((a+c) - native_sqrt((a-c)*(a-c) + b*b));\n" "}\n" "if (posX < dst_cols && (posY + 1) < dst_rows)\n" "{\n" "int dst_index = mad24(dst_step, dst_startY + 1, (int)sizeof(float) * (dst_startX + col - anX));\n" "float a = tmp_sum[1] * 0.5f;\n" "float b = tmp_sum[3];\n" "float c = tmp_sum[5] * 0.5f;\n" "*(__global float *)(dst + dst_index) = (float)((a+c) - native_sqrt((a-c)*(a-c) + b*b));\n" "}\n" "#else\n" "#error \"No such corners type\"\n" "#endif\n" "}\n" "}\n" , "0b0ba9ee4305009cb2433737f7ed5bcd"}; ProgramSource corner_oclsrc(corner.programStr); const struct ProgramEntry covardata={"covardata", "#ifdef BORDER_CONSTANT\n" "#define EXTRAPOLATE(x, maxV)\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = clamp((x), 0, (maxV)-1); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = ( (x) + (maxV) ) % (maxV); \\\n" "}\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \\\n" "}\n" "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*src_step))[_x])\n" "#ifdef BORDER_CONSTANT\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n" "#else\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n" "#endif\n" "#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])\n" "#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])\n" "#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \\\n" "int srcX = x + srcOffsetX - (kernel_border); \\\n" "int srcY = y + srcOffsetY - (kernel_border); \\\n" "int xb = srcX; \\\n" "int yb = srcY; \\\n" "\\\n" "EXTRAPOLATE(xb, (width)); \\\n" "EXTRAPOLATE(yb, (height)); \\\n" "lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n" "\\\n" "if(lix < ((kernel_border)*2)) \\\n" "{ \\\n" "int xb = srcX+BLK_X; \\\n" "EXTRAPOLATE(xb,(width)); \\\n" "lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n" "} \\\n" "if(liy< ((kernel_border)*2)) \\\n" "{ \\\n" "int yb = srcY+BLK_Y; \\\n" "EXTRAPOLATE(yb, (height)); \\\n" "lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n" "} \\\n" "if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \\\n" "{ \\\n" "int xb = srcX+BLK_X; \\\n" "int yb = srcY+BLK_Y; \\\n" "EXTRAPOLATE(xb,(width)); \\\n" "EXTRAPOLATE(yb,(height)); \\\n" "lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \\\n" "}\n" "__kernel void sobel3(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n" "__global uchar * DstX, int DstXPitch, int DstXOffset,\n" "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n" "int height, int width, float scale)\n" "{\n" "__local float lsmem[BLK_Y+2][BLK_X+2];\n" "int lix = get_local_id(0);\n" "int liy = get_local_id(1);\n" "int x = (int)get_global_id(0);\n" "int y = (int)get_global_id(1);\n" "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if( x >= dstWidth || y >=dstHeight ) return;\n" "float u1 = lsmem[liy][lix];\n" "float u2 = lsmem[liy][lix+1];\n" "float u3 = lsmem[liy][lix+2];\n" "float m1 = lsmem[liy+1][lix];\n" "float m3 = lsmem[liy+1][lix+2];\n" "float b1 = lsmem[liy+2][lix];\n" "float b2 = lsmem[liy+2][lix+1];\n" "float b3 = lsmem[liy+2][lix+2];\n" "#ifdef SCHARR\n" "DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;\n" "DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;\n" "#else\n" "DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;\n" "DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;\n" "#endif\n" "}\n" "__kernel void sobel5(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n" "__global uchar * DstX, int DstXPitch, int DstXOffset,\n" "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n" "int height, int width, float scale)\n" "{\n" "__local float lsmem[BLK_Y+4][BLK_X+4];\n" "int lix = get_local_id(0);\n" "int liy = get_local_id(1);\n" "int x = (int)get_global_id(0);\n" "int y = (int)get_global_id(1);\n" "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if( x >= dstWidth || y >=dstHeight ) return;\n" "float t1 = lsmem[liy][lix];\n" "float t2 = lsmem[liy][lix+1];\n" "float t3 = lsmem[liy][lix+2];\n" "float t4 = lsmem[liy][lix+3];\n" "float t5 = lsmem[liy][lix+4];\n" "float u1 = lsmem[liy+1][lix];\n" "float u2 = lsmem[liy+1][lix+1];\n" "float u3 = lsmem[liy+1][lix+2];\n" "float u4 = lsmem[liy+1][lix+3];\n" "float u5 = lsmem[liy+1][lix+4];\n" "float m1 = lsmem[liy+2][lix];\n" "float m2 = lsmem[liy+2][lix+1];\n" "float m4 = lsmem[liy+2][lix+3];\n" "float m5 = lsmem[liy+2][lix+4];\n" "float l1 = lsmem[liy+3][lix];\n" "float l2 = lsmem[liy+3][lix+1];\n" "float l3 = lsmem[liy+3][lix+2];\n" "float l4 = lsmem[liy+3][lix+3];\n" "float l5 = lsmem[liy+3][lix+4];\n" "float b1 = lsmem[liy+4][lix];\n" "float b2 = lsmem[liy+4][lix+1];\n" "float b3 = lsmem[liy+4][lix+2];\n" "float b4 = lsmem[liy+4][lix+3];\n" "float b5 = lsmem[liy+4][lix+4];\n" "DSTX(x,y) = scale *\n" "mad(12.0f, m4 - m2,\n" "mad(6.0f, m5 - m1,\n" "mad(8.0f, u4 - u2 + l4 - l2,\n" "mad(4.0f, u5 - u1 + l5 - l1,\n" "mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )\n" ")\n" ")\n" ")\n" ");\n" "DSTY(x,y) = scale *\n" "mad(12.0f, l3 - u3,\n" "mad(6.0f, b3 - t3,\n" "mad(8.0f, l2 - u2 + l4 - u4,\n" "mad(4.0f, b2 - t2 + b4 - t4,\n" "mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )\n" ")\n" ")\n" ")\n" ");\n" "}\n" "__kernel void sobel7(__global const uchar * Src, int src_step, int srcOffsetX, int srcOffsetY,\n" "__global uchar * DstX, int DstXPitch, int DstXOffset,\n" "__global uchar * DstY, int DstYPitch, int DstYOffset, int dstHeight, int dstWidth,\n" "int height, int width, float scale)\n" "{\n" "__local float lsmem[BLK_Y+6][BLK_X+6];\n" "int lix = get_local_id(0);\n" "int liy = get_local_id(1);\n" "int x = (int)get_global_id(0);\n" "int y = (int)get_global_id(1);\n" "INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if( x >= dstWidth || y >=dstHeight ) return;\n" "float tt1 = lsmem[liy][lix];\n" "float tt2 = lsmem[liy][lix+1];\n" "float tt3 = lsmem[liy][lix+2];\n" "float tt4 = lsmem[liy][lix+3];\n" "float tt5 = lsmem[liy][lix+4];\n" "float tt6 = lsmem[liy][lix+5];\n" "float tt7 = lsmem[liy][lix+6];\n" "float t1 = lsmem[liy+1][lix];\n" "float t2 = lsmem[liy+1][lix+1];\n" "float t3 = lsmem[liy+1][lix+2];\n" "float t4 = lsmem[liy+1][lix+3];\n" "float t5 = lsmem[liy+1][lix+4];\n" "float t6 = lsmem[liy+1][lix+5];\n" "float t7 = lsmem[liy+1][lix+6];\n" "float u1 = lsmem[liy+2][lix];\n" "float u2 = lsmem[liy+2][lix+1];\n" "float u3 = lsmem[liy+2][lix+2];\n" "float u4 = lsmem[liy+2][lix+3];\n" "float u5 = lsmem[liy+2][lix+4];\n" "float u6 = lsmem[liy+2][lix+5];\n" "float u7 = lsmem[liy+2][lix+6];\n" "float m1 = lsmem[liy+3][lix];\n" "float m2 = lsmem[liy+3][lix+1];\n" "float m3 = lsmem[liy+3][lix+2];\n" "float m5 = lsmem[liy+3][lix+4];\n" "float m6 = lsmem[liy+3][lix+5];\n" "float m7 = lsmem[liy+3][lix+6];\n" "float l1 = lsmem[liy+4][lix];\n" "float l2 = lsmem[liy+4][lix+1];\n" "float l3 = lsmem[liy+4][lix+2];\n" "float l4 = lsmem[liy+4][lix+3];\n" "float l5 = lsmem[liy+4][lix+4];\n" "float l6 = lsmem[liy+4][lix+5];\n" "float l7 = lsmem[liy+4][lix+6];\n" "float b1 = lsmem[liy+5][lix];\n" "float b2 = lsmem[liy+5][lix+1];\n" "float b3 = lsmem[liy+5][lix+2];\n" "float b4 = lsmem[liy+5][lix+3];\n" "float b5 = lsmem[liy+5][lix+4];\n" "float b6 = lsmem[liy+5][lix+5];\n" "float b7 = lsmem[liy+5][lix+6];\n" "float bb1 = lsmem[liy+6][lix];\n" "float bb2 = lsmem[liy+6][lix+1];\n" "float bb3 = lsmem[liy+6][lix+2];\n" "float bb4 = lsmem[liy+6][lix+3];\n" "float bb5 = lsmem[liy+6][lix+4];\n" "float bb6 = lsmem[liy+6][lix+5];\n" "float bb7 = lsmem[liy+6][lix+6];\n" "DSTX(x,y) = scale *\n" "mad(100.0f, m5 - m3,\n" "mad(80.0f, m6 - m2,\n" "mad(20.0f, m7 - m1,\n" "mad(75.0f, u5 - u3 + l5 - l3,\n" "mad(60.0f, u6 - u2 + l6 - l2,\n" "mad(15.0f, u7 - u1 + l7 - l1,\n" "mad(30.0f, t5 - t3 + b5 - b3,\n" "mad(24.0f, t6 - t2 + b6 - b2,\n" "mad(6.0f, t7 - t1 + b7 - b1,\n" "mad(5.0f, tt5 - tt3 + bb5 - bb3,\n" "mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ");\n" "DSTY(x,y) = scale *\n" "mad(100.0f, l4 - u4,\n" "mad(80.0f, b4 - t4,\n" "mad(20.0f, bb4 - tt4,\n" "mad(75.0f, l5 - u5 + l3 - u3,\n" "mad(60.0f, b5 - t5 + b3 - t3,\n" "mad(15.0f, bb5 - tt5 + bb3 - tt3,\n" "mad(30.0f, l6 - u6 + l2 - u2,\n" "mad(24.0f, b6 - t6 + b2 - t2,\n" "mad(6.0f, bb6 - tt6 + bb2 - tt2,\n" "mad(5.0f, l7 - u7 + l1 - u1,\n" "mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ")\n" ");\n" "}\n" , "97cb1ffd4e7c1bc93caba596bf9c6e55"}; ProgramSource covardata_oclsrc(covardata.programStr); const struct ProgramEntry cvtcolor={"cvtcolor", "#if depth == 0\n" "#define DATA_TYPE uchar\n" "#define MAX_NUM 255\n" "#define HALF_MAX 128\n" "#define COEFF_TYPE int\n" "#define SAT_CAST(num) convert_uchar_sat(num)\n" "#define DEPTH_0\n" "#elif depth == 2\n" "#define DATA_TYPE ushort\n" "#define MAX_NUM 65535\n" "#define HALF_MAX 32768\n" "#define COEFF_TYPE int\n" "#define SAT_CAST(num) convert_ushort_sat(num)\n" "#define DEPTH_2\n" "#elif depth == 5\n" "#define DATA_TYPE float\n" "#define MAX_NUM 1.0f\n" "#define HALF_MAX 0.5f\n" "#define COEFF_TYPE float\n" "#define SAT_CAST(num) (num)\n" "#define DEPTH_5\n" "#else\n" "#error \"invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)\"\n" "#endif\n" "#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))\n" "enum\n" "{\n" "yuv_shift = 14,\n" "xyz_shift = 12,\n" "hsv_shift = 12,\n" "R2Y = 4899,\n" "G2Y = 9617,\n" "B2Y = 1868,\n" "BLOCK_SIZE = 256\n" "};\n" "#define scnbytes ((int)sizeof(DATA_TYPE)*scn)\n" "#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)\n" "#ifndef hscale\n" "#define hscale 0\n" "#endif\n" "#ifndef hrange\n" "#define hrange 0\n" "#endif\n" "#if bidx == 0\n" "#define R_COMP z\n" "#define G_COMP y\n" "#define B_COMP x\n" "#elif bidx == 2\n" "#define R_COMP x\n" "#define G_COMP y\n" "#define B_COMP z\n" "#elif bidx == 3\n" "#define R_COMP w\n" "#define G_COMP w\n" "#define B_COMP w\n" "#endif\n" "#ifndef uidx\n" "#define uidx 0\n" "#endif\n" "#ifndef yidx\n" "#define yidx 0\n" "#endif\n" "#ifndef PIX_PER_WI_X\n" "#define PIX_PER_WI_X 1\n" "#endif\n" "#define __CAT(x, y) x##y\n" "#define CAT(x, y) __CAT(x, y)\n" "#define DATA_TYPE_4 CAT(DATA_TYPE, 4)\n" "__kernel void RGB2Gray(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n" "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "#ifdef DEPTH_5\n" "dst[0] = fma(src_pix.B_COMP, 0.114f, fma(src_pix.G_COMP, 0.587f, src_pix.R_COMP * 0.299f));\n" "#else\n" "dst[0] = (DATA_TYPE)CV_DESCALE(mad24(src_pix.B_COMP, B2Y, mad24(src_pix.G_COMP, G2Y, mul24(src_pix.R_COMP, R2Y))), yuv_shift);\n" "#endif\n" "++y;\n" "src_index += src_step;\n" "dst_index += dst_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n" "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n" "DATA_TYPE val = src[0];\n" "#if dcn == 3 || defined DEPTH_5\n" "dst[0] = dst[1] = dst[2] = val;\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "#else\n" "*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(val, val, val, MAX_NUM);\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };\n" "__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };\n" "__kernel void RGB2YUV(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dt_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n" "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n" "#ifdef DEPTH_5\n" "__constant float * coeffs = c_RGB2YUVCoeffs_f;\n" "const DATA_TYPE Y = fma(b, coeffs[0], fma(g, coeffs[1], r * coeffs[2]));\n" "const DATA_TYPE U = fma(b - Y, coeffs[3], HALF_MAX);\n" "const DATA_TYPE V = fma(r - Y, coeffs[4], HALF_MAX);\n" "#else\n" "__constant int * coeffs = c_RGB2YUVCoeffs_i;\n" "const int delta = HALF_MAX * (1 << yuv_shift);\n" "const int Y = CV_DESCALE(mad24(b, coeffs[0], mad24(g, coeffs[1], mul24(r, coeffs[2]))), yuv_shift);\n" "const int U = CV_DESCALE(mad24(b - Y, coeffs[3], delta), yuv_shift);\n" "const int V = CV_DESCALE(mad24(r - Y, coeffs[4], delta), yuv_shift);\n" "#endif\n" "dst[0] = SAT_CAST( Y );\n" "dst[1] = SAT_CAST( U );\n" "dst[2] = SAT_CAST( V );\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__constant float c_YUV2RGBCoeffs_f[4] = { 2.032f, -0.395f, -0.581f, 1.140f };\n" "__constant int c_YUV2RGBCoeffs_i[4] = { 33292, -6472, -9519, 18678 };\n" "__kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dt_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n" "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z;\n" "#ifdef DEPTH_5\n" "__constant float * coeffs = c_YUV2RGBCoeffs_f;\n" "float r = fma(V - HALF_MAX, coeffs[3], Y);\n" "float g = fma(V - HALF_MAX, coeffs[2], fma(U - HALF_MAX, coeffs[1], Y));\n" "float b = fma(U - HALF_MAX, coeffs[0], Y);\n" "#else\n" "__constant int * coeffs = c_YUV2RGBCoeffs_i;\n" "const int r = Y + CV_DESCALE(mul24(V - HALF_MAX, coeffs[3]), yuv_shift);\n" "const int g = Y + CV_DESCALE(mad24(V - HALF_MAX, coeffs[2], mul24(U - HALF_MAX, coeffs[1])), yuv_shift);\n" "const int b = Y + CV_DESCALE(mul24(U - HALF_MAX, coeffs[0]), yuv_shift);\n" "#endif\n" "dst[bidx] = SAT_CAST( b );\n" "dst[1] = SAT_CAST( g );\n" "dst[bidx^2] = SAT_CAST( r );\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__constant float c_YUV2RGBCoeffs_420[5] = { 1.163999557f, 2.017999649f, -0.390999794f,\n" "-0.812999725f, 1.5959997177f };\n" "__kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dt_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols / 2)\n" "{\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows / 2 )\n" "{\n" "__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);\n" "__global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);\n" "__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));\n" "__global uchar* dst2 = dst1 + dst_step;\n" "float Y1 = ysrc[0];\n" "float Y2 = ysrc[1];\n" "float Y3 = ysrc[src_step];\n" "float Y4 = ysrc[src_step + 1];\n" "float U = ((float)usrc[uidx]) - HALF_MAX;\n" "float V = ((float)usrc[1-uidx]) - HALF_MAX;\n" "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n" "float ruv = fma(coeffs[4], V, 0.5f);\n" "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n" "float buv = fma(coeffs[1], U, 0.5f);\n" "Y1 = max(0.f, Y1 - 16.f) * coeffs[0];\n" "dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);\n" "dst1[1] = convert_uchar_sat(Y1 + guv);\n" "dst1[bidx] = convert_uchar_sat(Y1 + buv);\n" "#if dcn == 4\n" "dst1[3] = 255;\n" "#endif\n" "Y2 = max(0.f, Y2 - 16.f) * coeffs[0];\n" "dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);\n" "dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);\n" "dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);\n" "#if dcn == 4\n" "dst1[7] = 255;\n" "#endif\n" "Y3 = max(0.f, Y3 - 16.f) * coeffs[0];\n" "dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);\n" "dst2[1] = convert_uchar_sat(Y3 + guv);\n" "dst2[bidx] = convert_uchar_sat(Y3 + buv);\n" "#if dcn == 4\n" "dst2[3] = 255;\n" "#endif\n" "Y4 = max(0.f, Y4 - 16.f) * coeffs[0];\n" "dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);\n" "dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);\n" "dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);\n" "#if dcn == 4\n" "dst2[7] = 255;\n" "#endif\n" "}\n" "++y;\n" "}\n" "}\n" "}\n" "__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dt_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols / 2)\n" "{\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows / 2 )\n" "{\n" "__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);\n" "__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);\n" "__global uchar* dst2 = dst1 + dst_step;\n" "float Y1 = ysrc[0];\n" "float Y2 = ysrc[1];\n" "float Y3 = ysrc[src_step];\n" "float Y4 = ysrc[src_step + 1];\n" "#ifdef SRC_CONT\n" "__global const uchar* uvsrc = srcptr + mad24(rows, src_step, src_offset);\n" "int u_ind = mad24(y, cols >> 1, x);\n" "float uv[2] = { ((float)uvsrc[u_ind]) - HALF_MAX, ((float)uvsrc[u_ind + ((rows * cols) >> 2)]) - HALF_MAX };\n" "#else\n" "int vsteps[2] = { cols >> 1, src_step - (cols >> 1)};\n" "__global const uchar* usrc = srcptr + mad24(rows + (y>>1), src_step, src_offset + (y%2)*(cols >> 1) + x);\n" "__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);\n" "float uv[2] = { ((float)usrc[0]) - HALF_MAX, ((float)vsrc[0]) - HALF_MAX };\n" "#endif\n" "float U = uv[uidx];\n" "float V = uv[1-uidx];\n" "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n" "float ruv = fma(coeffs[4], V, 0.5f);\n" "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n" "float buv = fma(coeffs[1], U, 0.5f);\n" "Y1 = max(0.f, Y1 - 16.f) * coeffs[0];\n" "dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);\n" "dst1[1] = convert_uchar_sat(Y1 + guv);\n" "dst1[bidx] = convert_uchar_sat(Y1 + buv);\n" "#if dcn == 4\n" "dst1[3] = 255;\n" "#endif\n" "Y2 = max(0.f, Y2 - 16.f) * coeffs[0];\n" "dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);\n" "dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);\n" "dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);\n" "#if dcn == 4\n" "dst1[7] = 255;\n" "#endif\n" "Y3 = max(0.f, Y3 - 16.f) * coeffs[0];\n" "dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);\n" "dst2[1] = convert_uchar_sat(Y3 + guv);\n" "dst2[bidx] = convert_uchar_sat(Y3 + buv);\n" "#if dcn == 4\n" "dst2[3] = 255;\n" "#endif\n" "Y4 = max(0.f, Y4 - 16.f) * coeffs[0];\n" "dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);\n" "dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);\n" "dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);\n" "#if dcn == 4\n" "dst2[7] = 255;\n" "#endif\n" "}\n" "++y;\n" "}\n" "}\n" "}\n" "__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,\n" "0.438999176f, -0.3679990768f, -0.0709991455f };\n" "__kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0) * PIX_PER_WI_X;\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols/2)\n" "{\n" "int src_index = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));\n" "int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);\n" "int y_rows = rows / 3 * 2;\n" "int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};\n" "__constant float* coeffs = c_RGB2YUVCoeffs_420;\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows / 3)\n" "{\n" "__global const uchar* src1 = srcptr + src_index;\n" "__global const uchar* src2 = src1 + src_step;\n" "__global uchar* ydst1 = dstptr + ydst_index;\n" "__global uchar* ydst2 = ydst1 + dst_step;\n" "__global uchar* udst = dstptr + mad24(y_rows + (y>>1), dst_step, dst_offset + (y%2)*(cols >> 1) + x);\n" "__global uchar* vdst = udst + mad24(y_rows >> 2, dst_step, y_rows % 4 ? vsteps[y%2] : 0);\n" "#if PIX_PER_WI_X == 2\n" "int s11 = *((__global const int*) src1);\n" "int s12 = *((__global const int*) src1 + 1);\n" "int s13 = *((__global const int*) src1 + 2);\n" "#if scn == 4\n" "int s14 = *((__global const int*) src1 + 3);\n" "#endif\n" "int s21 = *((__global const int*) src2);\n" "int s22 = *((__global const int*) src2 + 1);\n" "int s23 = *((__global const int*) src2 + 2);\n" "#if scn == 4\n" "int s24 = *((__global const int*) src2 + 3);\n" "#endif\n" "float src_pix1[scn * 4], src_pix2[scn * 4];\n" "*((float4*) src_pix1) = convert_float4(as_uchar4(s11));\n" "*((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));\n" "*((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));\n" "#if scn == 4\n" "*((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));\n" "#endif\n" "*((float4*) src_pix2) = convert_float4(as_uchar4(s21));\n" "*((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));\n" "*((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));\n" "#if scn == 4\n" "*((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));\n" "#endif\n" "uchar4 y1, y2;\n" "y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));\n" "y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));\n" "y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));\n" "y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));\n" "y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));\n" "y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));\n" "y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));\n" "y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));\n" "*((__global int*) ydst1) = as_int(y1);\n" "*((__global int*) ydst2) = as_int(y2);\n" "float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),\n" "fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),\n" "fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),\n" "fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };\n" "udst[0] = convert_uchar_sat(uv[uidx] );\n" "vdst[0] = convert_uchar_sat(uv[1 - uidx]);\n" "udst[1] = convert_uchar_sat(uv[2 + uidx]);\n" "vdst[1] = convert_uchar_sat(uv[3 - uidx]);\n" "#else\n" "float4 src_pix1 = convert_float4(vload4(0, src1));\n" "float4 src_pix2 = convert_float4(vload4(0, src1+scn));\n" "float4 src_pix3 = convert_float4(vload4(0, src2));\n" "float4 src_pix4 = convert_float4(vload4(0, src2+scn));\n" "ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));\n" "ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));\n" "ydst2[0] = convert_uchar_sat(fma(coeffs[0], src_pix3.R_COMP, fma(coeffs[1], src_pix3.G_COMP, fma(coeffs[2], src_pix3.B_COMP, 16.5f))));\n" "ydst2[1] = convert_uchar_sat(fma(coeffs[0], src_pix4.R_COMP, fma(coeffs[1], src_pix4.G_COMP, fma(coeffs[2], src_pix4.B_COMP, 16.5f))));\n" "float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),\n" "fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };\n" "udst[0] = convert_uchar_sat(uv[uidx] );\n" "vdst[0] = convert_uchar_sat(uv[1-uidx]);\n" "#endif\n" "++y;\n" "src_index += 2*src_step;\n" "ydst_index += 2*dst_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols / 2)\n" "{\n" "__global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);\n" "__global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows )\n" "{\n" "__constant float* coeffs = c_YUV2RGBCoeffs_420;\n" "#ifndef USE_OPTIMIZED_LOAD\n" "float U = ((float) src[uidx]) - HALF_MAX;\n" "float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX;\n" "float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];\n" "float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];\n" "#else\n" "int load_src = *((__global int*) src);\n" "float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};\n" "float U = vec_src[uidx] - HALF_MAX;\n" "float V = vec_src[(2 + uidx) % 4] - HALF_MAX;\n" "float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];\n" "float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];\n" "#endif\n" "float ruv = fma(coeffs[4], V, 0.5f);\n" "float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));\n" "float buv = fma(coeffs[1], U, 0.5f);\n" "dst[2 - bidx] = convert_uchar_sat(y00 + ruv);\n" "dst[1] = convert_uchar_sat(y00 + guv);\n" "dst[bidx] = convert_uchar_sat(y00 + buv);\n" "#if dcn == 4\n" "dst[3] = 255;\n" "#endif\n" "dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);\n" "dst[dcn + 1] = convert_uchar_sat(y01 + guv);\n" "dst[dcn + bidx] = convert_uchar_sat(y01 + buv);\n" "#if dcn == 4\n" "dst[7] = 255;\n" "#endif\n" "}\n" "++y;\n" "src += src_step;\n" "dst += dst_step;\n" "}\n" "}\n" "}\n" "__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};\n" "__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};\n" "__kernel void RGB2YCrCb(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dt_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dt_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);\n" "__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "DATA_TYPE b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n" "#ifdef DEPTH_5\n" "__constant float * coeffs = c_RGB2YCrCbCoeffs_f;\n" "DATA_TYPE Y = fma(b, coeffs[2], fma(g, coeffs[1], r * coeffs[0]));\n" "DATA_TYPE Cr = fma(r - Y, coeffs[3], HALF_MAX);\n" "DATA_TYPE Cb = fma(b - Y, coeffs[4], HALF_MAX);\n" "#else\n" "__constant int * coeffs = c_RGB2YCrCbCoeffs_i;\n" "int delta = HALF_MAX * (1 << yuv_shift);\n" "int Y = CV_DESCALE(mad24(b, coeffs[2], mad24(g, coeffs[1], mul24(r, coeffs[0]))), yuv_shift);\n" "int Cr = CV_DESCALE(mad24(r - Y, coeffs[3], delta), yuv_shift);\n" "int Cb = CV_DESCALE(mad24(b - Y, coeffs[4], delta), yuv_shift);\n" "#endif\n" "dst[0] = SAT_CAST( Y );\n" "dst[1] = SAT_CAST( Cr );\n" "dst[2] = SAT_CAST( Cb );\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f };\n" "__constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 };\n" "__kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_index);\n" "__global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, srcptr);\n" "DATA_TYPE yp = src_pix.x, cr = src_pix.y, cb = src_pix.z;\n" "#ifdef DEPTH_5\n" "__constant float * coeff = c_YCrCb2RGBCoeffs_f;\n" "float r = fma(coeff[0], cr - HALF_MAX, yp);\n" "float g = fma(coeff[1], cr - HALF_MAX, fma(coeff[2], cb - HALF_MAX, yp));\n" "float b = fma(coeff[3], cb - HALF_MAX, yp);\n" "#else\n" "__constant int * coeff = c_YCrCb2RGBCoeffs_i;\n" "int r = yp + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift);\n" "int g = yp + CV_DESCALE(mad24(coeff[1], cr - HALF_MAX, coeff[2] * (cb - HALF_MAX)), yuv_shift);\n" "int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift);\n" "#endif\n" "dstptr[(bidx^2)] = SAT_CAST(r);\n" "dstptr[1] = SAT_CAST(g);\n" "dstptr[bidx] = SAT_CAST(b);\n" "#if dcn == 4\n" "dstptr[3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int rows, int cols, __constant COEFF_TYPE * coeffs)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1) * PIX_PER_WI_Y;\n" "if (dx < cols)\n" "{\n" "int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));\n" "int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (dy < rows)\n" "{\n" "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n" "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z;\n" "#ifdef DEPTH_5\n" "float x = fma(r, coeffs[0], fma(g, coeffs[1], b * coeffs[2]));\n" "float y = fma(r, coeffs[3], fma(g, coeffs[4], b * coeffs[5]));\n" "float z = fma(r, coeffs[6], fma(g, coeffs[7], b * coeffs[8]));\n" "#else\n" "int x = CV_DESCALE(mad24(r, coeffs[0], mad24(g, coeffs[1], b * coeffs[2])), xyz_shift);\n" "int y = CV_DESCALE(mad24(r, coeffs[3], mad24(g, coeffs[4], b * coeffs[5])), xyz_shift);\n" "int z = CV_DESCALE(mad24(r, coeffs[6], mad24(g, coeffs[7], b * coeffs[8])), xyz_shift);\n" "#endif\n" "dst[0] = SAT_CAST(x);\n" "dst[1] = SAT_CAST(y);\n" "dst[2] = SAT_CAST(z);\n" "++dy;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int rows, int cols, __constant COEFF_TYPE * coeffs)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1) * PIX_PER_WI_Y;\n" "if (dx < cols)\n" "{\n" "int src_index = mad24(dy, src_step, mad24(dx, scnbytes, src_offset));\n" "int dst_index = mad24(dy, dst_step, mad24(dx, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (dy < rows)\n" "{\n" "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n" "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z;\n" "#ifdef DEPTH_5\n" "float b = fma(x, coeffs[0], fma(y, coeffs[1], z * coeffs[2]));\n" "float g = fma(x, coeffs[3], fma(y, coeffs[4], z * coeffs[5]));\n" "float r = fma(x, coeffs[6], fma(y, coeffs[7], z * coeffs[8]));\n" "#else\n" "int b = CV_DESCALE(mad24(x, coeffs[0], mad24(y, coeffs[1], z * coeffs[2])), xyz_shift);\n" "int g = CV_DESCALE(mad24(x, coeffs[3], mad24(y, coeffs[4], z * coeffs[5])), xyz_shift);\n" "int r = CV_DESCALE(mad24(x, coeffs[6], mad24(y, coeffs[7], z * coeffs[8])), xyz_shift);\n" "#endif\n" "DATA_TYPE dst0 = SAT_CAST(b);\n" "DATA_TYPE dst1 = SAT_CAST(g);\n" "DATA_TYPE dst2 = SAT_CAST(r);\n" "#if dcn == 3 || defined DEPTH_5\n" "dst[0] = dst0;\n" "dst[1] = dst1;\n" "dst[2] = dst2;\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "#else\n" "*(__global DATA_TYPE_4 *)dst = (DATA_TYPE_4)(dst0, dst1, dst2, MAX_NUM);\n" "#endif\n" "++dy;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);\n" "__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);\n" "DATA_TYPE_4 src_pix = vload4(0, src);\n" "#ifdef REVERSE\n" "dst[0] = src_pix.z;\n" "dst[1] = src_pix.y;\n" "dst[2] = src_pix.x;\n" "#else\n" "dst[0] = src_pix.x;\n" "dst[1] = src_pix.y;\n" "dst[2] = src_pix.z;\n" "#endif\n" "#if dcn == 4\n" "#if scn == 3\n" "dst[3] = MAX_NUM;\n" "#else\n" "dst[3] = src[3];\n" "#endif\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "ushort t = *((__global const ushort*)(src + src_index));\n" "#if greenbits == 6\n" "dst[dst_index + bidx] = (uchar)(t << 3);\n" "dst[dst_index + 1] = (uchar)((t >> 3) & ~3);\n" "dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);\n" "#else\n" "dst[dst_index + bidx] = (uchar)(t << 3);\n" "dst[dst_index + 1] = (uchar)((t >> 2) & ~7);\n" "dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);\n" "#endif\n" "#if dcn == 4\n" "#if greenbits == 6\n" "dst[dst_index + 3] = 255;\n" "#else\n" "dst[dst_index + 3] = t & 0x8000 ? 255 : 0;\n" "#endif\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = vload4(0, src + src_index);\n" "#if greenbits == 6\n" "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));\n" "#elif scn == 3\n" "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));\n" "#else\n" "*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|\n" "((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0));\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, dst_offset + x);\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "int t = *((__global const ushort*)(src + src_index));\n" "#if greenbits == 6\n" "dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 3) & 0xfc, G2Y, ((t >> 8) & 0xf8) * R2Y)), yuv_shift);\n" "#else\n" "dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, B2Y, mad24((t >> 2) & 0xf8, G2Y, ((t >> 7) & 0xf8) * R2Y)), yuv_shift);\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, src_offset + x);\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "int t = src[src_index];\n" "#if greenbits == 6\n" "*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));\n" "#else\n" "t >>= 3;\n" "*((__global ushort*)(dst + dst_index)) = (ushort)(t|(t << 5)|(t << 10));\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__constant int sector_data[][3] = { { 1, 3, 0 },\n" "{ 1, 0, 2 },\n" "{ 3, 0, 1 },\n" "{ 0, 2, 1 },\n" "{ 0, 1, 3 },\n" "{ 2, 1, 0 } };\n" "#ifdef DEPTH_0\n" "__kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols,\n" "__constant int * sdiv_table, __constant int * hdiv_table)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = vload4(0, src + src_index);\n" "int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n" "int h, s, v = b;\n" "int vmin = b, diff;\n" "int vr, vg;\n" "v = max(v, g);\n" "v = max(v, r);\n" "vmin = min(vmin, g);\n" "vmin = min(vmin, r);\n" "diff = v - vmin;\n" "vr = v == r ? -1 : 0;\n" "vg = v == g ? -1 : 0;\n" "s = mad24(diff, sdiv_table[v], (1 << (hsv_shift-1))) >> hsv_shift;\n" "h = (vr & (g - b)) +\n" "(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));\n" "h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;\n" "h += h < 0 ? hrange : 0;\n" "dst[dst_index] = convert_uchar_sat_rte(h);\n" "dst[dst_index + 1] = (uchar)s;\n" "dst[dst_index + 2] = (uchar)v;\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = vload4(0, src + src_index);\n" "float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f);\n" "float b, g, r;\n" "if (s != 0)\n" "{\n" "float tab[4];\n" "int sector;\n" "h *= hscale;\n" "if( h < 0 )\n" "do h += 6; while( h < 0 );\n" "else if( h >= 6 )\n" "do h -= 6; while( h >= 6 );\n" "sector = convert_int_sat_rtn(h);\n" "h -= sector;\n" "if( (unsigned)sector >= 6u )\n" "{\n" "sector = 0;\n" "h = 0.f;\n" "}\n" "tab[0] = v;\n" "tab[1] = v*(1.f - s);\n" "tab[2] = v*(1.f - s*h);\n" "tab[3] = v*(1.f - s*(1.f - h));\n" "b = tab[sector_data[sector][0]];\n" "g = tab[sector_data[sector][1]];\n" "r = tab[sector_data[sector][2]];\n" "}\n" "else\n" "b = g = r = v;\n" "dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);\n" "dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);\n" "dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);\n" "#if dcn == 4\n" "dst[dst_index + 3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#elif defined DEPTH_5\n" "__kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n" "float h, s, v;\n" "float vmin, diff;\n" "v = vmin = r;\n" "if( v < g ) v = g;\n" "if( v < b ) v = b;\n" "if( vmin > g ) vmin = g;\n" "if( vmin > b ) vmin = b;\n" "diff = v - vmin;\n" "s = diff/(float)(fabs(v) + FLT_EPSILON);\n" "diff = (float)(60.f/(diff + FLT_EPSILON));\n" "if( v == r )\n" "h = (g - b)*diff;\n" "else if( v == g )\n" "h = fma(b - r, diff, 120.f);\n" "else\n" "h = fma(r - g, diff, 240.f);\n" "if( h < 0 )\n" "h += 360.f;\n" "dst[0] = h*hscale;\n" "dst[1] = s;\n" "dst[2] = v;\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float h = src_pix.x, s = src_pix.y, v = src_pix.z;\n" "float b, g, r;\n" "if (s != 0)\n" "{\n" "float tab[4];\n" "int sector;\n" "h *= hscale;\n" "if(h < 0)\n" "do h += 6; while (h < 0);\n" "else if (h >= 6)\n" "do h -= 6; while (h >= 6);\n" "sector = convert_int_sat_rtn(h);\n" "h -= sector;\n" "if ((unsigned)sector >= 6u)\n" "{\n" "sector = 0;\n" "h = 0.f;\n" "}\n" "tab[0] = v;\n" "tab[1] = v*(1.f - s);\n" "tab[2] = v*(1.f - s*h);\n" "tab[3] = v*(1.f - s*(1.f - h));\n" "b = tab[sector_data[sector][0]];\n" "g = tab[sector_data[sector][1]];\n" "r = tab[sector_data[sector][2]];\n" "}\n" "else\n" "b = g = r = v;\n" "dst[bidx] = b;\n" "dst[1] = g;\n" "dst[bidx^2] = r;\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "#ifdef DEPTH_0\n" "__kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = vload4(0, src + src_index);\n" "float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f);\n" "float h = 0.f, s = 0.f, l;\n" "float vmin, vmax, diff;\n" "vmax = vmin = r;\n" "if (vmax < g) vmax = g;\n" "if (vmax < b) vmax = b;\n" "if (vmin > g) vmin = g;\n" "if (vmin > b) vmin = b;\n" "diff = vmax - vmin;\n" "l = (vmax + vmin)*0.5f;\n" "if (diff > FLT_EPSILON)\n" "{\n" "s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);\n" "diff = 60.f/diff;\n" "if( vmax == r )\n" "h = (g - b)*diff;\n" "else if( vmax == g )\n" "h = fma(b - r, diff, 120.f);\n" "else\n" "h = fma(r - g, diff, 240.f);\n" "if( h < 0.f )\n" "h += 360.f;\n" "}\n" "dst[dst_index] = convert_uchar_sat_rte(h*hscale);\n" "dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);\n" "dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = vload4(0, src + src_index);\n" "float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f);\n" "float b, g, r;\n" "if (s != 0)\n" "{\n" "float tab[4];\n" "float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;\n" "float p1 = 2*l - p2;\n" "h *= hscale;\n" "if( h < 0 )\n" "do h += 6; while( h < 0 );\n" "else if( h >= 6 )\n" "do h -= 6; while( h >= 6 );\n" "int sector = convert_int_sat_rtn(h);\n" "h -= sector;\n" "tab[0] = p2;\n" "tab[1] = p1;\n" "tab[2] = fma(p2 - p1, 1-h, p1);\n" "tab[3] = fma(p2 - p1, h, p1);\n" "b = tab[sector_data[sector][0]];\n" "g = tab[sector_data[sector][1]];\n" "r = tab[sector_data[sector][2]];\n" "}\n" "else\n" "b = g = r = l;\n" "dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);\n" "dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);\n" "dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);\n" "#if dcn == 4\n" "dst[dst_index + 3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#elif defined DEPTH_5\n" "__kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP;\n" "float h = 0.f, s = 0.f, l;\n" "float vmin, vmax, diff;\n" "vmax = vmin = r;\n" "if (vmax < g) vmax = g;\n" "if (vmax < b) vmax = b;\n" "if (vmin > g) vmin = g;\n" "if (vmin > b) vmin = b;\n" "diff = vmax - vmin;\n" "l = (vmax + vmin)*0.5f;\n" "if (diff > FLT_EPSILON)\n" "{\n" "s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);\n" "diff = 60.f/diff;\n" "if( vmax == r )\n" "h = (g - b)*diff;\n" "else if( vmax == g )\n" "h = fma(b - r, diff, 120.f);\n" "else\n" "h = fma(r - g, diff, 240.f);\n" "if( h < 0.f ) h += 360.f;\n" "}\n" "dst[0] = h*hscale;\n" "dst[1] = l;\n" "dst[2] = s;\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset,\n" "__global uchar* dstptr, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float h = src_pix.x, l = src_pix.y, s = src_pix.z;\n" "float b, g, r;\n" "if (s != 0)\n" "{\n" "float tab[4];\n" "int sector;\n" "float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;\n" "float p1 = 2*l - p2;\n" "h *= hscale;\n" "if( h < 0 )\n" "do h += 6; while( h < 0 );\n" "else if( h >= 6 )\n" "do h -= 6; while( h >= 6 );\n" "sector = convert_int_sat_rtn(h);\n" "h -= sector;\n" "tab[0] = p2;\n" "tab[1] = p1;\n" "tab[2] = fma(p2 - p1, 1-h, p1);\n" "tab[3] = fma(p2 - p1, h, p1);\n" "b = tab[sector_data[sector][0]];\n" "g = tab[sector_data[sector][1]];\n" "r = tab[sector_data[sector][2]];\n" "}\n" "else\n" "b = g = r = l;\n" "dst[bidx] = b;\n" "dst[1] = g;\n" "dst[bidx^2] = r;\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "#ifdef DEPTH_0\n" "__kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, src_offset + (x << 2));\n" "int dst_index = mad24(y, dst_step, dst_offset + (x << 2));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = *(__global const uchar4 *)(src + src_index);\n" "*(__global uchar4 *)(dst + dst_index) =\n" "(uchar4)(mad24(src_pix.x, src_pix.w, HALF_MAX) / MAX_NUM,\n" "mad24(src_pix.y, src_pix.w, HALF_MAX) / MAX_NUM,\n" "mad24(src_pix.z, src_pix.w, HALF_MAX) / MAX_NUM, src_pix.w);\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "__kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset,\n" "__global uchar* dst, int dst_step, int dst_offset,\n" "int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, 4, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, 4, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "uchar4 src_pix = *(__global const uchar4 *)(src + src_index);\n" "uchar v3 = src_pix.w, v3_half = v3 / 2;\n" "if (v3 == 0)\n" "*(__global uchar4 *)(dst + dst_index) = (uchar4)(0, 0, 0, 0);\n" "else\n" "*(__global uchar4 *)(dst + dst_index) =\n" "(uchar4)(mad24(src_pix.x, MAX_NUM, v3_half) / v3,\n" "mad24(src_pix.y, MAX_NUM, v3_half) / v3,\n" "mad24(src_pix.z, MAX_NUM, v3_half) / v3, v3);\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "#define lab_shift xyz_shift\n" "#define gamma_shift 3\n" "#define lab_shift2 (lab_shift + gamma_shift)\n" "#define GAMMA_TAB_SIZE 1024\n" "#define GammaTabScale (float)GAMMA_TAB_SIZE\n" "inline float splineInterpolate(float x, __global const float * tab, int n)\n" "{\n" "int ix = clamp(convert_int_sat_rtn(x), 0, n-1);\n" "x -= ix;\n" "tab += ix << 2;\n" "return fma(fma(fma(tab[3], x, tab[2]), x, tab[1]), x, tab[0]);\n" "}\n" "#ifdef DEPTH_0\n" "__kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n" "__global const ushort * gammaTab, __global ushort * LabCbrtTab_b,\n" "__constant int * coeffs, int Lscale, int Lshift)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const uchar* src_ptr = src + src_index;\n" "__global uchar* dst_ptr = dst + dst_index;\n" "uchar4 src_pix = vload4(0, src_ptr);\n" "int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n" "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n" "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n" "int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z];\n" "int fX = LabCbrtTab_b[CV_DESCALE(mad24(R, C0, mad24(G, C1, B*C2)), lab_shift)];\n" "int fY = LabCbrtTab_b[CV_DESCALE(mad24(R, C3, mad24(G, C4, B*C5)), lab_shift)];\n" "int fZ = LabCbrtTab_b[CV_DESCALE(mad24(R, C6, mad24(G, C7, B*C8)), lab_shift)];\n" "int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );\n" "int a = CV_DESCALE( mad24(500, fX - fY, 128*(1 << lab_shift2)), lab_shift2 );\n" "int b = CV_DESCALE( mad24(200, fY - fZ, 128*(1 << lab_shift2)), lab_shift2 );\n" "dst_ptr[0] = SAT_CAST(L);\n" "dst_ptr[1] = SAT_CAST(a);\n" "dst_ptr[2] = SAT_CAST(b);\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#elif defined DEPTH_5\n" "__kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n" "#ifdef SRGB\n" "__global const float * gammaTab,\n" "#endif\n" "__constant float * coeffs, float _1_3, float _a)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n" "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n" "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n" "float R = clamp(src_pix.x, 0.0f, 1.0f);\n" "float G = clamp(src_pix.y, 0.0f, 1.0f);\n" "float B = clamp(src_pix.z, 0.0f, 1.0f);\n" "#ifdef SRGB\n" "R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "#endif\n" "float X = fma(R, C0, fma(G, C1, B*C2));\n" "float Y = fma(R, C3, fma(G, C4, B*C5));\n" "float Z = fma(R, C6, fma(G, C7, B*C8));\n" "float FX = X > 0.008856f ? rootn(X, 3) : fma(7.787f, X, _a);\n" "float FY = Y > 0.008856f ? rootn(Y, 3) : fma(7.787f, Y, _a);\n" "float FZ = Z > 0.008856f ? rootn(Z, 3) : fma(7.787f, Z, _a);\n" "float L = Y > 0.008856f ? fma(116.f, FY, -16.f) : (903.3f * Y);\n" "float a = 500.f * (FX - FY);\n" "float b = 200.f * (FY - FZ);\n" "dst[0] = L;\n" "dst[1] = a;\n" "dst[2] = b;\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "inline void Lab2BGR_f(const float * srcbuf, float * dstbuf,\n" "#ifdef SRGB\n" "__global const float * gammaTab,\n" "#endif\n" "__constant float * coeffs, float lThresh, float fThresh)\n" "{\n" "float li = srcbuf[0], ai = srcbuf[1], bi = srcbuf[2];\n" "float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],\n" "C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],\n" "C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];\n" "float y, fy;\n" "if (li <= lThresh)\n" "{\n" "y = li / 903.3f;\n" "fy = fma(7.787f, y, 16.0f / 116.0f);\n" "}\n" "else\n" "{\n" "fy = (li + 16.0f) / 116.0f;\n" "y = fy * fy * fy;\n" "}\n" "float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };\n" "#pragma unroll\n" "for (int j = 0; j < 2; j++)\n" "if (fxz[j] <= fThresh)\n" "fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;\n" "else\n" "fxz[j] = fxz[j] * fxz[j] * fxz[j];\n" "float x = fxz[0], z = fxz[1];\n" "float ro = clamp(fma(C0, x, fma(C1, y, C2 * z)), 0.0f, 1.0f);\n" "float go = clamp(fma(C3, x, fma(C4, y, C5 * z)), 0.0f, 1.0f);\n" "float bo = clamp(fma(C6, x, fma(C7, y, C8 * z)), 0.0f, 1.0f);\n" "#ifdef SRGB\n" "ro = splineInterpolate(ro * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "go = splineInterpolate(go * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "bo = splineInterpolate(bo * GammaTabScale, gammaTab, GAMMA_TAB_SIZE);\n" "#endif\n" "dstbuf[0] = ro, dstbuf[1] = go, dstbuf[2] = bo;\n" "}\n" "#ifdef DEPTH_0\n" "__kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int rows, int cols,\n" "#ifdef SRGB\n" "__global const float * gammaTab,\n" "#endif\n" "__constant float * coeffs, float lThresh, float fThresh)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const uchar* src_ptr = src + src_index;\n" "__global uchar * dst_ptr = dst + dst_index;\n" "uchar4 src_pix = vload4(0, src_ptr);\n" "float srcbuf[3], dstbuf[3];\n" "srcbuf[0] = src_pix.x*(100.f/255.f);\n" "srcbuf[1] = convert_float(src_pix.y - 128);\n" "srcbuf[2] = convert_float(src_pix.z - 128);\n" "Lab2BGR_f(&srcbuf[0], &dstbuf[0],\n" "#ifdef SRGB\n" "gammaTab,\n" "#endif\n" "coeffs, lThresh, fThresh);\n" "#if dcn == 3\n" "dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);\n" "dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);\n" "dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);\n" "#else\n" "*(__global uchar4 *)dst_ptr = (uchar4)(SAT_CAST(dstbuf[0] * 255.0f),\n" "SAT_CAST(dstbuf[1] * 255.0f), SAT_CAST(dstbuf[2] * 255.0f), MAX_NUM);\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#elif defined DEPTH_5\n" "__kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n" "#ifdef SRGB\n" "__global const float * gammaTab,\n" "#endif\n" "__constant float * coeffs, float lThresh, float fThresh)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * PIX_PER_WI_Y;\n" "if (x < cols)\n" "{\n" "int src_index = mad24(y, src_step, mad24(x, scnbytes, src_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, dcnbytes, dst_offset));\n" "#pragma unroll\n" "for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)\n" "{\n" "if (y < rows)\n" "{\n" "__global const float * src = (__global const float *)(srcptr + src_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "float4 src_pix = vload4(0, src);\n" "float srcbuf[3], dstbuf[3];\n" "srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z;\n" "Lab2BGR_f(&srcbuf[0], &dstbuf[0],\n" "#ifdef SRGB\n" "gammaTab,\n" "#endif\n" "coeffs, lThresh, fThresh);\n" "dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];\n" "#if dcn == 4\n" "dst[3] = MAX_NUM;\n" "#endif\n" "++y;\n" "dst_index += dst_step;\n" "src_index += src_step;\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" "#define LAB_CBRT_TAB_SIZE 1024\n" "#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<= (maxV)) \\\n" "(x) -= ((maxV) - (minV)); \\\n" "}\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "if ((maxV) - (minV) == 1) \\\n" "(x) = (minV); \\\n" "else \\\n" "while ((x) >= (maxV) || (x) < (minV)) \\\n" "{ \\\n" "if ((x) < (minV)) \\\n" "(x) = (minV) - ((x) - (minV)) - 1; \\\n" "else \\\n" "(x) = (maxV) - 1 - ((x) - (maxV)); \\\n" "} \\\n" "}\n" "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "if ((maxV) - (minV) == 1) \\\n" "(x) = (minV); \\\n" "else \\\n" "while ((x) >= (maxV) || (x) < (minV)) \\\n" "{ \\\n" "if ((x) < (minV)) \\\n" "(x) = (minV) - ((x) - (minV)); \\\n" "else \\\n" "(x) = (maxV) - 1 - ((x) - (maxV)) - 1; \\\n" "} \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#else\n" "#ifdef BORDER_CONSTANT\n" "#define EXTRAPOLATE(x, minV, maxV)\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "(x) = clamp((x), (minV), (maxV)-1); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "if ((x) < (minV)) \\\n" "(x) += (((minV) - (x)) / ((maxV) - (minV)) + 1) * ((maxV) - (minV)); \\\n" "if ((x) >= (maxV)) \\\n" "(x) = ((x) - (minV)) % ((maxV) - (minV)) + (minV); \\\n" "}\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "(x) = clamp((x), 2 * (minV) - (x) - 1, 2 * (maxV) - (x) - 1); \\\n" "}\n" "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n" "#define EXTRAPOLATE(x, minV, maxV) \\\n" "{ \\\n" "(x) = clamp((x), 2 * (minV) - (x), 2 * (maxV) - (x) - 2); \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#endif\n" "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1) * cn\n" "#define DSTSIZE (int)sizeof(dstT1) * cn\n" "#endif\n" "#define UPDATE_COLUMN_SUM(col) \\\n" "__constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * col]; \\\n" "WT tmp_sum = 0; \\\n" "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++) \\\n" "tmp_sum += data[sy] * k[sy]; \\\n" "sumOfCols[local_id] = tmp_sum; \\\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#define UPDATE_TOTAL_SUM(col) \\\n" "int id = local_id + col - ANCHOR_X; \\\n" "if (id >= 0 && id < LOCAL_SIZE) \\\n" "total_sum += sumOfCols[id]; \\\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#define noconvert\n" "#define DIG(a) a,\n" "__constant WT1 kernelData[] = { COEFF };\n" "__kernel void filter2D(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)\n" "{\n" "int local_id = get_local_id(0);\n" "int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X;\n" "int y = get_global_id(1);\n" "WT data[KERNEL_SIZE_Y];\n" "__local WT sumOfCols[LOCAL_SIZE];\n" "#ifdef BORDER_ISOLATED\n" "int srcBeginX = srcOffsetX;\n" "int srcBeginY = srcOffsetY;\n" "#else\n" "int srcBeginX = 0;\n" "int srcBeginY = 0;\n" "#endif\n" "int srcX = srcOffsetX + x;\n" "int srcY = srcOffsetY + y - ANCHOR_Y;\n" "__global dstT *dst = (__global dstT *)(dstptr + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));\n" "#ifdef BORDER_CONSTANT\n" "if (srcX >= srcBeginX && srcX < srcEndX)\n" "{\n" "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n" "{\n" "if (srcY >= srcBeginY && srcY < srcEndY)\n" "data[sy + sy_index] = convertToWT(loadpix(srcptr + mad24(srcY, src_step, srcX * SRCSIZE)));\n" "else\n" "data[sy + sy_index] = (WT)(0);\n" "}\n" "}\n" "else\n" "{\n" "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n" "{\n" "data[sy + sy_index] = (WT)(0);\n" "}\n" "}\n" "#else\n" "EXTRAPOLATE(srcX, srcBeginX, srcEndX);\n" "for (int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; sy++, srcY++)\n" "{\n" "int tempY = srcY;\n" "EXTRAPOLATE(tempY, srcBeginY, srcEndY);\n" "data[sy + sy_index] = convertToWT(loadpix(srcptr + mad24(tempY, src_step, srcX * SRCSIZE)));\n" "}\n" "#endif\n" "WT total_sum = 0;\n" "for (int sx = 0; sx < ANCHOR_X; sx++)\n" "{\n" "UPDATE_COLUMN_SUM(sx);\n" "UPDATE_TOTAL_SUM(sx);\n" "}\n" "__constant WT1 * k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * ANCHOR_X];\n" "for (int sy = 0; sy < KERNEL_SIZE_Y; sy++)\n" "total_sum += data[sy] * k[sy];\n" "for (int sx = ANCHOR_X + 1; sx < KERNEL_SIZE_X; sx++)\n" "{\n" "UPDATE_COLUMN_SUM(sx);\n" "UPDATE_TOTAL_SUM(sx);\n" "}\n" "if (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) && x >= 0 && x < cols)\n" "storepix(convertToDstT(total_sum + (WT)(delta)), dst);\n" "}\n" , "77e935928055f243ff9082b1879a0b2c"}; ProgramSource filter2D_oclsrc(filter2D.programStr); const struct ProgramEntry filter2DSmall={"filter2DSmall", "#ifdef BORDER_REPLICATE\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))\n" "#endif\n" "#ifdef BORDER_REFLECT\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_REFLECT_101\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_WRAP\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))\n" "#endif\n" "#ifdef BORDER_ISOLATED\n" "#define ISOLATED_MIN(VAL) (VAL)\n" "#else\n" "#define ISOLATED_MIN(VAL) 0\n" "#endif\n" "#ifdef EXTRA_EXTRAPOLATION\n" "#ifdef BORDER_CONSTANT\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "x = max(min(x, maxX - 1), minX); \\\n" "y = max(min(y, maxY - 1), minY); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "if (x < minX) \\\n" "x -= ((x - maxX + 1) / maxX) * maxX; \\\n" "if (x >= maxX) \\\n" "x %= maxX; \\\n" "if (y < minY) \\\n" "y -= ((y - maxY + 1) / maxY) * maxY; \\\n" "if (y >= maxY) \\\n" "y %= maxY; \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n" "{ \\\n" "if (maxX - minX == 1) \\\n" "x = minX; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (x < minX) \\\n" "x = minX - (x - minX) - 1 + delta; \\\n" "else \\\n" "x = maxX - 1 - (x - maxX) - delta; \\\n" "} \\\n" "while (x >= maxX || x < minX); \\\n" "\\\n" "if (maxY - minY == 1) \\\n" "y = minY; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (y < minY) \\\n" "y = minY - (y - minY) - 1 + delta; \\\n" "else \\\n" "y = maxY - 1 - (y - maxY) - delta; \\\n" "} \\\n" "while (y >= maxY || y < minY); \\\n" "}\n" "#ifdef BORDER_REFLECT\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n" "#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n" "#endif\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#else\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \\\n" "_row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \\\n" "_row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \\\n" "y = _row + ISOLATED_MIN(minY); \\\n" "\\\n" "_col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \\\n" "_col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \\\n" "x = _col + ISOLATED_MIN(minX); \\\n" "}\n" "#endif\n" "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1) * cn\n" "#define DSTSIZE (int)sizeof(dstT1) * cn\n" "#endif\n" "#define noconvert\n" "struct RectCoords\n" "{\n" "int x1, y1, x2, y2;\n" "};\n" "#ifdef BORDER_ISOLATED\n" "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n" "{\n" "return (coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);\n" "}\n" "#else\n" "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n" "{\n" "return (coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);\n" "}\n" "#endif\n" "inline WT getBorderPixel(const struct RectCoords bounds, int2 coord,\n" "__global const uchar* srcptr, int srcstep)\n" "{\n" "#ifdef BORDER_CONSTANT\n" "return (WT)(0);\n" "#else\n" "int selected_col = coord.x;\n" "int selected_row = coord.y;\n" "EXTRAPOLATE(selected_col, selected_row,\n" "bounds.x1, bounds.y1,\n" "bounds.x2, bounds.y2\n" ");\n" "coord = (int2)(selected_col, selected_row);\n" "__global const uchar* ptr = srcptr + mul24(coord.y, srcstep) +\n" "coord.x * SRCSIZE;\n" "return convertToWT(loadpix(ptr));\n" "#endif\n" "}\n" "inline WT readSrcPixelSingle(int2 pos, __global const uchar* srcptr,\n" "int srcstep, const struct RectCoords srcCoords)\n" "{\n" "if (!isBorder(srcCoords, pos, 1))\n" "{\n" "__global const uchar* ptr = srcptr + mul24(pos.y, srcstep) +\n" "pos.x * SRCSIZE;\n" "return convertToWT(loadpix(ptr));\n" "}\n" "else\n" "{\n" "return getBorderPixel(srcCoords, pos, srcptr, srcstep);\n" "}\n" "}\n" "#define __CAT(x, y) x##y\n" "#define CAT(x, y) __CAT(x, y)\n" "#define vload1(OFFSET, PTR) (*(PTR + OFFSET))\n" "#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)\n" "#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)\n" "#if PX_LOAD_VEC_SIZE == 1\n" "#define PX_LOAD_FLOAT_VEC_CONV (float)\n" "#elif PX_LOAD_VEC_SIZE == 2\n" "#define PX_LOAD_FLOAT_VEC_CONV convert_float2\n" "#elif PX_LOAD_VEC_SIZE == 3\n" "#define PX_LOAD_FLOAT_VEC_CONV convert_float3\n" "#elif PX_LOAD_VEC_SIZE == 4\n" "#define PX_LOAD_FLOAT_VEC_CONV convert_float4\n" "#endif\n" "#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)\n" "#define float1 float\n" "inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar* srcptr,\n" "int srcstep, const struct RectCoords srcCoords)\n" "{\n" "__global const srcT1* ptr = (__global const srcT1*)\n" "(srcptr + mul24(pos.y, srcstep) +\n" "pos.x * SRCSIZE);\n" "return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));\n" "}\n" "#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n" "#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n" "#define DIG(a) a,\n" "__constant WT1 kernelData[] = { COEFF };\n" "__kernel void filter2DSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)\n" "{\n" "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n" "const int startX = get_global_id(0) * PX_PER_WI_X;\n" "const int startY = get_global_id(1) * PX_PER_WI_Y;\n" "if ((startX >= cols) || (startY >= rows))\n" "{\n" "return;\n" "}\n" "WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH];\n" "int py = 0;\n" "LOOP(PX_LOAD_Y_ITERATIONS, py,\n" "{\n" "int y = startY + py;\n" "int px = 0;\n" "LOOP(PX_LOAD_X_ITERATIONS, px,\n" "{\n" "int x = startX + (px * PX_LOAD_NUM_PX);\n" "int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y);\n" "if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX))\n" "{\n" "PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords);\n" "*((PX_LOAD_FLOAT_VEC_TYPE*)&privateData[py][px * PX_LOAD_NUM_PX]) = p;\n" "}\n" "else\n" "{\n" "int lx = 0;\n" "LOOP(PX_LOAD_NUM_PX, lx,\n" "{\n" "WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords);\n" "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p;\n" "srcPos.x++;\n" "});\n" "}\n" "});\n" "});\n" "py = 0;\n" "LOOP(PX_PER_WI_Y, py,\n" "{\n" "int y = startY + py;\n" "int px = 0;\n" "LOOP(PX_PER_WI_X, px,\n" "{\n" "int x = startX + px;\n" "WT total_sum = 0;\n" "int sy = 0;\n" "int kernelIndex = 0;\n" "LOOP(KERNEL_SIZE_Y, sy,\n" "{\n" "int sx = 0;\n" "LOOP(KERNEL_SIZE_X, sx,\n" "{\n" "total_sum = mad(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum);\n" "});\n" "});\n" "__global dstT* dstPtr = (__global dstT*)(dstptr + y * dst_step + dst_offset + x * DSTSIZE);\n" "storepix(convertToDstT(total_sum + (WT)(delta)), dstPtr);\n" "});\n" "});\n" "}\n" , "030d23b1d64d51e6485f8941af1e3fc3"}; ProgramSource filter2DSmall_oclsrc(filter2DSmall.programStr); const struct ProgramEntry filterSepCol={"filterSepCol", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define READ_TIMES_COL ((2*(RADIUSY+LSIZE1)-1)/LSIZE1)\n" "#define RADIUS 1\n" "#define noconvert\n" "#if CN != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1)*3\n" "#define DSTSIZE (int)sizeof(dstT1)*3\n" "#endif\n" "#define DIG(a) a,\n" "__constant srcT1 mat_kernel[] = { COEFF };\n" "__kernel void col_filter(__global const uchar * src, int src_step, int src_offset, int src_whole_rows, int src_whole_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "int l_x = get_local_id(0);\n" "int l_y = get_local_id(1);\n" "int start_addr = mad24(y, src_step, x * SRCSIZE);\n" "int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * SRCSIZE);\n" "srcT sum, temp[READ_TIMES_COL];\n" "__local srcT LDS_DAT[LSIZE1 * READ_TIMES_COL][LSIZE0 + 1];\n" "for (int i = 0; i < READ_TIMES_COL; ++i)\n" "{\n" "int current_addr = mad24(i, LSIZE1 * src_step, start_addr);\n" "current_addr = current_addr < end_addr ? current_addr : 0;\n" "temp[i] = loadpix(src + current_addr);\n" "}\n" "for (int i = 0; i < READ_TIMES_COL; ++i)\n" "LDS_DAT[mad24(i, LSIZE1, l_y)][l_x] = temp[i];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "sum = LDS_DAT[l_y + RADIUSY][l_x] * mat_kernel[RADIUSY];\n" "for (int i = 1; i <= RADIUSY; ++i)\n" "{\n" "temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];\n" "temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);\n" "#else\n" "sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);\n" "#endif\n" "}\n" "#ifdef INTEGER_ARITHMETIC\n" "#ifdef INTEL_DEVICE\n" "sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);\n" "#else\n" "sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;\n" "#endif\n" "#endif\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "start_addr = mad24(y, dst_step, mad24(DSTSIZE, x, dst_offset));\n" "storepix(convertToDstT(sum + (srcT)(delta)), dst + start_addr);\n" "}\n" "}\n" , "83a29b40287a01ffdb496951c71bc7cd"}; ProgramSource filterSepCol_oclsrc(filterSepCol.programStr); const struct ProgramEntry filterSepRow={"filterSepRow", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define READ_TIMES_ROW ((2*(RADIUSX+LSIZE0)-1)/LSIZE0)\n" "#define RADIUS 1\n" "#ifdef BORDER_REPLICATE\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n" "#endif\n" "#ifdef BORDER_REFLECT\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_REFLECT_101\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_WRAP\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n" "#endif\n" "#ifdef EXTRA_EXTRAPOLATION\n" "#ifdef BORDER_CONSTANT\n" "#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(t, minT, maxT) \\\n" "{ \\\n" "t = max(min(t, (maxT) - 1), (minT)); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, minT, maxT) \\\n" "{ \\\n" "if (t < (minT)) \\\n" "t -= ((t - (maxT) + 1) / (maxT)) * (maxT); \\\n" "if (t >= (maxT)) \\\n" "t %= (maxT); \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE_(t, minT, maxT, delta) \\\n" "{ \\\n" "if ((maxT) - (minT) == 1) \\\n" "t = (minT); \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (t < (minT)) \\\n" "t = (minT) - (t - (minT)) - 1 + delta; \\\n" "else \\\n" "t = (maxT) - 1 - (t - (maxT)) - delta; \\\n" "} \\\n" "while (t >= (maxT) || t < (minT)); \\\n" "\\\n" "}\n" "#ifdef BORDER_REFLECT\n" "#define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 0)\n" "#elif defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE(t, minT, maxT) EXTRAPOLATE_(t, minT, maxT, 1)\n" "#endif\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#else\n" "#ifdef BORDER_CONSTANT\n" "#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n" "#else\n" "#define EXTRAPOLATE(t, minT, maxT) \\\n" "{ \\\n" "int _delta = t - (minT); \\\n" "_delta = ADDR_L(_delta, 0, (maxT) - (minT)); \\\n" "_delta = ADDR_R(_delta, (maxT) - (minT), _delta); \\\n" "t = _delta + (minT); \\\n" "}\n" "#endif\n" "#endif\n" "#define noconvert\n" "#if CN != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1)*3\n" "#define DSTSIZE (int)sizeof(dstT1)*3\n" "#endif\n" "#define DIG(a) a,\n" "__constant dstT1 mat_kernel[] = { COEFF };\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "#define dstT4 int4\n" "#define convertDstVec convert_int4\n" "#else\n" "#define dstT4 float4\n" "#define convertDstVec convert_float4\n" "#endif\n" "__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,\n" "int src_cols, int src_rows, int src_whole_cols, int src_whole_rows,\n" "__global float * dst, int dst_step_in_pixel, int dst_cols, int dst_rows,\n" "int radiusy)\n" "{\n" "int x = get_global_id(0)<<2;\n" "int y = get_global_id(1);\n" "int l_x = get_local_id(0);\n" "int l_y = get_local_id(1);\n" "int start_x = x + src_offset_x - RADIUSX & 0xfffffffc;\n" "int offset = src_offset_x - RADIUSX & 3;\n" "int start_y = y + src_offset_y - radiusy;\n" "int start_addr = mad24(start_y, src_step_in_pixel, start_x);\n" "dstT4 sum;\n" "uchar4 temp[READ_TIMES_ROW];\n" "__local uchar4 LDS_DAT[LSIZE1][READ_TIMES_ROW * LSIZE0 + 1];\n" "#ifdef BORDER_CONSTANT\n" "int end_addr = mad24(src_whole_rows - 1, src_step_in_pixel, src_whole_cols);\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "int current_addr = mad24(i, LSIZE0 << 2, start_addr);\n" "current_addr = current_addr < end_addr && current_addr > 0 ? current_addr : 0;\n" "temp[i] = *(__global const uchar4 *)&src[current_addr];\n" "}\n" "#ifdef BORDER_ISOLATED\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "temp[i].x = ELEM(start_x+i*LSIZE0*4, src_offset_x, src_offset_x + src_cols, 0, temp[i].x);\n" "temp[i].y = ELEM(start_x+i*LSIZE0*4+1, src_offset_x, src_offset_x + src_cols, 0, temp[i].y);\n" "temp[i].z = ELEM(start_x+i*LSIZE0*4+2, src_offset_x, src_offset_x + src_cols, 0, temp[i].z);\n" "temp[i].w = ELEM(start_x+i*LSIZE0*4+3, src_offset_x, src_offset_x + src_cols, 0, temp[i].w);\n" "temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (uchar4)0, temp[i]);\n" "}\n" "#else\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "temp[i].x = ELEM(start_x+i*LSIZE0*4, 0, src_whole_cols, 0, temp[i].x);\n" "temp[i].y = ELEM(start_x+i*LSIZE0*4+1, 0, src_whole_cols, 0, temp[i].y);\n" "temp[i].z = ELEM(start_x+i*LSIZE0*4+2, 0, src_whole_cols, 0, temp[i].z);\n" "temp[i].w = ELEM(start_x+i*LSIZE0*4+3, 0, src_whole_cols, 0, temp[i].w);\n" "temp[i] = ELEM(start_y, 0, src_whole_rows, (uchar4)0, temp[i]);\n" "}\n" "#endif\n" "#else\n" "#ifdef BORDER_ISOLATED\n" "int not_all_in_range = (start_xsrc_offset_x + src_cols)| (start_y= src_offset_y + src_rows);\n" "#else\n" "int not_all_in_range = (start_x<0) | (start_x + READ_TIMES_ROW*LSIZE0*4+4>src_whole_cols)| (start_y<0) | (start_y >= src_whole_rows);\n" "#endif\n" "int4 index[READ_TIMES_ROW], addr;\n" "int s_y;\n" "if (not_all_in_range)\n" "{\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "index[i] = (int4)(mad24(i, LSIZE0 << 2, start_x)) + (int4)(0, 1, 2, 3);\n" "#ifdef BORDER_ISOLATED\n" "EXTRAPOLATE(index[i].x, src_offset_x, src_offset_x + src_cols);\n" "EXTRAPOLATE(index[i].y, src_offset_x, src_offset_x + src_cols);\n" "EXTRAPOLATE(index[i].z, src_offset_x, src_offset_x + src_cols);\n" "EXTRAPOLATE(index[i].w, src_offset_x, src_offset_x + src_cols);\n" "#else\n" "EXTRAPOLATE(index[i].x, 0, src_whole_cols);\n" "EXTRAPOLATE(index[i].y, 0, src_whole_cols);\n" "EXTRAPOLATE(index[i].z, 0, src_whole_cols);\n" "EXTRAPOLATE(index[i].w, 0, src_whole_cols);\n" "#endif\n" "}\n" "s_y = start_y;\n" "#ifdef BORDER_ISOLATED\n" "EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);\n" "#else\n" "EXTRAPOLATE(s_y, 0, src_whole_rows);\n" "#endif\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "addr = mad24((int4)s_y, (int4)src_step_in_pixel, index[i]);\n" "temp[i].x = src[addr.x];\n" "temp[i].y = src[addr.y];\n" "temp[i].z = src[addr.z];\n" "temp[i].w = src[addr.w];\n" "}\n" "}\n" "else\n" "{\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "temp[i] = *(__global uchar4*)&src[mad24(i, LSIZE0 << 2, start_addr)];\n" "}\n" "#endif\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "sum = convertDstVec(vload4(0,(__local uchar *)&LDS_DAT[l_y][l_x]+RADIUSX+offset)) * mat_kernel[RADIUSX];\n" "for (int i = 1; i <= RADIUSX; ++i)\n" "{\n" "temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);\n" "temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);\n" "#else\n" "sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);\n" "#endif\n" "}\n" "start_addr = mad24(y, dst_step_in_pixel, x);\n" "if ((x+3= 0 ? current_addr : 0;\n" "temp[i] = loadpix(src + current_addr);\n" "}\n" "#ifdef BORDER_ISOLATED\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "temp[i] = ELEM(mad24(i, LSIZE0, start_x), src_offset_x, src_offset_x + src_cols, (srcT)(0), temp[i]);\n" "temp[i] = ELEM(start_y, src_offset_y, src_offset_y + src_rows, (srcT)(0), temp[i]);\n" "}\n" "#else\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "temp[i] = ELEM(mad24(i, LSIZE0, start_x), 0, src_whole_cols, (srcT)(0), temp[i]);\n" "temp[i] = ELEM(start_y, 0, src_whole_rows, (srcT)(0), temp[i]);\n" "}\n" "#endif\n" "#else\n" "int index[READ_TIMES_ROW], s_x, s_y;\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "{\n" "s_x = mad24(i, LSIZE0, start_x);\n" "s_y = start_y;\n" "#ifdef BORDER_ISOLATED\n" "EXTRAPOLATE(s_x, src_offset_x, src_offset_x + src_cols);\n" "EXTRAPOLATE(s_y, src_offset_y, src_offset_y + src_rows);\n" "#else\n" "EXTRAPOLATE(s_x, 0, src_whole_cols);\n" "EXTRAPOLATE(s_y, 0, src_whole_rows);\n" "#endif\n" "index[i] = mad24(s_y, src_step, s_x * SRCSIZE);\n" "}\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "temp[i] = loadpix(src + index[i]);\n" "#endif\n" "for (int i = 0; i < READ_TIMES_ROW; ++i)\n" "LDS_DAT[l_y][mad24(i, LSIZE0, l_x)] = temp[i];\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "sum = convertToDstT(LDS_DAT[l_y][l_x + RADIUSX]) * mat_kernel[RADIUSX];\n" "for (int i = 1; i <= RADIUSX; ++i)\n" "{\n" "temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];\n" "temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);\n" "#else\n" "sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);\n" "#endif\n" "}\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "start_addr = mad24(y, dst_step, x * DSTSIZE);\n" "storepix(sum, dst + start_addr);\n" "}\n" "}\n" , "e99b92fca8604fe253f3c641802ce117"}; ProgramSource filterSepRow_oclsrc(filterSepRow.programStr); const struct ProgramEntry filterSep_singlePass={"filterSep_singlePass", "#ifdef BORDER_CONSTANT\n" "#define EXTRAPOLATE(x, maxV)\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = clamp((x), 0, (maxV)-1); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = ( (x) + (maxV) ) % (maxV); \\\n" "}\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \\\n" "}\n" "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#if CN != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1)*3\n" "#define DSTSIZE (int)sizeof(dstT1)*3\n" "#endif\n" "#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))\n" "#ifdef BORDER_CONSTANT\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n" "#else\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n" "#endif\n" "#define noconvert\n" "#define DIG(a) a,\n" "__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };\n" "__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };\n" "__kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,\n" "__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)\n" "{\n" "__local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];\n" "__local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];\n" "int lix = get_local_id(0);\n" "int liy = get_local_id(1);\n" "int x = get_global_id(0);\n" "int srcX = x + srcOffsetX - RADIUSX;\n" "int clocY = liy;\n" "do\n" "{\n" "int yb = clocY + srcOffsetY - RADIUSY;\n" "EXTRAPOLATE(yb, (height));\n" "int clocX = lix;\n" "int cSrcX = srcX;\n" "do\n" "{\n" "int xb = cSrcX;\n" "EXTRAPOLATE(xb,(width));\n" "lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n" "clocX += BLK_X;\n" "cSrcX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUSX*2));\n" "clocY += BLK_Y;\n" "}\n" "while (clocY < BLK_Y+(RADIUSY*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int y = 0; y < dst_rows; y+=BLK_Y)\n" "{\n" "int i, clocX = lix;\n" "WT sum = (WT) 0;\n" "do\n" "{\n" "sum = (WT) 0;\n" "for (i=0; i<=2*RADIUSY; i++)\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);\n" "#else\n" "sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);\n" "#endif\n" "lsmemDy[liy][clocX] = sum;\n" "clocX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUSX*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if ((x < dst_cols) && (y + liy < dst_rows))\n" "{\n" "sum = 0.0f;\n" "for (i=0; i<=2*RADIUSX; i++)\n" "#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)\n" "sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);\n" "#else\n" "sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);\n" "#endif\n" "#ifdef INTEGER_ARITHMETIC\n" "#ifdef INTEL_DEVICE\n" "sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);\n" "#else\n" "sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;\n" "#endif\n" "#endif\n" "storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));\n" "}\n" "for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)\n" "{\n" "int clocX = i % (BLK_X+(RADIUSX*2));\n" "int clocY = i / (BLK_X+(RADIUSX*2));\n" "lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;\n" "EXTRAPOLATE(yb, (height));\n" "clocX = lix;\n" "int cSrcX = x + srcOffsetX - RADIUSX;\n" "do\n" "{\n" "int xb = cSrcX;\n" "EXTRAPOLATE(xb,(width));\n" "lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n" "clocX += BLK_X;\n" "cSrcX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUSX*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" , "1335aadebf2523a98cb069063bdd2ba1"}; ProgramSource filterSep_singlePass_oclsrc(filterSep_singlePass.programStr); const struct ProgramEntry filterSmall={"filterSmall", "#ifdef BORDER_REPLICATE\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))\n" "#endif\n" "#ifdef BORDER_REFLECT\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_REFLECT_101\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))\n" "#endif\n" "#ifdef BORDER_WRAP\n" "#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))\n" "#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))\n" "#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))\n" "#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))\n" "#endif\n" "#ifdef BORDER_ISOLATED\n" "#define ISOLATED_MIN(VAL) (VAL)\n" "#else\n" "#define ISOLATED_MIN(VAL) 0\n" "#endif\n" "#ifdef EXTRA_EXTRAPOLATION\n" "#ifdef BORDER_CONSTANT\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "x = max(min(x, maxX - 1), minX); \\\n" "y = max(min(y, maxY - 1), minY); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "if (x < minX) \\\n" "x -= ((x - maxX + 1) / maxX) * maxX; \\\n" "if (x >= maxX) \\\n" "x %= maxX; \\\n" "if (y < minY) \\\n" "y -= ((y - maxY + 1) / maxY) * maxY; \\\n" "if (y >= maxY) \\\n" "y %= maxY; \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" "#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \\\n" "{ \\\n" "if (maxX - minX == 1) \\\n" "x = minX; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (x < minX) \\\n" "x = minX - (x - minX) - 1 + delta; \\\n" "else \\\n" "x = maxX - 1 - (x - maxX) - delta; \\\n" "} \\\n" "while (x >= maxX || x < minX); \\\n" "\\\n" "if (maxY - minY == 1) \\\n" "y = minY; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if (y < minY) \\\n" "y = minY - (y - minY) - 1 + delta; \\\n" "else \\\n" "y = maxY - 1 - (y - maxY) - delta; \\\n" "} \\\n" "while (y >= maxY || y < minY); \\\n" "}\n" "#ifdef BORDER_REFLECT\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)\n" "#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)\n" "#endif\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#else\n" "#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \\\n" "{ \\\n" "int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \\\n" "_row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \\\n" "_row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \\\n" "y = _row + ISOLATED_MIN(minY); \\\n" "\\\n" "_col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \\\n" "_col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \\\n" "x = _col + ISOLATED_MIN(minX); \\\n" "}\n" "#endif\n" "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1) * cn\n" "#define DSTSIZE (int)sizeof(dstT1) * cn\n" "#endif\n" "#define noconvert\n" "struct RectCoords\n" "{\n" "int x1, y1, x2, y2;\n" "};\n" "#ifdef BORDER_ISOLATED\n" "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n" "{\n" "return coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;\n" "}\n" "#else\n" "inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)\n" "{\n" "return coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;\n" "}\n" "#endif\n" "#define float1 float\n" "#define uchar1 uchar\n" "#define int1 int\n" "#define uint1 unit\n" "#define __CAT(x, y) x##y\n" "#define CAT(x, y) __CAT(x, y)\n" "#define vload1(OFFSET, PTR) (*(PTR + OFFSET))\n" "#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)\n" "#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)\n" "#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)\n" "inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar * srcptr,\n" "int srcstep, const struct RectCoords srcCoords)\n" "{\n" "__global const srcT1 * ptr = (__global const srcT1 *)\n" "(srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE));\n" "return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));\n" "}\n" "#define LOOP1(VAR, STMT) (STMT); (VAR)++;\n" "#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;\n" "#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))\n" "#ifdef OP_BOX_FILTER\n" "#define PROCESS_ELEM \\\n" "WT total_sum = (WT)(0); \\\n" "int sy = 0; \\\n" "LOOP(KERNEL_SIZE_Y, sy, \\\n" "{ \\\n" "int sx = 0; \\\n" "LOOP(KERNEL_SIZE_X, sx, \\\n" "{ \\\n" "total_sum += privateData[py + sy][px + sx]; \\\n" "}); \\\n" "})\n" "#elif defined OP_FILTER2D\n" "#define DIG(a) a,\n" "__constant WT1 kernelData[] = { COEFF };\n" "#define PROCESS_ELEM \\\n" "WT total_sum = 0; \\\n" "int sy = 0; \\\n" "int kernelIndex = 0; \\\n" "LOOP(KERNEL_SIZE_Y, sy, \\\n" "{ \\\n" "int sx = 0; \\\n" "LOOP(KERNEL_SIZE_X, sx, \\\n" "{ \\\n" "total_sum = fma(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum); \\\n" "}); \\\n" "})\n" "#elif defined OP_ERODE || defined OP_DILATE\n" "#ifdef DEPTH_0\n" "#define MIN_VAL 0\n" "#define MAX_VAL UCHAR_MAX\n" "#elif defined DEPTH_1\n" "#define MIN_VAL SCHAR_MIN\n" "#define MAX_VAL SCHAR_MAX\n" "#elif defined DEPTH_2\n" "#define MIN_VAL 0\n" "#define MAX_VAL USHRT_MAX\n" "#elif defined DEPTH_3\n" "#define MIN_VAL SHRT_MIN\n" "#define MAX_VAL SHRT_MAX\n" "#elif defined DEPTH_4\n" "#define MIN_VAL INT_MIN\n" "#define MAX_VAL INT_MAX\n" "#elif defined DEPTH_5\n" "#define MIN_VAL (-FLT_MAX)\n" "#define MAX_VAL FLT_MAX\n" "#elif defined DEPTH_6\n" "#define MIN_VAL (-DBL_MAX)\n" "#define MAX_VAL DBL_MAX\n" "#endif\n" "#ifdef OP_ERODE\n" "#define VAL (WT)MAX_VAL\n" "#elif defined OP_DILATE\n" "#define VAL (WT)MIN_VAL\n" "#else\n" "#error \"Unknown operation\"\n" "#endif\n" "#define convert_float1 convert_float\n" "#define convert_uchar1 convert_uchar\n" "#define convert_int1 convert_int\n" "#define convert_uint1 convert_uint\n" "#ifdef OP_ERODE\n" "#if defined INTEL_DEVICE && defined DEPTH_0\n" "#define WA_CONVERT_1 CAT(convert_uint, cn)\n" "#define WA_CONVERT_2 CAT(convert_, srcT)\n" "#define MORPH_OP(A, B) ((A) < (B) ? (A) : (B))\n" "#else\n" "#define MORPH_OP(A, B) min((A), (B))\n" "#endif\n" "#endif\n" "#ifdef OP_DILATE\n" "#define MORPH_OP(A, B) max((A), (B))\n" "#endif\n" "#define PROCESS(_y, _x) \\\n" "total_sum = convertToWT(MORPH_OP(convertToWT(total_sum), convertToWT(privateData[py + _y][px + _x])));\n" "#define PROCESS_ELEM \\\n" "WT total_sum = convertToWT(VAL); \\\n" "PROCESS_ELEM_\n" "#else\n" "#error \"No processing is specified\"\n" "#endif\n" "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n" "#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset\n" "#else\n" "#define EXTRA_PARAMS\n" "#endif\n" "inline WT getBorderPixel(const struct RectCoords bounds, int2 coord,\n" "__global const uchar * srcptr, int srcstep)\n" "{\n" "#ifdef BORDER_CONSTANT\n" "#ifdef OP_ERODE\n" "return (WT)(MAX_VAL);\n" "#elif defined OP_DILATE\n" "return (WT)(MIN_VAL);\n" "#else\n" "return (WT)(0);\n" "#endif\n" "#else\n" "int selected_col = coord.x;\n" "int selected_row = coord.y;\n" "EXTRAPOLATE(selected_col, selected_row,\n" "bounds.x1, bounds.y1,\n" "bounds.x2, bounds.y2);\n" "__global const uchar* ptr = srcptr + mad24(selected_row, srcstep, selected_col * SRCSIZE);\n" "return convertToWT(loadpix(ptr));\n" "#endif\n" "}\n" "inline WT readSrcPixelSingle(int2 pos, __global const uchar * srcptr,\n" "int srcstep, const struct RectCoords srcCoords)\n" "{\n" "if (!isBorder(srcCoords, pos, 1))\n" "{\n" "__global const uchar * ptr = srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE);\n" "return convertToWT(loadpix(ptr));\n" "}\n" "else\n" "return getBorderPixel(srcCoords, pos, srcptr, srcstep);\n" "}\n" "__kernel void filterSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols\n" "#ifdef NORMALIZE\n" ", float alpha\n" "#endif\n" "EXTRA_PARAMS )\n" "{\n" "const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };\n" "const int startX = get_global_id(0) * PX_PER_WI_X;\n" "const int startY = get_global_id(1) * PX_PER_WI_Y;\n" "if (startX >= cols || startY >= rows)\n" "return;\n" "WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH];\n" "int py = 0;\n" "LOOP(PX_LOAD_Y_ITERATIONS, py,\n" "{\n" "int y = startY + py;\n" "int px = 0;\n" "LOOP(PX_LOAD_X_ITERATIONS, px,\n" "{\n" "int x = startX + (px * PX_LOAD_NUM_PX);\n" "int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y);\n" "if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX))\n" "{\n" "PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords);\n" "#ifdef SQR\n" "*((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p * p;\n" "#else\n" "*((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p;\n" "#endif\n" "}\n" "else\n" "{\n" "int lx = 0;\n" "LOOP(PX_LOAD_NUM_PX, lx,\n" "{\n" "WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords);\n" "#ifdef SQR\n" "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p * p;\n" "#else\n" "*((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p;\n" "#endif\n" "srcPos.x++;\n" "});\n" "}\n" "});\n" "});\n" "py = 0;\n" "LOOP(PX_PER_WI_Y, py,\n" "{\n" "int y = startY + py;\n" "int px = 0;\n" "LOOP(PX_PER_WI_X, px,\n" "{\n" "int x = startX + px;\n" "PROCESS_ELEM;\n" "int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));\n" "__global dstT * dstPtr = (__global dstT *)(dstptr + dst_index);\n" "#ifdef NORMALIZE\n" "total_sum *= (WT)(alpha);\n" "#endif\n" "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n" "int mat_index = mad24(y, mat_step, mad24(x, SRCSIZE, mat_offset));\n" "WT value = convertToWT(loadpix(matptr + mat_index));\n" "#ifdef OP_GRADIENT\n" "storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );\n" "#elif defined OP_TOPHAT\n" "storepix(convertToDstT(convertToWT(value) - convertToWT(total_sum)), dstPtr );\n" "#elif defined OP_BLACKHAT\n" "storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );\n" "#endif\n" "#else\n" "storepix(convertToDstT(total_sum), dstPtr);\n" "#endif\n" "});\n" "});\n" "}\n" , "2aafc30dda5e658542c92a9ab2a63d4a"}; ProgramSource filterSmall_oclsrc(filterSmall.programStr); const struct ProgramEntry gftt={"gftt", "#ifdef OP_MAX_EIGEN_VAL\n" "__kernel void maxEigenVal(__global const uchar * srcptr, int src_step, int src_offset, int cols,\n" "int total, __global uchar * dstptr\n" "#ifdef HAVE_MASK\n" ", __global const uchar * maskptr, int mask_step, int mask_offset\n" "#endif\n" ")\n" "{\n" "int lid = get_local_id(0);\n" "int gid = get_group_id(0);\n" "int id = get_global_id(0);\n" "__local float localmem_max[WGS2_ALIGNED];\n" "float maxval = -FLT_MAX;\n" "for (int grain = groupnum * WGS; id < total; id += grain)\n" "{\n" "int src_index = mad24(id / cols, src_step, mad24((id % cols), (int)sizeof(float), src_offset));\n" "#ifdef HAVE_MASK\n" "int mask_index = mad24(id / cols, mask_step, id % cols + mask_offset);\n" "if (maskptr[mask_index])\n" "#endif\n" "maxval = max(maxval, *(__global const float *)(srcptr + src_index));\n" "}\n" "if (lid < WGS2_ALIGNED)\n" "localmem_max[lid] = maxval;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n" "localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid - WGS2_ALIGNED]);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n" "{\n" "if (lid < lsize)\n" "{\n" "int lid2 = lsize + lid;\n" "localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "if (lid == 0)\n" "*(__global float *)(dstptr + (int)sizeof(float) * gid) = localmem_max[0];\n" "}\n" "__kernel void maxEigenValTask(__global float * dst, float qualityLevel,\n" "__global int * cornersptr)\n" "{\n" "float maxval = -FLT_MAX;\n" "#pragma unroll\n" "for (int x = 0; x < groupnum; ++x)\n" "maxval = max(maxval, dst[x]);\n" "dst[0] = maxval * qualityLevel;\n" "cornersptr[0] = 0;\n" "}\n" "#elif OP_FIND_CORNERS\n" "#define GET_SRC_32F(_y, _x) *(__global const float *)(eigptr + (_y) * eig_step + (_x) * (int)sizeof(float) )\n" "__kernel void findCorners(__global const uchar * eigptr, int eig_step, int eig_offset,\n" "#ifdef HAVE_MASK\n" "__global const uchar * mask, int mask_step, int mask_offset,\n" "#endif\n" "__global uchar * cornersptr, int rows, int cols,\n" "__constant float * threshold, int max_corners)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "__global int* counter = (__global int*) cornersptr;\n" "__global float2 * corners = (__global float2 *)(cornersptr + (int)sizeof(float2));\n" "if (y < rows && x < cols\n" "#ifdef HAVE_MASK\n" "&& mask[mad24(y, mask_step, x + mask_offset)]\n" "#endif\n" ")\n" "{\n" "++x, ++y;\n" "float val = GET_SRC_32F(y, x);\n" "if (val > threshold[0])\n" "{\n" "float maxVal = val;\n" "maxVal = max(GET_SRC_32F(y - 1, x - 1), maxVal);\n" "maxVal = max(GET_SRC_32F(y - 1, x ), maxVal);\n" "maxVal = max(GET_SRC_32F(y - 1, x + 1), maxVal);\n" "maxVal = max(GET_SRC_32F(y , x - 1), maxVal);\n" "maxVal = max(GET_SRC_32F(y , x + 1), maxVal);\n" "maxVal = max(GET_SRC_32F(y + 1, x - 1), maxVal);\n" "maxVal = max(GET_SRC_32F(y + 1, x ), maxVal);\n" "maxVal = max(GET_SRC_32F(y + 1, x + 1), maxVal);\n" "if (val == maxVal)\n" "{\n" "int ind = atomic_inc(counter);\n" "if (ind < max_corners)\n" "{\n" "corners[ind].x = val;\n" "corners[ind].y = as_float(y | (x << 16));\n" "}\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" , "cb2cfd26f04e14ae047e2f5eb28c8e11"}; ProgramSource gftt_oclsrc(gftt.programStr); const struct ProgramEntry histogram={"histogram", "#ifndef kercn\n" "#define kercn 1\n" "#endif\n" "#ifndef T\n" "#define T uchar\n" "#endif\n" "#define noconvert\n" "__kernel void calculate_histogram(__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * histptr, int total)\n" "{\n" "int lid = get_local_id(0);\n" "int id = get_global_id(0) * kercn;\n" "int gid = get_group_id(0);\n" "__local int localhist[BINS];\n" "#pragma unroll\n" "for (int i = lid; i < BINS; i += WGS)\n" "localhist[i] = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "__global const uchar * src = src_ptr + src_offset;\n" "int src_index;\n" "for (int grain = HISTS_COUNT * WGS * kercn; id < total; id += grain)\n" "{\n" "#ifdef HAVE_SRC_CONT\n" "src_index = id;\n" "#else\n" "src_index = mad24(id / src_cols, src_step, id % src_cols);\n" "#endif\n" "#if kercn == 1\n" "atomic_inc(localhist + convert_int(src[src_index]));\n" "#elif kercn == 4\n" "int value = *(__global const int *)(src + src_index);\n" "atomic_inc(localhist + (value & 0xff));\n" "atomic_inc(localhist + ((value >> 8) & 0xff));\n" "atomic_inc(localhist + ((value >> 16) & 0xff));\n" "atomic_inc(localhist + ((value >> 24) & 0xff));\n" "#elif kercn >= 2\n" "T value = *(__global const T *)(src + src_index);\n" "atomic_inc(localhist + value.s0);\n" "atomic_inc(localhist + value.s1);\n" "#if kercn >= 4\n" "atomic_inc(localhist + value.s2);\n" "atomic_inc(localhist + value.s3);\n" "#if kercn >= 8\n" "atomic_inc(localhist + value.s4);\n" "atomic_inc(localhist + value.s5);\n" "atomic_inc(localhist + value.s6);\n" "atomic_inc(localhist + value.s7);\n" "#if kercn == 16\n" "atomic_inc(localhist + value.s8);\n" "atomic_inc(localhist + value.s9);\n" "atomic_inc(localhist + value.sA);\n" "atomic_inc(localhist + value.sB);\n" "atomic_inc(localhist + value.sC);\n" "atomic_inc(localhist + value.sD);\n" "atomic_inc(localhist + value.sE);\n" "atomic_inc(localhist + value.sF);\n" "#endif\n" "#endif\n" "#endif\n" "#endif\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "__global int * hist = (__global int *)(histptr + gid * BINS * (int)sizeof(int));\n" "#pragma unroll\n" "for (int i = lid; i < BINS; i += WGS)\n" "hist[i] = localhist[i];\n" "}\n" "#ifndef HT\n" "#define HT int\n" "#endif\n" "#ifndef convertToHT\n" "#define convertToHT noconvert\n" "#endif\n" "__kernel void merge_histogram(__global const int * ghist, __global uchar * histptr, int hist_step, int hist_offset)\n" "{\n" "int lid = get_local_id(0);\n" "__global HT * hist = (__global HT *)(histptr + hist_offset);\n" "#if WGS >= BINS\n" "HT res = (HT)(0);\n" "#else\n" "#pragma unroll\n" "for (int i = lid; i < BINS; i += WGS)\n" "hist[i] = (HT)(0);\n" "#endif\n" "#pragma unroll\n" "for (int i = 0; i < HISTS_COUNT; ++i)\n" "{\n" "#pragma unroll\n" "for (int j = lid; j < BINS; j += WGS)\n" "#if WGS >= BINS\n" "res += convertToHT(ghist[j]);\n" "#else\n" "hist[j] += convertToHT(ghist[j]);\n" "#endif\n" "ghist += BINS;\n" "}\n" "#if WGS >= BINS\n" "if (lid < BINS)\n" "*(__global HT *)(histptr + mad24(lid, hist_step, hist_offset)) = res;\n" "#endif\n" "}\n" "__kernel void calcLUT(__global uchar * dst, __global const int * ghist, int total)\n" "{\n" "int lid = get_local_id(0);\n" "__local int sumhist[BINS];\n" "__local float scale;\n" "#if WGS >= BINS\n" "int res = 0;\n" "#else\n" "#pragma unroll\n" "for (int i = lid; i < BINS; i += WGS)\n" "sumhist[i] = 0;\n" "#endif\n" "#pragma unroll\n" "for (int i = 0; i < HISTS_COUNT; ++i)\n" "{\n" "#pragma unroll\n" "for (int j = lid; j < BINS; j += WGS)\n" "#if WGS >= BINS\n" "res += ghist[j];\n" "#else\n" "sumhist[j] += ghist[j];\n" "#endif\n" "ghist += BINS;\n" "}\n" "#if WGS >= BINS\n" "if (lid < BINS)\n" "sumhist[lid] = res;\n" "#endif\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (lid == 0)\n" "{\n" "int sum = 0, i = 0;\n" "while (!sumhist[i])\n" "++i;\n" "if (total == sumhist[i])\n" "{\n" "scale = 1;\n" "for (int j = 0; j < BINS; ++j)\n" "sumhist[i] = i;\n" "}\n" "else\n" "{\n" "scale = 255.f / (total - sumhist[i]);\n" "for (sumhist[i++] = 0; i < BINS; i++)\n" "{\n" "sum += sumhist[i];\n" "sumhist[i] = sum;\n" "}\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#pragma unroll\n" "for (int i = lid; i < BINS; i += WGS)\n" "dst[i]= convert_uchar_sat_rte(convert_float(sumhist[i]) * scale);\n" "}\n" , "3bfd6703e639c8a36eb7cdd5f3eefda6"}; ProgramSource histogram_oclsrc(histogram.programStr); const struct ProgramEntry hough_lines={"hough_lines", "#define ACCUM(ptr) *((__global int*)(ptr))\n" "#ifdef MAKE_POINTS_LIST\n" "__kernel void make_point_list(__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * list_ptr, int list_step, int list_offset, __global int* global_offset)\n" "{\n" "int x = get_local_id(0);\n" "int y = get_group_id(1);\n" "__local int l_index, l_offset;\n" "__local int l_points[LOCAL_SIZE];\n" "__global const uchar * src = src_ptr + mad24(y, src_step, src_offset);\n" "__global int * list = (__global int*)(list_ptr + list_offset);\n" "if (x == 0)\n" "l_index = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y < src_rows)\n" "{\n" "y <<= 16;\n" "for (int i=x; i < src_cols; i+=GROUP_SIZE)\n" "{\n" "if (src[i])\n" "{\n" "int val = y | i;\n" "int index = atomic_inc(&l_index);\n" "l_points[index] = val;\n" "}\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (x == 0)\n" "l_offset = atomic_add(global_offset, l_index);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "list += l_offset;\n" "for (int i=x; i < l_index; i+=GROUP_SIZE)\n" "{\n" "list[i] = l_points[i];\n" "}\n" "}\n" "#elif defined FILL_ACCUM_GLOBAL\n" "__kernel void fill_accum_global(__global const uchar * list_ptr, int list_step, int list_offset,\n" "__global uchar * accum_ptr, int accum_step, int accum_offset,\n" "int total_points, float irho, float theta, int numrho, int numangle)\n" "{\n" "int theta_idx = get_global_id(1);\n" "int count_idx = get_global_id(0);\n" "int glob_size = get_global_size(0);\n" "float cosVal;\n" "float sinVal = sincos(theta * ((float)theta_idx), &cosVal);\n" "sinVal *= irho;\n" "cosVal *= irho;\n" "__global const int * list = (__global const int*)(list_ptr + list_offset);\n" "__global int* accum = (__global int*)(accum_ptr + mad24(theta_idx + 1, accum_step, accum_offset));\n" "const int shift = (numrho - 1) / 2;\n" "if (theta_idx < numangle)\n" "{\n" "for (int i = count_idx; i < total_points; i += glob_size)\n" "{\n" "const int val = list[i];\n" "const int x = (val & 0xFFFF);\n" "const int y = (val >> 16) & 0xFFFF;\n" "int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift;\n" "atomic_inc(accum + r + 1);\n" "}\n" "}\n" "}\n" "#elif defined FILL_ACCUM_LOCAL\n" "__kernel void fill_accum_local(__global const uchar * list_ptr, int list_step, int list_offset,\n" "__global uchar * accum_ptr, int accum_step, int accum_offset,\n" "int total_points, float irho, float theta, int numrho, int numangle)\n" "{\n" "int theta_idx = get_group_id(1);\n" "int count_idx = get_local_id(0);\n" "if (theta_idx > 0 && theta_idx < numangle + 1)\n" "{\n" "float cosVal;\n" "float sinVal = sincos(theta * (float) (theta_idx-1), &cosVal);\n" "sinVal *= irho;\n" "cosVal *= irho;\n" "__local int l_accum[BUFFER_SIZE];\n" "for (int i=count_idx; i> 16;\n" "int r = convert_int_rte(mad(x, cosVal, y * sinVal)) + shift;\n" "atomic_inc(l_accum + r + 1);\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "__global int* accum = (__global int*)(accum_ptr + mad24(theta_idx, accum_step, accum_offset));\n" "for (int i=count_idx; i threshold && curVote > ACCUM(accum - sizeof(int)) && curVote >= ACCUM(accum + sizeof(int)) &&\n" "curVote > ACCUM(accum - accum_step) && curVote >= ACCUM(accum + accum_step))\n" "{\n" "int index = atomic_inc(lines_index);\n" "if (index < linesMax)\n" "{\n" "float radius = (x - (accum_cols - 3) * 0.5f) * rho;\n" "float angle = y * theta;\n" "lines[index] = (float2)(radius, angle);\n" "}\n" "}\n" "accum += glob_size * (int) sizeof(int);\n" "}\n" "}\n" "}\n" "#elif GET_LINES_PROBABOLISTIC\n" "__kernel void get_lines(__global const uchar * accum_ptr, int accum_step, int accum_offset, int accum_rows, int accum_cols,\n" "__global const uchar * src_ptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * lines_ptr, int lines_step, int lines_offset, __global int* lines_index_ptr,\n" "int linesMax, int threshold, int lineLength, int lineGap, float rho, float theta)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (y < accum_rows-2)\n" "{\n" "__global uchar* accum = accum_ptr + mad24(y+1, accum_step, mad24(x+1, (int) sizeof(int), accum_offset));\n" "__global int4* lines = (__global int4*)(lines_ptr + lines_offset);\n" "__global int* lines_index = lines_index_ptr + 1;\n" "int curVote = ACCUM(accum);\n" "if (curVote >= threshold &&\n" "curVote > ACCUM(accum - accum_step - sizeof(int)) &&\n" "curVote > ACCUM(accum - accum_step) &&\n" "curVote > ACCUM(accum - accum_step + sizeof(int)) &&\n" "curVote > ACCUM(accum - sizeof(int)) &&\n" "curVote > ACCUM(accum + sizeof(int)) &&\n" "curVote > ACCUM(accum + accum_step - sizeof(int)) &&\n" "curVote > ACCUM(accum + accum_step) &&\n" "curVote > ACCUM(accum + accum_step + sizeof(int)))\n" "{\n" "const float radius = (x - (accum_cols - 2 - 1) * 0.5f) * rho;\n" "const float angle = y * theta;\n" "float cosa;\n" "float sina = sincos(angle, &cosa);\n" "float2 p0 = (float2)(cosa * radius, sina * radius);\n" "float2 dir = (float2)(-sina, cosa);\n" "float2 pb[4] = { (float2)(-1, -1), (float2)(-1, -1), (float2)(-1, -1), (float2)(-1, -1) };\n" "float a;\n" "if (dir.x != 0)\n" "{\n" "a = -p0.x / dir.x;\n" "pb[0].x = 0;\n" "pb[0].y = p0.y + a * dir.y;\n" "a = (src_cols - 1 - p0.x) / dir.x;\n" "pb[1].x = src_cols - 1;\n" "pb[1].y = p0.y + a * dir.y;\n" "}\n" "if (dir.y != 0)\n" "{\n" "a = -p0.y / dir.y;\n" "pb[2].x = p0.x + a * dir.x;\n" "pb[2].y = 0;\n" "a = (src_rows - 1 - p0.y) / dir.y;\n" "pb[3].x = p0.x + a * dir.x;\n" "pb[3].y = src_rows - 1;\n" "}\n" "if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < src_rows))\n" "{\n" "p0 = pb[0];\n" "if (dir.x < 0)\n" "dir = -dir;\n" "}\n" "else if (pb[1].x == src_cols - 1 && (pb[1].y >= 0 && pb[1].y < src_rows))\n" "{\n" "p0 = pb[1];\n" "if (dir.x > 0)\n" "dir = -dir;\n" "}\n" "else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < src_cols))\n" "{\n" "p0 = pb[2];\n" "if (dir.y < 0)\n" "dir = -dir;\n" "}\n" "else if (pb[3].y == src_rows - 1 && (pb[3].x >= 0 && pb[3].x < src_cols))\n" "{\n" "p0 = pb[3];\n" "if (dir.y > 0)\n" "dir = -dir;\n" "}\n" "dir /= max(fabs(dir.x), fabs(dir.y));\n" "float2 line_end[2];\n" "int gap;\n" "bool inLine = false;\n" "if (p0.x < 0 || p0.x >= src_cols || p0.y < 0 || p0.y >= src_rows)\n" "return;\n" "for (;;)\n" "{\n" "if (*(src_ptr + mad24(p0.y, src_step, p0.x + src_offset)))\n" "{\n" "gap = 0;\n" "if (!inLine)\n" "{\n" "line_end[0] = p0;\n" "line_end[1] = p0;\n" "inLine = true;\n" "}\n" "else\n" "{\n" "line_end[1] = p0;\n" "}\n" "}\n" "else if (inLine)\n" "{\n" "if (++gap > lineGap)\n" "{\n" "bool good_line = fabs(line_end[1].x - line_end[0].x) >= lineLength ||\n" "fabs(line_end[1].y - line_end[0].y) >= lineLength;\n" "if (good_line)\n" "{\n" "int index = atomic_inc(lines_index);\n" "if (index < linesMax)\n" "lines[index] = (int4)(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);\n" "}\n" "gap = 0;\n" "inLine = false;\n" "}\n" "}\n" "p0 = p0 + dir;\n" "if (p0.x < 0 || p0.x >= src_cols || p0.y < 0 || p0.y >= src_rows)\n" "{\n" "if (inLine)\n" "{\n" "bool good_line = fabs(line_end[1].x - line_end[0].x) >= lineLength ||\n" "fabs(line_end[1].y - line_end[0].y) >= lineLength;\n" "if (good_line)\n" "{\n" "int index = atomic_inc(lines_index);\n" "if (index < linesMax)\n" "lines[index] = (int4)(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);\n" "}\n" "}\n" "break;\n" "}\n" "}\n" "}\n" "}\n" "}\n" "#endif\n" , "1a16d01d003274c100d23519d745047f"}; ProgramSource hough_lines_oclsrc(hough_lines.programStr); const struct ProgramEntry integral_sum={"integral_sum", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#ifndef LOCAL_SUM_SIZE\n" "#define LOCAL_SUM_SIZE 16\n" "#endif\n" "#define LOCAL_SUM_STRIDE (LOCAL_SUM_SIZE + 1)\n" "kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,\n" "__global uchar *buf_ptr, int buf_step, int buf_offset\n" "#ifdef SUM_SQUARE\n" ",__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset\n" "#endif\n" ")\n" "{\n" "__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n" "#ifdef SUM_SQUARE\n" "__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n" "#endif\n" "int lid = get_local_id(0);\n" "int gid = get_group_id(0);\n" "int x = get_global_id(0);\n" "int src_index = x + src_offset;\n" "sumT accum = 0;\n" "#ifdef SUM_SQUARE\n" "sumSQT accum_sq = 0;\n" "#endif\n" "for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)\n" "{\n" "int lsum_index = lid;\n" "#pragma unroll\n" "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)\n" "{\n" "if ((x < cols) && (y + yin < rows))\n" "{\n" "__global const uchar *src = src_ptr + src_index;\n" "accum += src[0];\n" "#ifdef SUM_SQUARE\n" "sumSQT temp = src[0] * src[0];\n" "accum_sq += temp;\n" "#endif\n" "}\n" "lm_sum[lsum_index] = accum;\n" "#ifdef SUM_SQUARE\n" "lm_sum_sq[lsum_index] = accum_sq;\n" "#endif\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));\n" "#ifdef SUM_SQUARE\n" "int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));\n" "#endif\n" "lsum_index = LOCAL_SUM_STRIDE * lid;\n" "#pragma unroll\n" "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)\n" "{\n" "__global sumT *buf = (__global sumT *)(buf_ptr + buf_index);\n" "buf[0] = lm_sum[lsum_index];\n" "buf_index += buf_step;\n" "#ifdef SUM_SQUARE\n" "__global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);\n" "bufsq[0] = lm_sum_sq[lsum_index];\n" "buf_sq_index += buf_sq_step;\n" "#endif\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,\n" "#ifdef SUM_SQUARE\n" "__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,\n" "#endif\n" "__global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols\n" "#ifdef SUM_SQUARE\n" ",__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset\n" "#endif\n" ")\n" "{\n" "__local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n" "#ifdef SUM_SQUARE\n" "__local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];\n" "#endif\n" "int lid = get_local_id(0);\n" "int gid = get_group_id(0);\n" "int gs = get_global_size(0);\n" "int x = get_global_id(0);\n" "__global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);\n" "for (int xin = x; xin < cols; xin += gs)\n" "{\n" "dst[xin] = 0;\n" "}\n" "dst_offset += dst_step;\n" "if (x < rows - 1)\n" "{\n" "dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));\n" "dst[0] = 0;\n" "}\n" "int buf_index = mad24((int)sizeof(sumT), x, buf_offset);\n" "sumT accum = 0;\n" "#ifdef SUM_SQUARE\n" "__global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);\n" "for (int xin = x; xin < cols; xin += gs)\n" "{\n" "dst_sq[xin] = 0;\n" "}\n" "dst_sq_offset += dst_sq_step;\n" "if (x < rows - 1)\n" "{\n" "dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));\n" "dst_sq[0] = 0;\n" "}\n" "int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);\n" "sumSQT accum_sq = 0;\n" "#endif\n" "for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)\n" "{\n" "int lsum_index = lid;\n" "#pragma unroll\n" "for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)\n" "{\n" "__global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);\n" "accum += buf[0];\n" "lm_sum[lsum_index] = accum;\n" "buf_index += buf_step;\n" "#ifdef SUM_SQUARE\n" "__global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);\n" "accum_sq += buf_sq[0];\n" "lm_sum_sq[lsum_index] = accum_sq;\n" "buf_sq_index += buf_sq_step;\n" "#endif\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (y + lid < cols)\n" "{\n" "int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));\n" "#ifdef SUM_SQUARE\n" "int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));\n" "#endif\n" "lsum_index = LOCAL_SUM_STRIDE * lid;\n" "int yin_max = min(rows - 1 - LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);\n" "#pragma unroll\n" "for (int yin = 0; yin < yin_max; yin++, lsum_index++)\n" "{\n" "dst = (__global sumT *)(dst_ptr + dst_index);\n" "dst[0] = lm_sum[lsum_index];\n" "dst_index += dst_step;\n" "#ifdef SUM_SQUARE\n" "dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);\n" "dst_sq[0] = lm_sum_sq[lsum_index];\n" "dst_sq_index += dst_sq_step;\n" "#endif\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" , "ce49fba6c7a369504177acc108203a38"}; ProgramSource integral_sum_oclsrc(integral_sum.programStr); const struct ProgramEntry laplacian5={"laplacian5", "#define noconvert\n" "#ifdef ONLY_SUM_CONVERT\n" "__kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,\n" "__global const uchar * src2ptr, int src2_step, int src2_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "coeffT scale, coeffT delta)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (y < dst_rows && x < dst_cols)\n" "{\n" "int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));\n" "int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));\n" "__global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);\n" "__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);\n" "__global dstT * dst = (__global dstT *)(dstptr + dst_index);\n" "#if wdepth <= 4\n" "dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );\n" "#else\n" "dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );\n" "#endif\n" "}\n" "}\n" "#else\n" "#ifdef BORDER_CONSTANT\n" "#define EXTRAPOLATE(x, maxV)\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = clamp((x), 0, (maxV)-1); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = ( (x) + (maxV) ) % (maxV); \\\n" "}\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \\\n" "}\n" "#elif defined BORDER_REFLECT_101\n" "#define EXTRAPOLATE(x, maxV) \\\n" "{ \\\n" "(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#if CN != 3\n" "#define loadpix(addr) *(__global const srcT *)(addr)\n" "#define storepix(val, addr) *(__global dstT *)(addr) = val\n" "#define SRCSIZE (int)sizeof(srcT)\n" "#define DSTSIZE (int)sizeof(dstT)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))\n" "#define SRCSIZE (int)sizeof(srcT1)*3\n" "#define DSTSIZE (int)sizeof(dstT1)*3\n" "#endif\n" "#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))\n" "#ifdef BORDER_CONSTANT\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))\n" "#else\n" "#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))\n" "#endif\n" "#define DIG(a) a,\n" "__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };\n" "__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };\n" "__kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,\n" "__global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "WT1 scale, WT1 delta)\n" "{\n" "__local WT lsmem[BLK_Y + 2 * RADIUS][BLK_X + 2 * RADIUS];\n" "__local WT lsmemDy1[BLK_Y][BLK_X + 2 * RADIUS];\n" "__local WT lsmemDy2[BLK_Y][BLK_X + 2 * RADIUS];\n" "int lix = get_local_id(0);\n" "int liy = get_local_id(1);\n" "int x = get_global_id(0);\n" "int srcX = x + srcOffsetX - RADIUS;\n" "int clocY = liy;\n" "do\n" "{\n" "int yb = clocY + srcOffsetY - RADIUS;\n" "EXTRAPOLATE(yb, (height));\n" "int clocX = lix;\n" "int cSrcX = srcX;\n" "do\n" "{\n" "int xb = cSrcX;\n" "EXTRAPOLATE(xb,(width));\n" "lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );\n" "clocX += BLK_X;\n" "cSrcX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUS*2));\n" "clocY += BLK_Y;\n" "}\n" "while (clocY < BLK_Y+(RADIUS*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "WT scale_v = (WT)scale;\n" "WT delta_v = (WT)delta;\n" "for (int y = 0; y < dst_rows; y+=BLK_Y)\n" "{\n" "int i, clocX = lix;\n" "WT sum1 = (WT) 0;\n" "WT sum2 = (WT) 0;\n" "do\n" "{\n" "sum1 = (WT) 0;\n" "sum2 = (WT) 0;\n" "for (i=0; i<=2*RADIUS; i++)\n" "{\n" "sum1 = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum1);\n" "sum2 = mad(lsmem[liy + i][clocX], mat_kernelX[i], sum2);\n" "}\n" "lsmemDy1[liy][clocX] = sum1;\n" "lsmemDy2[liy][clocX] = sum2;\n" "clocX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUS*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if ((x < dst_cols) && (y + liy < dst_rows))\n" "{\n" "sum1 = (WT) 0;\n" "sum2 = (WT) 0;\n" "for (i=0; i<=2*RADIUS; i++)\n" "{\n" "sum1 = mad(lsmemDy1[liy][lix+i], mat_kernelX[i], sum1);\n" "sum2 = mad(lsmemDy2[liy][lix+i], mat_kernelY[i], sum2);\n" "}\n" "WT sum = mad(scale_v, (sum1 + sum2), delta_v);\n" "storepix(convertToDT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));\n" "}\n" "for (int i = liy * BLK_X + lix; i < (RADIUS*2) * (BLK_X+(RADIUS*2)); i += BLK_X * BLK_Y)\n" "{\n" "int clocX = i % (BLK_X+(RADIUS*2));\n" "int clocY = i / (BLK_X+(RADIUS*2));\n" "lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int yb = y + liy + BLK_Y + srcOffsetY + RADIUS;\n" "EXTRAPOLATE(yb, (height));\n" "clocX = lix;\n" "int cSrcX = x + srcOffsetX - RADIUS;\n" "do\n" "{\n" "int xb = cSrcX;\n" "EXTRAPOLATE(xb,(width));\n" "lsmem[liy + 2*RADIUS][clocX] = ELEM(xb, yb, (width), (height), 0 );\n" "clocX += BLK_X;\n" "cSrcX += BLK_X;\n" "}\n" "while(clocX < BLK_X+(RADIUS*2));\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "}\n" "#endif\n" , "3ce3fc1a1c2e6be3a8fd0d2f51afeaf1"}; ProgramSource laplacian5_oclsrc(laplacian5.programStr); const struct ProgramEntry match_template={"match_template", "#if cn != 3\n" "#define loadpix(addr) *(__global const T *)(addr)\n" "#define TSIZE (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" "#define TSIZE ((int)sizeof(T1)*3)\n" "#endif\n" "#define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset))\n" "#define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset))\n" "#define SUMS(ox, oy) mad24(y+oy, src_sums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sums_offset))\n" "#define SQ_SUMS(ox, oy) mad24(y+oy, src_sqsums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sqsums_offset))\n" "inline float normAcc(float num, float denum)\n" "{\n" "if (fabs(num) < denum)\n" "return num / denum;\n" "if (fabs(num) < denum * 1.125f)\n" "return num > 0 ? 1 : -1;\n" "return 0;\n" "}\n" "inline float normAcc_SQDIFF(float num, float denum)\n" "{\n" "if (fabs(num) < denum)\n" "return num / denum;\n" "if (fabs(num) < denum * 1.125f)\n" "return num > 0 ? 1 : -1;\n" "return 1;\n" "}\n" "#define noconvert\n" "#if cn == 1\n" "#define convertToDT(value) (float)(value)\n" "#elif cn == 2\n" "#define convertToDT(value) (float)(value.x + value.y)\n" "#elif cn == 3\n" "#define convertToDT(value) (float)(value.x + value.y + value.z)\n" "#elif cn == 4\n" "#define convertToDT(value) (float)(value.x + value.y + value.z + value.w)\n" "#else\n" "#error \"cn should be 1-4\"\n" "#endif\n" "#ifdef CALC_SUM\n" "__kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,\n" "int cols, int total, __global float * dst)\n" "{\n" "int lid = get_local_id(0), id = get_global_id(0);\n" "__local WT localmem[WGS2_ALIGNED];\n" "WT accumulator = (WT)(0), tmp;\n" "for ( ; id < total; id += WGS)\n" "{\n" "int src_index = mad24(id / cols, src_step, mad24(id % cols, TSIZE, src_offset));\n" "T src = loadpix(srcptr + src_index);\n" "tmp = convertToWT(src);\n" "accumulator = mad(tmp, tmp, accumulator);\n" "}\n" "if (lid < WGS2_ALIGNED)\n" "localmem[lid] = accumulator;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)\n" "localmem[lid - WGS2_ALIGNED] += accumulator;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)\n" "{\n" "if (lid < lsize)\n" "{\n" "int lid2 = lsize + lid;\n" "localmem[lid] += localmem[lid2];\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "if (lid == 0)\n" "dst[0] = convertToDT(localmem[0]);\n" "}\n" "#elif defined FIRST_CHANNEL\n" "__kernel void extractFirstChannel( const __global uchar* img, int img_step, int img_offset,\n" "__global uchar* res, int res_step, int res_offset, int rows, int cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1)*PIX_PER_WI_Y;\n" "if(x < cols )\n" "{\n" "#pragma unroll\n" "for (int cy=0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)\n" "{\n" "T1 image = *(__global const T1*)(img + mad24(y, img_step, mad24(x, (int)sizeof(T1)*cn, img_offset)));;\n" "int res_idx = mad24(y, res_step, mad24(x, (int)sizeof(float), res_offset));\n" "*(__global float *)(res + res_idx) = image;\n" "}\n" "}\n" "}\n" "#elif defined CCORR\n" "#if cn==1 && PIX_PER_WI_X==4\n" "__kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x0 = get_global_id(0)*PIX_PER_WI_X;\n" "int y = get_global_id(1);\n" "if (y < dst_rows)\n" "{\n" "if (x0 + PIX_PER_WI_X <= dst_cols)\n" "{\n" "WT sum = (WT)(0);\n" "int ind = mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset));\n" "__global const T1 * template = (__global const T1*)(templateptr + template_offset);\n" "for (int i = 0; i < template_rows; ++i)\n" "{\n" "for (int j = 0; j < template_cols; ++j)\n" "{\n" "T temp = (T)(template[j]);\n" "T src = vload4(0, (__global const T1*)(srcptr + ind + j*(int)sizeof(T1)));\n" "sum = mad(convertToWT(src), convertToWT(temp), sum);\n" "}\n" "ind += src_step;\n" "template = (__global const T1 *)((__global const uchar *)template + template_step);\n" "}\n" "T temp = (T)(template[0]);\n" "int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));\n" "*(__global float4 *)(dst + dst_idx) = convert_float4(sum);\n" "}\n" "else\n" "{\n" "WT1 sum [PIX_PER_WI_X];\n" "#pragma unroll\n" "for (int i=0; i < PIX_PER_WI_X; i++) sum[i] = 0;\n" "__global const T1 * src = (__global const T1 *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset)));\n" "__global const T1 * template = (__global const T1 *)(templateptr + template_offset);\n" "for (int i = 0; i < template_rows; ++i)\n" "{\n" "for (int j = 0; j < template_cols; ++j)\n" "{\n" "#pragma unroll\n" "for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x)\n" "{\n" "sum[cx] = mad(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]);\n" "}\n" "}\n" "src = (__global const T1 *)((__global const uchar *)src + src_step);\n" "template = (__global const T1 *)((__global const uchar *)template + template_step);\n" "}\n" "#pragma unroll\n" "for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0)\n" "{\n" "int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) = convertToDT(sum[cx]);\n" "}\n" "}\n" "}\n" "}\n" "#else\n" "__kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "WT sum = (WT)(0);\n" "for (int i = 0; i < template_rows; ++i)\n" "{\n" "for (int j = 0; j < template_cols; ++j)\n" "{\n" "T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset)));\n" "T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));\n" "sum = mad(convertToWT(src), convertToWT(template), sum);\n" "}\n" "}\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) = convertToDT(sum);\n" "}\n" "}\n" "#endif\n" "#elif defined CCORR_NORMED\n" "__kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, __global const float * template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "__global const float * sqsum = (__global const float *)(src_sqsums);\n" "src_sqsums_step /= sizeof(float);\n" "src_sqsums_offset /= sizeof(float);\n" "float image_sqsum_ = (float)(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)] -\n" "sqsum[SQSUMS_PTR(0, template_rows)] + sqsum[SQSUMS_PTR(0, 0)]);\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst + dst_idx);\n" "*dstult = normAcc(*dstult, sqrt(image_sqsum_ * template_sqsum[0]));\n" "}\n" "}\n" "#elif defined SQDIFF\n" "__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "WT sum = (WT)(0), value;\n" "for (int i = 0; i < template_rows; ++i)\n" "{\n" "for (int j = 0; j < template_cols; ++j)\n" "{\n" "T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset)));\n" "T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));\n" "value = convertToWT(src) - convertToWT(template);\n" "sum = mad(value, value, sum);\n" "}\n" "}\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) = convertToDT(sum);\n" "}\n" "}\n" "#elif defined SQDIFF_PREPARED\n" "__kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, __global const float * template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "src_sqsums_step /= sizeof(float);\n" "src_sqsums_offset /= sizeof(float);\n" "__global const float * sqsum = (__global const float *)(src_sqsums);\n" "float image_sqsum_ = (float)(\n" "(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -\n" "(sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));\n" "float template_sqsum_value = template_sqsum[0];\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst + dst_idx);\n" "*dstult = image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value;\n" "}\n" "}\n" "#elif defined SQDIFF_NORMED\n" "__kernel void matchTemplate_SQDIFF_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, __global const float * template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "src_sqsums_step /= sizeof(float);\n" "src_sqsums_offset /= sizeof(float);\n" "__global const float * sqsum = (__global const float *)(src_sqsums);\n" "float image_sqsum_ = (float)(\n" "(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -\n" "(sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));\n" "float template_sqsum_value = template_sqsum[0];\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst + dst_idx);\n" "*dstult = normAcc_SQDIFF(image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value, sqrt(image_sqsum_ * template_sqsum_value));\n" "}\n" "}\n" "#elif defined CCOEFF\n" "#if cn == 1\n" "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, float template_sum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n" "int step = src_sums_step/(int)sizeof(T);\n" "T image_sum = (T)(0), value;\n" "value = (T)(sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);\n" "image_sum = mad(value, template_sum , image_sum);\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n" "}\n" "}\n" "#elif cn==3\n" "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, float4 template_sum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "T image_sum = (T)(0), value, temp_sum;\n" "temp_sum.x = template_sum.x;\n" "temp_sum.y = template_sum.y;\n" "temp_sum.z = template_sum.z;\n" "value = vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, template_rows)));\n" "value -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, template_rows)));\n" "value -= vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, 0)));\n" "value += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));\n" "image_sum = mad(value, temp_sum , 0);\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n" "}\n" "}\n" "#elif (cn==2 || cn==4)\n" "__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int template_rows, int template_cols, float4 template_sum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n" "int step = src_sums_step/(int)sizeof(T);\n" "T image_sum = (T)(0), value, temp_sum;\n" "#if cn==2\n" "temp_sum.x = template_sum.x;\n" "temp_sum.y = template_sum.y;\n" "#else\n" "temp_sum = template_sum;\n" "#endif\n" "value = (sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);\n" "image_sum = mad(value, temp_sum , image_sum);\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "*(__global float *)(dst + dst_idx) -= convertToDT(image_sum);\n" "}\n" "}\n" "#else\n" "#error \"cn should be 1-4\"\n" "#endif\n" "#elif defined CCOEFF_NORMED\n" "#if cn == 1\n" "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int t_rows, int t_cols, float weight, float template_sum, float template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "float sum_[2];\n" "float sqsum_[2];\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int step = src_sums_step/(int)sizeof(T);\n" "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n" "__global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));\n" "T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];\n" "T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];\n" "float num = convertToDT(mad(value_sum, template_sum, 0));\n" "value_sqsum -= weight * value_sum * value_sum;\n" "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst+dst_idx);\n" "*dstult = normAcc((*dstult) - num, denum);\n" "}\n" "}\n" "#elif cn==3\n" "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int step = src_sums_step/(int)sizeof(T);\n" "T temp_sum, value_sum, value_sqsum;\n" "temp_sum.x = template_sum.x;\n" "temp_sum.y = template_sum.y;\n" "temp_sum.z = template_sum.z;\n" "value_sum = vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, t_rows)));\n" "value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, t_rows)));\n" "value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, 0)));\n" "value_sum += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));\n" "value_sqsum = vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, t_rows)));\n" "value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, t_rows)));\n" "value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, 0)));\n" "value_sqsum += vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, 0)));\n" "float num = convertToDT(mad(value_sum, temp_sum, 0));\n" "value_sqsum -= weight * value_sum * value_sum;\n" "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst+dst_idx);\n" "*dstult = normAcc((*dstult) - num, denum);\n" "}\n" "}\n" "#elif (cn==2 || cn==4)\n" "__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,\n" "__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int step = src_sums_step/(int)sizeof(T);\n" "T temp_sum;\n" "__global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));\n" "__global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));\n" "T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];\n" "T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];\n" "#if cn==2\n" "temp_sum.x = template_sum.x;\n" "temp_sum.y = template_sum.y;\n" "#else\n" "temp_sum = template_sum;\n" "#endif\n" "float num = convertToDT(mad(value_sum, temp_sum, 0));\n" "value_sqsum -= weight * value_sum * value_sum;\n" "float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));\n" "int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));\n" "__global float * dstult = (__global float *)(dst+dst_idx);\n" "*dstult = normAcc((*dstult) - num, denum);\n" "}\n" "}\n" "#else\n" "#error \"cn should be 1-4\"\n" "#endif\n" "#endif\n" , "b3c29b8efeb2ed66a052794cb7d162cb"}; ProgramSource match_template_oclsrc(match_template.programStr); const struct ProgramEntry medianFilter={"medianFilter", "#if cn != 3\n" "#define loadpix(addr) *(__global const T *)(addr)\n" "#define storepix(val, addr) *(__global T *)(addr) = val\n" "#define TSIZE (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" "#define TSIZE (int)sizeof(T1) * cn\n" "#endif\n" "#define OP(a,b) { mid=a; a=min(a,b); b=max(mid,b);}\n" "#ifdef USE_4OPT\n" "#if cn == 1\n" "#define LOAD4(val, offs) (val) = vload4(0, (__global T1 *)(srcptr + src_index + (offs)))\n" "#define STORE4(val, offs) vstore4((val), 0, (__global T1 *)(dstptr + (offs)))\n" "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n" "dst0 = (T4)(src0, dst1.xyz); \\\n" "dst2 = (T4)(dst1.yzw, src2); }\n" "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n" "dst0 = (T4)(src0, src1, dst2.xy); \\\n" "dst1 = (T4)(src1, dst2.xyz); \\\n" "dst3 = (T4)(dst2.yzw, src3); \\\n" "dst4 = (T4)(dst2.zw, src3, src4); }\n" "#elif cn == 2\n" "#define LOAD4(val, offs) (val) = vload8(0, (__global T1 *)(srcptr + src_index + (offs)))\n" "#define STORE4(val, offs) vstore8((val), 0, (__global T1 *)(dstptr + (offs)))\n" "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n" "dst0 = (T4)(src0, dst1.s012345); \\\n" "dst2 = (T4)(dst1.s234567, src2); }\n" "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n" "dst0 = (T4)(src0, src1, dst2.s0123); \\\n" "dst1 = (T4)(src1, dst2.s012345); \\\n" "dst3 = (T4)(dst2.s234567, src3); \\\n" "dst4 = (T4)(dst2.s4567, src3, src4); }\n" "#elif cn == 4\n" "#define LOAD4(val, offs) (val) = vload16(0, (__global T1 *)(srcptr + src_index + (offs)))\n" "#define STORE4(val, offs) vstore16((val), 0, (__global T1 *)(dstptr + (offs)))\n" "#define SHUFFLE4_3(src0, src1, src2, dst0, dst1, dst2) { dst1 = src1; \\\n" "dst0 = (T4)(src0, dst1.s0123456789ab ); \\\n" "dst2 = (T4)(dst1.s456789abcdef, src2); }\n" "#define SHUFFLE4_5(src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, dst4) { dst2 = src2; \\\n" "dst0 = (T4)(src0, src1, dst2.s01234567); \\\n" "dst1 = (T4)(src1, dst2.s0123456789ab); \\\n" "dst3 = (T4)(dst2.s456789abcdef, src3); \\\n" "dst4 = (T4)(dst2.s89abcdef, src3, src4); }\n" "#endif\n" "__kernel void medianFilter3_u(__global const uchar* srcptr, int srcStep, int srcOffset,\n" "__global uchar* dstptr, int dstStep, int dstOffset,\n" "int rows, int cols)\n" "{\n" "int gx= get_global_id(0) << 2;\n" "int gy= get_global_id(1) << 2;\n" "if( gy >= rows || gx >= cols)\n" "return;\n" "T c0; T4 c1; T c2;\n" "T c3; T4 c4; T c5;\n" "T c6; T4 c7; T c8;\n" "int x_left = mad24(max(gx-1, 0), TSIZE, srcOffset);\n" "int x_central = mad24(gx, TSIZE, srcOffset);\n" "int x_right = mad24(min(gx+4, cols-1), TSIZE, srcOffset);\n" "int xdst = mad24(gx, TSIZE, dstOffset);\n" "int src_index = max(gy-1, 0)*srcStep;\n" "c0 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c1, x_central);\n" "c2 = *(__global T *)(srcptr + src_index + x_right);\n" "src_index = gy*srcStep;\n" "c3 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c4, x_central);\n" "c5 = *(__global T *)(srcptr + src_index + x_right);\n" "#define ITER3(k) { \\\n" "src_index = min(gy+k+1, rows-1)*srcStep; \\\n" "c6 = *(__global T *)(srcptr + src_index + x_left); \\\n" "LOAD4(c7, x_central); \\\n" "c8 = *(__global T *)(srcptr + src_index + x_right); \\\n" "T4 p0, p1, p2, p3, p4, p5, p6, p7, p8; \\\n" "SHUFFLE4_3(c0, c1, c2, p0, p1, p2); \\\n" "SHUFFLE4_3(c3, c4, c5, p3, p4, p5); \\\n" "SHUFFLE4_3(c6, c7, c8, p6, p7, p8); \\\n" "T4 mid; \\\n" "OP(p1, p2); OP(p4, p5); OP(p7, p8); OP(p0, p1); \\\n" "OP(p3, p4); OP(p6, p7); OP(p1, p2); OP(p4, p5); \\\n" "OP(p7, p8); OP(p0, p3); OP(p5, p8); OP(p4, p7); \\\n" "OP(p3, p6); OP(p1, p4); OP(p2, p5); OP(p4, p7); \\\n" "OP(p4, p2); OP(p6, p4); OP(p4, p2); \\\n" "int dst_index = mad24( gy+k, dstStep, xdst); \\\n" "STORE4(p4, dst_index); \\\n" "c0 = c3; c1 = c4; c2 = c5; \\\n" "c3 = c6; c4 = c7; c5 = c8; \\\n" "}\n" "ITER3(0);\n" "ITER3(1);\n" "ITER3(2);\n" "ITER3(3);\n" "}\n" "__kernel void medianFilter5_u(__global const uchar* srcptr, int srcStep, int srcOffset,\n" "__global uchar* dstptr, int dstStep, int dstOffset,\n" "int rows, int cols)\n" "{\n" "int gx= get_global_id(0) << 2;\n" "int gy= get_global_id(1) << 2;\n" "if( gy >= rows || gx >= cols)\n" "return;\n" "T c0; T c1; T4 c2; T c3; T c4;\n" "T c5; T c6; T4 c7; T c8; T c9;\n" "T c10; T c11; T4 c12; T c13; T c14;\n" "T c15; T c16; T4 c17; T c18; T c19;\n" "T c20; T c21; T4 c22; T c23; T c24;\n" "int x_leftmost = mad24(max(gx-2, 0), TSIZE, srcOffset);\n" "int x_left = mad24(max(gx-1, 0), TSIZE, srcOffset);\n" "int x_central = mad24(gx, TSIZE, srcOffset);\n" "int x_right = mad24(min(gx+4, cols-1), TSIZE, srcOffset);\n" "int x_rightmost= mad24(min(gx+5, cols-1), TSIZE, srcOffset);\n" "int xdst = mad24(gx, TSIZE, dstOffset);\n" "int src_index = max(gy-2, 0)*srcStep;\n" "c0 = *(__global T *)(srcptr + src_index + x_leftmost);\n" "c1 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c2, x_central);\n" "c3 = *(__global T *)(srcptr + src_index + x_right);\n" "c4 = *(__global T *)(srcptr + src_index + x_rightmost);\n" "src_index = max(gy-1, 0)*srcStep;\n" "c5 = *(__global T *)(srcptr + src_index + x_leftmost);\n" "c6 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c7, x_central);\n" "c8 = *(__global T *)(srcptr + src_index + x_right);\n" "c9 = *(__global T *)(srcptr + src_index + x_rightmost);\n" "src_index = gy*srcStep;\n" "c10 = *(__global T *)(srcptr + src_index + x_leftmost);\n" "c11 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c12, x_central);\n" "c13 = *(__global T *)(srcptr + src_index + x_right);\n" "c14 = *(__global T *)(srcptr + src_index + x_rightmost);\n" "src_index = (gy+1)*srcStep;\n" "c15 = *(__global T *)(srcptr + src_index + x_leftmost);\n" "c16 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c17, x_central);\n" "c18 = *(__global T *)(srcptr + src_index + x_right);\n" "c19 = *(__global T *)(srcptr + src_index + x_rightmost);\n" "for(int k = 0; k < 4; k++)\n" "{\n" "src_index = min(gy+k+2, rows-1) * srcStep;\n" "c20 = *(__global T *)(srcptr + src_index + x_leftmost);\n" "c21 = *(__global T *)(srcptr + src_index + x_left);\n" "LOAD4(c22, x_central);\n" "c23 = *(__global T *)(srcptr + src_index + x_right);\n" "c24 = *(__global T *)(srcptr + src_index + x_rightmost);\n" "T4 p0, p1, p2, p3, p4,\n" "p5, p6, p7, p8, p9,\n" "p10, p11, p12, p13, p14,\n" "p15, p16, p17, p18, p19,\n" "p20, p21, p22, p23, p24;\n" "SHUFFLE4_5(c0, c1, c2, c3, c4, p0, p1, p2, p3, p4);\n" "SHUFFLE4_5(c5, c6, c7, c8, c9, p5, p6, p7, p8, p9);\n" "SHUFFLE4_5(c10, c11, c12, c13, c14, p10, p11, p12, p13, p14);\n" "SHUFFLE4_5(c15, c16, c17, c18, c19, p15, p16, p17, p18, p19);\n" "SHUFFLE4_5(c20, c21, c22, c23, c24, p20, p21, p22, p23, p24);\n" "T4 mid;\n" "OP(p1, p2); OP(p0, p1); OP(p1, p2); OP(p4, p5); OP(p3, p4);\n" "OP(p4, p5); OP(p0, p3); OP(p2, p5); OP(p2, p3); OP(p1, p4);\n" "OP(p1, p2); OP(p3, p4); OP(p7, p8); OP(p6, p7); OP(p7, p8);\n" "OP(p10, p11); OP(p9, p10); OP(p10, p11); OP(p6, p9); OP(p8, p11);\n" "OP(p8, p9); OP(p7, p10); OP(p7, p8); OP(p9, p10); OP(p0, p6);\n" "OP(p4, p10); OP(p4, p6); OP(p2, p8); OP(p2, p4); OP(p6, p8);\n" "OP(p1, p7); OP(p5, p11); OP(p5, p7); OP(p3, p9); OP(p3, p5);\n" "OP(p7, p9); OP(p1, p2); OP(p3, p4); OP(p5, p6); OP(p7, p8);\n" "OP(p9, p10); OP(p13, p14); OP(p12, p13); OP(p13, p14); OP(p16, p17);\n" "OP(p15, p16); OP(p16, p17); OP(p12, p15); OP(p14, p17); OP(p14, p15);\n" "OP(p13, p16); OP(p13, p14); OP(p15, p16); OP(p19, p20); OP(p18, p19);\n" "OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p21, p23); OP(p22, p24);\n" "OP(p22, p23); OP(p18, p21); OP(p20, p23); OP(p20, p21); OP(p19, p22);\n" "OP(p22, p24); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p12, p18);\n" "OP(p16, p22); OP(p16, p18); OP(p14, p20); OP(p20, p24); OP(p14, p16);\n" "OP(p18, p20); OP(p22, p24); OP(p13, p19); OP(p17, p23); OP(p17, p19);\n" "OP(p15, p21); OP(p15, p17); OP(p19, p21); OP(p13, p14); OP(p15, p16);\n" "OP(p17, p18); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p0, p12);\n" "OP(p8, p20); OP(p8, p12); OP(p4, p16); OP(p16, p24); OP(p12, p16);\n" "OP(p2, p14); OP(p10, p22); OP(p10, p14); OP(p6, p18); OP(p6, p10);\n" "OP(p10, p12); OP(p1, p13); OP(p9, p21); OP(p9, p13); OP(p5, p17);\n" "OP(p13, p17); OP(p3, p15); OP(p11, p23); OP(p11, p15); OP(p7, p19);\n" "OP(p7, p11); OP(p11, p13); OP(p11, p12);\n" "int dst_index = mad24( gy+k, dstStep, xdst);\n" "STORE4(p12, dst_index);\n" "c0=c5; c1=c6; c2=c7; c3=c8; c4=c9;\n" "c5=c10; c6=c11; c7=c12; c8=c13; c9=c14;\n" "c10=c15; c11=c16; c12=c17; c13=c18; c14=c19;\n" "c15=c20; c16=c21; c17=c22; c18=c23; c19=c24;\n" "}\n" "}\n" "#endif\n" "__kernel void medianFilter3(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "__local T data[18][18];\n" "int x = get_local_id(0);\n" "int y = get_local_id(1);\n" "int gx = get_global_id(0);\n" "int gy = get_global_id(1);\n" "int dx = gx - x - 1;\n" "int dy = gy - y - 1;\n" "int id = min(mad24(x, 16, y), 9*18-1);\n" "int dr = id / 18;\n" "int dc = id % 18;\n" "int c = clamp(dx + dc, 0, dst_cols - 1);\n" "int r = clamp(dy + dr, 0, dst_rows - 1);\n" "int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n" "r = clamp(dy + dr + 9, 0, dst_rows - 1);\n" "int index9 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n" "data[dr][dc] = loadpix(srcptr + index1);\n" "data[dr+9][dc] = loadpix(srcptr + index9);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "T p0 = data[y][x], p1 = data[y][(x+1)], p2 = data[y][(x+2)];\n" "T p3 = data[y+1][x], p4 = data[y+1][(x+1)], p5 = data[y+1][(x+2)];\n" "T p6 = data[y+2][x], p7 = data[y+2][(x+1)], p8 = data[y+2][(x+2)];\n" "T mid;\n" "OP(p1, p2); OP(p4, p5); OP(p7, p8); OP(p0, p1);\n" "OP(p3, p4); OP(p6, p7); OP(p1, p2); OP(p4, p5);\n" "OP(p7, p8); OP(p0, p3); OP(p5, p8); OP(p4, p7);\n" "OP(p3, p6); OP(p1, p4); OP(p2, p5); OP(p4, p7);\n" "OP(p4, p2); OP(p6, p4); OP(p4, p2);\n" "int dst_index = mad24( gy, dst_step, mad24(gx, TSIZE, dst_offset));\n" "if (gy < dst_rows && gx < dst_cols)\n" "storepix(p4, dstptr + dst_index);\n" "}\n" "__kernel void medianFilter5(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "__local T data[20][20];\n" "int x = get_local_id(0);\n" "int y = get_local_id(1);\n" "int gx = get_global_id(0);\n" "int gy = get_global_id(1);\n" "int dx = gx - x - 2;\n" "int dy = gy - y - 2;\n" "int id = min(mad24(x, 16, y), 10*20-1);\n" "int dr = id / 20;\n" "int dc = id % 20;\n" "int c = clamp(dx + dc, 0, dst_cols - 1);\n" "int r = clamp(dy + dr, 0, dst_rows - 1);\n" "int index1 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n" "r = clamp(dy + dr + 10, 0, dst_rows - 1);\n" "int index10 = mad24(r, src_step, mad24(c, TSIZE, src_offset));\n" "data[dr][dc] = loadpix(srcptr + index1);\n" "data[dr+10][dc] = loadpix(srcptr + index10);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "T p0 = data[y][x], p1 = data[y][x+1], p2 = data[y][x+2], p3 = data[y][x+3], p4 = data[y][x+4];\n" "T p5 = data[y+1][x], p6 = data[y+1][x+1], p7 = data[y+1][x+2], p8 = data[y+1][x+3], p9 = data[y+1][x+4];\n" "T p10 = data[y+2][x], p11 = data[y+2][x+1], p12 = data[y+2][x+2], p13 = data[y+2][x+3], p14 = data[y+2][x+4];\n" "T p15 = data[y+3][x], p16 = data[y+3][x+1], p17 = data[y+3][x+2], p18 = data[y+3][x+3], p19 = data[y+3][x+4];\n" "T p20 = data[y+4][x], p21 = data[y+4][x+1], p22 = data[y+4][x+2], p23 = data[y+4][x+3], p24 = data[y+4][x+4];\n" "T mid;\n" "OP(p1, p2); OP(p0, p1); OP(p1, p2); OP(p4, p5); OP(p3, p4);\n" "OP(p4, p5); OP(p0, p3); OP(p2, p5); OP(p2, p3); OP(p1, p4);\n" "OP(p1, p2); OP(p3, p4); OP(p7, p8); OP(p6, p7); OP(p7, p8);\n" "OP(p10, p11); OP(p9, p10); OP(p10, p11); OP(p6, p9); OP(p8, p11);\n" "OP(p8, p9); OP(p7, p10); OP(p7, p8); OP(p9, p10); OP(p0, p6);\n" "OP(p4, p10); OP(p4, p6); OP(p2, p8); OP(p2, p4); OP(p6, p8);\n" "OP(p1, p7); OP(p5, p11); OP(p5, p7); OP(p3, p9); OP(p3, p5);\n" "OP(p7, p9); OP(p1, p2); OP(p3, p4); OP(p5, p6); OP(p7, p8);\n" "OP(p9, p10); OP(p13, p14); OP(p12, p13); OP(p13, p14); OP(p16, p17);\n" "OP(p15, p16); OP(p16, p17); OP(p12, p15); OP(p14, p17); OP(p14, p15);\n" "OP(p13, p16); OP(p13, p14); OP(p15, p16); OP(p19, p20); OP(p18, p19);\n" "OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p21, p23); OP(p22, p24);\n" "OP(p22, p23); OP(p18, p21); OP(p20, p23); OP(p20, p21); OP(p19, p22);\n" "OP(p22, p24); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p12, p18);\n" "OP(p16, p22); OP(p16, p18); OP(p14, p20); OP(p20, p24); OP(p14, p16);\n" "OP(p18, p20); OP(p22, p24); OP(p13, p19); OP(p17, p23); OP(p17, p19);\n" "OP(p15, p21); OP(p15, p17); OP(p19, p21); OP(p13, p14); OP(p15, p16);\n" "OP(p17, p18); OP(p19, p20); OP(p21, p22); OP(p23, p24); OP(p0, p12);\n" "OP(p8, p20); OP(p8, p12); OP(p4, p16); OP(p16, p24); OP(p12, p16);\n" "OP(p2, p14); OP(p10, p22); OP(p10, p14); OP(p6, p18); OP(p6, p10);\n" "OP(p10, p12); OP(p1, p13); OP(p9, p21); OP(p9, p13); OP(p5, p17);\n" "OP(p13, p17); OP(p3, p15); OP(p11, p23); OP(p11, p15); OP(p7, p19);\n" "OP(p7, p11); OP(p11, p13); OP(p11, p12);\n" "int dst_index = mad24(gy, dst_step, mad24(gx, TSIZE, dst_offset));\n" "if (gy < dst_rows && gx < dst_cols)\n" "storepix(p12, dstptr + dst_index);\n" "}\n" , "f082457348bfbcb2e2de3014f46093a8"}; ProgramSource medianFilter_oclsrc(medianFilter.programStr); const struct ProgramEntry moments={"moments", "#if TILE_SIZE != 32\n" "#error \"TILE SIZE should be 32\"\n" "#endif\n" "__kernel void moments(__global const uchar* src, int src_step, int src_offset,\n" "int src_rows, int src_cols, __global int* mom0, int xtiles)\n" "{\n" "int x0 = get_global_id(0);\n" "int y0 = get_group_id(1);\n" "int x, y = get_local_id(1);\n" "int x_min = x0*TILE_SIZE;\n" "int ypix = y0*TILE_SIZE + y;\n" "__local int mom[TILE_SIZE][10];\n" "if (x_min < src_cols && y0*TILE_SIZE < src_rows)\n" "{\n" "if (ypix < src_rows)\n" "{\n" "int x_max = min(src_cols - x_min, TILE_SIZE);\n" "__global const uchar* ptr = src + src_offset + ypix*src_step + x_min;\n" "int4 S = (int4)(0, 0, 0, 0), p;\n" "#define SUM_ELEM(elem, ofs) \\\n" "(int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem\n" "x = x_max & -4;\n" "if (x_max >= 4)\n" "{\n" "p = convert_int4(vload4(0, ptr));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) +\n" "(int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27);\n" "if (x_max >= 8)\n" "{\n" "p = convert_int4(vload4(0, ptr + 4));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) +\n" "(int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343);\n" "if (x_max >= 12)\n" "{\n" "p = convert_int4(vload4(0, ptr + 8));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) +\n" "(int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331);\n" "if (x_max >= 16)\n" "{\n" "p = convert_int4(vload4(0, ptr + 12));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) +\n" "(int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375);\n" "}\n" "}\n" "}\n" "}\n" "if (x_max >= 20)\n" "{\n" "p = convert_int4(vload4(0, ptr + 16));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) +\n" "(int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859);\n" "if (x_max >= 24)\n" "{\n" "p = convert_int4(vload4(0, ptr + 20));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) +\n" "(int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167);\n" "if (x_max >= 28)\n" "{\n" "p = convert_int4(vload4(0, ptr + 24));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) +\n" "(int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683);\n" "if (x_max >= 32)\n" "{\n" "p = convert_int4(vload4(0, ptr + 28));\n" "#ifdef OP_MOMENTS_BINARY\n" "p = min(p, 1);\n" "#endif\n" "S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) +\n" "(int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791);\n" "}\n" "}\n" "}\n" "}\n" "if (x < x_max)\n" "{\n" "int ps = ptr[x];\n" "#ifdef OP_MOMENTS_BINARY\n" "ps = min(ps, 1);\n" "#endif\n" "S += SUM_ELEM(ps, x);\n" "if (x + 1 < x_max)\n" "{\n" "ps = ptr[x + 1];\n" "#ifdef OP_MOMENTS_BINARY\n" "ps = min(ps, 1);\n" "#endif\n" "S += SUM_ELEM(ps, x + 1);\n" "if (x + 2 < x_max)\n" "{\n" "ps = ptr[x + 2];\n" "#ifdef OP_MOMENTS_BINARY\n" "ps = min(ps, 1);\n" "#endif\n" "S += SUM_ELEM(ps, x + 2);\n" "}\n" "}\n" "}\n" "int sy = y*y;\n" "mom[y][0] = S.s0;\n" "mom[y][1] = S.s1;\n" "mom[y][2] = y*S.s0;\n" "mom[y][3] = S.s2;\n" "mom[y][4] = y*S.s1;\n" "mom[y][5] = sy*S.s0;\n" "mom[y][6] = S.s3;\n" "mom[y][7] = y*S.s2;\n" "mom[y][8] = sy*S.s1;\n" "mom[y][9] = y*sy*S.s0;\n" "}\n" "else\n" "mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =\n" "mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#define REDUCE(d) \\\n" "if (y < d) \\\n" "{ \\\n" "mom[y][0] += mom[y + d][0]; \\\n" "mom[y][1] += mom[y + d][1]; \\\n" "mom[y][2] += mom[y + d][2]; \\\n" "mom[y][3] += mom[y + d][3]; \\\n" "mom[y][4] += mom[y + d][4]; \\\n" "mom[y][5] += mom[y + d][5]; \\\n" "mom[y][6] += mom[y + d][6]; \\\n" "mom[y][7] += mom[y + d][7]; \\\n" "mom[y][8] += mom[y + d][8]; \\\n" "mom[y][9] += mom[y + d][9]; \\\n" "} \\\n" "barrier(CLK_LOCAL_MEM_FENCE)\n" "REDUCE(16);\n" "REDUCE(8);\n" "REDUCE(4);\n" "REDUCE(2);\n" "if (y < 10)\n" "{\n" "__global int* momout = mom0 + (y0*xtiles + x0) * 10;\n" "momout[y] = mom[0][y] + mom[1][y];\n" "}\n" "}\n" "}\n" , "1d0545282b5860ed7eeeb6860fa9edc3"}; ProgramSource moments_oclsrc(moments.programStr); const struct ProgramEntry morph={"morph", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define noconvert\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T *)(addr)\n" "#define storepix(val, addr) *(__global T *)(addr) = val\n" "#define TSIZE (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" "#define TSIZE ((int)sizeof(T1)*3)\n" "#endif\n" "#ifdef DEPTH_0\n" "#define MIN_VAL 0\n" "#define MAX_VAL UCHAR_MAX\n" "#elif defined DEPTH_1\n" "#define MIN_VAL SCHAR_MIN\n" "#define MAX_VAL SCHAR_MAX\n" "#elif defined DEPTH_2\n" "#define MIN_VAL 0\n" "#define MAX_VAL USHRT_MAX\n" "#elif defined DEPTH_3\n" "#define MIN_VAL SHRT_MIN\n" "#define MAX_VAL SHRT_MAX\n" "#elif defined DEPTH_4\n" "#define MIN_VAL INT_MIN\n" "#define MAX_VAL INT_MAX\n" "#elif defined DEPTH_5\n" "#define MIN_VAL (-FLT_MAX)\n" "#define MAX_VAL FLT_MAX\n" "#elif defined DEPTH_6\n" "#define MIN_VAL (-DBL_MAX)\n" "#define MAX_VAL DBL_MAX\n" "#endif\n" "#ifdef OP_ERODE\n" "#define VAL MAX_VAL\n" "#elif defined OP_DILATE\n" "#define VAL MIN_VAL\n" "#else\n" "#error \"Unknown operation\"\n" "#endif\n" "#ifdef OP_ERODE\n" "#if defined INTEL_DEVICE && defined DEPTH_0\n" "#define MORPH_OP(A, B) ((A) < (B) ? (A) : (B))\n" "#else\n" "#define MORPH_OP(A, B) min((A), (B))\n" "#endif\n" "#endif\n" "#ifdef OP_DILATE\n" "#define MORPH_OP(A, B) max((A), (B))\n" "#endif\n" "#define PROCESS(y, x) \\\n" "temp = LDS_DAT[mad24(l_y + y, width, l_x + x)]; \\\n" "res = MORPH_OP(res, temp);\n" "#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) < (l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)\n" "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n" "#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset\n" "#else\n" "#define EXTRA_PARAMS\n" "#endif\n" "__kernel void morph(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int src_offset_x, int src_offset_y, int cols, int rows,\n" "int src_whole_cols, int src_whole_rows EXTRA_PARAMS)\n" "{\n" "int gidx = get_global_id(0), gidy = get_global_id(1);\n" "int l_x = get_local_id(0), l_y = get_local_id(1);\n" "int x = get_group_id(0) * LSIZE0, y = get_group_id(1) * LSIZE1;\n" "int start_x = x + src_offset_x - RADIUSX;\n" "int width = mad24(RADIUSX, 2, LSIZE0 + 1);\n" "int start_y = y + src_offset_y - RADIUSY;\n" "int point1 = mad24(l_y, LSIZE0, l_x);\n" "int point2 = point1 + LSIZE0 * LSIZE1;\n" "int tl_x = point1 % width, tl_y = point1 / width;\n" "int tl_x2 = point2 % width, tl_y2 = point2 / width;\n" "int cur_x = start_x + tl_x, cur_y = start_y + tl_y;\n" "int cur_x2 = start_x + tl_x2, cur_y2 = start_y + tl_y2;\n" "int start_addr = mad24(cur_y, src_step, cur_x * TSIZE);\n" "int start_addr2 = mad24(cur_y2, src_step, cur_x2 * TSIZE);\n" "__local T LDS_DAT[2 * LSIZE1 * LSIZE0];\n" "int end_addr = mad24(src_whole_rows - 1, src_step, src_whole_cols * TSIZE);\n" "start_addr = start_addr < end_addr && start_addr > 0 ? start_addr : 0;\n" "start_addr2 = start_addr2 < end_addr && start_addr2 > 0 ? start_addr2 : 0;\n" "T temp0 = loadpix(srcptr + start_addr);\n" "T temp1 = loadpix(srcptr + start_addr2);\n" "temp0 = ELEM(cur_x, 0, src_whole_cols, (T)(VAL), temp0);\n" "temp0 = ELEM(cur_y, 0, src_whole_rows, (T)(VAL), temp0);\n" "temp1 = ELEM(cur_x2, 0, src_whole_cols, (T)(VAL), temp1);\n" "temp1 = ELEM(cur_y2, 0, src_whole_rows, (T)(VAL), temp1);\n" "LDS_DAT[point1] = temp0;\n" "LDS_DAT[point2] = temp1;\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "if (gidx < cols && gidy < rows)\n" "{\n" "T res = (T)(VAL), temp;\n" "PROCESS_ELEMS;\n" "int dst_index = mad24(gidy, dst_step, mad24(gidx, TSIZE, dst_offset));\n" "#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT\n" "int mat_index = mad24(gidy, mat_step, mad24(gidx, TSIZE, mat_offset));\n" "T value = loadpix(matptr + mat_index);\n" "#ifdef OP_GRADIENT\n" "storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);\n" "#elif defined OP_TOPHAT\n" "storepix(convertToT(convertToWT(value) - convertToWT(res)), dstptr + dst_index);\n" "#elif defined OP_BLACKHAT\n" "storepix(convertToT(convertToWT(res) - convertToWT(value)), dstptr + dst_index);\n" "#endif\n" "#else\n" "storepix(res, dstptr + dst_index);\n" "#endif\n" "}\n" "}\n" , "232e712bff362e53c55027da6e1e1584"}; ProgramSource morph_oclsrc(morph.programStr); const struct ProgramEntry precornerdetect={"precornerdetect", "__kernel void preCornerDetect(__global const uchar * Dxptr, int dx_step, int dx_offset,\n" "__global const uchar * Dyptr, int dy_step, int dy_offset,\n" "__global const uchar * D2xptr, int d2x_step, int d2x_offset,\n" "__global const uchar * D2yptr, int d2y_step, int d2y_offset,\n" "__global const uchar * Dxyptr, int dxy_step, int dxy_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset,\n" "int dst_rows, int dst_cols, float factor)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1);\n" "if (x < dst_cols && y < dst_rows)\n" "{\n" "int dx_index = mad24(dx_step, y, (int)sizeof(float) * x + dx_offset);\n" "int dy_index = mad24(dy_step, y, (int)sizeof(float) * x + dy_offset);\n" "int d2x_index = mad24(d2x_step, y, (int)sizeof(float) * x + d2x_offset);\n" "int d2y_index = mad24(d2y_step, y, (int)sizeof(float) * x + d2y_offset);\n" "int dxy_index = mad24(dxy_step, y, (int)sizeof(float) * x + dxy_offset);\n" "int dst_index = mad24(dst_step, y, (int)sizeof(float) * x + dst_offset);\n" "float dx = *(__global const float *)(Dxptr + dx_index);\n" "float dy = *(__global const float *)(Dyptr + dy_index);\n" "float d2x = *(__global const float *)(D2xptr + d2x_index);\n" "float d2y = *(__global const float *)(D2yptr + d2y_index);\n" "float dxy = *(__global const float *)(Dxyptr + dxy_index);\n" "__global float * dst = (__global float *)(dstptr + dst_index);\n" "dst[0] = factor * (dx*dx*d2y + dy*dy*d2x - 2*dx*dy*dxy);\n" "}\n" "}\n" , "14a94db70b88aa76ff8840f03f3ad556"}; ProgramSource precornerdetect_oclsrc(precornerdetect.programStr); const struct ProgramEntry pyr_down={"pyr_down", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)\n" "#elif defined BORDER_REFLECT\n" "#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)\n" "#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101\n" "#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T*)(addr)\n" "#define storepix(val, addr) *(__global T*)(addr) = (val)\n" "#define PIXSIZE ((int)sizeof(T))\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" "#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))\n" "#define PIXSIZE ((int)sizeof(T1)*3)\n" "#endif\n" "#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))\n" "#if kercn == 4\n" "#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))\n" "#endif\n" "#ifdef INTEL_DEVICE\n" "#define MAD(x,y,z) fma((x),(y),(z))\n" "#else\n" "#define MAD(x,y,z) mad((x),(y),(z))\n" "#endif\n" "#define LOAD_LOCAL(col_gl, col_lcl) \\\n" "sum0 = co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \\\n" "sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0); \\\n" "temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows)); \\\n" "sum0 = MAD(co1, temp, sum0); \\\n" "sum1 = co3 * temp; \\\n" "temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \\\n" "sum0 = MAD(co2, temp, sum0); \\\n" "sum1 = MAD(co2, temp, sum1); \\\n" "temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \\\n" "sum0 = MAD(co3, temp, sum0); \\\n" "sum1 = MAD(co1, temp, sum1); \\\n" "smem[0][col_lcl] = sum0; \\\n" "sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1); \\\n" "sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1); \\\n" "smem[1][col_lcl] = sum1;\n" "#if kercn == 4\n" "#define LOAD_LOCAL4(col_gl, col_lcl) \\\n" "sum40 = co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \\\n" "sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40); \\\n" "temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y, src_rows)); \\\n" "sum40 = MAD(co1, temp4, sum40); \\\n" "sum41 = co3 * temp4; \\\n" "temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \\\n" "sum40 = MAD(co2, temp4, sum40); \\\n" "sum41 = MAD(co2, temp4, sum41); \\\n" "temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \\\n" "sum40 = MAD(co3, temp4, sum40); \\\n" "sum41 = MAD(co1, temp4, sum41); \\\n" "vstore4(sum40, col_lcl, (__local float*) &smem[0][2]); \\\n" "sum41 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum41); \\\n" "sum41 = MAD(co3, SRC4(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum41); \\\n" "vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);\n" "#endif\n" "#define noconvert\n" "__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "const int x = get_global_id(0)*kercn;\n" "const int y = 2*get_global_id(1);\n" "__local FT smem[2][LOCAL_SIZE + 4];\n" "__global uchar * dstData = dst + dst_offset;\n" "__global const uchar * srcData = src + src_offset;\n" "FT sum0, sum1, temp;\n" "FT co1 = 0.375f;\n" "FT co2 = 0.25f;\n" "FT co3 = 0.0625f;\n" "const int src_y = 2*y;\n" "int col;\n" "if (src_y >= 2 && src_y < src_rows - 4)\n" "{\n" "#define EXTRAPOLATE_(val, maxVal) val\n" "#if kercn == 1\n" "col = EXTRAPOLATE(x, src_cols);\n" "LOAD_LOCAL(col, 2 + get_local_id(0))\n" "#else\n" "if (x < src_cols-4)\n" "{\n" "float4 sum40, sum41, temp4;\n" "LOAD_LOCAL4(x, get_local_id(0))\n" "}\n" "else\n" "{\n" "for (int i=0; i<4; i++)\n" "{\n" "col = EXTRAPOLATE(x+i, src_cols);\n" "LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)\n" "}\n" "}\n" "#endif\n" "if (get_local_id(0) < 2)\n" "{\n" "col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n" "LOAD_LOCAL(col, get_local_id(0))\n" "}\n" "else if (get_local_id(0) < 4)\n" "{\n" "col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n" "LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))\n" "}\n" "}\n" "else\n" "{\n" "#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal)\n" "#if kercn == 1\n" "col = EXTRAPOLATE(x, src_cols);\n" "LOAD_LOCAL(col, 2 + get_local_id(0))\n" "#else\n" "if (x < src_cols-4)\n" "{\n" "float4 sum40, sum41, temp4;\n" "LOAD_LOCAL4(x, get_local_id(0))\n" "}\n" "else\n" "{\n" "for (int i=0; i<4; i++)\n" "{\n" "col = EXTRAPOLATE(x+i, src_cols);\n" "LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)\n" "}\n" "}\n" "#endif\n" "if (get_local_id(0) < 2)\n" "{\n" "col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n" "LOAD_LOCAL(col, get_local_id(0))\n" "}\n" "else if (get_local_id(0) < 4)\n" "{\n" "col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);\n" "LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))\n" "}\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "#if kercn == 1\n" "if (get_local_id(0) < LOCAL_SIZE / 2)\n" "{\n" "const int tid2 = get_local_id(0) * 2;\n" "const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;\n" "if (dst_x < dst_cols)\n" "{\n" "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n" "{\n" "#if cn == 1\n" "#if fdepth <= 5\n" "FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));\n" "#else\n" "FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));\n" "#endif\n" "#else\n" "FT sum = co3 * smem[yin - y][2 + tid2 - 2];\n" "sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);\n" "sum = MAD(co1, smem[yin - y][2 + tid2 ], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);\n" "#endif\n" "sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);\n" "storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);\n" "}\n" "}\n" "}\n" "#else\n" "int tid4 = get_local_id(0) * 4;\n" "int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;\n" "if (dst_x < dst_cols - 1)\n" "{\n" "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n" "{\n" "FT sum = co3* smem[yin - y][2 + tid4 + 2];\n" "sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);\n" "sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n" "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n" "dst_x ++;\n" "sum = co3* smem[yin - y][2 + tid4 + 4];\n" "sum = MAD(co3, smem[yin - y][2 + tid4 ], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n" "sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);\n" "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n" "dst_x --;\n" "}\n" "}\n" "else if (dst_x < dst_cols)\n" "{\n" "for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)\n" "{\n" "FT sum = co3* smem[yin - y][2 + tid4 + 2];\n" "sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);\n" "sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);\n" "sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);\n" "storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));\n" "}\n" "}\n" "#endif\n" "}\n" , "3266de56ccdc2bcb8226bf97c932e272"}; ProgramSource pyr_down_oclsrc(pyr_down.programStr); const struct ProgramEntry pyr_up={"pyr_up", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T*)(addr)\n" "#define storepix(val, addr) *(__global T*)(addr) = (val)\n" "#define PIXSIZE ((int)sizeof(T))\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" "#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))\n" "#define PIXSIZE ((int)sizeof(T1)*3)\n" "#endif\n" "#define EXTRAPOLATE(x, maxV) min(maxV - 1, (int) abs(x))\n" "#define noconvert\n" "__kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "const int x = get_global_id(0);\n" "const int y = get_global_id(1);\n" "const int tidx = get_local_id(0);\n" "const int tidy = get_local_id(1);\n" "__local FT s_srcPatch[LOCAL_SIZE/2 + 2][LOCAL_SIZE/2 + 2];\n" "__local FT s_dstPatch[LOCAL_SIZE/2 + 2][LOCAL_SIZE];\n" "__global uchar * dstData = dst + dst_offset;\n" "__global const uchar * srcData = src + src_offset;\n" "if( tidx < (LOCAL_SIZE/2 + 2) && tidy < LOCAL_SIZE/2 + 2 )\n" "{\n" "int srcx = EXTRAPOLATE(mad24((int)get_group_id(0), LOCAL_SIZE/2, tidx) - 1, src_cols);\n" "int srcy = EXTRAPOLATE(mad24((int)get_group_id(1), LOCAL_SIZE/2, tidy) - 1, src_rows);\n" "s_srcPatch[tidy][tidx] = convertToFT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "FT sum = 0.f;\n" "const FT co1 = 0.75f;\n" "const FT co2 = 0.5f;\n" "const FT co3 = 0.125f;\n" "const FT coef1 = (tidx & 1) == 0 ? co1 : (FT) 0;\n" "const FT coef2 = (tidx & 1) == 0 ? co3 : co2;\n" "const FT coefy1 = (tidy & 1) == 0 ? co1 : (FT) 0;\n" "const FT coefy2 = (tidy & 1) == 0 ? co3 : co2;\n" "if(tidy < LOCAL_SIZE/2 + 2)\n" "{\n" "sum = coef2* s_srcPatch[tidy][1 + ((tidx - 1) >> 1)];\n" "sum = mad(coef1, s_srcPatch[tidy][1 + ((tidx ) >> 1)], sum);\n" "sum = mad(coef2, s_srcPatch[tidy][1 + ((tidx + 2) >> 1)], sum);\n" "s_dstPatch[tidy][tidx] = sum;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "sum = coefy2* s_dstPatch[1 + ((tidy - 1) >> 1)][tidx];\n" "sum = mad(coefy1, s_dstPatch[1 + ((tidy ) >> 1)][tidx], sum);\n" "sum = mad(coefy2, s_dstPatch[1 + ((tidy + 2) >> 1)][tidx], sum);\n" "if ((x < dst_cols) && (y < dst_rows))\n" "storepix(convertToT(sum), dstData + y * dst_step + x * PIXSIZE);\n" "}\n" "__kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "const int lx = 2*get_local_id(0);\n" "const int ly = 2*get_local_id(1);\n" "__local FT s_srcPatch[LOCAL_SIZE+2][LOCAL_SIZE+2];\n" "__local FT s_dstPatch[LOCAL_SIZE+2][2*LOCAL_SIZE];\n" "__global uchar * dstData = dst + dst_offset;\n" "__global const uchar * srcData = src + src_offset;\n" "if( lx < (LOCAL_SIZE+2) && ly < (LOCAL_SIZE+2) )\n" "{\n" "int srcx = mad24((int)get_group_id(0), LOCAL_SIZE, lx) - 1;\n" "int srcy = mad24((int)get_group_id(1), LOCAL_SIZE, ly) - 1;\n" "int srcx1 = EXTRAPOLATE(srcx, src_cols);\n" "int srcx2 = EXTRAPOLATE(srcx+1, src_cols);\n" "int srcy1 = EXTRAPOLATE(srcy, src_rows);\n" "int srcy2 = EXTRAPOLATE(srcy+1, src_rows);\n" "s_srcPatch[ly][lx] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));\n" "s_srcPatch[ly+1][lx] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));\n" "s_srcPatch[ly][lx+1] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));\n" "s_srcPatch[ly+1][lx+1] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "FT sum;\n" "const FT co1 = 0.75f;\n" "const FT co2 = 0.5f;\n" "const FT co3 = 0.125f;\n" "sum = co3 * s_srcPatch[1 + (ly >> 1)][1 + ((lx - 2) >> 1)];\n" "sum = mad(co1, s_srcPatch[1 + (ly >> 1)][1 + ((lx ) >> 1)], sum);\n" "sum = mad(co3, s_srcPatch[1 + (ly >> 1)][1 + ((lx + 2) >> 1)], sum);\n" "s_dstPatch[1 + get_local_id(1)][lx] = sum;\n" "sum = co2 * s_srcPatch[1 + (ly >> 1)][1 + ((lx + 1 - 1) >> 1)];\n" "sum = mad(co2, s_srcPatch[1 + (ly >> 1)][1 + ((lx + 1 + 1) >> 1)], sum);\n" "s_dstPatch[1 + get_local_id(1)][lx+1] = sum;\n" "if (ly < 1)\n" "{\n" "sum = co3 * s_srcPatch[0][1 + ((lx - 2) >> 1)];\n" "sum = mad(co1, s_srcPatch[0][1 + ((lx ) >> 1)], sum);\n" "sum = mad(co3, s_srcPatch[0][1 + ((lx + 2) >> 1)], sum);\n" "s_dstPatch[0][lx] = sum;\n" "sum = co2 * s_srcPatch[0][1 + ((lx + 1 - 1) >> 1)];\n" "sum = mad(co2, s_srcPatch[0][1 + ((lx + 1 + 1) >> 1)], sum);\n" "s_dstPatch[0][lx+1] = sum;\n" "}\n" "if (ly > 2*LOCAL_SIZE-3)\n" "{\n" "sum = co3 * s_srcPatch[LOCAL_SIZE+1][1 + ((lx - 2) >> 1)];\n" "sum = mad(co1, s_srcPatch[LOCAL_SIZE+1][1 + ((lx ) >> 1)], sum);\n" "sum = mad(co3, s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 2) >> 1)], sum);\n" "s_dstPatch[LOCAL_SIZE+1][lx] = sum;\n" "sum = co2 * s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 1 - 1) >> 1)];\n" "sum = mad(co2, s_srcPatch[LOCAL_SIZE+1][1 + ((lx + 1 + 1) >> 1)], sum);\n" "s_dstPatch[LOCAL_SIZE+1][lx+1] = sum;\n" "}\n" "barrier(CLK_LOCAL_MEM_FENCE);\n" "int dst_x = 2*get_global_id(0);\n" "int dst_y = 2*get_global_id(1);\n" "if ((dst_x < dst_cols) && (dst_y < dst_rows))\n" "{\n" "sum = co3 * s_dstPatch[1 + get_local_id(1) - 1][lx];\n" "sum = mad(co1, s_dstPatch[1 + get_local_id(1) ][lx], sum);\n" "sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);\n" "storepix(convertToT(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);\n" "sum = co3 * s_dstPatch[1 + get_local_id(1) - 1][lx+1];\n" "sum = mad(co1, s_dstPatch[1 + get_local_id(1) ][lx+1], sum);\n" "sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);\n" "storepix(convertToT(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);\n" "sum = co2 * s_dstPatch[1 + get_local_id(1) ][lx];\n" "sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);\n" "storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);\n" "sum = co2 * s_dstPatch[1 + get_local_id(1) ][lx+1];\n" "sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);\n" "storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);\n" "}\n" "}\n" , "e48abb0036bd5e090ad06600b018eec9"}; ProgramSource pyr_up_oclsrc(pyr_up.programStr); const struct ProgramEntry remap={"remap", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define noconvert\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T*)(addr)\n" "#define storepix(val, addr) *(__global T*)(addr) = val\n" "#define TSIZE ((int)sizeof(T))\n" "#define convertScalar(a) (a)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n" "#define TSIZE ((int)sizeof(T1)*3)\n" "#define convertScalar(a) (T)(a.x, a.y, a.z)\n" "#endif\n" "enum\n" "{\n" "INTER_BITS = 5,\n" "INTER_TAB_SIZE = 1 << INTER_BITS,\n" "INTER_TAB_SIZE2 = INTER_TAB_SIZE * INTER_TAB_SIZE\n" "};\n" "#ifdef INTER_NEAREST\n" "#define convertToWT\n" "#endif\n" "#ifdef BORDER_CONSTANT\n" "#define EXTRAPOLATE(v2, v) v = scalar;\n" "#elif defined BORDER_REPLICATE\n" "#define EXTRAPOLATE(v2, v) \\\n" "{ \\\n" "v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \\\n" "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n" "}\n" "#elif defined BORDER_WRAP\n" "#define EXTRAPOLATE(v2, v) \\\n" "{ \\\n" "if (v2.x < 0) \\\n" "v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \\\n" "if (v2.x >= src_cols) \\\n" "v2.x %= src_cols; \\\n" "\\\n" "if (v2.y < 0) \\\n" "v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \\\n" "if( v2.y >= src_rows ) \\\n" "v2.y %= src_rows; \\\n" "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n" "}\n" "#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)\n" "#ifdef BORDER_REFLECT\n" "#define DELTA int delta = 0\n" "#else\n" "#define DELTA int delta = 1\n" "#endif\n" "#define EXTRAPOLATE(v2, v) \\\n" "{ \\\n" "DELTA; \\\n" "if (src_cols == 1) \\\n" "v2.x = 0; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if( v2.x < 0 ) \\\n" "v2.x = -v2.x - 1 + delta; \\\n" "else \\\n" "v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \\\n" "} \\\n" "while (v2.x >= src_cols || v2.x < 0); \\\n" "\\\n" "if (src_rows == 1) \\\n" "v2.y = 0; \\\n" "else \\\n" "do \\\n" "{ \\\n" "if( v2.y < 0 ) \\\n" "v2.y = -v2.y - 1 + delta; \\\n" "else \\\n" "v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \\\n" "} \\\n" "while (v2.y >= src_rows || v2.y < 0); \\\n" "v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \\\n" "}\n" "#else\n" "#error No extrapolation method\n" "#endif\n" "#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)\n" "#ifdef INTER_NEAREST\n" "__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * map1ptr, int map1_step, int map1_offset,\n" "__global const uchar * map2ptr, int map2_step, int map2_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "T scalar = convertScalar(nVal);\n" "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));\n" "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const float * map1 = (__global const float *)(map1ptr + map1_index);\n" "__global const float * map2 = (__global const float *)(map2ptr + map2_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "int gx = convert_int_sat_rte(map1[0]);\n" "int gy = convert_int_sat_rte(map2[0]);\n" "if (NEED_EXTRAPOLATION(gx, gy))\n" "{\n" "#ifndef BORDER_CONSTANT\n" "int2 gxy = (int2)(gx, gy);\n" "#endif\n" "T v;\n" "EXTRAPOLATE(gxy, v)\n" "storepix(v, dst);\n" "}\n" "else\n" "{\n" "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n" "storepix(loadpix((__global const T*)(srcptr + src_index)), dst);\n" "}\n" "}\n" "}\n" "}\n" "__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * mapptr, int map_step, int map_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "T scalar = convertScalar(nVal);\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map_index += map_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const float2 * map = (__global const float2 *)(mapptr + map_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "int2 gxy = convert_int2_sat_rte(map[0]);\n" "int gx = gxy.x, gy = gxy.y;\n" "if (NEED_EXTRAPOLATION(gx, gy))\n" "{\n" "T v;\n" "EXTRAPOLATE(gxy, v)\n" "storepix(v, dst);\n" "}\n" "else\n" "{\n" "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n" "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n" "}\n" "}\n" "}\n" "}\n" "__kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * mapptr, int map_step, int map_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "T scalar = convertScalar(nVal);\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map_index += map_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const short2 * map = (__global const short2 *)(mapptr + map_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "int2 gxy = convert_int2(map[0]);\n" "int gx = gxy.x, gy = gxy.y;\n" "if (NEED_EXTRAPOLATION(gx, gy))\n" "{\n" "T v;\n" "EXTRAPOLATE(gxy, v)\n" "storepix(v, dst);\n" "}\n" "else\n" "{\n" "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n" "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n" "}\n" "}\n" "}\n" "}\n" "__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * map1ptr, int map1_step, int map1_offset,\n" "__global const uchar * map2ptr, int map2_step, int map2_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "T scalar = convertScalar(nVal);\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));\n" "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);\n" "__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1);\n" "int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0;\n" "int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0;\n" "int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy);\n" "int gx = gxy.x, gy = gxy.y;\n" "if (NEED_EXTRAPOLATION(gx, gy))\n" "{\n" "T v;\n" "EXTRAPOLATE(gxy, v)\n" "storepix(v, dst);\n" "}\n" "else\n" "{\n" "int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset));\n" "storepix(loadpix((__global const T *)(srcptr + src_index)), dst);\n" "}\n" "}\n" "}\n" "}\n" "#elif defined INTER_LINEAR\n" "__constant float coeffs[64] =\n" "{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,\n" "0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,\n" "0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,\n" "0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,\n" "0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,\n" "0.062500f, 0.937500f, 0.031250f, 0.968750f };\n" "__kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * map1ptr, int map1_step, int map1_offset,\n" "__global const uchar * map2ptr, int map2_step, int map2_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "WT scalar = convertToWT(convertScalar(nVal));\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));\n" "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index);\n" "__global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "int2 map_dataA = convert_int2(map1[0]);\n" "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n" "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n" "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n" "ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1));\n" "WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE);\n" "WT a = scalar, b = scalar, c = scalar, d = scalar;\n" "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n" "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataA, a);\n" "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n" "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataB, b);\n" "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n" "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataC, c);\n" "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n" "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataD, d);\n" "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n" "b * (u.x) * (1 - u.y) +\n" "c * (1 - u.x) * (u.y) +\n" "d * (u.x) * (u.y);\n" "storepix(convertToT(dst_data), dst);\n" "}\n" "}\n" "}\n" "__kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * map1ptr, int map1_step, int map1_offset,\n" "__global const uchar * map2ptr, int map2_step, int map2_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "WT scalar = convertToWT(convertScalar(nVal));\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));\n" "int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const float * map1 = (__global const float *)(map1ptr + map1_index);\n" "__global const float * map2 = (__global const float *)(map2ptr + map2_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "#if defined BORDER_CONSTANT\n" "float xf = map1[0], yf = map2[0];\n" "int sx = convert_int_sat_rtz(mad(xf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;\n" "int sy = convert_int_sat_rtz(mad(yf, INTER_TAB_SIZE, 0.5f)) >> INTER_BITS;\n" "__constant float * coeffs_x = coeffs + ((convert_int_rte(xf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);\n" "__constant float * coeffs_y = coeffs + ((convert_int_rte(yf * INTER_TAB_SIZE) & (INTER_TAB_SIZE - 1)) << 1);\n" "WT sum = (WT)(0), xsum;\n" "int src_index = mad24(sy, src_step, mad24(sx, TSIZE, src_offset));\n" "#pragma unroll\n" "for (int yp = 0; yp < 2; ++yp, src_index += src_step)\n" "{\n" "if (sy + yp >= 0 && sy + yp < src_rows)\n" "{\n" "xsum = (WT)(0);\n" "if (sx >= 0 && sx + 2 < src_cols)\n" "{\n" "#if depth == 0 && cn == 1\n" "uchar2 value = vload2(0, srcptr + src_index);\n" "xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));\n" "#else\n" "#pragma unroll\n" "for (int xp = 0; xp < 2; ++xp)\n" "xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);\n" "#endif\n" "}\n" "else\n" "{\n" "#pragma unroll\n" "for (int xp = 0; xp < 2; ++xp)\n" "xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?\n" "convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);\n" "}\n" "sum = fma(xsum, coeffs_y[yp], sum);\n" "}\n" "else\n" "sum = fma(scalar, coeffs_y[yp], sum);\n" "}\n" "storepix(convertToT(sum), dst);\n" "#else\n" "float2 map_data = (float2)(map1[0], map2[0]);\n" "int2 map_dataA = convert_int2_sat_rtn(map_data);\n" "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n" "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n" "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n" "float2 _u = map_data - convert_float2(map_dataA);\n" "WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;\n" "WT scalar = convertToWT(convertScalar(nVal));\n" "WT a = scalar, b = scalar, c = scalar, d = scalar;\n" "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n" "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataA, a);\n" "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n" "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataB, b);\n" "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n" "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataC, c);\n" "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n" "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataD, d);\n" "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n" "b * (u.x) * (1 - u.y) +\n" "c * (1 - u.x) * (u.y) +\n" "d * (u.x) * (u.y);\n" "storepix(convertToT(dst_data), dst);\n" "#endif\n" "}\n" "}\n" "}\n" "__kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * mapptr, int map_step, int map_offset,\n" "ST nVal)\n" "{\n" "int x = get_global_id(0);\n" "int y = get_global_id(1) * rowsPerWI;\n" "if (x < dst_cols)\n" "{\n" "WT scalar = convertToWT(convertScalar(nVal));\n" "int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));\n" "int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));\n" "#pragma unroll\n" "for (int i = 0; i < rowsPerWI; ++i, ++y,\n" "map_index += map_step, dst_index += dst_step)\n" "if (y < dst_rows)\n" "{\n" "__global const float2 * map = (__global const float2 *)(mapptr + map_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "float2 map_data = map[0];\n" "int2 map_dataA = convert_int2_sat_rtn(map_data);\n" "int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);\n" "int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);\n" "int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);\n" "float2 _u = map_data - convert_float2(map_dataA);\n" "WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;\n" "WT a = scalar, b = scalar, c = scalar, d = scalar;\n" "if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))\n" "a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataA, a);\n" "if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))\n" "b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataB, b);\n" "if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))\n" "c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataC, c);\n" "if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))\n" "d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));\n" "else\n" "EXTRAPOLATE(map_dataD, d);\n" "WT dst_data = a * (1 - u.x) * (1 - u.y) +\n" "b * (u.x) * (1 - u.y) +\n" "c * (1 - u.x) * (u.y) +\n" "d * (u.x) * (u.y);\n" "storepix(convertToT(dst_data), dst);\n" "}\n" "}\n" "}\n" "#endif\n" , "6833b9a226d061c1ff80509eed0dd178"}; ProgramSource remap_oclsrc(remap.programStr); const struct ProgramEntry resize={"resize", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)\n" "#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)\n" "#define INC(x,l) min(x+1,l-1)\n" "#define noconvert\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T *)(addr)\n" "#define storepix(val, addr) *(__global T *)(addr) = val\n" "#define TSIZE (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1 *)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))\n" "#define TSIZE (int)sizeof(T1)*cn\n" "#endif\n" "#if defined USE_SAMPLER\n" "#if cn == 1\n" "#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).x\n" "#define INTERMEDIATE_TYPE float\n" "#elif cn == 2\n" "#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).xy\n" "#define INTERMEDIATE_TYPE float2\n" "#elif cn == 3\n" "#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).xyz\n" "#define INTERMEDIATE_TYPE float3\n" "#elif cn == 4\n" "#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z)\n" "#define INTERMEDIATE_TYPE float4\n" "#endif\n" "#define __CAT(x, y) x##y\n" "#define CAT(x, y) __CAT(x, y)\n" "#define float1 float\n" "#if depth == 0\n" "#define RESULT_SCALE 255.0f\n" "#elif depth == 1\n" "#define RESULT_SCALE 127.0f\n" "#elif depth == 2\n" "#define RESULT_SCALE 65535.0f\n" "#elif depth == 3\n" "#define RESULT_SCALE 32767.0f\n" "#else\n" "#define RESULT_SCALE 1.0f\n" "#endif\n" "__kernel void resizeSampler(__read_only image2d_t srcImage,\n" "__global uchar* dstptr, int dststep, int dstoffset,\n" "int dstrows, int dstcols,\n" "float ifx, float ify)\n" "{\n" "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n" "CLK_ADDRESS_CLAMP_TO_EDGE |\n" "CLK_FILTER_LINEAR;\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);\n" "INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));\n" "#if depth <= 4\n" "T uval = convertToDT(round(intermediate * RESULT_SCALE));\n" "#else\n" "T uval = convertToDT(intermediate * RESULT_SCALE);\n" "#endif\n" "if(dx < dstcols && dy < dstrows)\n" "{\n" "storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));\n" "}\n" "}\n" "#elif defined INTER_LINEAR_INTEGER\n" "__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__global const uchar * buffer)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "__global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;\n" "__global const short * ialpha = (__global const short *)(yofs + dst_rows);\n" "__global const short * ibeta = ialpha + ((dst_cols + dy) << 1);\n" "ialpha += dx << 1;\n" "int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),\n" "sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);\n" "short a0 = ialpha[0], a1 = ialpha[1];\n" "short b0 = ibeta[0], b1 = ibeta[1];\n" "int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),\n" "src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));\n" "WT data0 = convertToWT(loadpix(srcptr + src_index0));\n" "WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));\n" "WT data2 = convertToWT(loadpix(srcptr + src_index1));\n" "WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));\n" "WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +\n" "( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);\n" "storepix(convertToDT((val + 2) >> 2),\n" "dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n" "}\n" "}\n" "#elif defined INTER_LINEAR\n" "__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "float ifx, float ify)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);\n" "int x = floor(sx), y = floor(sy);\n" "float u = sx - x, v = sy - y;\n" "if ( x<0 ) x=0,u=0;\n" "if ( x>=src_cols ) x=src_cols-1,u=0;\n" "if ( y<0 ) y=0,v=0;\n" "if ( y>=src_rows ) y=src_rows-1,v=0;\n" "int y_ = INC(y, src_rows);\n" "int x_ = INC(x, src_cols);\n" "#if depth <= 4\n" "u = u * INTER_RESIZE_COEF_SCALE;\n" "v = v * INTER_RESIZE_COEF_SCALE;\n" "int U = rint(u);\n" "int V = rint(v);\n" "int U1 = rint(INTER_RESIZE_COEF_SCALE - u);\n" "int V1 = rint(INTER_RESIZE_COEF_SCALE - v);\n" "WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));\n" "WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));\n" "WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));\n" "WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));\n" "WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +\n" "mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);\n" "T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);\n" "#else\n" "float u1 = 1.f - u;\n" "float v1 = 1.f - v;\n" "WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));\n" "WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));\n" "WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));\n" "WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));\n" "T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;\n" "#endif\n" "storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n" "}\n" "}\n" "#elif defined INTER_NEAREST\n" "__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "float ifx, float ify)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "float s1 = dx * ifx;\n" "float s2 = dy * ify;\n" "int sx = min(convert_int_rtz(s1), src_cols - 1);\n" "int sy = min(convert_int_rtz(s2), src_rows - 1);\n" "storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),\n" "dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));\n" "}\n" "}\n" "#elif defined INTER_AREA\n" "#ifdef INTER_AREA_FAST\n" "__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "int dst_index = mad24(dy, dst_step, dst_offset);\n" "int sx = XSCALE * dx;\n" "int sy = YSCALE * dy;\n" "WTV sum = (WTV)(0);\n" "#pragma unroll\n" "for (int py = 0; py < YSCALE; ++py)\n" "{\n" "int y = min(sy + py, src_rows - 1);\n" "int src_index = mad24(y, src_step, src_offset);\n" "#pragma unroll\n" "for (int px = 0; px < XSCALE; ++px)\n" "{\n" "int x = min(sx + px, src_cols - 1);\n" "sum += convertToWTV(loadpix(src + src_index + x*TSIZE));\n" "}\n" "}\n" "storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));\n" "}\n" "}\n" "#else\n" "__kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "float ifx, float ify, __global const int * ofs_tab,\n" "__global const int * map_tab, __global const float * alpha_tab)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "int dst_index = mad24(dy, dst_step, dst_offset);\n" "__global const int * xmap_tab = map_tab;\n" "__global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));\n" "__global const float * xalpha_tab = alpha_tab;\n" "__global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));\n" "__global const int * xofs_tab = ofs_tab;\n" "__global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);\n" "int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];\n" "int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];\n" "int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];\n" "int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];\n" "WTV sum = (WTV)(0), buf;\n" "int src_index = mad24(sy0, src_step, src_offset);\n" "for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)\n" "{\n" "WTV beta = (WTV)(yalpha_tab[yk]);\n" "buf = (WTV)(0);\n" "for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)\n" "{\n" "WTV alpha = (WTV)(xalpha_tab[xk]);\n" "buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;\n" "}\n" "sum += buf * beta;\n" "}\n" "storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));\n" "}\n" "}\n" "#endif\n" "#endif\n" , "3e1ea3c21fc70a7a9166d5cc66b7ff80"}; ProgramSource resize_oclsrc(resize.programStr); const struct ProgramEntry threshold={"threshold", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#endif\n" "__kernel void threshold(__global const uchar * srcptr, int src_step, int src_offset,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols,\n" "T1 thresh, T1 max_val, T1 min_val)\n" "{\n" "int gx = get_global_id(0);\n" "int gy = get_global_id(1) * STRIDE_SIZE;\n" "if (gx < cols)\n" "{\n" "int src_index = mad24(gy, src_step, mad24(gx, (int)sizeof(T), src_offset));\n" "int dst_index = mad24(gy, dst_step, mad24(gx, (int)sizeof(T), dst_offset));\n" "#pragma unroll\n" "for (int i = 0; i < STRIDE_SIZE; i++)\n" "{\n" "if (gy < rows)\n" "{\n" "T sdata = *(__global const T *)(srcptr + src_index);\n" "__global T * dst = (__global T *)(dstptr + dst_index);\n" "#ifdef THRESH_BINARY\n" "dst[0] = sdata > (thresh) ? (T)(max_val) : (T)(0);\n" "#elif defined THRESH_BINARY_INV\n" "dst[0] = sdata > (thresh) ? (T)(0) : (T)(max_val);\n" "#elif defined THRESH_TRUNC\n" "dst[0] = clamp(sdata, (T)min_val, (T)(thresh));\n" "#elif defined THRESH_TOZERO\n" "dst[0] = sdata > (thresh) ? sdata : (T)(0);\n" "#elif defined THRESH_TOZERO_INV\n" "dst[0] = sdata > (thresh) ? (T)(0) : sdata;\n" "#endif\n" "gy++;\n" "src_index += src_step;\n" "dst_index += dst_step;\n" "}\n" "}\n" "}\n" "}\n" , "f464151682565a20de380a62e09ae458"}; ProgramSource threshold_oclsrc(threshold.programStr); const struct ProgramEntry warp_affine={"warp_affine", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#define CT double\n" "#else\n" "#define CT float\n" "#endif\n" "#define INTER_BITS 5\n" "#define INTER_TAB_SIZE (1 << INTER_BITS)\n" "#define INTER_SCALE 1.f/INTER_TAB_SIZE\n" "#define AB_BITS max(10, (int)INTER_BITS)\n" "#define AB_SCALE (1 << AB_BITS)\n" "#define INTER_REMAP_COEF_BITS 15\n" "#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)\n" "#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))\n" "#define noconvert\n" "#ifndef ST\n" "#define ST T\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T*)(addr)\n" "#define storepix(val, addr) *(__global T*)(addr) = val\n" "#define scalar scalar_\n" "#define pixsize (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n" "#ifdef INTER_NEAREST\n" "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n" "#else\n" "#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)\n" "#endif\n" "#define pixsize ((int)sizeof(T1)*3)\n" "#endif\n" "#ifdef INTER_NEAREST\n" "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy0 = get_global_id(1) * rowsPerWI;\n" "if (dx < dst_cols)\n" "{\n" "int round_delta = (AB_SCALE >> 1);\n" "int X0_ = rint(M[0] * dx * AB_SCALE);\n" "int Y0_ = rint(M[3] * dx * AB_SCALE);\n" "int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));\n" "for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)\n" "{\n" "int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;\n" "int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;\n" "short sx = convert_short_sat(X0 >> AB_BITS);\n" "short sy = convert_short_sat(Y0 >> AB_BITS);\n" "if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)\n" "{\n" "int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset));\n" "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n" "}\n" "else\n" "storepix(scalar, dstptr + dst_index);\n" "}\n" "}\n" "}\n" "#elif defined INTER_LINEAR\n" "__constant float coeffs[64] =\n" "{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,\n" "0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,\n" "0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,\n" "0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,\n" "0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,\n" "0.062500f, 0.937500f, 0.031250f, 0.968750f };\n" "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy0 = get_global_id(1) * rowsPerWI;\n" "if (dx < dst_cols)\n" "{\n" "int tmp = dx << AB_BITS;\n" "int X0_ = rint(M[0] * tmp);\n" "int Y0_ = rint(M[3] * tmp);\n" "for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)\n" "{\n" "int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;\n" "int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;\n" "X0 = X0 >> (AB_BITS - INTER_BITS);\n" "Y0 = Y0 >> (AB_BITS - INTER_BITS);\n" "short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);\n" "short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));\n" "#if defined AMD_DEVICE || depth > 4\n" "WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;\n" "if (sx >= 0 && sx < src_cols)\n" "{\n" "if (sy >= 0 && sy < src_rows)\n" "v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));\n" "if (sy+1 >= 0 && sy+1 < src_rows)\n" "v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));\n" "}\n" "if (sx+1 >= 0 && sx+1 < src_cols)\n" "{\n" "if (sy >= 0 && sy < src_rows)\n" "v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));\n" "if (sy+1 >= 0 && sy+1 < src_rows)\n" "v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));\n" "}\n" "float taby = 1.f/INTER_TAB_SIZE*ay;\n" "float tabx = 1.f/INTER_TAB_SIZE*ax;\n" "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n" "#if depth <= 4\n" "int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n" "int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );\n" "int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n" "int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );\n" "WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));\n" "storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);\n" "#else\n" "float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;\n" "WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));\n" "storepix(convertToT(val), dstptr + dst_index);\n" "#endif\n" "#else\n" "__constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);\n" "int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;\n" "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n" "WT sum = (WT)(0), xsum;\n" "#pragma unroll\n" "for (int y = 0; y < 2; y++)\n" "{\n" "src_index = mad24(y, src_step, src_index0);\n" "if (sy + y >= 0 && sy + y < src_rows)\n" "{\n" "xsum = (WT)(0);\n" "if (sx >= 0 && sx + 2 < src_cols)\n" "{\n" "#if depth == 0 && cn == 1\n" "uchar2 value = vload2(0, srcptr + src_index);\n" "xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));\n" "#else\n" "#pragma unroll\n" "for (int x = 0; x < 2; x++)\n" "xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);\n" "#endif\n" "}\n" "else\n" "{\n" "#pragma unroll\n" "for (int x = 0; x < 2; x++)\n" "xsum = fma(sx + x >= 0 && sx + x < src_cols ?\n" "convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);\n" "}\n" "sum = fma(xsum, coeffs_y[y], sum);\n" "}\n" "else\n" "sum = fma(scalar, coeffs_y[y], sum);\n" "}\n" "storepix(convertToT(sum), dstptr + dst_index);\n" "#endif\n" "}\n" "}\n" "}\n" "#elif defined INTER_CUBIC\n" "#ifdef AMD_DEVICE\n" "inline void interpolateCubic( float x, float* coeffs )\n" "{\n" "const float A = -0.75f;\n" "coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A);\n" "coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f);\n" "coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f);\n" "coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];\n" "}\n" "#else\n" "__constant float coeffs[128] =\n" "{ 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,\n" "-0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,\n" "-0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,\n" "-0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,\n" "-0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,\n" "-0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,\n" "-0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,\n" "-0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,\n" "-0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,\n" "-0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,\n" "-0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };\n" "#endif\n" "__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "int tmp = (dx << AB_BITS);\n" "int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;\n" "int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;\n" "X0 = X0 >> (AB_BITS - INTER_BITS);\n" "Y0 = Y0 >> (AB_BITS - INTER_BITS);\n" "int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;\n" "int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));\n" "#ifdef AMD_DEVICE\n" "WT v[16];\n" "#pragma unroll\n" "for (int y = 0; y < 4; y++)\n" "{\n" "if (sy+y >= 0 && sy+y < src_rows)\n" "{\n" "#pragma unroll\n" "for (int x = 0; x < 4; x++)\n" "v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?\n" "convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;\n" "}\n" "else\n" "{\n" "#pragma unroll\n" "for (int x = 0; x < 4; x++)\n" "v[mad24(y, 4, x)] = scalar;\n" "}\n" "}\n" "float tab1y[4], tab1x[4];\n" "float ayy = INTER_SCALE * ay;\n" "float axx = INTER_SCALE * ax;\n" "interpolateCubic(ayy, tab1y);\n" "interpolateCubic(axx, tab1x);\n" "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n" "WT sum = (WT)(0);\n" "#if depth <= 4\n" "int itab[16];\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "sum = mad24(v[i], itab[i], sum);\n" "storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);\n" "#else\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);\n" "storepix(convertToT( sum ), dstptr + dst_index);\n" "#endif\n" "#else\n" "__constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);\n" "int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;\n" "int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));\n" "WT sum = (WT)(0), xsum;\n" "#pragma unroll\n" "for (int y = 0; y < 4; y++)\n" "{\n" "src_index = mad24(y, src_step, src_index0);\n" "if (sy + y >= 0 && sy + y < src_rows)\n" "{\n" "xsum = (WT)(0);\n" "if (sx >= 0 && sx + 4 < src_cols)\n" "{\n" "#if depth == 0 && cn == 1\n" "uchar4 value = vload4(0, srcptr + src_index);\n" "xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));\n" "#else\n" "#pragma unroll\n" "for (int x = 0; x < 4; x++)\n" "xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);\n" "#endif\n" "}\n" "else\n" "{\n" "#pragma unroll\n" "for (int x = 0; x < 4; x++)\n" "xsum = fma(sx + x >= 0 && sx + x < src_cols ?\n" "convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);\n" "}\n" "sum = fma(xsum, coeffs_y[y], sum);\n" "}\n" "else\n" "sum = fma(scalar, coeffs_y[y], sum);\n" "}\n" "storepix(convertToT(sum), dstptr + dst_index);\n" "#endif\n" "}\n" "}\n" "#endif\n" , "582cfe4cf8dd76973e63698796247546"}; ProgramSource warp_affine_oclsrc(warp_affine.programStr); const struct ProgramEntry warp_perspective={"warp_perspective", "#ifdef DOUBLE_SUPPORT\n" "#ifdef cl_amd_fp64\n" "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" "#elif defined (cl_khr_fp64)\n" "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" "#endif\n" "#define CT double\n" "#else\n" "#define CT float\n" "#endif\n" "#define INTER_BITS 5\n" "#define INTER_TAB_SIZE (1 << INTER_BITS)\n" "#define INTER_SCALE 1.f / INTER_TAB_SIZE\n" "#define AB_BITS max(10, (int)INTER_BITS)\n" "#define AB_SCALE (1 << AB_BITS)\n" "#define INTER_REMAP_COEF_BITS 15\n" "#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)\n" "#define noconvert\n" "#ifndef ST\n" "#define ST T\n" "#endif\n" "#if cn != 3\n" "#define loadpix(addr) *(__global const T*)(addr)\n" "#define storepix(val, addr) *(__global T*)(addr) = val\n" "#define scalar scalar_\n" "#define pixsize (int)sizeof(T)\n" "#else\n" "#define loadpix(addr) vload3(0, (__global const T1*)(addr))\n" "#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))\n" "#ifdef INTER_NEAREST\n" "#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)\n" "#else\n" "#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)\n" "#endif\n" "#define pixsize ((int)sizeof(T1)*3)\n" "#endif\n" "#ifdef INTER_NEAREST\n" "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "CT X0 = M[0] * dx + M[1] * dy + M[2];\n" "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n" "CT W = M[6] * dx + M[7] * dy + M[8];\n" "W = W != 0.0f ? 1.f / W : 0.0f;\n" "short sx = convert_short_sat_rte(X0*W);\n" "short sy = convert_short_sat_rte(Y0*W);\n" "int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);\n" "if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)\n" "{\n" "int src_index = mad24(sy, src_step, sx * pixsize + src_offset);\n" "storepix(loadpix(srcptr + src_index), dstptr + dst_index);\n" "}\n" "else\n" "storepix(scalar, dstptr + dst_index);\n" "}\n" "}\n" "#elif defined INTER_LINEAR\n" "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "CT X0 = M[0] * dx + M[1] * dy + M[2];\n" "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n" "CT W = M[6] * dx + M[7] * dy + M[8];\n" "W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;\n" "int X = rint(X0 * W), Y = rint(Y0 * W);\n" "short sx = convert_short_sat(X >> INTER_BITS);\n" "short sy = convert_short_sat(Y >> INTER_BITS);\n" "short ay = (short)(Y & (INTER_TAB_SIZE - 1));\n" "short ax = (short)(X & (INTER_TAB_SIZE - 1));\n" "WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?\n" "convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;\n" "WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?\n" "convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;\n" "WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?\n" "convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;\n" "WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?\n" "convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;\n" "float taby = 1.f/INTER_TAB_SIZE*ay;\n" "float tabx = 1.f/INTER_TAB_SIZE*ax;\n" "int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);\n" "#if depth <= 4\n" "int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n" "int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );\n" "int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );\n" "int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );\n" "WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;\n" "storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);\n" "#else\n" "float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;\n" "WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;\n" "storepix(convertToT(val), dstptr + dst_index);\n" "#endif\n" "}\n" "}\n" "#elif defined INTER_CUBIC\n" "inline void interpolateCubic( float x, float* coeffs )\n" "{\n" "const float A = -0.75f;\n" "coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;\n" "coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;\n" "coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;\n" "coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];\n" "}\n" "__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,\n" "__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,\n" "__constant CT * M, ST scalar_)\n" "{\n" "int dx = get_global_id(0);\n" "int dy = get_global_id(1);\n" "if (dx < dst_cols && dy < dst_rows)\n" "{\n" "CT X0 = M[0] * dx + M[1] * dy + M[2];\n" "CT Y0 = M[3] * dx + M[4] * dy + M[5];\n" "CT W = M[6] * dx + M[7] * dy + M[8];\n" "W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f;\n" "int X = rint(X0 * W), Y = rint(Y0 * W);\n" "short sx = convert_short_sat(X >> INTER_BITS) - 1;\n" "short sy = convert_short_sat(Y >> INTER_BITS) - 1;\n" "short ay = (short)(Y & (INTER_TAB_SIZE-1));\n" "short ax = (short)(X & (INTER_TAB_SIZE-1));\n" "WT v[16];\n" "#pragma unroll\n" "for (int y = 0; y < 4; y++)\n" "#pragma unroll\n" "for (int x = 0; x < 4; x++)\n" "v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?\n" "convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;\n" "float tab1y[4], tab1x[4];\n" "float ayy = INTER_SCALE * ay;\n" "float axx = INTER_SCALE * ax;\n" "interpolateCubic(ayy, tab1y);\n" "interpolateCubic(axx, tab1x);\n" "int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);\n" "WT sum = (WT)(0);\n" "#if depth <= 4\n" "int itab[16];\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "sum += v[i] * itab[i];\n" "storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);\n" "#else\n" "#pragma unroll\n" "for (int i = 0; i < 16; i++)\n" "sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];\n" "storepix(convertToT( sum ), dstptr + dst_index);\n" "#endif\n" "}\n" "}\n" "#endif\n" , "1449b5059b082c4595846a86ed5702ad"}; ProgramSource warp_perspective_oclsrc(warp_perspective.programStr); } }}