opencl_kernels_video.cpp - OpenGrok cross reference for /external/opencv3/modules/video/opencl_kernels

Lines Matching full:n
14 "#if CN==1\n"
15 "#define T_MEAN float\n"
16 "#define F_ZERO (0.0f)\n"
17 "#define cnMode 1\n"
18 "#define frameToMean(a, b) (b) = *(a);\n"
19 "#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n"
20 "inline float sum(float val)\n"
21 "{\n"
22 "return val;\n"
23 "}\n"
24 "#else\n"
25 "#define T_MEAN float4\n"
26 "#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n"
27 "#define cnMode 4\n"
28 "#define meanToFrame(a, b)\\\n"
29 "b[0] = convert_uchar_sat(a.x); \\\n"
30 "b[1] = convert_uchar_sat(a.y); \\\n"
31 "b[2] = convert_uchar_sat(a.z);\n"
32 "#define frameToMean(a, b)\\\n"
33 "b.x = a[0]; \\\n"
34 "b.y = a[1]; \\\n"
35 "b.z = a[2]; \\\n"
36 "b.w = 0.0f;\n"
37 "inline float sum(const float4 val)\n"
38 "{\n"
39 "return (val.x + val.y + val.z);\n"
40 "}\n"
41 "#endif\n"
42 …nel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n"
43 "__global uchar* modesUsed,\n"
44 "__global uchar* weight,\n"
45 "__global uchar* mean,\n"
46 "__global uchar* variance,\n"
47 "__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n"
48 "float alphaT, float alpha1, float prune,\n"
49 "float c_Tb, float c_TB, float c_Tg, float c_varMin,\n"
50 "float c_varMax, float c_varInit, float c_tau\n"
51 "#ifdef SHADOW_DETECT\n"
52 ", uchar c_shadowVal\n"
53 "#endif\n"
54 ")\n"
55 "{\n"
56 "int x = get_global_id(0);\n"
57 "int y = get_global_id(1);\n"
58 "if( x < frame_col && y < frame_row)\n"
59 "{\n"
60 "__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n"
61 "T_MEAN pix;\n"
62 "frameToMean(_frame, pix);\n"
63 "uchar foreground = 255;\n"
64 "bool fitsPDF = false;\n"
65 "int pt_idx =  mad24(y, frame_col, x);\n"
66 "int idx_step = frame_row * frame_col;\n"
67 "__global uchar* _modesUsed = modesUsed + pt_idx;\n"
68 "uchar nmodes = _modesUsed[0];\n"
69 "float totalWeight = 0.0f;\n"
70 "__global float* _weight = (__global float*)(weight);\n"
71 "__global float* _variance = (__global float*)(variance);\n"
72 "__global T_MEAN* _mean = (__global T_MEAN*)(mean);\n"
73 "uchar mode = 0;\n"
74 "for (; mode < nmodes; ++mode)\n"
75 "{\n"
76 "int mode_idx = mad24(mode, idx_step, pt_idx);\n"
77 "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n"
78 "float c_var = _variance[mode_idx];\n"
79 "T_MEAN c_mean = _mean[mode_idx];\n"
80 "T_MEAN diff = c_mean - pix;\n"
81 "float dist2 = dot(diff, diff);\n"
82 "if (totalWeight < c_TB && dist2 < c_Tb * c_var)\n"
83 "foreground = 0;\n"
84 "if (dist2 < c_Tg * c_var)\n"
85 "{\n"
86 "fitsPDF = true;\n"
87 "c_weight += alphaT;\n"
88 "float k = alphaT / c_weight;\n"
89 "T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);\n"
90 "float variance_new  = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);\n"
91 "for (int i = mode; i > 0; --i)\n"
92 "{\n"
93 "int prev_idx = mode_idx - idx_step;\n"
94 "if (c_weight < _weight[prev_idx])\n"
95 "break;\n"
96 "_weight[mode_idx]   = _weight[prev_idx];\n"
97 "_variance[mode_idx] = _variance[prev_idx];\n"
98 "_mean[mode_idx]     = _mean[prev_idx];\n"
99 "mode_idx = prev_idx;\n"
100 "}\n"
101 "_mean[mode_idx]     = mean_new;\n"
102 "_variance[mode_idx] = variance_new;\n"
103 "_weight[mode_idx]   = c_weight;\n"
104 "totalWeight += c_weight;\n"
105 "mode ++;\n"
106 "break;\n"
107 "}\n"
108 "if (c_weight < -prune)\n"
109 "c_weight = 0.0f;\n"
110 "_weight[mode_idx] = c_weight;\n"
111 "totalWeight += c_weight;\n"
112 "}\n"
113 "for (; mode < nmodes; ++mode)\n"
114 "{\n"
115 "int mode_idx = mad24(mode, idx_step, pt_idx);\n"
116 "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n"
117 "if (c_weight < -prune)\n"
118 "{\n"
119 "c_weight = 0.0f;\n"
120 "nmodes = mode;\n"
121 "break;\n"
122 "}\n"
123 "_weight[mode_idx] = c_weight;\n"
124 "totalWeight += c_weight;\n"
125 "}\n"
126 "if (0.f < totalWeight)\n"
127 "{\n"
128 "totalWeight = 1.f / totalWeight;\n"
129 "for (int mode = 0; mode < nmodes; ++mode)\n"
130 "_weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;\n"
131 "}\n"
132 "if (!fitsPDF)\n"
133 "{\n"
134 "uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;\n"
135 "int mode_idx = mad24(mode, idx_step, pt_idx);\n"
136 "if (nmodes == 1)\n"
137 "_weight[mode_idx] = 1.f;\n"
138 "else\n"
139 "{\n"
140 "_weight[mode_idx] = alphaT;\n"
141 "for (int i = pt_idx; i < mode_idx; i += idx_step)\n"
142 "_weight[i] *= alpha1;\n"
143 "}\n"
144 "for (int i = nmodes - 1; i > 0; --i)\n"
145 "{\n"
146 "int prev_idx = mode_idx - idx_step;\n"
147 "if (alphaT < _weight[prev_idx])\n"
148 "break;\n"
149 "_weight[mode_idx]   = _weight[prev_idx];\n"
150 "_variance[mode_idx] = _variance[prev_idx];\n"
151 "_mean[mode_idx]     = _mean[prev_idx];\n"
152 "mode_idx = prev_idx;\n"
153 "}\n"
154 "_mean[mode_idx] = pix;\n"
155 "_variance[mode_idx] = c_varInit;\n"
156 "}\n"
157 "_modesUsed[0] = nmodes;\n"
158 "#ifdef SHADOW_DETECT\n"
159 "if (foreground)\n"
160 "{\n"
161 "float tWeight = 0.0f;\n"
162 "for (uchar mode = 0; mode < nmodes; ++mode)\n"
163 "{\n"
164 "int mode_idx = mad24(mode, idx_step, pt_idx);\n"
165 "T_MEAN c_mean = _mean[mode_idx];\n"
166 "T_MEAN pix_mean = pix * c_mean;\n"
167 "float numerator = sum(pix_mean);\n"
168 "float denominator = dot(c_mean, c_mean);\n"
169 "if (denominator == 0)\n"
170 "break;\n"
171 "if (numerator <= denominator && numerator >= c_tau * denominator)\n"
172 "{\n"
173 "float a = numerator / denominator;\n"
174 "T_MEAN dD = mad(a, c_mean, -pix);\n"
175 "if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)\n"
176 "{\n"
177 "foreground = c_shadowVal;\n"
178 "break;\n"
179 "}\n"
180 "}\n"
181 "tWeight += _weight[mode_idx];\n"
182 "if (tWeight > c_TB)\n"
183 "break;\n"
184 "}\n"
185 "}\n"
186 "#endif\n"
187 "__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n"
188 "*_fgmask = (uchar)foreground;\n"
189 "}\n"
190 "}\n"
191 "__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,\n"
192 "__global const uchar* weight,\n"
193 "__global const uchar* mean,\n"
194 "__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,\n"
195 "float c_TB)\n"
196 "{\n"
197 "int x = get_global_id(0);\n"
198 "int y = get_global_id(1);\n"
199 "if(x < dst_col && y < dst_row)\n"
200 "{\n"
201 "int pt_idx =  mad24(y, dst_col, x);\n"
202 "__global const uchar* _modesUsed = modesUsed + pt_idx;\n"
203 "uchar nmodes = _modesUsed[0];\n"
204 "T_MEAN meanVal = (T_MEAN)F_ZERO;\n"
205 "float totalWeight = 0.0f;\n"
206 "__global const float* _weight = (__global const float*)weight;\n"
207 "__global const T_MEAN* _mean = (__global const T_MEAN*)(mean);\n"
208 "int idx_step = dst_row * dst_col;\n"
209 "for (uchar mode = 0; mode < nmodes; ++mode)\n"
210 "{\n"
211 "int mode_idx = mad24(mode, idx_step, pt_idx);\n"
212 "float c_weight = _weight[mode_idx];\n"
213 "T_MEAN c_mean = _mean[mode_idx];\n"
214 "meanVal = mad(c_weight, c_mean, meanVal);\n"
215 "totalWeight += c_weight;\n"
216 "if (totalWeight > c_TB)\n"
217 "break;\n"
218 "}\n"
219 "if (0.f < totalWeight)\n"
220 "meanVal = meanVal / totalWeight;\n"
221 "else\n"
222 "meanVal = (T_MEAN)(0.f);\n"
223 "__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n"
224 "meanToFrame(meanVal, _dst);\n"
225 "}\n"
226 "}\n"
230 "#define tx  (int)get_local_id(0)\n"
231 "#define ty  get_local_id(1)\n"
232 "#define bx  get_group_id(0)\n"
233 "#define bdx (int)get_local_size(0)\n"
234 "#define BORDER_SIZE 5\n"
235 "#define MAX_KSIZE_HALF 100\n"
236 "#ifndef polyN\n"
237 "#define polyN 5\n"
238 "#endif\n"
239 "#if USE_DOUBLE\n"
240 "#ifdef cl_amd_fp64\n"
241 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n"
242 "#elif defined (cl_khr_fp64)\n"
243 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n"
244 "#endif\n"
245 "#define TYPE double\n"
246 "#define VECTYPE double4\n"
247 "#else\n"
248 "#define TYPE float\n"
249 "#define VECTYPE float4\n"
250 "#endif\n"
251 "__kernel void polynomialExpansion(__global __const float * src, int srcStep,\n"
252 "__global float * dst, int dstStep,\n"
253 "const int rows, const  int cols,\n"
254 "__global __const float * c_g,\n"
255 "__global __const float * c_xg,\n"
256 "__global __const float * c_xxg,\n"
257 "__local float * smem,\n"
258 "const VECTYPE ig)\n"
259 "{\n"
260 "const int y = get_global_id(1);\n"
261 "const int x = bx * (bdx - 2*polyN) + tx - polyN;\n"
262 "int xWarped;\n"
263 "__local float *row = smem + tx;\n"
264 "if (y < rows && y >= 0)\n"
265 "{\n"
266 "xWarped = min(max(x, 0), cols - 1);\n"
267 "row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];\n"
268 "row[bdx] = 0.f;\n"
269 "row[2*bdx] = 0.f;\n"
270 "#pragma unroll\n"
271 "for (int k = 1; k <= polyN; ++k)\n"
272 "{\n"
273 "float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];\n"
274 "float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];\n"
275 "row[0] += c_g[k] * (t0 + t1);\n"
276 "row[bdx] += c_xg[k] * (t1 - t0);\n"
277 "row[2*bdx] += c_xxg[k] * (t0 + t1);\n"
278 "}\n"
279 "}\n"
280 "barrier(CLK_LOCAL_MEM_FENCE);\n"
281 "if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)\n"
282 "{\n"
283 "TYPE b1 = c_g[0] * row[0];\n"
284 "TYPE b3 = c_g[0] * row[bdx];\n"
285 "TYPE b5 = c_g[0] * row[2*bdx];\n"
286 "TYPE b2 = 0, b4 = 0, b6 = 0;\n"
287 "#pragma unroll\n"
288 "for (int k = 1; k <= polyN; ++k)\n"
289 "{\n"
290 "b1 += (row[k] + row[-k]) * c_g[k];\n"
291 "b4 += (row[k] + row[-k]) * c_xxg[k];\n"
292 "b2 += (row[k] - row[-k]) * c_xg[k];\n"
293 "b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];\n"
294 "b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];\n"
295 "b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];\n"
296 "}\n"
297 "dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);\n"
298 "dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);\n"
299 "dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);\n"
300 "dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);\n"
301 "dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);\n"
302 "}\n"
303 "}\n"
304 "inline int idx_row_low(const int y, const int last_row)\n"
305 "{\n"
306 "return abs(y) % (last_row + 1);\n"
307 "}\n"
308 "inline int idx_row_high(const int y, const int last_row)\n"
309 "{\n"
310 "return abs(last_row - abs(last_row - y)) % (last_row + 1);\n"
311 "}\n"
312 "inline int idx_col_low(const int x, const int last_col)\n"
313 "{\n"
314 "return abs(x) % (last_col + 1);\n"
315 "}\n"
316 "inline int idx_col_high(const int x, const int last_col)\n"
317 "{\n"
318 "return abs(last_col - abs(last_col - x)) % (last_col + 1);\n"
319 "}\n"
320 "inline int idx_col(const int x, const int last_col)\n"
321 "{\n"
322 "return idx_col_low(idx_col_high(x, last_col), last_col);\n"
323 "}\n"
324 "__kernel void gaussianBlur(__global const float * src, int srcStep,\n"
325 "__global float * dst, int dstStep, const int rows, const  int cols,\n"
326 "__global const float * c_gKer, const int ksizeHalf,\n"
327 "__local float * smem)\n"
328 "{\n"
329 "const int y = get_global_id(1);\n"
330 "const int x = get_global_id(0);\n"
331 "__local float *row = smem + ty * (bdx + 2*ksizeHalf);\n"
332 "if (y < rows)\n"
333 "{\n"
334 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
335 "{\n"
336 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
337 "xExt = idx_col(xExt, cols - 1);\n"
338 "row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];\n"
339 "for (int j = 1; j <= ksizeHalf; ++j)\n"
340 "row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]\n"
341 "+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n"
342 "}\n"
343 "}\n"
344 "barrier(CLK_LOCAL_MEM_FENCE);\n"
345 "if (y < rows && y >= 0 && x < cols && x >= 0)\n"
346 "{\n"
347 "row += tx + ksizeHalf;\n"
348 "float res = row[0] * c_gKer[0];\n"
349 "for (int i = 1; i <= ksizeHalf; ++i)\n"
350 "res += (row[-i] + row[i]) * c_gKer[i];\n"
351 "dst[mad24(y, dstStep, x)] = res;\n"
352 "}\n"
353 "}\n"
354 "__kernel void gaussianBlur5(__global const float * src, int srcStep,\n"
355 "__global float * dst, int dstStep,\n"
356 "const int rows, const  int cols,\n"
357 "__global const float * c_gKer, const int ksizeHalf,\n"
358 "__local float * smem)\n"
359 "{\n"
360 "const int y = get_global_id(1);\n"
361 "const int x = get_global_id(0);\n"
362 "const int smw = bdx + 2*ksizeHalf;\n"
363 "__local volatile float *row = smem + 5 * ty * smw;\n"
364 "if (y < rows)\n"
365 "{\n"
366 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
367 "{\n"
368 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
369 "xExt = idx_col(xExt, cols - 1);\n"
370 "#pragma unroll\n"
371 "for (int k = 0; k < 5; ++k)\n"
372 "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];\n"
373 "for (int j = 1; j <= ksizeHalf; ++j)\n"
374 "#pragma unroll\n"
375 "for (int k = 0; k < 5; ++k)\n"
376 "row[k*smw + i] +=\n"
377 "(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +\n"
378 "src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n"
379 "}\n"
380 "}\n"
381 "barrier(CLK_LOCAL_MEM_FENCE);\n"
382 "if (y < rows && y >= 0 && x < cols && x >= 0)\n"
383 "{\n"
384 "row += tx + ksizeHalf;\n"
385 "float res[5];\n"
386 "#pragma unroll\n"
387 "for (int k = 0; k < 5; ++k)\n"
388 "res[k] = row[k*smw] * c_gKer[0];\n"
389 "for (int i = 1; i <= ksizeHalf; ++i)\n"
390 "#pragma unroll\n"
391 "for (int k = 0; k < 5; ++k)\n"
392 "res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];\n"
393 "#pragma unroll\n"
394 "for (int k = 0; k < 5; ++k)\n"
395 "dst[mad24(k*rows + y, dstStep, x)] = res[k];\n"
396 "}\n"
397 "}\n"
398 "__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };\n"
399 "__kernel void updateMatrices(__global const float * flowx, int xStep,\n"
400 "__global const float * flowy, int yStep,\n"
401 "const int rows, const int cols,\n"
402 "__global const float * R0, int R0Step,\n"
403 "__global const float * R1, int R1Step,\n"
404 "__global float * M, int mStep)\n"
405 "{\n"
406 "const int y = get_global_id(1);\n"
407 "const int x = get_global_id(0);\n"
408 "if (y < rows && y >= 0 && x < cols && x >= 0)\n"
409 "{\n"
410 "float dx = flowx[mad24(y, xStep, x)];\n"
411 "float dy = flowy[mad24(y, yStep, x)];\n"
412 "float fx = x + dx;\n"
413 "float fy = y + dy;\n"
414 "int x1 = convert_int(floor(fx));\n"
415 "int y1 = convert_int(floor(fy));\n"
416 "fx -= x1;\n"
417 "fy -= y1;\n"
418 "float r2, r3, r4, r5, r6;\n"
419 "if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)\n"
420 "{\n"
421 "float a00 = (1.f - fx) * (1.f - fy);\n"
422 "float a01 = fx * (1.f - fy);\n"
423 "float a10 = (1.f - fx) * fy;\n"
424 "float a11 = fx * fy;\n"
425 "r2 = a00 * R1[mad24(y1, R1Step, x1)] +\n"
426 "a01 * R1[mad24(y1, R1Step, x1 + 1)] +\n"
427 "a10 * R1[mad24(y1 + 1, R1Step, x1)] +\n"
428 "a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];\n"
429 "r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +\n"
430 "a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +\n"
431 "a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +\n"
432 "a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];\n"
433 "r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +\n"
434 "a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +\n"
435 "a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +\n"
436 "a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];\n"
437 "r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +\n"
438 "a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +\n"
439 "a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +\n"
440 "a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];\n"
441 "r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +\n"
442 "a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +\n"
443 "a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +\n"
444 "a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];\n"
445 "r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;\n"
446 "r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;\n"
447 "r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;\n"
448 "}\n"
449 "else\n"
450 "{\n"
451 "r2 = r3 = 0.f;\n"
452 "r4 = R0[mad24(2*rows + y, R0Step, x)];\n"
453 "r5 = R0[mad24(3*rows + y, R0Step, x)];\n"
454 "r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;\n"
455 "}\n"
456 "r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;\n"
457 "r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;\n"
458 "r2 += r4*dy + r6*dx;\n"
459 "r3 += r6*dy + r5*dx;\n"
460 "float scale =\n"
461 "c_border[min(x, BORDER_SIZE)] *\n"
462 "c_border[min(y, BORDER_SIZE)] *\n"
463 "c_border[min(cols - x - 1, BORDER_SIZE)] *\n"
464 "c_border[min(rows - y - 1, BORDER_SIZE)];\n"
465 "r2 *= scale;\n"
466 "r3 *= scale;\n"
467 "r4 *= scale;\n"
468 "r5 *= scale;\n"
469 "r6 *= scale;\n"
470 "M[mad24(y, mStep, x)] = r4*r4 + r6*r6;\n"
471 "M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;\n"
472 "M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;\n"
473 "M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;\n"
474 "M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;\n"
475 "}\n"
476 "}\n"
477 "__kernel void boxFilter5(__global const float * src, int srcStep,\n"
478 "__global float * dst, int dstStep,\n"
479 "const int rows, const  int cols,\n"
480 "const int ksizeHalf,\n"
481 "__local float * smem)\n"
482 "{\n"
483 "const int y = get_global_id(1);\n"
484 "const int x = get_global_id(0);\n"
485 "const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));\n"
486 "const int smw = bdx + 2*ksizeHalf;\n"
487 "__local float *row = smem + 5 * ty * smw;\n"
488 "if (y < rows)\n"
489 "{\n"
490 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n"
491 "{\n"
492 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n"
493 "xExt = min(max(xExt, 0), cols - 1);\n"
494 "#pragma unroll\n"
495 "for (int k = 0; k < 5; ++k)\n"
496 "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];\n"
497 "for (int j = 1; j <= ksizeHalf; ++j)\n"
498 "#pragma unroll\n"
499 "for (int k = 0; k < 5; ++k)\n"
500 "row[k*smw + i] +=\n"
501 "src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +\n"
502 "src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];\n"
503 "}\n"
504 "}\n"
505 "barrier(CLK_LOCAL_MEM_FENCE);\n"
506 "if (y < rows && y >= 0 && x < cols && x >= 0)\n"
507 "{\n"
508 "row += tx + ksizeHalf;\n"
509 "float res[5];\n"
510 "#pragma unroll\n"
511 "for (int k = 0; k < 5; ++k)\n"
512 "res[k] = row[k*smw];\n"
513 "for (int i = 1; i <= ksizeHalf; ++i)\n"
514 "#pragma unroll\n"
515 "for (int k = 0; k < 5; ++k)\n"
516 "res[k] += row[k*smw - i] + row[k*smw + i];\n"
517 "#pragma unroll\n"
518 "for (int k = 0; k < 5; ++k)\n"
519 "dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;\n"
520 "}\n"
521 "}\n"
522 "__kernel void updateFlow(__global const float * M, int mStep,\n"
523 "__global float * flowx, int xStep,\n"
524 "__global float * flowy, int yStep,\n"
525 "const int rows, const int cols)\n"
526 "{\n"
527 "const int y = get_global_id(1);\n"
528 "const int x = get_global_id(0);\n"
529 "if (y < rows && y >= 0 && x < cols && x >= 0)\n"
530 "{\n"
531 "float g11 = M[mad24(y, mStep, x)];\n"
532 "float g12 = M[mad24(rows + y, mStep, x)];\n"
533 "float g22 = M[mad24(2*rows + y, mStep, x)];\n"
534 "float h1 =  M[mad24(3*rows + y, mStep, x)];\n"
535 "float h2 =  M[mad24(4*rows + y, mStep, x)];\n"
536 "float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);\n"
537 "flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;\n"
538 "flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;\n"
539 "}\n"
540 "}\n"
544 …id centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,\n"
545 "__global float* dx, __global float* dy, int d_step)\n"
546 "{\n"
547 "int x = get_global_id(0);\n"
548 "int y = get_global_id(1);\n"
549 "if((x < src_col)&&(y < src_row))\n"
550 "{\n"
551 "int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);\n"
552 "int src_x2 = (x - 1) > 0 ? (x -1) : 0;\n"
553 "dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);\n"
554 "int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);\n"
555 "int src_y2 = (y - 1) > 0 ? (y - 1) : 0;\n"
556 "dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);\n"
557 "}\n"
558 "}\n"
559 "inline float bicubicCoeff(float x_)\n"
560 "{\n"
561 "float x = fabs(x_);\n"
562 "if (x <= 1.0f)\n"
563 "return x * x * (1.5f * x - 2.5f) + 1.0f;\n"
564 "else if (x < 2.0f)\n"
565 "return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;\n"
566 "else\n"
567 "return 0.0f;\n"
568 "}\n"
569 "__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,\n"
570 "image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,\n"
571 "__global const float* u1, int u1_step,\n"
572 "__global const float* u2,\n"
573 "__global float* I1w,\n"
574 "__global float* I1wx, \n"
575 "__global float* I1wy, \n"
576 "__global float* grad, \n"
577 "__global float* rho,\n"
578 "int I1w_step,\n"
579 "int u2_step,\n"
580 "int u1_offset_x,\n"
581 "int u1_offset_y,\n"
582 "int u2_offset_x,\n"
583 "int u2_offset_y)\n"
584 "{\n"
585 "int x = get_global_id(0);\n"
586 "int y = get_global_id(1);\n"
587 "if(x < I0_col&&y < I0_row)\n"
588 "{\n"
589 "float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
590 "float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
591 "float wx = x + u1Val;\n"
592 "float wy = y + u2Val;\n"
593 "int xmin = ceil(wx - 2.0f);\n"
594 "int xmax = floor(wx + 2.0f);\n"
595 "int ymin = ceil(wy - 2.0f);\n"
596 "int ymax = floor(wy + 2.0f);\n"
597 "float sum  = 0.0f;\n"
598 "float sumx = 0.0f;\n"
599 "float sumy = 0.0f;\n"
600 "float wsum = 0.0f;\n"
601 …mpler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
602 "for (int cy = ymin; cy <= ymax; ++cy)\n"
603 "{\n"
604 "for (int cx = xmin; cx <= xmax; ++cx)\n"
605 "{\n"
606 "float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n"
607 "int2 cood = (int2)(cx, cy);\n"
608 "sum += w * read_imagef(tex_I1, sampleri, cood).x;\n"
609 "sumx += w * read_imagef(tex_I1x, sampleri, cood).x;\n"
610 "sumy += w * read_imagef(tex_I1y, sampleri, cood).x;\n"
611 "wsum += w;\n"
612 "}\n"
613 "}\n"
614 "float coeff = 1.0f / wsum;\n"
615 "float I1wVal  = sum  * coeff;\n"
616 "float I1wxVal = sumx * coeff;\n"
617 "float I1wyVal = sumy * coeff;\n"
618 "I1w[y * I1w_step + x]  = I1wVal;\n"
619 "I1wx[y * I1w_step + x] = I1wxVal;\n"
620 "I1wy[y * I1w_step + x] = I1wyVal;\n"
621 "float Ix2 = I1wxVal * I1wxVal;\n"
622 "float Iy2 = I1wyVal * I1wyVal;\n"
623 "grad[y * I1w_step + x] = Ix2 + Iy2;\n"
624 "float I0Val = I0[y * I0_step + x];\n"
625 "rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n"
626 "}\n"
627 "}\n"
628 … float readImage(__global float *image,  int x,  int y,  int rows,  int cols, int elemCntPerRow)\n"
629 "{\n"
630 "int i0 = clamp(x, 0, cols - 1);\n"
631 "int j0 = clamp(y, 0, rows - 1);\n"
632 "return image[j0 * elemCntPerRow + i0];\n"
633 "}\n"
634 … void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,\n"
635 "__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,\n"
636 "__global const float* u1, int u1_step,\n"
637 "__global const float* u2,\n"
638 "__global float* I1w,\n"
639 "__global float* I1wx, \n"
640 "__global float* I1wy, \n"
641 "__global float* grad, \n"
642 "__global float* rho,\n"
643 "int I1w_step,\n"
644 "int u2_step,\n"
645 "int I1_step,\n"
646 "int I1x_step)\n"
647 "{\n"
648 "int x = get_global_id(0);\n"
649 "int y = get_global_id(1);\n"
650 "if(x < I0_col&&y < I0_row)\n"
651 "{\n"
652 "float u1Val = u1[y * u1_step + x];\n"
653 "float u2Val = u2[y * u2_step + x];\n"
654 "float wx = x + u1Val;\n"
655 "float wy = y + u2Val;\n"
656 "int xmin = ceil(wx - 2.0f);\n"
657 "int xmax = floor(wx + 2.0f);\n"
658 "int ymin = ceil(wy - 2.0f);\n"
659 "int ymax = floor(wy + 2.0f);\n"
660 "float sum  = 0.0f;\n"
661 "float sumx = 0.0f;\n"
662 "float sumy = 0.0f;\n"
663 "float wsum = 0.0f;\n"
664 "for (int cy = ymin; cy <= ymax; ++cy)\n"
665 "{\n"
666 "for (int cx = xmin; cx <= xmax; ++cx)\n"
667 "{\n"
668 "float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n"
669 "int2 cood = (int2)(cx, cy);\n"
670 "sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);\n"
671 "sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);\n"
672 "sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);\n"
673 "wsum += w;\n"
674 "}\n"
675 "}\n"
676 "float coeff = 1.0f / wsum;\n"
677 "float I1wVal  = sum  * coeff;\n"
678 "float I1wxVal = sumx * coeff;\n"
679 "float I1wyVal = sumy * coeff;\n"
680 "I1w[y * I1w_step + x]  = I1wVal;\n"
681 "I1wx[y * I1w_step + x] = I1wxVal;\n"
682 "I1wy[y * I1w_step + x] = I1wyVal;\n"
683 "float Ix2 = I1wxVal * I1wxVal;\n"
684 "float Iy2 = I1wyVal * I1wyVal;\n"
685 "grad[y * I1w_step + x] = Ix2 + Iy2;\n"
686 "float I0Val = I0[y * I0_step + x];\n"
687 "rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n"
688 "}\n"
689 "}\n"
690 … void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,\n"
691 "__global const float* u2,\n"
692 "__global float* p11, int p11_step,\n"
693 "__global float* p12,\n"
694 "__global float* p21,\n"
695 "__global float* p22,\n"
696 "float taut,\n"
697 "int u2_step,\n"
698 "int u1_offset_x,\n"
699 "int u1_offset_y,\n"
700 "int u2_offset_x,\n"
701 "int u2_offset_y)\n"
702 "{\n"
703 "int x = get_global_id(0);\n"
704 "int y = get_global_id(1);\n"
705 "if(x < u1_col && y < u1_row)\n"
706 "{\n"
707 "int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n"
708 …offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
709 "int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n"
710 …+ u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
711 "int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n"
712 …offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
713 "int src_y2 = (y + 1) <  (u1_row - 1) ? (y + 1) : (u1_row - 1);\n"
714 …+ u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
715 "float g1 = hypot(u1x, u1y);\n"
716 "float g2 = hypot(u2x, u2y);\n"
717 "float ng1 = 1.0f + taut * g1;\n"
718 "float ng2 = 1.0f + taut * g2;\n"
719 "p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;\n"
720 "p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;\n"
721 "p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;\n"
722 "p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;\n"
723 "}\n"
724 "}\n"
725 …ence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)\n"
726 "{\n"
727 "if (x > 0 && y > 0)\n"
728 "{\n"
729 "float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];\n"
730 "float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];\n"
731 "return v1x + v2y;\n"
732 "}\n"
733 "else\n"
734 "{\n"
735 "if (y > 0)\n"
736 "return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];\n"
737 "else\n"
738 "{\n"
739 "if (x > 0)\n"
740 "return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];\n"
741 "else\n"
742 "return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];\n"
743 "}\n"
744 "}\n"
745 "}\n"
746 …rnel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,\n"
747 "__global const float* I1wy, \n"
748 "__global const float* grad, \n"
749 "__global const float* rho_c, \n"
750 "__global const float* p11, \n"
751 "__global const float* p12, \n"
752 "__global const float* p21, \n"
753 "__global const float* p22, \n"
754 "__global float* u1, int u1_step,\n"
755 "__global float* u2,\n"
756 "__global float* error, float l_t, float theta, int u2_step,\n"
757 "int u1_offset_x,\n"
758 "int u1_offset_y,\n"
759 "int u2_offset_x,\n"
760 "int u2_offset_y,\n"
761 "char calc_error)\n"
762 "{\n"
763 "int x = get_global_id(0);\n"
764 "int y = get_global_id(1);\n"
765 "if(x < I1wx_col && y < I1wx_row)\n"
766 "{\n"
767 "float I1wxVal = I1wx[y * I1wx_step + x];\n"
768 "float I1wyVal = I1wy[y * I1wx_step + x];\n"
769 "float gradVal = grad[y * I1wx_step + x];\n"
770 "float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n"
771 "float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n"
772 "float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);\n"
773 "float d1 = 0.0f;\n"
774 "float d2 = 0.0f;\n"
775 "if (rho < -l_t * gradVal)\n"
776 "{\n"
777 "d1 = l_t * I1wxVal;\n"
778 "d2 = l_t * I1wyVal;\n"
779 "}\n"
780 "else if (rho > l_t * gradVal)\n"
781 "{\n"
782 "d1 = -l_t * I1wxVal;\n"
783 "d2 = -l_t * I1wyVal;\n"
784 "}\n"
785 "else if (gradVal > 1.192092896e-07f)\n"
786 "{\n"
787 "float fi = -rho / gradVal;\n"
788 "d1 = fi * I1wxVal;\n"
789 "d2 = fi * I1wyVal;\n"
790 "}\n"
791 "float v1 = u1OldVal + d1;\n"
792 "float v2 = u2OldVal + d2;\n"
793 "float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);\n"
794 "float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);\n"
795 "float u1NewVal = v1 + theta * div_p1;\n"
796 "float u2NewVal = v2 + theta * div_p2;\n"
797 "u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;\n"
798 "u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;\n"
799 "if(calc_error)\n"
800 "{\n"
801 "float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);\n"
802 "float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);\n"
803 "error[y * I1wx_step + x] = n1 + n2;\n"
804 "}\n"
805 "}\n"
806 "}\n"
810 "#define GRIDSIZE    3\n"
811 "#define LSx 8\n"
812 "#define LSy 8\n"
813 "#define LM_W (LSx*GRIDSIZE+2)\n"
814 "#define LM_H (LSy*GRIDSIZE+2)\n"
815 "#define BUFFER  (LSx*LSy)\n"
816 "#define BUFFER2 BUFFER>>1\n"
817 "#ifndef WAVE_SIZE\n"
818 "#define WAVE_SIZE 1\n"
819 "#endif\n"
820 "#ifdef CPU\n"
821 … val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)\n"
822 "{\n"
823 "smem1[tid] = val1;\n"
824 "smem2[tid] = val2;\n"
825 "smem3[tid] = val3;\n"
826 "barrier(CLK_LOCAL_MEM_FENCE);\n"
827 "for(int i = BUFFER2; i > 0; i >>= 1)\n"
828 "{\n"
829 "if(tid < i)\n"
830 "{\n"
831 "smem1[tid] += smem1[tid + i];\n"
832 "smem2[tid] += smem2[tid + i];\n"
833 "smem3[tid] += smem3[tid + i];\n"
834 "}\n"
835 "barrier(CLK_LOCAL_MEM_FENCE);\n"
836 "}\n"
837 "}\n"
838 …2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)\n"
839 "{\n"
840 "smem1[tid] = val1;\n"
841 "smem2[tid] = val2;\n"
842 "barrier(CLK_LOCAL_MEM_FENCE);\n"
843 "for(int i = BUFFER2; i > 0; i >>= 1)\n"
844 "{\n"
845 "if(tid < i)\n"
846 "{\n"
847 "smem1[tid] += smem1[tid + i];\n"
848 "smem2[tid] += smem2[tid + i];\n"
849 "}\n"
850 "barrier(CLK_LOCAL_MEM_FENCE);\n"
851 "}\n"
852 "}\n"
853 "inline void reduce1(float val1, volatile __local float* smem1, int tid)\n"
854 "{\n"
855 "smem1[tid] = val1;\n"
856 "barrier(CLK_LOCAL_MEM_FENCE);\n"
857 "for(int i = BUFFER2; i > 0; i >>= 1)\n"
858 "{\n"
859 "if(tid < i)\n"
860 "{\n"
861 "smem1[tid] += smem1[tid + i];\n"
862 "}\n"
863 "barrier(CLK_LOCAL_MEM_FENCE);\n"
864 "}\n"
865 "}\n"
866 "#else\n"
867 "inline void reduce3(float val1, float val2, float val3,\n"
868 …al volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)\n"
869 "{\n"
870 "smem1[tid] = val1;\n"
871 "smem2[tid] = val2;\n"
872 "smem3[tid] = val3;\n"
873 "barrier(CLK_LOCAL_MEM_FENCE);\n"
874 "if (tid < 32)\n"
875 "{\n"
876 "smem1[tid] += smem1[tid + 32];\n"
877 "smem2[tid] += smem2[tid + 32];\n"
878 "smem3[tid] += smem3[tid + 32];\n"
879 "#if WAVE_SIZE < 32\n"
880 "}\n"
881 "barrier(CLK_LOCAL_MEM_FENCE);\n"
882 "if (tid < 16)\n"
883 "{\n"
884 "#endif\n"
885 "smem1[tid] += smem1[tid + 16];\n"
886 "smem2[tid] += smem2[tid + 16];\n"
887 "smem3[tid] += smem3[tid + 16];\n"
888 "#if WAVE_SIZE <16\n"
889 "}\n"
890 "barrier(CLK_LOCAL_MEM_FENCE);\n"
891 "if (tid<1)\n"
892 "{\n"
893 "#endif\n"
894 "local float8* m1 = (local float8*)smem1;\n"
895 "local float8* m2 = (local float8*)smem2;\n"
896 "local float8* m3 = (local float8*)smem3;\n"
897 "float8 t1 = m1[0]+m1[1];\n"
898 "float8 t2 = m2[0]+m2[1];\n"
899 "float8 t3 = m3[0]+m3[1];\n"
900 "float4 t14 = t1.lo + t1.hi;\n"
901 "float4 t24 = t2.lo + t2.hi;\n"
902 "float4 t34 = t3.lo + t3.hi;\n"
903 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
904 "smem2[0] = t24.x+t24.y+t24.z+t24.w;\n"
905 "smem3[0] = t34.x+t34.y+t34.z+t34.w;\n"
906 "}\n"
907 "barrier(CLK_LOCAL_MEM_FENCE);\n"
908 "}\n"
909 …2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)\n"
910 "{\n"
911 "smem1[tid] = val1;\n"
912 "smem2[tid] = val2;\n"
913 "barrier(CLK_LOCAL_MEM_FENCE);\n"
914 "if (tid < 32)\n"
915 "{\n"
916 "smem1[tid] += smem1[tid + 32];\n"
917 "smem2[tid] += smem2[tid + 32];\n"
918 "#if WAVE_SIZE < 32\n"
919 "}\n"
920 "barrier(CLK_LOCAL_MEM_FENCE);\n"
921 "if (tid < 16)\n"
922 "{\n"
923 "#endif\n"
924 "smem1[tid] += smem1[tid + 16];\n"
925 "smem2[tid] += smem2[tid + 16];\n"
926 "#if WAVE_SIZE <16\n"
927 "}\n"
928 "barrier(CLK_LOCAL_MEM_FENCE);\n"
929 "if (tid<1)\n"
930 "{\n"
931 "#endif\n"
932 "local float8* m1 = (local float8*)smem1;\n"
933 "local float8* m2 = (local float8*)smem2;\n"
934 "float8 t1 = m1[0]+m1[1];\n"
935 "float8 t2 = m2[0]+m2[1];\n"
936 "float4 t14 = t1.lo + t1.hi;\n"
937 "float4 t24 = t2.lo + t2.hi;\n"
938 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
939 "smem2[0] = t24.x+t24.y+t24.z+t24.w;\n"
940 "}\n"
941 "barrier(CLK_LOCAL_MEM_FENCE);\n"
942 "}\n"
943 "inline void reduce1(float val1, __local volatile float* smem1, int tid)\n"
944 "{\n"
945 "smem1[tid] = val1;\n"
946 "barrier(CLK_LOCAL_MEM_FENCE);\n"
947 "if (tid < 32)\n"
948 "{\n"
949 "smem1[tid] += smem1[tid + 32];\n"
950 "#if WAVE_SIZE < 32\n"
951 "}\n"
952 "barrier(CLK_LOCAL_MEM_FENCE);\n"
953 "if (tid < 16)\n"
954 "{\n"
955 "#endif\n"
956 "smem1[tid] += smem1[tid + 16];\n"
957 "#if WAVE_SIZE <16\n"
958 "}\n"
959 "barrier(CLK_LOCAL_MEM_FENCE);\n"
960 "if (tid<1)\n"
961 "{\n"
962 "#endif\n"
963 "local float8* m1 = (local float8*)smem1;\n"
964 "float8 t1 = m1[0]+m1[1];\n"
965 "float4 t14 = t1.lo + t1.hi;\n"
966 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n"
967 "}\n"
968 "barrier(CLK_LOCAL_MEM_FENCE);\n"
969 "}\n"
970 "#endif\n"
971 "#define SCALE (1.0f / (1 << 20))\n"
972 "#define  THRESHOLD  0.01f\n"
973 …pler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
974 …fine VAL(_y,_x,_yy,_xx)    (IPatchLocal[(yid+((_y)*LSy)+1+(_yy))*LM_W+(xid+((_x)*LSx)+1+(_xx))])\n"
975 "inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,\n"
976 "float* Pch, float* Dx, float* Dy,\n"
977 "float* A11, float* A12, float* A22, float w)\n"
978 "{\n"
979 "unsigned int xid=get_local_id(0);\n"
980 "unsigned int yid=get_local_id(1);\n"
981 "*Pch = VAL(TileY,TileX,0,0);\n"
982 …eX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));\n"
983 …eX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));\n"
984 "dIdx *= w;\n"
985 "dIdy *= w;\n"
986 "*Dx = dIdx;\n"
987 "*Dy = dIdy;\n"
988 "*A11 += dIdx * dIdx;\n"
989 "*A12 += dIdx * dIdy;\n"
990 "*A22 += dIdy * dIdy;\n"
991 "}\n"
992 "#undef VAL\n"
993 "inline void GetPatch(image2d_t J, float x, float y,\n"
994 "float* Pch, float* Dx, float* Dy,\n"
995 "float* b1, float* b2)\n"
996 "{\n"
997 "float J_val = read_imagef(J, sampler, (float2)(x, y)).x;\n"
998 "float diff = (J_val - *Pch) * 32.0f;\n"
999 "*b1 += diff**Dx;\n"
1000 "*b2 += diff**Dy;\n"
1001 "}\n"
1002 "inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)\n"
1003 "{\n"
1004 "float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;\n"
1005 "*errval += fabs(diff);\n"
1006 "}\n"
1007 …imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;\n"
1008 "void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)\n"
1009 "{\n"
1010 "unsigned int xid=get_local_id(0);\n"
1011 "unsigned int yid=get_local_id(1);\n"
1012 "READI(0,0);READI(0,1);READI(0,2);\n"
1013 "READI(1,0);READI(1,1);READI(1,2);\n"
1014 "READI(2,0);READI(2,1);READI(2,2);\n"
1015 "if(xid<2)\n"
1016 "{\n"
1017 "READI(0,3);\n"
1018 "READI(1,3);\n"
1019 "READI(2,3);\n"
1020 "}\n"
1021 "if(yid<2)\n"
1022 "{\n"
1023 "READI(3,0);READI(3,1);READI(3,2);\n"
1024 "}\n"
1025 "if(yid<2 && xid<2)\n"
1026 "{\n"
1027 "READI(3,3);\n"
1028 "}\n"
1029 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1030 "}\n"
1031 "#undef READI\n"
1032 "__attribute__((reqd_work_group_size(LSx, LSy, 1)))\n"
1033 "__kernel void lkSparse(image2d_t I, image2d_t J,\n"
1034 …al const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err,\n"
1035 …int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)\n"
1036 "{\n"
1037 "__local float smem1[BUFFER];\n"
1038 "__local float smem2[BUFFER];\n"
1039 "__local float smem3[BUFFER];\n"
1040 "unsigned int xid=get_local_id(0);\n"
1041 "unsigned int yid=get_local_id(1);\n"
1042 "unsigned int gid=get_group_id(0);\n"
1043 "unsigned int xsize=get_local_size(0);\n"
1044 "unsigned int ysize=get_local_size(1);\n"
1045 "int xBase, yBase, k;\n"
1046 "float wx = ((xid+2*xsize)<c_winSize_x)?1:0;\n"
1047 "float wy = ((yid+2*ysize)<c_winSize_y)?1:0;\n"
1048 "float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);\n"
1049 "const int tid = mad24(yid, xsize, xid);\n"
1050 "float2 prevPt = prevPts[gid] / (float2)(1 << level);\n"
1051 "if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)\n"
1052 "{\n"
1053 "if (tid == 0 && level == 0)\n"
1054 "{\n"
1055 "status[gid] = 0;\n"
1056 "}\n"
1057 "return;\n"
1058 "}\n"
1059 "prevPt -= c_halfWin;\n"
1060 "float A11 = 0;\n"
1061 "float A12 = 0;\n"
1062 "float A22 = 0;\n"
1063 "float I_patch[GRIDSIZE][GRIDSIZE];\n"
1064 "float dIdx_patch[GRIDSIZE][GRIDSIZE];\n"
1065 "float dIdy_patch[GRIDSIZE][GRIDSIZE];\n"
1066 "local float IPatchLocal[LM_W*LM_H];\n"
1067 "ReadPatchIToLocalMem(I,prevPt,IPatchLocal);\n"
1068 "{\n"
1069 "SetPatch(IPatchLocal, 0, 0,\n"
1070 "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n"
1071 "&A11, &A12, &A22,1);\n"
1072 "SetPatch(IPatchLocal, 0, 1,\n"
1073 "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n"
1074 "&A11, &A12, &A22,1);\n"
1075 "SetPatch(IPatchLocal, 0, 2,\n"
1076 "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n"
1077 "&A11, &A12, &A22,wx);\n"
1078 "}\n"
1079 "{\n"
1080 "SetPatch(IPatchLocal, 1, 0,\n"
1081 "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n"
1082 "&A11, &A12, &A22,1);\n"
1083 "SetPatch(IPatchLocal, 1,1,\n"
1084 "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n"
1085 "&A11, &A12, &A22,1);\n"
1086 "SetPatch(IPatchLocal, 1,2,\n"
1087 "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n"
1088 "&A11, &A12, &A22,wx);\n"
1089 "}\n"
1090 "{\n"
1091 "SetPatch(IPatchLocal, 2,0,\n"
1092 "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n"
1093 "&A11, &A12, &A22,wy);\n"
1094 "SetPatch(IPatchLocal, 2,1,\n"
1095 "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n"
1096 "&A11, &A12, &A22,wy);\n"
1097 "SetPatch(IPatchLocal, 2,2,\n"
1098 "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n"
1099 "&A11, &A12, &A22,wx*wy);\n"
1100 "}\n"
1101 "reduce3(A11, A12, A22, smem1, smem2, smem3, tid);\n"
1102 "A11 = smem1[0];\n"
1103 "A12 = smem2[0];\n"
1104 "A22 = smem3[0];\n"
1105 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1106 "float D = A11 * A22 - A12 * A12;\n"
1107 "if (D < 1.192092896e-07f)\n"
1108 "{\n"
1109 "if (tid == 0 && level == 0)\n"
1110 "status[gid] = 0;\n"
1111 "return;\n"
1112 "}\n"
1113 "A11 /= D;\n"
1114 "A12 /= D;\n"
1115 "A22 /= D;\n"
1116 "prevPt = nextPts[gid] * 2.0f - c_halfWin;\n"
1117 "for (k = 0; k < c_iters; ++k)\n"
1118 "{\n"
1119 "if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)\n"
1120 "{\n"
1121 "if (tid == 0 && level == 0)\n"
1122 "status[gid] = 0;\n"
1123 "break;\n"
1124 "}\n"
1125 "float b1 = 0;\n"
1126 "float b2 = 0;\n"
1127 "yBase=yid;\n"
1128 "{\n"
1129 "xBase=xid;\n"
1130 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1131 "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n"
1132 "&b1, &b2);\n"
1133 "xBase+=xsize;\n"
1134 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1135 "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n"
1136 "&b1, &b2);\n"
1137 "xBase+=xsize;\n"
1138 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1139 "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n"
1140 "&b1, &b2);\n"
1141 "}\n"
1142 "yBase+=ysize;\n"
1143 "{\n"
1144 "xBase=xid;\n"
1145 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1146 "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n"
1147 "&b1, &b2);\n"
1148 "xBase+=xsize;\n"
1149 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1150 "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n"
1151 "&b1, &b2);\n"
1152 "xBase+=xsize;\n"
1153 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1154 "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n"
1155 "&b1, &b2);\n"
1156 "}\n"
1157 "yBase+=ysize;\n"
1158 "{\n"
1159 "xBase=xid;\n"
1160 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1161 "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n"
1162 "&b1, &b2);\n"
1163 "xBase+=xsize;\n"
1164 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1165 "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n"
1166 "&b1, &b2);\n"
1167 "xBase+=xsize;\n"
1168 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1169 "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n"
1170 "&b1, &b2);\n"
1171 "}\n"
1172 "reduce2(b1, b2, smem1, smem2, tid);\n"
1173 "b1 = smem1[0];\n"
1174 "b2 = smem2[0];\n"
1175 "barrier(CLK_LOCAL_MEM_FENCE);\n"
1176 "float2 delta;\n"
1177 "delta.x = A12 * b2 - A22 * b1;\n"
1178 "delta.y = A12 * b1 - A11 * b2;\n"
1179 "prevPt += delta;\n"
1180 "if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)\n"
1181 "break;\n"
1182 "}\n"
1183 "D = 0.0f;\n"
1184 "if (calcErr)\n"
1185 "{\n"
1186 "yBase=yid;\n"
1187 "{\n"
1188 "xBase=xid;\n"
1189 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1190 "&I_patch[0][0], &D);\n"
1191 "xBase+=xsize;\n"
1192 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1193 "&I_patch[0][1], &D);\n"
1194 "xBase+=xsize;\n"
1195 "if(xBase<c_winSize_x)\n"
1196 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1197 "&I_patch[0][2], &D);\n"
1198 "}\n"
1199 "yBase+=ysize;\n"
1200 "{\n"
1201 "xBase=xid;\n"
1202 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1203 "&I_patch[1][0], &D);\n"
1204 "xBase+=xsize;\n"
1205 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1206 "&I_patch[1][1], &D);\n"
1207 "xBase+=xsize;\n"
1208 "if(xBase<c_winSize_x)\n"
1209 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1210 "&I_patch[1][2], &D);\n"
1211 "}\n"
1212 "yBase+=ysize;\n"
1213 "if(yBase<c_winSize_y)\n"
1214 "{\n"
1215 "xBase=xid;\n"
1216 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1217 "&I_patch[2][0], &D);\n"
1218 "xBase+=xsize;\n"
1219 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1220 "&I_patch[2][1], &D);\n"
1221 "xBase+=xsize;\n"
1222 "if(xBase<c_winSize_x)\n"
1223 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n"
1224 "&I_patch[2][2], &D);\n"
1225 "}\n"
1226 "reduce1(D, smem1, tid);\n"
1227 "}\n"
1228 "if (tid == 0)\n"
1229 "{\n"
1230 "prevPt += c_halfWin;\n"
1231 "nextPts[gid] = prevPt;\n"
1232 "if (calcErr)\n"
1233 "err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);\n"
1234 "}\n"
1235 "}\n"