Lines Matching full:n
14 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable\n"
15 "#define MAX_FLOAT 3.40282e+038f\n"
16 "#ifndef T\n"
17 "#define T float\n"
18 "#endif\n"
19 "#ifndef BLOCK_SIZE\n"
20 "#define BLOCK_SIZE 16\n"
21 "#endif\n"
22 "#ifndef MAX_DESC_LEN\n"
23 "#define MAX_DESC_LEN 64\n"
24 "#endif\n"
25 "#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)\n"
26 "#ifndef SHARED_MEM_SZ\n"
27 "# if (BLOCK_SIZE < MAX_DESC_LEN)\n"
28 "# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))\n"
29 "# else\n"
30 "# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)\n"
31 "# endif\n"
32 "#endif\n"
33 "#ifndef DIST_TYPE\n"
34 "#define DIST_TYPE 2\n"
35 "#endif\n"
36 "#if (DIST_TYPE == 2)\n"
37 "# ifdef T_FLOAT\n"
38 "typedef float result_type;\n"
39 "# if (8 == kercn)\n"
40 "typedef float8 value_type;\n"
41 …alue_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n"
42 "# elif (4 == kercn)\n"
43 "typedef float4 value_type;\n"
44 … define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n"
45 "# else\n"
46 "typedef float value_type;\n"
47 "# define DIST(x, y) result += fabs((x) - (y))\n"
48 "# endif\n"
49 "# else\n"
50 "typedef int result_type;\n"
51 "# if (8 == kercn)\n"
52 "typedef int8 value_type;\n"
53 …value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n"
54 "# elif (4 == kercn)\n"
55 "typedef int4 value_type;\n"
56 … define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n"
57 "# else\n"
58 "typedef int value_type;\n"
59 "# define DIST(x, y) result += abs((x) - (y))\n"
60 "# endif\n"
61 "# endif\n"
62 "# define DIST_RES(x) (x)\n"
63 "#elif (DIST_TYPE == 4)\n"
64 "typedef float result_type;\n"
65 "# if (8 == kercn)\n"
66 "typedef float8 value_type;\n"
67 …T(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}\n"
68 "# elif (4 == kercn)\n"
69 "typedef float4 value_type;\n"
70 "# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}\n"
71 "# else\n"
72 "typedef float value_type;\n"
73 "# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}\n"
74 "# endif\n"
75 "# define DIST_RES(x) sqrt(x)\n"
76 "#elif (DIST_TYPE == 6)\n"
77 "# if (8 == kercn)\n"
78 "typedef int8 value_type;\n"
79 "# elif (4 == kercn)\n"
80 "typedef int4 value_type;\n"
81 "# else\n"
82 "typedef int value_type;\n"
83 "# endif\n"
84 "typedef int result_type;\n"
85 "# define DIST(x, y) result += popcount( (x) ^ (y) )\n"
86 "# define DIST_RES(x) (x)\n"
87 "#endif\n"
88 "inline result_type reduce_block(\n"
89 "__local value_type *s_query,\n"
90 "__local value_type *s_train,\n"
91 "int lidx,\n"
92 "int lidy\n"
93 ")\n"
94 "{\n"
95 "result_type result = 0;\n"
96 "#pragma unroll\n"
97 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
98 "{\n"
99 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n"
100 "}\n"
101 "return DIST_RES(result);\n"
102 "}\n"
103 "inline result_type reduce_block_match(\n"
104 "__local value_type *s_query,\n"
105 "__local value_type *s_train,\n"
106 "int lidx,\n"
107 "int lidy\n"
108 ")\n"
109 "{\n"
110 "result_type result = 0;\n"
111 "#pragma unroll\n"
112 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
113 "{\n"
114 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n"
115 "}\n"
116 "return result;\n"
117 "}\n"
118 "inline result_type reduce_multi_block(\n"
119 "__local value_type *s_query,\n"
120 "__local value_type *s_train,\n"
121 "int block_index,\n"
122 "int lidx,\n"
123 "int lidy\n"
124 ")\n"
125 "{\n"
126 "result_type result = 0;\n"
127 "#pragma unroll\n"
128 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
129 "{\n"
130 …ST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);\n"
131 "}\n"
132 "return result;\n"
133 "}\n"
134 "__kernel void BruteForceMatch_Match(\n"
135 "__global T *query,\n"
136 "__global T *train,\n"
137 "__global int *bestTrainIdx,\n"
138 "__global float *bestDistance,\n"
139 "int query_rows,\n"
140 "int query_cols,\n"
141 "int train_rows,\n"
142 "int train_cols,\n"
143 "int step\n"
144 ")\n"
145 "{\n"
146 "const int lidx = get_local_id(0);\n"
147 "const int lidy = get_local_id(1);\n"
148 "const int groupidx = get_group_id(0);\n"
149 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n"
150 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
151 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
152 "query_cols /= kercn;\n"
153 "__local float sharebuffer[SHARED_MEM_SZ];\n"
154 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
155 "#if 0 < MAX_DESC_LEN\n"
156 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n"
157 "#pragma unroll\n"
158 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n"
159 "{\n"
160 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
161 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n"
162 "}\n"
163 "#else\n"
164 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
165 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
166 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
167 "#endif\n"
168 "float myBestDistance = MAX_FLOAT;\n"
169 "int myBestTrainIdx = -1;\n"
170 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)\n"
171 "{\n"
172 "result_type result = 0;\n"
173 "const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n"
174 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
175 "#if 0 < MAX_DESC_LEN\n"
176 "#pragma unroll\n"
177 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n"
178 "{\n"
179 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
180 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n"
181 "barrier(CLK_LOCAL_MEM_FENCE);\n"
182 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n"
183 "barrier(CLK_LOCAL_MEM_FENCE);\n"
184 "}\n"
185 "#else\n"
186 "for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)\n"
187 "{\n"
188 "const int loadx = mad24(i, BLOCK_SIZE, lidx);\n"
189 "if (loadx < query_cols)\n"
190 "{\n"
191 "s_query[s_query_i] = query_vec[loadx];\n"
192 "s_train[s_train_i] = train_vec[loadx];\n"
193 "}\n"
194 "else\n"
195 "{\n"
196 "s_query[s_query_i] = 0;\n"
197 "s_train[s_train_i] = 0;\n"
198 "}\n"
199 "barrier(CLK_LOCAL_MEM_FENCE);\n"
200 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n"
201 "barrier(CLK_LOCAL_MEM_FENCE);\n"
202 "}\n"
203 "#endif\n"
204 "result = DIST_RES(result);\n"
205 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n"
206 "if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance )\n"
207 "{\n"
208 "myBestDistance = result;\n"
209 "myBestTrainIdx = trainIdx;\n"
210 "}\n"
211 "}\n"
212 "barrier(CLK_LOCAL_MEM_FENCE);\n"
213 "__local float *s_distance = (__local float *)sharebuffer;\n"
214 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n"
215 "s_distance += lidy * BLOCK_SIZE_ODD;\n"
216 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n"
217 "s_distance[lidx] = myBestDistance;\n"
218 "s_trainIdx[lidx] = myBestTrainIdx;\n"
219 "barrier(CLK_LOCAL_MEM_FENCE);\n"
220 "#pragma unroll\n"
221 "for (int k = 0 ; k < BLOCK_SIZE; k++)\n"
222 "{\n"
223 "if (myBestDistance > s_distance[k])\n"
224 "{\n"
225 "myBestDistance = s_distance[k];\n"
226 "myBestTrainIdx = s_trainIdx[k];\n"
227 "}\n"
228 "}\n"
229 "if (queryIdx < query_rows && lidx == 0)\n"
230 "{\n"
231 "bestTrainIdx[queryIdx] = myBestTrainIdx;\n"
232 "bestDistance[queryIdx] = myBestDistance;\n"
233 "}\n"
234 "}\n"
235 "__kernel void BruteForceMatch_RadiusMatch(\n"
236 "__global T *query,\n"
237 "__global T *train,\n"
238 "float maxDistance,\n"
239 "__global int *bestTrainIdx,\n"
240 "__global float *bestDistance,\n"
241 "__global int *nMatches,\n"
242 "int query_rows,\n"
243 "int query_cols,\n"
244 "int train_rows,\n"
245 "int train_cols,\n"
246 "int bestTrainIdx_cols,\n"
247 "int step,\n"
248 "int ostep\n"
249 ")\n"
250 "{\n"
251 "const int lidx = get_local_id(0);\n"
252 "const int lidy = get_local_id(1);\n"
253 "const int groupidx = get_group_id(0);\n"
254 "const int groupidy = get_group_id(1);\n"
255 "const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);\n"
256 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
257 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
258 "const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);\n"
259 "const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;\n"
260 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
261 "query_cols /= kercn;\n"
262 "__local float sharebuffer[SHARED_MEM_SZ];\n"
263 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
264 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
265 "result_type result = 0;\n"
266 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
267 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
268 "for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)\n"
269 "{\n"
270 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
271 "if (loadx < query_cols)\n"
272 "{\n"
273 "s_query[s_query_i] = query_vec[loadx];\n"
274 "s_train[s_train_i] = train_vec[loadx];\n"
275 "}\n"
276 "else\n"
277 "{\n"
278 "s_query[s_query_i] = 0;\n"
279 "s_train[s_train_i] = 0;\n"
280 "}\n"
281 "barrier(CLK_LOCAL_MEM_FENCE);\n"
282 "result += reduce_block(s_query, s_train, lidx, lidy);\n"
283 "barrier(CLK_LOCAL_MEM_FENCE);\n"
284 "}\n"
285 "if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)\n"
286 "{\n"
287 "int ind = atom_inc(nMatches + queryIdx);\n"
288 "if(ind < bestTrainIdx_cols)\n"
289 "{\n"
290 "bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;\n"
291 "bestDistance[mad24(queryIdx, ostep, ind)] = result;\n"
292 "}\n"
293 "}\n"
294 "}\n"
295 "__kernel void BruteForceMatch_knnMatch(\n"
296 "__global T *query,\n"
297 "__global T *train,\n"
298 "__global int2 *bestTrainIdx,\n"
299 "__global float2 *bestDistance,\n"
300 "int query_rows,\n"
301 "int query_cols,\n"
302 "int train_rows,\n"
303 "int train_cols,\n"
304 "int step\n"
305 ")\n"
306 "{\n"
307 "const int lidx = get_local_id(0);\n"
308 "const int lidy = get_local_id(1);\n"
309 "const int groupidx = get_group_id(0);\n"
310 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n"
311 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
312 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
313 "query_cols /= kercn;\n"
314 "__local float sharebuffer[SHARED_MEM_SZ];\n"
315 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
316 "#if 0 < MAX_DESC_LEN\n"
317 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n"
318 "#pragma unroll\n"
319 "for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)\n"
320 "{\n"
321 "int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
322 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n"
323 "}\n"
324 "#else\n"
325 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
326 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
327 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
328 "#endif\n"
329 "float myBestDistance1 = MAX_FLOAT;\n"
330 "float myBestDistance2 = MAX_FLOAT;\n"
331 "int myBestTrainIdx1 = -1;\n"
332 "int myBestTrainIdx2 = -1;\n"
333 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)\n"
334 "{\n"
335 "result_type result = 0;\n"
336 "int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n"
337 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
338 "#if 0 < MAX_DESC_LEN\n"
339 "#pragma unroll\n"
340 "for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)\n"
341 "{\n"
342 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
343 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n"
344 "barrier(CLK_LOCAL_MEM_FENCE);\n"
345 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n"
346 "barrier(CLK_LOCAL_MEM_FENCE);\n"
347 "}\n"
348 "#else\n"
349 "for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)\n"
350 "{\n"
351 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
352 "if (loadx < query_cols)\n"
353 "{\n"
354 "s_query[s_query_i] = query_vec[loadx];\n"
355 "s_train[s_train_i] = train_vec[loadx];\n"
356 "}\n"
357 "else\n"
358 "{\n"
359 "s_query[s_query_i] = 0;\n"
360 "s_train[s_train_i] = 0;\n"
361 "}\n"
362 "barrier(CLK_LOCAL_MEM_FENCE);\n"
363 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n"
364 "barrier(CLK_LOCAL_MEM_FENCE);\n"
365 "}\n"
366 "#endif\n"
367 "result = DIST_RES(result);\n"
368 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n"
369 "if (queryIdx < query_rows && trainIdx < train_rows)\n"
370 "{\n"
371 "if (result < myBestDistance1)\n"
372 "{\n"
373 "myBestDistance2 = myBestDistance1;\n"
374 "myBestTrainIdx2 = myBestTrainIdx1;\n"
375 "myBestDistance1 = result;\n"
376 "myBestTrainIdx1 = trainIdx;\n"
377 "}\n"
378 "else if (result < myBestDistance2)\n"
379 "{\n"
380 "myBestDistance2 = result;\n"
381 "myBestTrainIdx2 = trainIdx;\n"
382 "}\n"
383 "}\n"
384 "}\n"
385 "barrier(CLK_LOCAL_MEM_FENCE);\n"
386 "__local float *s_distance = (__local float *)sharebuffer;\n"
387 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n"
388 "s_distance += lidy * BLOCK_SIZE_ODD;\n"
389 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n"
390 "s_distance[lidx] = myBestDistance1;\n"
391 "s_trainIdx[lidx] = myBestTrainIdx1;\n"
392 "float bestDistance1 = MAX_FLOAT;\n"
393 "float bestDistance2 = MAX_FLOAT;\n"
394 "int bestTrainIdx1 = -1;\n"
395 "int bestTrainIdx2 = -1;\n"
396 "barrier(CLK_LOCAL_MEM_FENCE);\n"
397 "if (lidx == 0)\n"
398 "{\n"
399 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n"
400 "{\n"
401 "float val = s_distance[i];\n"
402 "if (val < bestDistance1)\n"
403 "{\n"
404 "bestDistance2 = bestDistance1;\n"
405 "bestTrainIdx2 = bestTrainIdx1;\n"
406 "bestDistance1 = val;\n"
407 "bestTrainIdx1 = s_trainIdx[i];\n"
408 "}\n"
409 "else if (val < bestDistance2)\n"
410 "{\n"
411 "bestDistance2 = val;\n"
412 "bestTrainIdx2 = s_trainIdx[i];\n"
413 "}\n"
414 "}\n"
415 "}\n"
416 "barrier(CLK_LOCAL_MEM_FENCE);\n"
417 "s_distance[lidx] = myBestDistance2;\n"
418 "s_trainIdx[lidx] = myBestTrainIdx2;\n"
419 "barrier(CLK_LOCAL_MEM_FENCE);\n"
420 "if (lidx == 0)\n"
421 "{\n"
422 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n"
423 "{\n"
424 "float val = s_distance[i];\n"
425 "if (val < bestDistance2)\n"
426 "{\n"
427 "bestDistance2 = val;\n"
428 "bestTrainIdx2 = s_trainIdx[i];\n"
429 "}\n"
430 "}\n"
431 "}\n"
432 "myBestDistance1 = bestDistance1;\n"
433 "myBestDistance2 = bestDistance2;\n"
434 "myBestTrainIdx1 = bestTrainIdx1;\n"
435 "myBestTrainIdx2 = bestTrainIdx2;\n"
436 "if (queryIdx < query_rows && lidx == 0)\n"
437 "{\n"
438 "bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);\n"
439 "bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);\n"
440 "}\n"
441 "}\n"
445 "inline int cornerScore(__global const uchar* img, int step)\n"
446 "{\n"
447 "int k, tofs, v = img[0], a0 = 0, b0;\n"
448 "int d[16];\n"
449 "#define LOAD2(idx, ofs) \\\n"
450 "tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])\n"
451 "LOAD2(0, 3);\n"
452 "LOAD2(1, -step+3);\n"
453 "LOAD2(2, -step*2+2);\n"
454 "LOAD2(3, -step*3+1);\n"
455 "LOAD2(4, -step*3);\n"
456 "LOAD2(5, -step*3-1);\n"
457 "LOAD2(6, -step*2-2);\n"
458 "LOAD2(7, -step-3);\n"
459 "#pragma unroll\n"
460 "for( k = 0; k < 16; k += 2 )\n"
461 "{\n"
462 "int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);\n"
463 "a = min(a, (int)d[(k+3)&15]);\n"
464 "a = min(a, (int)d[(k+4)&15]);\n"
465 "a = min(a, (int)d[(k+5)&15]);\n"
466 "a = min(a, (int)d[(k+6)&15]);\n"
467 "a = min(a, (int)d[(k+7)&15]);\n"
468 "a = min(a, (int)d[(k+8)&15]);\n"
469 "a0 = max(a0, min(a, (int)d[k&15]));\n"
470 "a0 = max(a0, min(a, (int)d[(k+9)&15]));\n"
471 "}\n"
472 "b0 = -a0;\n"
473 "#pragma unroll\n"
474 "for( k = 0; k < 16; k += 2 )\n"
475 "{\n"
476 "int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);\n"
477 "b = max(b, (int)d[(k+3)&15]);\n"
478 "b = max(b, (int)d[(k+4)&15]);\n"
479 "b = max(b, (int)d[(k+5)&15]);\n"
480 "b = max(b, (int)d[(k+6)&15]);\n"
481 "b = max(b, (int)d[(k+7)&15]);\n"
482 "b = max(b, (int)d[(k+8)&15]);\n"
483 "b0 = min(b0, max(b, (int)d[k]));\n"
484 "b0 = min(b0, max(b, (int)d[(k+9)&15]));\n"
485 "}\n"
486 "return -b0-1;\n"
487 "}\n"
488 "__kernel\n"
489 "void FAST_findKeypoints(\n"
490 "__global const uchar * _img, int step, int img_offset,\n"
491 "int img_rows, int img_cols,\n"
492 "volatile __global int* kp_loc,\n"
493 "int max_keypoints, int threshold )\n"
494 "{\n"
495 "int j = get_global_id(0) + 3;\n"
496 "int i = get_global_id(1) + 3;\n"
497 "if (i < img_rows - 3 && j < img_cols - 3)\n"
498 "{\n"
499 "__global const uchar* img = _img + mad24(i, step, j + img_offset);\n"
500 "int v = img[0], t0 = v - threshold, t1 = v + threshold;\n"
501 "int k, tofs, v0, v1;\n"
502 "int m0 = 0, m1 = 0;\n"
503 "#define UPDATE_MASK(idx, ofs) \\\n"
504 "tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \\\n"
505 "m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \\\n"
506 "m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))\n"
507 "UPDATE_MASK(0, 3);\n"
508 "if( (m0 | m1) == 0 )\n"
509 "return;\n"
510 "UPDATE_MASK(2, -step*2+2);\n"
511 "UPDATE_MASK(4, -step*3);\n"
512 "UPDATE_MASK(6, -step*2-2);\n"
513 "#define EVEN_MASK (1+4+16+64)\n"
514 "if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&\n"
515 "((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )\n"
516 "return;\n"
517 "UPDATE_MASK(1, -step+3);\n"
518 "UPDATE_MASK(3, -step*3+1);\n"
519 "UPDATE_MASK(5, -step*3-1);\n"
520 "UPDATE_MASK(7, -step-3);\n"
521 "if( ((m0 | (m0 >> 8)) & 255) != 255 &&\n"
522 "((m1 | (m1 >> 8)) & 255) != 255 )\n"
523 "return;\n"
524 "m0 |= m0 << 16;\n"
525 "m1 |= m1 << 16;\n"
526 "#define CHECK0(i) ((m0 & (511 << i)) == (511 << i))\n"
527 "#define CHECK1(i) ((m1 & (511 << i)) == (511 << i))\n"
528 "if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +\n"
529 "CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +\n"
530 "CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +\n"
531 "CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +\n"
532 "CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +\n"
533 "CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +\n"
534 "CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +\n"
535 "CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )\n"
536 "return;\n"
537 "{\n"
538 "int idx = atomic_inc(kp_loc);\n"
539 "if( idx < max_keypoints )\n"
540 "{\n"
541 "kp_loc[1 + 2*idx] = j;\n"
542 "kp_loc[2 + 2*idx] = i;\n"
543 "}\n"
544 "}\n"
545 "}\n"
546 "}\n"
547 "__kernel\n"
548 "void FAST_nonmaxSupression(\n"
549 "__global const int* kp_in, volatile __global int* kp_out,\n"
550 "__global const uchar * _img, int step, int img_offset,\n"
551 "int rows, int cols, int counter, int max_keypoints)\n"
552 "{\n"
553 "const int idx = get_global_id(0);\n"
554 "if (idx < counter)\n"
555 "{\n"
556 "int x = kp_in[1 + 2*idx];\n"
557 "int y = kp_in[2 + 2*idx];\n"
558 "__global const uchar* img = _img + mad24(y, step, x + img_offset);\n"
559 "int s = cornerScore(img, step);\n"
560 "if( (x < 4 || s > cornerScore(img-1, step)) +\n"
561 "(y < 4 || s > cornerScore(img-step, step)) != 2 )\n"
562 "return;\n"
563 "if( (x >= cols - 4 || s > cornerScore(img+1, step)) +\n"
564 "(y >= rows - 4 || s > cornerScore(img+step, step)) +\n"
565 "(x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +\n"
566 "(x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +\n"
567 "(x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +\n"
568 "(x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)\n"
569 "{\n"
570 "int new_idx = atomic_inc(kp_out);\n"
571 "if( new_idx < max_keypoints )\n"
572 "{\n"
573 "kp_out[1 + 3*new_idx] = x;\n"
574 "kp_out[2 + 3*new_idx] = y;\n"
575 "kp_out[3 + 3*new_idx] = s;\n"
576 "}\n"
577 "}\n"
578 "}\n"
579 "}\n"
583 "#define LAYERINFO_SIZE 1\n"
584 "#define LAYERINFO_OFS 0\n"
585 "#define KEYPOINT_SIZE 3\n"
586 "#define ORIENTED_KEYPOINT_SIZE 4\n"
587 "#define KEYPOINT_X 0\n"
588 "#define KEYPOINT_Y 1\n"
589 "#define KEYPOINT_Z 2\n"
590 "#define KEYPOINT_ANGLE 3\n"
591 "#ifdef ORB_RESPONSES\n"
592 "__kernel void\n"
593 "ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
594 "__global const int* layerinfo, __global const int* keypoints,\n"
595 "__global float* responses, int nkeypoints )\n"
596 "{\n"
597 "int idx = get_global_id(0);\n"
598 "if( idx < nkeypoints )\n"
599 "{\n"
600 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n"
601 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
602 "__global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
603 "(kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);\n"
604 "int i, j;\n"
605 "int a = 0, b = 0, c = 0;\n"
606 "for( i = 0; i < blockSize; i++, img += imgstep-blockSize )\n"
607 "{\n"
608 "for( j = 0; j < blockSize; j++, img++ )\n"
609 "{\n"
610 …Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];\n"
611 …gstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];\n"
612 "a += Ix*Ix;\n"
613 "b += Iy*Iy;\n"
614 "c += Ix*Iy;\n"
615 "}\n"
616 "}\n"
617 …esponses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;\n"
618 "}\n"
619 "}\n"
620 "#endif\n"
621 "#ifdef ORB_ANGLES\n"
622 "#define _DBL_EPSILON 2.2204460492503131e-16f\n"
623 "#define atan2_p1 (0.9997878412794807f*57.29577951308232f)\n"
624 "#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)\n"
625 "#define atan2_p5 (0.1555786518463281f*57.29577951308232f)\n"
626 "#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)\n"
627 "inline float fastAtan2( float y, float x )\n"
628 "{\n"
629 "float ax = fabs(x), ay = fabs(y);\n"
630 "float a, c, c2;\n"
631 "if( ax >= ay )\n"
632 "{\n"
633 "c = ay/(ax + _DBL_EPSILON);\n"
634 "c2 = c*c;\n"
635 "a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n"
636 "}\n"
637 "else\n"
638 "{\n"
639 "c = ax/(ay + _DBL_EPSILON);\n"
640 "c2 = c*c;\n"
641 "a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n"
642 "}\n"
643 "if( x < 0 )\n"
644 "a = 180.f - a;\n"
645 "if( y < 0 )\n"
646 "a = 360.f - a;\n"
647 "return a;\n"
648 "}\n"
649 "__kernel void\n"
650 "ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
651 "__global const int* layerinfo, __global const int* keypoints,\n"
652 "__global float* responses, const __global int* u_max,\n"
653 "int nkeypoints, int half_k )\n"
654 "{\n"
655 "int idx = get_global_id(0);\n"
656 "if( idx < nkeypoints )\n"
657 "{\n"
658 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n"
659 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
660 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
661 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n"
662 "int u, v, m_01 = 0, m_10 = 0;\n"
663 "for( u = -half_k; u <= half_k; u++ )\n"
664 "m_10 += u * center[u];\n"
665 "for( v = 1; v <= half_k; v++ )\n"
666 "{\n"
667 "int v_sum = 0;\n"
668 "int d = u_max[v];\n"
669 "for( u = -d; u <= d; u++ )\n"
670 "{\n"
671 "int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];\n"
672 "v_sum += (val_plus - val_minus);\n"
673 "m_10 += u * (val_plus + val_minus);\n"
674 "}\n"
675 "m_01 += v * v_sum;\n"
676 "}\n"
677 "responses[idx] = fastAtan2((float)m_01, (float)m_10);\n"
678 "}\n"
679 "}\n"
680 "#endif\n"
681 "#ifdef ORB_DESCRIPTORS\n"
682 "__kernel void\n"
683 "ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
684 "__global const int* layerinfo, __global const int* keypoints,\n"
685 "__global uchar* _desc, const __global int* pattern,\n"
686 "int nkeypoints, int dsize )\n"
687 "{\n"
688 "int idx = get_global_id(0);\n"
689 "if( idx < nkeypoints )\n"
690 "{\n"
691 "int i;\n"
692 "__global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;\n"
693 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
694 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
695 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n"
696 "float angle = as_float(kpt[KEYPOINT_ANGLE]);\n"
697 "angle *= 0.01745329251994329547f;\n"
698 "float cosa;\n"
699 "float sina = sincos(angle, &cosa);\n"
700 "__global uchar* desc = _desc + idx*dsize;\n"
701 "#define GET_VALUE(idx) \\\n"
702 "center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \\\n"
703 "convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]\n"
704 "for( i = 0; i < dsize; i++ )\n"
705 "{\n"
706 "int val;\n"
707 "#if WTA_K == 2\n"
708 "int t0, t1;\n"
709 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n"
710 "val = t0 < t1;\n"
711 "t0 = GET_VALUE(2); t1 = GET_VALUE(3);\n"
712 "val |= (t0 < t1) << 1;\n"
713 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n"
714 "val |= (t0 < t1) << 2;\n"
715 "t0 = GET_VALUE(6); t1 = GET_VALUE(7);\n"
716 "val |= (t0 < t1) << 3;\n"
717 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n"
718 "val |= (t0 < t1) << 4;\n"
719 "t0 = GET_VALUE(10); t1 = GET_VALUE(11);\n"
720 "val |= (t0 < t1) << 5;\n"
721 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n"
722 "val |= (t0 < t1) << 6;\n"
723 "t0 = GET_VALUE(14); t1 = GET_VALUE(15);\n"
724 "val |= (t0 < t1) << 7;\n"
725 "pattern += 16*2;\n"
726 "#elif WTA_K == 3\n"
727 "int t0, t1, t2;\n"
728 "t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);\n"
729 "val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);\n"
730 "t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);\n"
731 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;\n"
732 "t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);\n"
733 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;\n"
734 "t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);\n"
735 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;\n"
736 "pattern += 12*2;\n"
737 "#elif WTA_K == 4\n"
738 "int t0, t1, t2, t3, k;\n"
739 "int a, b;\n"
740 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n"
741 "t2 = GET_VALUE(2); t3 = GET_VALUE(3);\n"
742 "a = 0, b = 2;\n"
743 "if( t1 > t0 ) t0 = t1, a = 1;\n"
744 "if( t3 > t2 ) t2 = t3, b = 3;\n"
745 "k = t0 > t2 ? a : b;\n"
746 "val = k;\n"
747 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n"
748 "t2 = GET_VALUE(6); t3 = GET_VALUE(7);\n"
749 "a = 0, b = 2;\n"
750 "if( t1 > t0 ) t0 = t1, a = 1;\n"
751 "if( t3 > t2 ) t2 = t3, b = 3;\n"
752 "k = t0 > t2 ? a : b;\n"
753 "val |= k << 2;\n"
754 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n"
755 "t2 = GET_VALUE(10); t3 = GET_VALUE(11);\n"
756 "a = 0, b = 2;\n"
757 "if( t1 > t0 ) t0 = t1, a = 1;\n"
758 "if( t3 > t2 ) t2 = t3, b = 3;\n"
759 "k = t0 > t2 ? a : b;\n"
760 "val |= k << 4;\n"
761 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n"
762 "t2 = GET_VALUE(14); t3 = GET_VALUE(15);\n"
763 "a = 0, b = 2;\n"
764 "if( t1 > t0 ) t0 = t1, a = 1;\n"
765 "if( t3 > t2 ) t2 = t3, b = 3;\n"
766 "k = t0 > t2 ? a : b;\n"
767 "val |= k << 6;\n"
768 "pattern += 16*2;\n"
769 "#else\n"
770 "#error \"unknown/undefined WTA_K value; should be 2, 3 or 4\"\n"
771 "#endif\n"
772 "desc[i] = (uchar)val;\n"
773 "}\n"
774 "}\n"
775 "}\n"
776 "#endif\n"