1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// @Authors
18//    Peng Xiao, pengxiao@multicorewareinc.com
19//
20// Redistribution and use in source and binary forms, with or without modification,
21// are permitted provided that the following conditions are met:
22//
23//   * Redistribution's of source code must retain the above copyright notice,
24//     this list of conditions and the following disclaimer.
25//
26//   * Redistribution's in binary form must reproduce the above copyright notice,
27//     this list of conditions and the following disclaimer in the documentation
28//     and/or other materials provided with the distribution.
29//
30//   * The name of the copyright holders may not be used to endorse or promote products
31//     derived from this software without specific prior written permission.
32//
33// This software is provided by the copyright holders and contributors as is and
34// any express or implied warranties, including, but not limited to, the implied
35// warranties of merchantability and fitness for a particular purpose are disclaimed.
36// In no event shall the Intel Corporation or contributors be liable for any direct,
37// indirect, incidental, special, exemplary, or consequential damages
38// (including, but not limited to, procurement of substitute goods or services;
39// loss of use, data, or profits; or business interruption) however caused
40// and on any theory of liability, whether in contract, strict liability,
41// or tort (including negligence or otherwise) arising in any way out of
42// the use of this software, even if advised of the possibility of such damage.
43//
44//M*/
45
46#define TG22 0.4142135623730950488016887242097f
47#define TG67 2.4142135623730950488016887242097f
48
49#ifdef WITH_SOBEL
50
51#if cn == 1
52#define loadpix(addr) convert_floatN(*(__global const TYPE *)(addr))
53#else
54#define loadpix(addr) convert_floatN(vload3(0, (__global const TYPE *)(addr)))
55#endif
56#define storepix(value, addr) *(__global int *)(addr) = (int)(value)
57
58/*
59    stage1_with_sobel:
60        Sobel operator
61        Calc magnitudes
62        Non maxima suppression
63        Double thresholding
64*/
65
66__constant int prev[4][2] = {
67    { 0, -1 },
68    { -1, 1 },
69    { -1, 0 },
70    { -1, -1 }
71};
72
73__constant int next[4][2] = {
74    { 0, 1 },
75    { 1, -1 },
76    { 1, 0 },
77    { 1, 1 }
78};
79
80inline float3 sobel(int idx, __local const floatN *smem)
81{
82    // result: x, y, mag
83    float3 res;
84
85    floatN dx = fma(2, smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4],
86        smem[idx + 2] - smem[idx] + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]);
87
88    floatN dy = fma(2, smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9],
89        smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10] + smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]);
90
91#ifdef L2GRAD
92    floatN magN = fma(dx, dx, dy * dy);
93#else
94    floatN magN = fabs(dx) + fabs(dy);
95#endif
96#if cn == 1
97    res.z = magN;
98    res.x = dx;
99    res.y = dy;
100#else
101    res.z = max(magN.x, max(magN.y, magN.z));
102    if (res.z == magN.y)
103    {
104        dx.x = dx.y;
105        dy.x = dy.y;
106    }
107    else if (res.z == magN.z)
108    {
109        dx.x = dx.z;
110        dy.x = dy.z;
111    }
112    res.x = dx.x;
113    res.y = dy.x;
114#endif
115
116    return res;
117}
118
119__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols,
120                                __global uchar *map, int map_step, int map_offset,
121                                float low_thr, float high_thr)
122{
123    __local floatN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)];
124
125    int lidx = get_local_id(0);
126    int lidy = get_local_id(1);
127
128    int start_x = GRP_SIZEX * get_group_id(0);
129    int start_y = GRP_SIZEY * get_group_id(1);
130
131    int i = lidx + lidy * GRP_SIZEX;
132    for (int j = i;  j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY)
133    {
134        int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1);
135        int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1);
136        smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset)));
137    }
138
139    barrier(CLK_LOCAL_MEM_FENCE);
140
141    //// Sobel, Magnitude
142    //
143
144    __local float mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
145
146    lidx++;
147    lidy++;
148
149    if (i < GRP_SIZEX + 2)
150    {
151        int grp_sizey = min(GRP_SIZEY + 1, rows - start_y);
152        mag[i] = (sobel(i, smem)).z;
153        mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z;
154    }
155    if (i < GRP_SIZEY + 2)
156    {
157        int grp_sizex = min(GRP_SIZEX + 1, cols - start_x);
158        mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z;
159        mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z;
160    }
161
162    int idx = lidx + lidy * (GRP_SIZEX + 4);
163    i = lidx + lidy * (GRP_SIZEX + 2);
164
165    float3 res = sobel(idx, smem);
166    mag[i] = res.z;
167    barrier(CLK_LOCAL_MEM_FENCE);
168
169    int x = (int) res.x;
170    int y = (int) res.y;
171
172    //// Threshold + Non maxima suppression
173    //
174
175    /*
176        Sector numbers
177
178        3   2   1
179         *  *  *
180          * * *
181        0*******0
182          * * *
183         *  *  *
184        1   2   3
185
186        We need to determine arctg(dy / dx) to one of the four directions: 0, 45, 90 or 135 degrees.
187        Therefore if abs(dy / dx) belongs to the interval
188        [0, tg(22.5)]           -> 0 direction
189        [tg(22.5), tg(67.5)]    -> 1 or 3
190        [tg(67,5), +oo)         -> 2
191
192        Since tg(67.5) = 1 / tg(22.5), if we take
193        a = abs(dy / dx) * tg(22.5) and b = abs(dy / dx) * tg(67.5)
194        we can get another intervals
195
196        in case a:
197        [0, tg(22.5)^2]     -> 0
198        [tg(22.5)^2, 1]     -> 1, 3
199        [1, +oo)            -> 2
200
201        in case b:
202        [0, 1]              -> 0
203        [1, tg(67.5)^2]     -> 1,3
204        [tg(67.5)^2, +oo)   -> 2
205
206        that can help to find direction without conditions.
207
208        0 - might belong to an edge
209        1 - pixel doesn't belong to an edge
210        2 - belong to an edge
211    */
212
213    int gidx = get_global_id(0);
214    int gidy = get_global_id(1);
215
216    if (gidx >= cols || gidy >= rows)
217        return;
218
219    float mag0 = mag[i];
220
221    int value = 1;
222    if (mag0 > low_thr)
223    {
224        int a = (y / (float)x) * TG22;
225        int b = (y / (float)x) * TG67;
226
227        a = min((int)abs(a), 1) + 1;
228        b = min((int)abs(b), 1);
229
230        //  a = { 1, 2 }
231        //  b = { 0, 1 }
232        //  a * b = { 0, 1, 2 } - directions that we need ( + 3 if x ^ y < 0)
233
234        int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); // if a = 1, b = 1, dy ^ dx < 0
235        int dir = a * b + 2 * dir3;
236        float prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
237        float next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
238
239        if (mag0 > prev_mag && mag0 >= next_mag)
240        {
241            value = (mag0 > high_thr) ? 2 : 0;
242        }
243    }
244
245    storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
246}
247
248#elif defined WITHOUT_SOBEL
249
250/*
251    stage1_without_sobel:
252        Calc magnitudes
253        Non maxima suppression
254        Double thresholding
255*/
256
257#define loadpix(addr) (__global short *)(addr)
258#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
259
260#ifdef L2GRAD
261#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y))
262#else
263#define dist(x, y) (abs(x) + abs(y))
264#endif
265
266__constant int prev[4][2] = {
267    { 0, -1 },
268    { -1, -1 },
269    { -1, 0 },
270    { -1, 1 }
271};
272
273__constant int next[4][2] = {
274    { 0, 1 },
275    { 1, 1 },
276    { 1, 0 },
277    { 1, -1 }
278};
279
280__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset,
281                                   __global const uchar *dyptr, int dy_step, int dy_offset,
282                                   __global uchar *map, int map_step, int map_offset, int rows, int cols,
283                                   int low_thr, int high_thr)
284{
285    int start_x = get_group_id(0) * GRP_SIZEX;
286    int start_y = get_group_id(1) * GRP_SIZEY;
287
288    int lidx = get_local_id(0);
289    int lidy = get_local_id(1);
290
291    __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
292    __local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
293
294#pragma unroll
295    for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY)
296    {
297        int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1);
298        int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1);
299
300        int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset));
301        int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset));
302
303        __global short *dx = loadpix(dxptr + dx_index);
304        __global short *dy = loadpix(dyptr + dy_index);
305
306        int mag0 = dist(dx[0], dy[0]);
307#if cn > 1
308        short cdx = dx[0], cdy = dy[0];
309#pragma unroll
310        for (int j = 1; j < cn; ++j)
311        {
312            int mag1 = dist(dx[j], dy[j]);
313            if (mag1 > mag0)
314            {
315                mag0 = mag1;
316                cdx = dx[j];
317                cdy = dy[j];
318            }
319        }
320        dx[0] = cdx;
321        dy[0] = cdy;
322#endif
323        mag[i] = mag0;
324        sigma[i] = (short2)(dx[0], dy[0]);
325    }
326
327    barrier(CLK_LOCAL_MEM_FENCE);
328
329    int gidx = get_global_id(0);
330    int gidy = get_global_id(1);
331
332    if (gidx >= cols || gidy >= rows)
333        return;
334
335    lidx++;
336    lidy++;
337
338    int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)];
339    short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x;
340    short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y;
341
342    int value = 1;
343    if (mag0 > low_thr)
344    {
345        int a = (y / (float)x) * TG22;
346        int b = (y / (float)x) * TG67;
347
348        a = min((int)abs(a), 1) + 1;
349        b = min((int)abs(b), 1);
350
351        int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);
352        int dir = a * b + 2 * dir3;
353        int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
354        int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
355
356        if (mag0 > prev_mag && mag0 >= next_mag)
357        {
358            value = (mag0 > high_thr) ? 2 : 0;
359        }
360    }
361
362    storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
363}
364
365#undef TG22
366#undef CANNY_SHIFT
367
368#elif defined STAGE2
369/*
370    stage2:
371        hysteresis (add edges labeled 0 if they are connected with an edge labeled 2)
372*/
373
374#define loadpix(addr) *(__global int *)(addr)
375#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
376#define LOCAL_TOTAL (LOCAL_X*LOCAL_Y)
377#define l_stack_size (4*LOCAL_TOTAL)
378#define p_stack_size 8
379
380__constant short move_dir[2][8] = {
381    { -1, -1, -1, 0, 0, 1, 1, 1 },
382    { -1, 0, 1, -1, 1, -1, 0, 1 }
383};
384
385__kernel void stage2_hysteresis(__global uchar *map_ptr, int map_step, int map_offset, int rows, int cols)
386{
387    map_ptr += map_offset;
388
389    int x = get_global_id(0);
390    int y = get_global_id(1) * PIX_PER_WI;
391
392    int lid = get_local_id(0) + get_local_id(1) * LOCAL_X;
393
394    __local ushort2 l_stack[l_stack_size];
395    __local int l_counter;
396
397    if (lid == 0)
398        l_counter = 0;
399    barrier(CLK_LOCAL_MEM_FENCE);
400
401    if (x < cols)
402    {
403        __global uchar* map = map_ptr + mad24(y, map_step, x * (int)sizeof(int));
404
405        #pragma unroll
406        for (int cy = 0; cy < PIX_PER_WI; ++cy)
407        {
408            if (y < rows)
409            {
410                int type = loadpix(map);
411                if (type == 2)
412                {
413                    l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
414                }
415
416                y++;
417                map += map_step;
418            }
419        }
420    }
421    barrier(CLK_LOCAL_MEM_FENCE);
422
423    ushort2 p_stack[p_stack_size];
424    int p_counter = 0;
425
426    while(l_counter != 0)
427    {
428        int mod = l_counter % LOCAL_TOTAL;
429        int pix_per_thr = l_counter / LOCAL_TOTAL + ((lid < mod) ? 1 : 0);
430
431        for (int i = 0; i < pix_per_thr; ++i)
432        {
433            int index = atomic_dec(&l_counter) - 1;
434            if (index < 0)
435               continue;
436            ushort2 pos = l_stack[ index ];
437
438            #pragma unroll
439            for (int j = 0; j < 8; ++j)
440            {
441                ushort posx = pos.x + move_dir[0][j];
442                ushort posy = pos.y + move_dir[1][j];
443                if (posx < 0 || posy < 0 || posx >= cols || posy >= rows)
444                    continue;
445                __global uchar *addr = map_ptr + mad24(posy, map_step, posx * (int)sizeof(int));
446                int type = loadpix(addr);
447                if (type == 0)
448                {
449                    p_stack[p_counter++] = (ushort2)(posx, posy);
450                    storepix(2, addr);
451                }
452            }
453        }
454        barrier(CLK_LOCAL_MEM_FENCE);
455        if (l_counter < 0)
456            l_counter = 0;
457        barrier(CLK_LOCAL_MEM_FENCE);
458
459        while (p_counter > 0)
460        {
461            l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter];
462        }
463        barrier(CLK_LOCAL_MEM_FENCE);
464    }
465}
466
467#elif defined GET_EDGES
468
469// Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
470// map      edge type mappings
471// dst      edge output
472
473__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols,
474                       __global uchar *dst, int dst_step, int dst_offset)
475{
476    int x = get_global_id(0);
477    int y = get_global_id(1) * PIX_PER_WI;
478
479    if (x < cols)
480    {
481        int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset));
482        int dst_index = mad24(dst_step, y, x + dst_offset);
483
484        #pragma unroll
485        for (int cy = 0; cy < PIX_PER_WI; ++cy)
486        {
487            if (y < rows)
488            {
489                __global const int * map = (__global const int *)(mapptr + map_index);
490                dst[dst_index] = (uchar)(-(map[0] >> 1));
491
492                y++;
493                map_index += map_step;
494                dst_index += dst_step;
495            }
496        }
497    }
498}
499
500#endif
501