1// OpenCL port of the FAST corner detector.
2// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org
3
4inline int cornerScore(__global const uchar* img, int step)
5{
6    int k, tofs, v = img[0], a0 = 0, b0;
7    int d[16];
8    #define LOAD2(idx, ofs) \
9        tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])
10    LOAD2(0, 3);
11    LOAD2(1, -step+3);
12    LOAD2(2, -step*2+2);
13    LOAD2(3, -step*3+1);
14    LOAD2(4, -step*3);
15    LOAD2(5, -step*3-1);
16    LOAD2(6, -step*2-2);
17    LOAD2(7, -step-3);
18
19    #pragma unroll
20    for( k = 0; k < 16; k += 2 )
21    {
22        int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);
23        a = min(a, (int)d[(k+3)&15]);
24        a = min(a, (int)d[(k+4)&15]);
25        a = min(a, (int)d[(k+5)&15]);
26        a = min(a, (int)d[(k+6)&15]);
27        a = min(a, (int)d[(k+7)&15]);
28        a = min(a, (int)d[(k+8)&15]);
29        a0 = max(a0, min(a, (int)d[k&15]));
30        a0 = max(a0, min(a, (int)d[(k+9)&15]));
31    }
32
33    b0 = -a0;
34    #pragma unroll
35    for( k = 0; k < 16; k += 2 )
36    {
37        int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);
38        b = max(b, (int)d[(k+3)&15]);
39        b = max(b, (int)d[(k+4)&15]);
40        b = max(b, (int)d[(k+5)&15]);
41        b = max(b, (int)d[(k+6)&15]);
42        b = max(b, (int)d[(k+7)&15]);
43        b = max(b, (int)d[(k+8)&15]);
44
45        b0 = min(b0, max(b, (int)d[k]));
46        b0 = min(b0, max(b, (int)d[(k+9)&15]));
47    }
48
49    return -b0-1;
50}
51
52__kernel
53void FAST_findKeypoints(
54    __global const uchar * _img, int step, int img_offset,
55    int img_rows, int img_cols,
56    volatile __global int* kp_loc,
57    int max_keypoints, int threshold )
58{
59    int j = get_global_id(0) + 3;
60    int i = get_global_id(1) + 3;
61
62    if (i < img_rows - 3 && j < img_cols - 3)
63    {
64        __global const uchar* img = _img + mad24(i, step, j + img_offset);
65        int v = img[0], t0 = v - threshold, t1 = v + threshold;
66        int k, tofs, v0, v1;
67        int m0 = 0, m1 = 0;
68
69        #define UPDATE_MASK(idx, ofs) \
70            tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \
71            m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \
72            m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))
73
74        UPDATE_MASK(0, 3);
75        if( (m0 | m1) == 0 )
76            return;
77
78        UPDATE_MASK(2, -step*2+2);
79        UPDATE_MASK(4, -step*3);
80        UPDATE_MASK(6, -step*2-2);
81
82        #define EVEN_MASK (1+4+16+64)
83
84        if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&
85            ((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )
86            return;
87
88        UPDATE_MASK(1, -step+3);
89        UPDATE_MASK(3, -step*3+1);
90        UPDATE_MASK(5, -step*3-1);
91        UPDATE_MASK(7, -step-3);
92        if( ((m0 | (m0 >> 8)) & 255) != 255 &&
93            ((m1 | (m1 >> 8)) & 255) != 255 )
94            return;
95
96        m0 |= m0 << 16;
97        m1 |= m1 << 16;
98
99        #define CHECK0(i) ((m0 & (511 << i)) == (511 << i))
100        #define CHECK1(i) ((m1 & (511 << i)) == (511 << i))
101
102        if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +
103            CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +
104            CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +
105            CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +
106
107            CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +
108            CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +
109            CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +
110            CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )
111            return;
112
113        {
114            int idx = atomic_inc(kp_loc);
115            if( idx < max_keypoints )
116            {
117                kp_loc[1 + 2*idx] = j;
118                kp_loc[2 + 2*idx] = i;
119            }
120        }
121    }
122}
123
124///////////////////////////////////////////////////////////////////////////
125// nonmaxSupression
126
127__kernel
128void FAST_nonmaxSupression(
129    __global const int* kp_in, volatile __global int* kp_out,
130    __global const uchar * _img, int step, int img_offset,
131    int rows, int cols, int counter, int max_keypoints)
132{
133    const int idx = get_global_id(0);
134
135    if (idx < counter)
136    {
137        int x = kp_in[1 + 2*idx];
138        int y = kp_in[2 + 2*idx];
139        __global const uchar* img = _img + mad24(y, step, x + img_offset);
140
141        int s = cornerScore(img, step);
142
143        if( (x < 4 || s > cornerScore(img-1, step)) +
144            (y < 4 || s > cornerScore(img-step, step)) != 2 )
145            return;
146        if( (x >= cols - 4 || s > cornerScore(img+1, step)) +
147            (y >= rows - 4 || s > cornerScore(img+step, step)) +
148            (x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +
149            (x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +
150            (x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +
151            (x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)
152        {
153            int new_idx = atomic_inc(kp_out);
154            if( new_idx < max_keypoints )
155            {
156                kp_out[1 + 3*new_idx] = x;
157                kp_out[2 + 3*new_idx] = y;
158                kp_out[3 + 3*new_idx] = s;
159            }
160        }
161    }
162}
163