1/* See LICENSE file in the root OpenCV directory */
2
3#if TILE_SIZE != 32
4#error "TILE SIZE should be 32"
5#endif
6
7
8__kernel void moments(__global const uchar* src, int src_step, int src_offset,
9    int src_rows, int src_cols, __global int* mom0, int xtiles)
10{
11    int x0 = get_global_id(0);
12    int y0 = get_group_id(1);
13    int x, y = get_local_id(1);
14    int x_min = x0*TILE_SIZE;
15    int ypix = y0*TILE_SIZE + y;
16    __local int mom[TILE_SIZE][10];
17
18    if (x_min < src_cols && y0*TILE_SIZE < src_rows)
19    {
20        if (ypix < src_rows)
21        {
22            int x_max = min(src_cols - x_min, TILE_SIZE);
23            __global const uchar* ptr = src + src_offset + ypix*src_step + x_min;
24            int4 S = (int4)(0, 0, 0, 0), p;
25
26#define SUM_ELEM(elem, ofs) \
27    (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem
28
29            x = x_max & -4;
30            if (x_max >= 4)
31            {
32                p = convert_int4(vload4(0, ptr));
33#ifdef OP_MOMENTS_BINARY
34                p = min(p, 1);
35#endif
36                S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) +
37                    (int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27);
38                //SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3);
39
40                if (x_max >= 8)
41                {
42                    p = convert_int4(vload4(0, ptr + 4));
43#ifdef OP_MOMENTS_BINARY
44                    p = min(p, 1);
45#endif
46                    S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) +
47                        (int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343);
48                    //SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7);
49
50                    if (x_max >= 12)
51                    {
52                        p = convert_int4(vload4(0, ptr + 8));
53#ifdef OP_MOMENTS_BINARY
54                        p = min(p, 1);
55#endif
56                        S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) +
57                            (int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331);
58                        //SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11);
59
60                        if (x_max >= 16)
61                        {
62                            p = convert_int4(vload4(0, ptr + 12));
63#ifdef OP_MOMENTS_BINARY
64                            p = min(p, 1);
65#endif
66                            S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) +
67                                (int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375);
68                            //SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15);
69                        }
70                    }
71                }
72            }
73
74            if (x_max >= 20)
75            {
76                p = convert_int4(vload4(0, ptr + 16));
77#ifdef OP_MOMENTS_BINARY
78                p = min(p, 1);
79#endif
80                S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) +
81                    (int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859);
82                //SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
83
84                if (x_max >= 24)
85                {
86                    p = convert_int4(vload4(0, ptr + 20));
87#ifdef OP_MOMENTS_BINARY
88                    p = min(p, 1);
89#endif
90                    S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) +
91                        (int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167);
92                    //SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
93
94                    if (x_max >= 28)
95                    {
96                        p = convert_int4(vload4(0, ptr + 24));
97#ifdef OP_MOMENTS_BINARY
98                        p = min(p, 1);
99#endif
100                        S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) +
101                            (int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683);
102                        //SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
103
104                        if (x_max >= 32)
105                        {
106                            p = convert_int4(vload4(0, ptr + 28));
107#ifdef OP_MOMENTS_BINARY
108                            p = min(p, 1);
109#endif
110                            S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) +
111                                (int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791);
112                            //SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
113                        }
114                    }
115                }
116            }
117
118            if (x < x_max)
119            {
120                int ps = ptr[x];
121#ifdef OP_MOMENTS_BINARY
122                ps = min(ps, 1);
123#endif
124                S += SUM_ELEM(ps, x);
125                if (x + 1 < x_max)
126                {
127                    ps = ptr[x + 1];
128#ifdef OP_MOMENTS_BINARY
129                    ps = min(ps, 1);
130#endif
131                    S += SUM_ELEM(ps, x + 1);
132                    if (x + 2 < x_max)
133                    {
134                        ps = ptr[x + 2];
135#ifdef OP_MOMENTS_BINARY
136                        ps = min(ps, 1);
137#endif
138                        S += SUM_ELEM(ps, x + 2);
139                    }
140                }
141            }
142
143            int sy = y*y;
144
145            mom[y][0] = S.s0;
146            mom[y][1] = S.s1;
147            mom[y][2] = y*S.s0;
148            mom[y][3] = S.s2;
149            mom[y][4] = y*S.s1;
150            mom[y][5] = sy*S.s0;
151            mom[y][6] = S.s3;
152            mom[y][7] = y*S.s2;
153            mom[y][8] = sy*S.s1;
154            mom[y][9] = y*sy*S.s0;
155        }
156        else
157            mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =
158            mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;
159        barrier(CLK_LOCAL_MEM_FENCE);
160
161#define REDUCE(d) \
162        if (y < d) \
163        { \
164        mom[y][0] += mom[y + d][0]; \
165        mom[y][1] += mom[y + d][1]; \
166        mom[y][2] += mom[y + d][2]; \
167        mom[y][3] += mom[y + d][3]; \
168        mom[y][4] += mom[y + d][4]; \
169        mom[y][5] += mom[y + d][5]; \
170        mom[y][6] += mom[y + d][6]; \
171        mom[y][7] += mom[y + d][7]; \
172        mom[y][8] += mom[y + d][8]; \
173        mom[y][9] += mom[y + d][9]; \
174        } \
175        barrier(CLK_LOCAL_MEM_FENCE)
176
177        REDUCE(16);
178        REDUCE(8);
179        REDUCE(4);
180        REDUCE(2);
181
182        if (y < 10)
183        {
184            __global int* momout = mom0 + (y0*xtiles + x0) * 10;
185            momout[y] = mom[0][y] + mom[1][y];
186        }
187    }
188}
189