1/*M///////////////////////////////////////////////////////////////////////////////////////
2// This file is part of OpenCV project.
3// It is subject to the license terms in the LICENSE file found in the top-level directory
4// of this distribution and at http://opencv.org/license.html.
5// Copyright (C) 2014, Itseez, Inc., all rights reserved.
6// Third party copyrights are property of their respective owners.
7//M*/
8
9#ifdef DOUBLE_SUPPORT
10#ifdef cl_amd_fp64
11#pragma OPENCL EXTENSION cl_amd_fp64:enable
12#elif defined (cl_khr_fp64)
13#pragma OPENCL EXTENSION cl_khr_fp64:enable
14#endif
15#endif
16
17#ifndef LOCAL_SUM_SIZE
18#define LOCAL_SUM_SIZE      16
19#endif
20
21#define LOCAL_SUM_STRIDE    (LOCAL_SUM_SIZE + 1)
22
23
24kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
25                              __global uchar *buf_ptr, int buf_step, int buf_offset
26#ifdef SUM_SQUARE
27                              ,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
28#endif
29                              )
30{
31    __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
32#ifdef SUM_SQUARE
33    __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
34#endif
35    int lid = get_local_id(0);
36    int gid = get_group_id(0);
37
38    int x = get_global_id(0);
39    int src_index = x + src_offset;
40
41    sumT accum = 0;
42#ifdef SUM_SQUARE
43    sumSQT accum_sq = 0;
44#endif
45    for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
46    {
47        int lsum_index = lid;
48        #pragma unroll
49        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
50        {
51            if ((x < cols) && (y + yin < rows))
52            {
53                __global const uchar *src = src_ptr + src_index;
54                accum += src[0];
55#ifdef SUM_SQUARE
56                sumSQT temp = src[0] * src[0];
57                accum_sq += temp;
58#endif
59            }
60            lm_sum[lsum_index] = accum;
61#ifdef SUM_SQUARE
62            lm_sum_sq[lsum_index] = accum_sq;
63#endif
64        }
65        barrier(CLK_LOCAL_MEM_FENCE);
66
67        //int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
68        int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
69#ifdef SUM_SQUARE
70        int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
71#endif
72
73        lsum_index = LOCAL_SUM_STRIDE * lid;
74        #pragma unroll
75        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
76        {
77            __global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
78            buf[0] = lm_sum[lsum_index];
79            buf_index += buf_step;
80#ifdef SUM_SQUARE
81            __global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
82            bufsq[0] = lm_sum_sq[lsum_index];
83            buf_sq_index += buf_sq_step;
84#endif
85        }
86        barrier(CLK_LOCAL_MEM_FENCE);
87    }
88}
89
90kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
91#ifdef SUM_SQUARE
92                              __global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
93#endif
94                              __global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
95#ifdef SUM_SQUARE
96                              ,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
97#endif
98                              )
99{
100    __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
101#ifdef SUM_SQUARE
102    __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
103#endif
104    int lid = get_local_id(0);
105    int gid = get_group_id(0);
106
107    int gs = get_global_size(0);
108
109    int x = get_global_id(0);
110
111    __global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
112    for (int xin = x; xin < cols; xin += gs)
113    {
114        dst[xin] = 0;
115    }
116    dst_offset += dst_step;
117
118    if (x < rows - 1)
119    {
120        dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
121        dst[0] = 0;
122    }
123
124    int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
125    sumT accum = 0;
126
127#ifdef SUM_SQUARE
128    __global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
129    for (int xin = x; xin < cols; xin += gs)
130    {
131        dst_sq[xin] = 0;
132    }
133    dst_sq_offset += dst_sq_step;
134
135    if (x < rows - 1)
136    {
137        dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
138        dst_sq[0] = 0;
139    }
140
141    int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
142    sumSQT accum_sq = 0;
143#endif
144
145    for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
146    {
147        int lsum_index = lid;
148        #pragma unroll
149        for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
150        {
151            __global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
152            accum += buf[0];
153            lm_sum[lsum_index] = accum;
154            buf_index += buf_step;
155#ifdef SUM_SQUARE
156            __global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
157            accum_sq += buf_sq[0];
158            lm_sum_sq[lsum_index] = accum_sq;
159            buf_sq_index += buf_sq_step;
160#endif
161        }
162        barrier(CLK_LOCAL_MEM_FENCE);
163
164        if (y + lid < cols)
165        {
166            //int dst_index = dst_offset + dst_step *  LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
167            int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
168#ifdef SUM_SQUARE
169            int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
170#endif
171            lsum_index = LOCAL_SUM_STRIDE * lid;
172            int yin_max = min(rows - 1 -  LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
173            #pragma unroll
174            for (int yin = 0; yin < yin_max; yin++, lsum_index++)
175            {
176                dst = (__global sumT *)(dst_ptr + dst_index);
177                dst[0] = lm_sum[lsum_index];
178                dst_index += dst_step;
179#ifdef SUM_SQUARE
180                dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
181                dst_sq[0] = lm_sum_sq[lsum_index];
182                dst_sq_index += dst_sq_step;
183#endif
184            }
185        }
186        barrier(CLK_LOCAL_MEM_FENCE);
187    }
188}
189