1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// This file is part of OpenCV project. 3// It is subject to the license terms in the LICENSE file found in the top-level directory 4// of this distribution and at http://opencv.org/license.html. 5// Copyright (C) 2014, Itseez, Inc., all rights reserved. 6// Third party copyrights are property of their respective owners. 7//M*/ 8 9#ifdef DOUBLE_SUPPORT 10#ifdef cl_amd_fp64 11#pragma OPENCL EXTENSION cl_amd_fp64:enable 12#elif defined (cl_khr_fp64) 13#pragma OPENCL EXTENSION cl_khr_fp64:enable 14#endif 15#endif 16 17#ifndef LOCAL_SUM_SIZE 18#define LOCAL_SUM_SIZE 16 19#endif 20 21#define LOCAL_SUM_STRIDE (LOCAL_SUM_SIZE + 1) 22 23 24kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols, 25 __global uchar *buf_ptr, int buf_step, int buf_offset 26#ifdef SUM_SQUARE 27 ,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset 28#endif 29 ) 30{ 31 __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE]; 32#ifdef SUM_SQUARE 33 __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE]; 34#endif 35 int lid = get_local_id(0); 36 int gid = get_group_id(0); 37 38 int x = get_global_id(0); 39 int src_index = x + src_offset; 40 41 sumT accum = 0; 42#ifdef SUM_SQUARE 43 sumSQT accum_sq = 0; 44#endif 45 for (int y = 0; y < rows; y += LOCAL_SUM_SIZE) 46 { 47 int lsum_index = lid; 48 #pragma unroll 49 for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE) 50 { 51 if ((x < cols) && (y + yin < rows)) 52 { 53 __global const uchar *src = src_ptr + src_index; 54 accum += src[0]; 55#ifdef SUM_SQUARE 56 sumSQT temp = src[0] * src[0]; 57 accum_sq += temp; 58#endif 59 } 60 lm_sum[lsum_index] = accum; 61#ifdef SUM_SQUARE 62 lm_sum_sq[lsum_index] = accum_sq; 63#endif 64 } 65 barrier(CLK_LOCAL_MEM_FENCE); 66 67 //int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid; 68 int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset)); 69#ifdef SUM_SQUARE 70 int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset)); 71#endif 72 73 lsum_index = LOCAL_SUM_STRIDE * lid; 74 #pragma unroll 75 for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++) 76 { 77 __global sumT *buf = (__global sumT *)(buf_ptr + buf_index); 78 buf[0] = lm_sum[lsum_index]; 79 buf_index += buf_step; 80#ifdef SUM_SQUARE 81 __global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index); 82 bufsq[0] = lm_sum_sq[lsum_index]; 83 buf_sq_index += buf_sq_step; 84#endif 85 } 86 barrier(CLK_LOCAL_MEM_FENCE); 87 } 88} 89 90kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset, 91#ifdef SUM_SQUARE 92 __global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset, 93#endif 94 __global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols 95#ifdef SUM_SQUARE 96 ,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset 97#endif 98 ) 99{ 100 __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE]; 101#ifdef SUM_SQUARE 102 __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE]; 103#endif 104 int lid = get_local_id(0); 105 int gid = get_group_id(0); 106 107 int gs = get_global_size(0); 108 109 int x = get_global_id(0); 110 111 __global sumT *dst = (__global sumT *)(dst_ptr + dst_offset); 112 for (int xin = x; xin < cols; xin += gs) 113 { 114 dst[xin] = 0; 115 } 116 dst_offset += dst_step; 117 118 if (x < rows - 1) 119 { 120 dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset)); 121 dst[0] = 0; 122 } 123 124 int buf_index = mad24((int)sizeof(sumT), x, buf_offset); 125 sumT accum = 0; 126 127#ifdef SUM_SQUARE 128 __global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset); 129 for (int xin = x; xin < cols; xin += gs) 130 { 131 dst_sq[xin] = 0; 132 } 133 dst_sq_offset += dst_sq_step; 134 135 if (x < rows - 1) 136 { 137 dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset)); 138 dst_sq[0] = 0; 139 } 140 141 int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset); 142 sumSQT accum_sq = 0; 143#endif 144 145 for (int y = 1; y < cols; y += LOCAL_SUM_SIZE) 146 { 147 int lsum_index = lid; 148 #pragma unroll 149 for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE) 150 { 151 __global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index); 152 accum += buf[0]; 153 lm_sum[lsum_index] = accum; 154 buf_index += buf_step; 155#ifdef SUM_SQUARE 156 __global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index); 157 accum_sq += buf_sq[0]; 158 lm_sum_sq[lsum_index] = accum_sq; 159 buf_sq_index += buf_sq_step; 160#endif 161 } 162 barrier(CLK_LOCAL_MEM_FENCE); 163 164 if (y + lid < cols) 165 { 166 //int dst_index = dst_offset + dst_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid; 167 int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset)); 168#ifdef SUM_SQUARE 169 int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset)); 170#endif 171 lsum_index = LOCAL_SUM_STRIDE * lid; 172 int yin_max = min(rows - 1 - LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE); 173 #pragma unroll 174 for (int yin = 0; yin < yin_max; yin++, lsum_index++) 175 { 176 dst = (__global sumT *)(dst_ptr + dst_index); 177 dst[0] = lm_sum[lsum_index]; 178 dst_index += dst_step; 179#ifdef SUM_SQUARE 180 dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index); 181 dst_sq[0] = lm_sum_sq[lsum_index]; 182 dst_sq_index += dst_sq_step; 183#endif 184 } 185 } 186 barrier(CLK_LOCAL_MEM_FENCE); 187 } 188} 189