1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Dachuan Zhao, dachuan@multicorewareinc.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46#ifdef DOUBLE_SUPPORT 47#ifdef cl_amd_fp64 48#pragma OPENCL EXTENSION cl_amd_fp64:enable 49#elif defined (cl_khr_fp64) 50#pragma OPENCL EXTENSION cl_khr_fp64:enable 51#endif 52#endif 53 54#if defined BORDER_REPLICATE 55// aaaaaa|abcdefgh|hhhhhhh 56#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1) 57#elif defined BORDER_WRAP 58// cdefgh|abcdefgh|abcdefg 59#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV) 60#elif defined BORDER_REFLECT 61// fedcba|abcdefgh|hgfedcb 62#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1) 63#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 64// gfedcb|abcdefgh|gfedcba 65#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1) 66#else 67#error No extrapolation method 68#endif 69 70#if cn != 3 71#define loadpix(addr) *(__global const T*)(addr) 72#define storepix(val, addr) *(__global T*)(addr) = (val) 73#define PIXSIZE ((int)sizeof(T)) 74#else 75#define loadpix(addr) vload3(0, (__global const T1*)(addr)) 76#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr)) 77#define PIXSIZE ((int)sizeof(T1)*3) 78#endif 79 80#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x))) 81 82#if kercn == 4 83#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x))) 84#endif 85 86#ifdef INTEL_DEVICE 87#define MAD(x,y,z) fma((x),(y),(z)) 88#else 89#define MAD(x,y,z) mad((x),(y),(z)) 90#endif 91 92#define LOAD_LOCAL(col_gl, col_lcl) \ 93 sum0 = co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \ 94 sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0); \ 95 temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows)); \ 96 sum0 = MAD(co1, temp, sum0); \ 97 sum1 = co3 * temp; \ 98 temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \ 99 sum0 = MAD(co2, temp, sum0); \ 100 sum1 = MAD(co2, temp, sum1); \ 101 temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \ 102 sum0 = MAD(co3, temp, sum0); \ 103 sum1 = MAD(co1, temp, sum1); \ 104 smem[0][col_lcl] = sum0; \ 105 sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1); \ 106 sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1); \ 107 smem[1][col_lcl] = sum1; 108 109 110#if kercn == 4 111#define LOAD_LOCAL4(col_gl, col_lcl) \ 112 sum40 = co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \ 113 sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40); \ 114 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y, src_rows)); \ 115 sum40 = MAD(co1, temp4, sum40); \ 116 sum41 = co3 * temp4; \ 117 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \ 118 sum40 = MAD(co2, temp4, sum40); \ 119 sum41 = MAD(co2, temp4, sum41); \ 120 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \ 121 sum40 = MAD(co3, temp4, sum40); \ 122 sum41 = MAD(co1, temp4, sum41); \ 123 vstore4(sum40, col_lcl, (__local float*) &smem[0][2]); \ 124 sum41 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum41); \ 125 sum41 = MAD(co3, SRC4(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum41); \ 126 vstore4(sum41, col_lcl, (__local float*) &smem[1][2]); 127#endif 128 129#define noconvert 130 131__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 132 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) 133{ 134 const int x = get_global_id(0)*kercn; 135 const int y = 2*get_global_id(1); 136 137 __local FT smem[2][LOCAL_SIZE + 4]; 138 __global uchar * dstData = dst + dst_offset; 139 __global const uchar * srcData = src + src_offset; 140 141 FT sum0, sum1, temp; 142 FT co1 = 0.375f; 143 FT co2 = 0.25f; 144 FT co3 = 0.0625f; 145 146 const int src_y = 2*y; 147 int col; 148 149 if (src_y >= 2 && src_y < src_rows - 4) 150 { 151#define EXTRAPOLATE_(val, maxVal) val 152#if kercn == 1 153 col = EXTRAPOLATE(x, src_cols); 154 LOAD_LOCAL(col, 2 + get_local_id(0)) 155#else 156 if (x < src_cols-4) 157 { 158 float4 sum40, sum41, temp4; 159 LOAD_LOCAL4(x, get_local_id(0)) 160 } 161 else 162 { 163 for (int i=0; i<4; i++) 164 { 165 col = EXTRAPOLATE(x+i, src_cols); 166 LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i) 167 } 168 } 169#endif 170 if (get_local_id(0) < 2) 171 { 172 col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 173 LOAD_LOCAL(col, get_local_id(0)) 174 } 175 else if (get_local_id(0) < 4) 176 { 177 col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 178 LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0)) 179 } 180 } 181 else // need extrapolate y 182 { 183#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal) 184#if kercn == 1 185 col = EXTRAPOLATE(x, src_cols); 186 LOAD_LOCAL(col, 2 + get_local_id(0)) 187#else 188 if (x < src_cols-4) 189 { 190 float4 sum40, sum41, temp4; 191 LOAD_LOCAL4(x, get_local_id(0)) 192 } 193 else 194 { 195 for (int i=0; i<4; i++) 196 { 197 col = EXTRAPOLATE(x+i, src_cols); 198 LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i) 199 } 200 } 201#endif 202 if (get_local_id(0) < 2) 203 { 204 col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 205 LOAD_LOCAL(col, get_local_id(0)) 206 } 207 else if (get_local_id(0) < 4) 208 { 209 col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 210 LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0)) 211 } 212 } 213 214 barrier(CLK_LOCAL_MEM_FENCE); 215 216#if kercn == 1 217 if (get_local_id(0) < LOCAL_SIZE / 2) 218 { 219 const int tid2 = get_local_id(0) * 2; 220 221 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; 222 223 if (dst_x < dst_cols) 224 { 225 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 226 { 227#if cn == 1 228#if fdepth <= 5 229 FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2)); 230#else 231 FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2)); 232#endif 233#else 234 FT sum = co3 * smem[yin - y][2 + tid2 - 2]; 235 sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum); 236 sum = MAD(co1, smem[yin - y][2 + tid2 ], sum); 237 sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum); 238#endif 239 sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum); 240 storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE); 241 } 242 } 243 } 244#else 245 int tid4 = get_local_id(0) * 4; 246 int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2; 247 if (dst_x < dst_cols - 1) 248 { 249 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 250 { 251 252 FT sum = co3* smem[yin - y][2 + tid4 + 2]; 253 sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum); 254 sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum); 255 sum = MAD(co1, smem[yin - y][2 + tid4 ], sum); 256 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 257 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 258 259 dst_x ++; 260 sum = co3* smem[yin - y][2 + tid4 + 4]; 261 sum = MAD(co3, smem[yin - y][2 + tid4 ], sum); 262 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 263 sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum); 264 sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum); 265 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 266 dst_x --; 267 } 268 269 } 270 else if (dst_x < dst_cols) 271 { 272 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 273 { 274 FT sum = co3* smem[yin - y][2 + tid4 + 2]; 275 sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum); 276 sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum); 277 sum = MAD(co1, smem[yin - y][2 + tid4 ], sum); 278 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 279 280 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 281 } 282 } 283#endif 284 285} 286