1// This file is part of OpenCV project. 2// It is subject to the license terms in the LICENSE file found in the top-level directory 3// of this distribution and at http://opencv.org/license.html. 4 5// Copyright (C) 2014, Itseez, Inc., all rights reserved. 6// Third party copyrights are property of their respective owners. 7 8#define SQRT_2 0.707106781188f 9#define sin_120 0.866025403784f 10#define fft5_2 0.559016994374f 11#define fft5_3 -0.951056516295f 12#define fft5_4 -1.538841768587f 13#define fft5_5 0.363271264002f 14 15#ifdef DOUBLE_SUPPORT 16#ifdef cl_amd_fp64 17#pragma OPENCL EXTENSION cl_amd_fp64:enable 18#elif defined (cl_khr_fp64) 19#pragma OPENCL EXTENSION cl_khr_fp64:enable 20#endif 21#endif 22 23__attribute__((always_inline)) 24CT mul_complex(CT a, CT b) { 25 return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); 26} 27 28__attribute__((always_inline)) 29CT twiddle(CT a) { 30 return (CT)(a.y, -a.x); 31} 32 33__attribute__((always_inline)) 34void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles, 35 const int x, const int block_size) 36{ 37 const int k = x & (block_size - 1); 38 a1 = mul_complex(twiddles[k], a1); 39 const int dst_ind = (x << 1) - k; 40 41 smem[dst_ind] = a0 + a1; 42 smem[dst_ind+block_size] = a0 - a1; 43} 44 45__attribute__((always_inline)) 46void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles, 47 const int x, const int block_size) 48{ 49 const int k = x & (block_size - 1); 50 a1 = mul_complex(twiddles[k], a1); 51 a2 = mul_complex(twiddles[k + block_size], a2); 52 a3 = mul_complex(twiddles[k + 2*block_size], a3); 53 54 const int dst_ind = ((x - k) << 2) + k; 55 56 CT b0 = a0 + a2; 57 a2 = a0 - a2; 58 CT b1 = a1 + a3; 59 a3 = twiddle(a1 - a3); 60 61 smem[dst_ind] = b0 + b1; 62 smem[dst_ind + block_size] = a2 + a3; 63 smem[dst_ind + 2*block_size] = b0 - b1; 64 smem[dst_ind + 3*block_size] = a2 - a3; 65} 66 67__attribute__((always_inline)) 68void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles, 69 const int x, const int block_size) 70{ 71 const int k = x % block_size; 72 a1 = mul_complex(twiddles[k], a1); 73 a2 = mul_complex(twiddles[k+block_size], a2); 74 const int dst_ind = ((x - k) * 3) + k; 75 76 CT b1 = a1 + a2; 77 a2 = twiddle(sin_120*(a1 - a2)); 78 CT b0 = a0 - (CT)(0.5f)*b1; 79 80 smem[dst_ind] = a0 + b1; 81 smem[dst_ind + block_size] = b0 + a2; 82 smem[dst_ind + 2*block_size] = b0 - a2; 83} 84 85__attribute__((always_inline)) 86void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles, 87 const int x, const int block_size) 88{ 89 const int k = x % block_size; 90 a1 = mul_complex(twiddles[k], a1); 91 a2 = mul_complex(twiddles[k + block_size], a2); 92 a3 = mul_complex(twiddles[k+2*block_size], a3); 93 a4 = mul_complex(twiddles[k+3*block_size], a4); 94 95 const int dst_ind = ((x - k) * 5) + k; 96 __local CT* dst = smem + dst_ind; 97 98 CT b0, b1, b5; 99 100 b1 = a1 + a4; 101 a1 -= a4; 102 103 a4 = a3 + a2; 104 a3 -= a2; 105 106 a2 = b1 + a4; 107 b0 = a0 - (CT)0.25f * a2; 108 109 b1 = fft5_2 * (b1 - a4); 110 a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x); 111 b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); 112 113 a4.x += fft5_4 * a3.y; 114 a4.y -= fft5_4 * a3.x; 115 116 a1 = b0 + b1; 117 b0 -= b1; 118 119 dst[0] = a0 + a2; 120 dst[block_size] = a1 + a4; 121 dst[2 * block_size] = b0 + b5; 122 dst[3 * block_size] = b0 - b5; 123 dst[4 * block_size] = a1 - a4; 124} 125 126__attribute__((always_inline)) 127void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 128{ 129 CT a0, a1; 130 131 if (x < t) 132 { 133 a0 = smem[x]; 134 a1 = smem[x+t]; 135 } 136 137 barrier(CLK_LOCAL_MEM_FENCE); 138 139 if (x < t) 140 butterfly2(a0, a1, smem, twiddles, x, block_size); 141 142 barrier(CLK_LOCAL_MEM_FENCE); 143} 144 145__attribute__((always_inline)) 146void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 147{ 148 const int x2 = x1 + t/2; 149 CT a0, a1, a2, a3; 150 151 if (x1 < t/2) 152 { 153 a0 = smem[x1]; a1 = smem[x1+t]; 154 a2 = smem[x2]; a3 = smem[x2+t]; 155 } 156 157 barrier(CLK_LOCAL_MEM_FENCE); 158 159 if (x1 < t/2) 160 { 161 butterfly2(a0, a1, smem, twiddles, x1, block_size); 162 butterfly2(a2, a3, smem, twiddles, x2, block_size); 163 } 164 165 barrier(CLK_LOCAL_MEM_FENCE); 166} 167 168__attribute__((always_inline)) 169void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 170{ 171 const int x2 = x1 + t/3; 172 const int x3 = x1 + 2*t/3; 173 CT a0, a1, a2, a3, a4, a5; 174 175 if (x1 < t/3) 176 { 177 a0 = smem[x1]; a1 = smem[x1+t]; 178 a2 = smem[x2]; a3 = smem[x2+t]; 179 a4 = smem[x3]; a5 = smem[x3+t]; 180 } 181 182 barrier(CLK_LOCAL_MEM_FENCE); 183 184 if (x1 < t/3) 185 { 186 butterfly2(a0, a1, smem, twiddles, x1, block_size); 187 butterfly2(a2, a3, smem, twiddles, x2, block_size); 188 butterfly2(a4, a5, smem, twiddles, x3, block_size); 189 } 190 191 barrier(CLK_LOCAL_MEM_FENCE); 192} 193 194__attribute__((always_inline)) 195void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 196{ 197 const int thread_block = t/4; 198 const int x2 = x1 + thread_block; 199 const int x3 = x1 + 2*thread_block; 200 const int x4 = x1 + 3*thread_block; 201 CT a0, a1, a2, a3, a4, a5, a6, a7; 202 203 if (x1 < t/4) 204 { 205 a0 = smem[x1]; a1 = smem[x1+t]; 206 a2 = smem[x2]; a3 = smem[x2+t]; 207 a4 = smem[x3]; a5 = smem[x3+t]; 208 a6 = smem[x4]; a7 = smem[x4+t]; 209 } 210 211 barrier(CLK_LOCAL_MEM_FENCE); 212 213 if (x1 < t/4) 214 { 215 butterfly2(a0, a1, smem, twiddles, x1, block_size); 216 butterfly2(a2, a3, smem, twiddles, x2, block_size); 217 butterfly2(a4, a5, smem, twiddles, x3, block_size); 218 butterfly2(a6, a7, smem, twiddles, x4, block_size); 219 } 220 221 barrier(CLK_LOCAL_MEM_FENCE); 222} 223 224__attribute__((always_inline)) 225void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 226{ 227 const int thread_block = t/5; 228 const int x2 = x1 + thread_block; 229 const int x3 = x1 + 2*thread_block; 230 const int x4 = x1 + 3*thread_block; 231 const int x5 = x1 + 4*thread_block; 232 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; 233 234 if (x1 < t/5) 235 { 236 a0 = smem[x1]; a1 = smem[x1+t]; 237 a2 = smem[x2]; a3 = smem[x2+t]; 238 a4 = smem[x3]; a5 = smem[x3+t]; 239 a6 = smem[x4]; a7 = smem[x4+t]; 240 a8 = smem[x5]; a9 = smem[x5+t]; 241 } 242 243 barrier(CLK_LOCAL_MEM_FENCE); 244 245 if (x1 < t/5) 246 { 247 butterfly2(a0, a1, smem, twiddles, x1, block_size); 248 butterfly2(a2, a3, smem, twiddles, x2, block_size); 249 butterfly2(a4, a5, smem, twiddles, x3, block_size); 250 butterfly2(a6, a7, smem, twiddles, x4, block_size); 251 butterfly2(a8, a9, smem, twiddles, x5, block_size); 252 } 253 254 barrier(CLK_LOCAL_MEM_FENCE); 255} 256 257__attribute__((always_inline)) 258void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 259{ 260 CT a0, a1, a2, a3; 261 262 if (x < t) 263 { 264 a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; 265 } 266 267 barrier(CLK_LOCAL_MEM_FENCE); 268 269 if (x < t) 270 butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size); 271 272 barrier(CLK_LOCAL_MEM_FENCE); 273} 274 275__attribute__((always_inline)) 276void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 277{ 278 const int x2 = x1 + t/2; 279 CT a0, a1, a2, a3, a4, a5, a6, a7; 280 281 if (x1 < t/2) 282 { 283 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; 284 a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; 285 } 286 287 barrier(CLK_LOCAL_MEM_FENCE); 288 289 if (x1 < t/2) 290 { 291 butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); 292 butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); 293 } 294 295 barrier(CLK_LOCAL_MEM_FENCE); 296} 297 298__attribute__((always_inline)) 299void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 300{ 301 const int x2 = x1 + t/3; 302 const int x3 = x2 + t/3; 303 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; 304 305 if (x1 < t/3) 306 { 307 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; 308 a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; 309 a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t]; 310 } 311 312 barrier(CLK_LOCAL_MEM_FENCE); 313 314 if (x1 < t/3) 315 { 316 butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); 317 butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); 318 butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size); 319 } 320 321 barrier(CLK_LOCAL_MEM_FENCE); 322} 323 324__attribute__((always_inline)) 325void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 326{ 327 const int k = x % block_size; 328 CT a0, a1, a2, a3, a4, a5, a6, a7; 329 330 if (x < t) 331 { 332 int tw_ind = block_size / 8; 333 334 a0 = smem[x]; 335 a1 = mul_complex(twiddles[k], smem[x + t]); 336 a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]); 337 a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]); 338 a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]); 339 a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]); 340 a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]); 341 a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]); 342 343 CT b0, b1, b6, b7; 344 345 b0 = a0 + a4; 346 a4 = a0 - a4; 347 b1 = a1 + a5; 348 a5 = a1 - a5; 349 a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y); 350 b6 = twiddle(a2 - a6); 351 a2 = a2 + a6; 352 b7 = a3 - a7; 353 b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y); 354 a3 = a3 + a7; 355 356 a0 = b0 + a2; 357 a2 = b0 - a2; 358 a1 = b1 + a3; 359 a3 = twiddle(b1 - a3); 360 a6 = a4 - b6; 361 a4 = a4 + b6; 362 a7 = twiddle(a5 - b7); 363 a5 = a5 + b7; 364 365 } 366 367 barrier(CLK_LOCAL_MEM_FENCE); 368 369 if (x < t) 370 { 371 const int dst_ind = ((x - k) << 3) + k; 372 __local CT* dst = smem + dst_ind; 373 374 dst[0] = a0 + a1; 375 dst[block_size] = a4 + a5; 376 dst[2 * block_size] = a2 + a3; 377 dst[3 * block_size] = a6 + a7; 378 dst[4 * block_size] = a0 - a1; 379 dst[5 * block_size] = a4 - a5; 380 dst[6 * block_size] = a2 - a3; 381 dst[7 * block_size] = a6 - a7; 382 } 383 384 barrier(CLK_LOCAL_MEM_FENCE); 385} 386 387__attribute__((always_inline)) 388void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 389{ 390 CT a0, a1, a2; 391 392 if (x < t) 393 { 394 a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; 395 } 396 397 barrier(CLK_LOCAL_MEM_FENCE); 398 399 if (x < t) 400 butterfly3(a0, a1, a2, smem, twiddles, x, block_size); 401 402 barrier(CLK_LOCAL_MEM_FENCE); 403} 404 405__attribute__((always_inline)) 406void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 407{ 408 const int x2 = x1 + t/2; 409 CT a0, a1, a2, a3, a4, a5; 410 411 if (x1 < t/2) 412 { 413 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 414 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 415 } 416 417 barrier(CLK_LOCAL_MEM_FENCE); 418 419 if (x1 < t/2) 420 { 421 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 422 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 423 } 424 425 barrier(CLK_LOCAL_MEM_FENCE); 426} 427 428__attribute__((always_inline)) 429void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 430{ 431 const int x2 = x1 + t/3; 432 const int x3 = x2 + t/3; 433 CT a0, a1, a2, a3, a4, a5, a6, a7, a8; 434 435 if (x1 < t/3) 436 { 437 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 438 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 439 a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; 440 } 441 442 barrier(CLK_LOCAL_MEM_FENCE); 443 444 if (x1 < t/3) 445 { 446 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 447 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 448 butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); 449 } 450 451 barrier(CLK_LOCAL_MEM_FENCE); 452} 453 454__attribute__((always_inline)) 455void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 456{ 457 const int thread_block = t/4; 458 const int x2 = x1 + thread_block; 459 const int x3 = x1 + 2*thread_block; 460 const int x4 = x1 + 3*thread_block; 461 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; 462 463 if (x1 < t/4) 464 { 465 a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; 466 a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; 467 a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; 468 a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t]; 469 } 470 471 barrier(CLK_LOCAL_MEM_FENCE); 472 473 if (x1 < t/4) 474 { 475 butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); 476 butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); 477 butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); 478 butterfly3(a9, a10, a11, smem, twiddles, x4, block_size); 479 } 480 481 barrier(CLK_LOCAL_MEM_FENCE); 482} 483 484__attribute__((always_inline)) 485void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t) 486{ 487 const int k = x % block_size; 488 CT a0, a1, a2, a3, a4; 489 490 if (x < t) 491 { 492 a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t]; 493 } 494 495 barrier(CLK_LOCAL_MEM_FENCE); 496 497 if (x < t) 498 butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size); 499 500 barrier(CLK_LOCAL_MEM_FENCE); 501} 502 503__attribute__((always_inline)) 504void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t) 505{ 506 const int x2 = x1+t/2; 507 CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; 508 509 if (x1 < t/2) 510 { 511 a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t]; 512 a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t]; 513 } 514 515 barrier(CLK_LOCAL_MEM_FENCE); 516 517 if (x1 < t/2) 518 { 519 butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size); 520 butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size); 521 } 522 523 barrier(CLK_LOCAL_MEM_FENCE); 524} 525 526#ifdef DFT_SCALE 527#define SCALE_VAL(x, scale) x*scale 528#else 529#define SCALE_VAL(x, scale) x 530#endif 531 532__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 533 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 534 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 535{ 536 const int x = get_global_id(0); 537 const int y = get_group_id(1); 538 const int block_size = LOCAL_SIZE/kercn; 539 if (y < nz) 540 { 541 __local CT smem[LOCAL_SIZE]; 542 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 543 const int ind = x; 544#ifdef IS_1D 545 FT scale = (FT) 1/dst_cols; 546#else 547 FT scale = (FT) 1/(dst_cols*dst_rows); 548#endif 549 550#ifdef COMPLEX_INPUT 551 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset))); 552 #pragma unroll 553 for (int i=0; i<kercn; i++) 554 smem[x+i*block_size] = src[i*block_size]; 555#else 556 __global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset))); 557 #pragma unroll 558 for (int i=0; i<kercn; i++) 559 smem[x+i*block_size] = (CT)(src[i*block_size], 0.f); 560#endif 561 barrier(CLK_LOCAL_MEM_FENCE); 562 563 RADIX_PROCESS; 564 565#ifdef COMPLEX_OUTPUT 566#ifdef NO_CONJUGATE 567 // copy result without complex conjugate 568 const int cols = dst_cols/2 + 1; 569#else 570 const int cols = dst_cols; 571#endif 572 573 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 574 #pragma unroll 575 for (int i=x; i<cols; i+=block_size) 576 dst[i] = SCALE_VAL(smem[i], scale); 577#ifdef REAL_INPUT 578#ifdef COMPLEX_OUTPUT 579#ifdef IS_1D 580 for(int i=x+1; i < (dst_cols+1)/2; i+=block_size) 581 { 582 dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale)); 583 } 584#endif 585#endif 586#endif 587#else 588 // pack row to CCS 589 __local FT* smem_1cn = (__local FT*) smem; 590 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 591 for (int i=x; i<dst_cols-1; i+=block_size) 592 dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale); 593 if (x == 0) 594 dst[0] = SCALE_VAL(smem_1cn[0], scale); 595#endif 596 } 597 else 598 { 599 // fill with zero other rows 600#ifdef COMPLEX_OUTPUT 601 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 602#else 603 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 604#endif 605 #pragma unroll 606 for (int i=x; i<dst_cols; i+=block_size) 607 dst[i] = 0.f; 608 } 609} 610 611__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 612 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 613 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 614{ 615 const int x = get_group_id(0); 616 const int y = get_global_id(1); 617 618 if (x < nz) 619 { 620 __local CT smem[LOCAL_SIZE]; 621 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)); 622 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 623 const int ind = y; 624 const int block_size = LOCAL_SIZE/kercn; 625 FT scale = 1.f/(dst_rows*dst_cols); 626 627 #pragma unroll 628 for (int i=0; i<kercn; i++) 629 smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step)); 630 631 barrier(CLK_LOCAL_MEM_FENCE); 632 633 RADIX_PROCESS; 634 635#ifdef COMPLEX_OUTPUT 636 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 637 #pragma unroll 638 for (int i=0; i<kercn; i++) 639 *((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale); 640#else 641 if (x == 0) 642 { 643 // pack first column to CCS 644 __local FT* smem_1cn = (__local FT*) smem; 645 __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset); 646 for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size) 647 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale); 648 if (y == 0) 649 *((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale); 650 } 651 else if (x == (dst_cols+1)/2) 652 { 653 // pack last column to CCS (if needed) 654 __local FT* smem_1cn = (__local FT*) smem; 655 __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset)); 656 for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size) 657 *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale); 658 if (y == 0) 659 *((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale); 660 } 661 else 662 { 663 __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT))); 664 #pragma unroll 665 for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step) 666 vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst); 667 } 668#endif 669 } 670} 671 672__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 673 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 674 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 675{ 676 const int x = get_global_id(0); 677 const int y = get_group_id(1); 678 const int block_size = LOCAL_SIZE/kercn; 679#ifdef IS_1D 680 const FT scale = (FT) 1/dst_cols; 681#else 682 const FT scale = (FT) 1/(dst_cols*dst_rows); 683#endif 684 685 if (y < nz) 686 { 687 __local CT smem[LOCAL_SIZE]; 688 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 689 const int ind = x; 690 691#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE) 692 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset))); 693 #pragma unroll 694 for (int i=0; i<kercn; i++) 695 { 696 smem[x+i*block_size].x = src[i*block_size].x; 697 smem[x+i*block_size].y = -src[i*block_size].y; 698 } 699#else 700 701 #if !defined(REAL_INPUT) && defined(NO_CONJUGATE) 702 __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset))); 703 704 #pragma unroll 705 for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size) 706 { 707 smem[i+1].x = src[i].x; 708 smem[i+1].y = -src[i].y; 709 smem[LOCAL_SIZE-i-1] = src[i]; 710 } 711 #else 712 713 #pragma unroll 714 for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size) 715 { 716 CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset)))); 717 718 smem[i+1].x = src.x; 719 smem[i+1].y = -src.y; 720 smem[LOCAL_SIZE-i-1] = src; 721 } 722 723 #endif 724 725 if (x==0) 726 { 727 smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset)); 728 smem[0].y = 0.f; 729 730 if(LOCAL_SIZE % 2 ==0) 731 { 732 #if !defined(REAL_INPUT) && defined(NO_CONJUGATE) 733 smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x; 734 #else 735 smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset))); 736 #endif 737 smem[LOCAL_SIZE/2].y = 0.f; 738 } 739 } 740#endif 741 742 barrier(CLK_LOCAL_MEM_FENCE); 743 744 RADIX_PROCESS; 745 746 // copy data to dst 747#ifdef COMPLEX_OUTPUT 748 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset))); 749 #pragma unroll 750 for (int i=0; i<kercn; i++) 751 { 752 dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale); 753 dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale); 754 } 755#else 756 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset))); 757 #pragma unroll 758 for (int i=0; i<kercn; i++) 759 { 760 dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale); 761 } 762#endif 763 } 764 else 765 { 766 // fill with zero other rows 767#ifdef COMPLEX_OUTPUT 768 __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 769#else 770 __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset)); 771#endif 772 #pragma unroll 773 for (int i=x; i<dst_cols; i+=block_size) 774 dst[i] = 0.f; 775 } 776} 777 778__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, 779 __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 780 __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) 781{ 782 const int x = get_group_id(0); 783 const int y = get_global_id(1); 784 785#ifdef COMPLEX_INPUT 786 if (x < nz) 787 { 788 __local CT smem[LOCAL_SIZE]; 789 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)); 790 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 791 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 792 const int ind = y; 793 const int block_size = LOCAL_SIZE/kercn; 794 795 #pragma unroll 796 for (int i=0; i<kercn; i++) 797 { 798 CT temp = *((__global const CT*)(src + i*block_size*src_step)); 799 smem[y+i*block_size].x = temp.x; 800 smem[y+i*block_size].y = -temp.y; 801 } 802 803 barrier(CLK_LOCAL_MEM_FENCE); 804 805 RADIX_PROCESS; 806 807 // copy data to dst 808 #pragma unroll 809 for (int i=0; i<kercn; i++) 810 { 811 __global CT* res = (__global CT*)(dst + i*block_size*dst_step); 812 res[0].x = smem[y + i*block_size].x; 813 res[0].y = -smem[y + i*block_size].y; 814 } 815 } 816#else 817 if (x < nz) 818 { 819 __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset); 820 const int ind = y; 821 const int block_size = LOCAL_SIZE/kercn; 822 823 __local CT smem[LOCAL_SIZE]; 824#ifdef EVEN 825 if (x!=0 && (x!=(nz-1))) 826#else 827 if (x!=0) 828#endif 829 { 830 __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset)); 831 #pragma unroll 832 for (int i=0; i<kercn; i++) 833 { 834 CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step)); 835 smem[y+i*block_size].x = temp.x; 836 smem[y+i*block_size].y = -temp.y; 837 } 838 } 839 else 840 { 841 int ind = x==0 ? 0: 2*x-1; 842 __global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset))); 843 int step = src_step/(int)sizeof(FT); 844 845 #pragma unroll 846 for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size) 847 { 848 smem[i+1].x = src[2*i*step]; 849 smem[i+1].y = -src[(2*i+1)*step]; 850 851 smem[LOCAL_SIZE-i-1].x = src[2*i*step];; 852 smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step]; 853 } 854 if (y==0) 855 { 856 smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset)); 857 smem[0].y = 0.f; 858 859 if(LOCAL_SIZE % 2 ==0) 860 { 861 smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step]; 862 smem[LOCAL_SIZE/2].y = 0.f; 863 } 864 } 865 } 866 barrier(CLK_LOCAL_MEM_FENCE); 867 868 RADIX_PROCESS; 869 870 // copy data to dst 871 __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)); 872 873 #pragma unroll 874 for (int i=0; i<kercn; i++) 875 { 876 __global CT* res = (__global CT*)(dst + i*block_size*dst_step); 877 res[0].x = smem[y + i*block_size].x; 878 res[0].y = -smem[y + i*block_size].y; 879 } 880 } 881#endif 882}