1// This file is part of OpenCV project.
2// It is subject to the license terms in the LICENSE file found in the top-level directory
3// of this distribution and at http://opencv.org/license.html.
4
5// Copyright (C) 2014, Itseez, Inc., all rights reserved.
6// Third party copyrights are property of their respective owners.
7
8#define SQRT_2 0.707106781188f
9#define sin_120 0.866025403784f
10#define fft5_2  0.559016994374f
11#define fft5_3 -0.951056516295f
12#define fft5_4 -1.538841768587f
13#define fft5_5  0.363271264002f
14
15#ifdef DOUBLE_SUPPORT
16#ifdef cl_amd_fp64
17#pragma OPENCL EXTENSION cl_amd_fp64:enable
18#elif defined (cl_khr_fp64)
19#pragma OPENCL EXTENSION cl_khr_fp64:enable
20#endif
21#endif
22
23__attribute__((always_inline))
24CT mul_complex(CT a, CT b) {
25    return (CT)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x));
26}
27
28__attribute__((always_inline))
29CT twiddle(CT a) {
30    return (CT)(a.y, -a.x);
31}
32
33__attribute__((always_inline))
34void butterfly2(CT a0, CT a1, __local CT* smem, __global const CT* twiddles,
35                const int x, const int block_size)
36{
37    const int k = x & (block_size - 1);
38    a1 = mul_complex(twiddles[k], a1);
39    const int dst_ind = (x << 1) - k;
40
41    smem[dst_ind] = a0 + a1;
42    smem[dst_ind+block_size] = a0 - a1;
43}
44
45__attribute__((always_inline))
46void butterfly4(CT a0, CT a1, CT a2, CT a3, __local CT* smem, __global const CT* twiddles,
47                const int x, const int block_size)
48{
49    const int k = x & (block_size - 1);
50    a1 = mul_complex(twiddles[k], a1);
51    a2 = mul_complex(twiddles[k + block_size], a2);
52    a3 = mul_complex(twiddles[k + 2*block_size], a3);
53
54    const int dst_ind = ((x - k) << 2) + k;
55
56    CT b0 = a0 + a2;
57    a2 = a0 - a2;
58    CT b1 = a1 + a3;
59    a3 = twiddle(a1 - a3);
60
61    smem[dst_ind]                = b0 + b1;
62    smem[dst_ind + block_size]   = a2 + a3;
63    smem[dst_ind + 2*block_size] = b0 - b1;
64    smem[dst_ind + 3*block_size] = a2 - a3;
65}
66
67__attribute__((always_inline))
68void butterfly3(CT a0, CT a1, CT a2, __local CT* smem, __global const CT* twiddles,
69                const int x, const int block_size)
70{
71    const int k = x % block_size;
72    a1 = mul_complex(twiddles[k], a1);
73    a2 = mul_complex(twiddles[k+block_size], a2);
74    const int dst_ind = ((x - k) * 3) + k;
75
76    CT b1 = a1 + a2;
77    a2 = twiddle(sin_120*(a1 - a2));
78    CT b0 = a0 - (CT)(0.5f)*b1;
79
80    smem[dst_ind] = a0 + b1;
81    smem[dst_ind + block_size] = b0 + a2;
82    smem[dst_ind + 2*block_size] = b0 - a2;
83}
84
85__attribute__((always_inline))
86void butterfly5(CT a0, CT a1, CT a2, CT a3, CT a4, __local CT* smem, __global const CT* twiddles,
87                const int x, const int block_size)
88{
89    const int k = x % block_size;
90    a1 = mul_complex(twiddles[k], a1);
91    a2 = mul_complex(twiddles[k + block_size], a2);
92    a3 = mul_complex(twiddles[k+2*block_size], a3);
93    a4 = mul_complex(twiddles[k+3*block_size], a4);
94
95    const int dst_ind = ((x - k) * 5) + k;
96    __local CT* dst = smem + dst_ind;
97
98    CT b0, b1, b5;
99
100    b1 = a1 + a4;
101    a1 -= a4;
102
103    a4 = a3 + a2;
104    a3 -= a2;
105
106    a2 = b1 + a4;
107    b0 = a0 - (CT)0.25f * a2;
108
109    b1 = fft5_2 * (b1 - a4);
110    a4 = fft5_3 * (CT)(-a1.y - a3.y, a1.x + a3.x);
111    b5 = (CT)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x);
112
113    a4.x += fft5_4 * a3.y;
114    a4.y -= fft5_4 * a3.x;
115
116    a1 = b0 + b1;
117    b0 -= b1;
118
119    dst[0] = a0 + a2;
120    dst[block_size] = a1 + a4;
121    dst[2 * block_size] = b0 + b5;
122    dst[3 * block_size] = b0 - b5;
123    dst[4 * block_size] = a1 - a4;
124}
125
126__attribute__((always_inline))
127void fft_radix2(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
128{
129    CT a0, a1;
130
131    if (x < t)
132    {
133        a0 = smem[x];
134        a1 = smem[x+t];
135    }
136
137    barrier(CLK_LOCAL_MEM_FENCE);
138
139    if (x < t)
140        butterfly2(a0, a1, smem, twiddles, x, block_size);
141
142    barrier(CLK_LOCAL_MEM_FENCE);
143}
144
145__attribute__((always_inline))
146void fft_radix2_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
147{
148    const int x2 = x1 + t/2;
149    CT a0, a1, a2, a3;
150
151    if (x1 < t/2)
152    {
153        a0 = smem[x1]; a1 = smem[x1+t];
154        a2 = smem[x2]; a3 = smem[x2+t];
155    }
156
157    barrier(CLK_LOCAL_MEM_FENCE);
158
159    if (x1 < t/2)
160    {
161        butterfly2(a0, a1, smem, twiddles, x1, block_size);
162        butterfly2(a2, a3, smem, twiddles, x2, block_size);
163    }
164
165    barrier(CLK_LOCAL_MEM_FENCE);
166}
167
168__attribute__((always_inline))
169void fft_radix2_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
170{
171    const int x2 = x1 + t/3;
172    const int x3 = x1 + 2*t/3;
173    CT a0, a1, a2, a3, a4, a5;
174
175    if (x1 < t/3)
176    {
177        a0 = smem[x1]; a1 = smem[x1+t];
178        a2 = smem[x2]; a3 = smem[x2+t];
179        a4 = smem[x3]; a5 = smem[x3+t];
180    }
181
182    barrier(CLK_LOCAL_MEM_FENCE);
183
184    if (x1 < t/3)
185    {
186        butterfly2(a0, a1, smem, twiddles, x1, block_size);
187        butterfly2(a2, a3, smem, twiddles, x2, block_size);
188        butterfly2(a4, a5, smem, twiddles, x3, block_size);
189    }
190
191    barrier(CLK_LOCAL_MEM_FENCE);
192}
193
194__attribute__((always_inline))
195void fft_radix2_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
196{
197    const int thread_block = t/4;
198    const int x2 = x1 + thread_block;
199    const int x3 = x1 + 2*thread_block;
200    const int x4 = x1 + 3*thread_block;
201    CT a0, a1, a2, a3, a4, a5, a6, a7;
202
203    if (x1 < t/4)
204    {
205        a0 = smem[x1]; a1 = smem[x1+t];
206        a2 = smem[x2]; a3 = smem[x2+t];
207        a4 = smem[x3]; a5 = smem[x3+t];
208        a6 = smem[x4]; a7 = smem[x4+t];
209    }
210
211    barrier(CLK_LOCAL_MEM_FENCE);
212
213    if (x1 < t/4)
214    {
215        butterfly2(a0, a1, smem, twiddles, x1, block_size);
216        butterfly2(a2, a3, smem, twiddles, x2, block_size);
217        butterfly2(a4, a5, smem, twiddles, x3, block_size);
218        butterfly2(a6, a7, smem, twiddles, x4, block_size);
219    }
220
221    barrier(CLK_LOCAL_MEM_FENCE);
222}
223
224__attribute__((always_inline))
225void fft_radix2_B5(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
226{
227    const int thread_block = t/5;
228    const int x2 = x1 + thread_block;
229    const int x3 = x1 + 2*thread_block;
230    const int x4 = x1 + 3*thread_block;
231    const int x5 = x1 + 4*thread_block;
232    CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
233
234    if (x1 < t/5)
235    {
236        a0 = smem[x1]; a1 = smem[x1+t];
237        a2 = smem[x2]; a3 = smem[x2+t];
238        a4 = smem[x3]; a5 = smem[x3+t];
239        a6 = smem[x4]; a7 = smem[x4+t];
240        a8 = smem[x5]; a9 = smem[x5+t];
241    }
242
243    barrier(CLK_LOCAL_MEM_FENCE);
244
245    if (x1 < t/5)
246    {
247        butterfly2(a0, a1, smem, twiddles, x1, block_size);
248        butterfly2(a2, a3, smem, twiddles, x2, block_size);
249        butterfly2(a4, a5, smem, twiddles, x3, block_size);
250        butterfly2(a6, a7, smem, twiddles, x4, block_size);
251        butterfly2(a8, a9, smem, twiddles, x5, block_size);
252    }
253
254    barrier(CLK_LOCAL_MEM_FENCE);
255}
256
257__attribute__((always_inline))
258void fft_radix4(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
259{
260    CT a0, a1, a2, a3;
261
262    if (x < t)
263    {
264        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t];
265    }
266
267    barrier(CLK_LOCAL_MEM_FENCE);
268
269    if (x < t)
270        butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size);
271
272    barrier(CLK_LOCAL_MEM_FENCE);
273}
274
275__attribute__((always_inline))
276void fft_radix4_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
277{
278    const int x2 = x1 + t/2;
279    CT a0, a1, a2, a3, a4, a5, a6, a7;
280
281    if (x1 < t/2)
282    {
283        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
284        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
285    }
286
287    barrier(CLK_LOCAL_MEM_FENCE);
288
289    if (x1 < t/2)
290    {
291        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
292        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
293    }
294
295    barrier(CLK_LOCAL_MEM_FENCE);
296}
297
298__attribute__((always_inline))
299void fft_radix4_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
300{
301    const int x2 = x1 + t/3;
302    const int x3 = x2 + t/3;
303    CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
304
305    if (x1 < t/3)
306    {
307        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t];
308        a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t];
309        a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t];
310    }
311
312    barrier(CLK_LOCAL_MEM_FENCE);
313
314    if (x1 < t/3)
315    {
316        butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size);
317        butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size);
318        butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size);
319    }
320
321    barrier(CLK_LOCAL_MEM_FENCE);
322}
323
324__attribute__((always_inline))
325void fft_radix8(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
326{
327    const int k = x % block_size;
328    CT a0, a1, a2, a3, a4, a5, a6, a7;
329
330    if (x < t)
331    {
332        int tw_ind = block_size / 8;
333
334        a0 = smem[x];
335        a1 = mul_complex(twiddles[k], smem[x + t]);
336        a2 = mul_complex(twiddles[k + block_size],smem[x+2*t]);
337        a3 = mul_complex(twiddles[k+2*block_size],smem[x+3*t]);
338        a4 = mul_complex(twiddles[k+3*block_size],smem[x+4*t]);
339        a5 = mul_complex(twiddles[k+4*block_size],smem[x+5*t]);
340        a6 = mul_complex(twiddles[k+5*block_size],smem[x+6*t]);
341        a7 = mul_complex(twiddles[k+6*block_size],smem[x+7*t]);
342
343        CT b0, b1, b6, b7;
344
345        b0 = a0 + a4;
346        a4 = a0 - a4;
347        b1 = a1 + a5;
348        a5 = a1 - a5;
349        a5 = (CT)(SQRT_2) * (CT)(a5.x + a5.y, -a5.x + a5.y);
350        b6 = twiddle(a2 - a6);
351        a2 = a2 + a6;
352        b7 = a3 - a7;
353        b7 = (CT)(SQRT_2) * (CT)(-b7.x + b7.y, -b7.x - b7.y);
354        a3 = a3 + a7;
355
356        a0 = b0 + a2;
357        a2 = b0 - a2;
358        a1 = b1 + a3;
359        a3 = twiddle(b1 - a3);
360        a6 = a4 - b6;
361        a4 = a4 + b6;
362        a7 = twiddle(a5 - b7);
363        a5 = a5 + b7;
364
365    }
366
367    barrier(CLK_LOCAL_MEM_FENCE);
368
369    if (x < t)
370    {
371        const int dst_ind = ((x - k) << 3) + k;
372        __local CT* dst = smem + dst_ind;
373
374        dst[0] = a0 + a1;
375        dst[block_size] = a4 + a5;
376        dst[2 * block_size] = a2 + a3;
377        dst[3 * block_size] = a6 + a7;
378        dst[4 * block_size] = a0 - a1;
379        dst[5 * block_size] = a4 - a5;
380        dst[6 * block_size] = a2 - a3;
381        dst[7 * block_size] = a6 - a7;
382    }
383
384    barrier(CLK_LOCAL_MEM_FENCE);
385}
386
387__attribute__((always_inline))
388void fft_radix3(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
389{
390    CT a0, a1, a2;
391
392    if (x < t)
393    {
394        a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t];
395    }
396
397    barrier(CLK_LOCAL_MEM_FENCE);
398
399    if (x < t)
400        butterfly3(a0, a1, a2, smem, twiddles, x, block_size);
401
402    barrier(CLK_LOCAL_MEM_FENCE);
403}
404
405__attribute__((always_inline))
406void fft_radix3_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
407{
408    const int x2 = x1 + t/2;
409    CT a0, a1, a2, a3, a4, a5;
410
411    if (x1 < t/2)
412    {
413        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
414        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
415    }
416
417    barrier(CLK_LOCAL_MEM_FENCE);
418
419    if (x1 < t/2)
420    {
421        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
422        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
423    }
424
425    barrier(CLK_LOCAL_MEM_FENCE);
426}
427
428__attribute__((always_inline))
429void fft_radix3_B3(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
430{
431    const int x2 = x1 + t/3;
432    const int x3 = x2 + t/3;
433    CT a0, a1, a2, a3, a4, a5, a6, a7, a8;
434
435    if (x1 < t/3)
436    {
437        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
438        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
439        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
440    }
441
442    barrier(CLK_LOCAL_MEM_FENCE);
443
444    if (x1 < t/3)
445    {
446        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
447        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
448        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
449    }
450
451    barrier(CLK_LOCAL_MEM_FENCE);
452}
453
454__attribute__((always_inline))
455void fft_radix3_B4(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
456{
457    const int thread_block = t/4;
458    const int x2 = x1 + thread_block;
459    const int x3 = x1 + 2*thread_block;
460    const int x4 = x1 + 3*thread_block;
461    CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11;
462
463    if (x1 < t/4)
464    {
465        a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t];
466        a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t];
467        a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t];
468        a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t];
469    }
470
471    barrier(CLK_LOCAL_MEM_FENCE);
472
473    if (x1 < t/4)
474    {
475        butterfly3(a0, a1, a2, smem, twiddles, x1, block_size);
476        butterfly3(a3, a4, a5, smem, twiddles, x2, block_size);
477        butterfly3(a6, a7, a8, smem, twiddles, x3, block_size);
478        butterfly3(a9, a10, a11, smem, twiddles, x4, block_size);
479    }
480
481    barrier(CLK_LOCAL_MEM_FENCE);
482}
483
484__attribute__((always_inline))
485void fft_radix5(__local CT* smem, __global const CT* twiddles, const int x, const int block_size, const int t)
486{
487    const int k = x % block_size;
488    CT a0, a1, a2, a3, a4;
489
490    if (x < t)
491    {
492        a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t];
493    }
494
495    barrier(CLK_LOCAL_MEM_FENCE);
496
497    if (x < t)
498        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size);
499
500    barrier(CLK_LOCAL_MEM_FENCE);
501}
502
503__attribute__((always_inline))
504void fft_radix5_B2(__local CT* smem, __global const CT* twiddles, const int x1, const int block_size, const int t)
505{
506    const int x2 = x1+t/2;
507    CT a0, a1, a2, a3, a4, a5, a6, a7, a8, a9;
508
509    if (x1 < t/2)
510    {
511        a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t];
512        a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t];
513    }
514
515    barrier(CLK_LOCAL_MEM_FENCE);
516
517    if (x1 < t/2)
518    {
519        butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size);
520        butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size);
521    }
522
523    barrier(CLK_LOCAL_MEM_FENCE);
524}
525
526#ifdef DFT_SCALE
527#define SCALE_VAL(x, scale) x*scale
528#else
529#define SCALE_VAL(x, scale) x
530#endif
531
532__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
533                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
534                                   __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
535{
536    const int x = get_global_id(0);
537    const int y = get_group_id(1);
538    const int block_size = LOCAL_SIZE/kercn;
539    if (y < nz)
540    {
541        __local CT smem[LOCAL_SIZE];
542        __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
543        const int ind = x;
544#ifdef IS_1D
545        FT scale = (FT) 1/dst_cols;
546#else
547        FT scale = (FT) 1/(dst_cols*dst_rows);
548#endif
549
550#ifdef COMPLEX_INPUT
551        __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));
552        #pragma unroll
553        for (int i=0; i<kercn; i++)
554            smem[x+i*block_size] = src[i*block_size];
555#else
556        __global const FT* src = (__global const FT*)(src_ptr + mad24(y, src_step, mad24(x, (int)sizeof(FT), src_offset)));
557        #pragma unroll
558        for (int i=0; i<kercn; i++)
559            smem[x+i*block_size] = (CT)(src[i*block_size], 0.f);
560#endif
561        barrier(CLK_LOCAL_MEM_FENCE);
562
563        RADIX_PROCESS;
564
565#ifdef COMPLEX_OUTPUT
566#ifdef NO_CONJUGATE
567        // copy result without complex conjugate
568        const int cols = dst_cols/2 + 1;
569#else
570        const int cols = dst_cols;
571#endif
572
573        __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
574        #pragma unroll
575        for (int i=x; i<cols; i+=block_size)
576            dst[i] = SCALE_VAL(smem[i], scale);
577#ifdef REAL_INPUT
578#ifdef COMPLEX_OUTPUT
579#ifdef IS_1D
580        for(int i=x+1; i < (dst_cols+1)/2; i+=block_size)
581        {
582            dst[dst_cols-i] = (CT)(SCALE_VAL(smem[i].x, scale), SCALE_VAL(-smem[i].y, scale));
583        }
584#endif
585#endif
586#endif
587#else
588        // pack row to CCS
589        __local FT* smem_1cn = (__local FT*) smem;
590        __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
591        for (int i=x; i<dst_cols-1; i+=block_size)
592            dst[i+1] = SCALE_VAL(smem_1cn[i+2], scale);
593        if (x == 0)
594            dst[0] = SCALE_VAL(smem_1cn[0], scale);
595#endif
596    }
597    else
598    {
599        // fill with zero other rows
600#ifdef COMPLEX_OUTPUT
601        __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
602#else
603        __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
604#endif
605        #pragma unroll
606        for (int i=x; i<dst_cols; i+=block_size)
607            dst[i] = 0.f;
608    }
609}
610
611__kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
612                                   __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
613                                   __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
614{
615    const int x = get_group_id(0);
616    const int y = get_global_id(1);
617
618    if (x < nz)
619    {
620        __local CT smem[LOCAL_SIZE];
621        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));
622        __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
623        const int ind = y;
624        const int block_size = LOCAL_SIZE/kercn;
625        FT scale = 1.f/(dst_rows*dst_cols);
626
627        #pragma unroll
628        for (int i=0; i<kercn; i++)
629            smem[y+i*block_size] = *((__global const CT*)(src + i*block_size*src_step));
630
631        barrier(CLK_LOCAL_MEM_FENCE);
632
633        RADIX_PROCESS;
634
635#ifdef COMPLEX_OUTPUT
636        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
637        #pragma unroll
638        for (int i=0; i<kercn; i++)
639            *((__global CT*)(dst + i*block_size*dst_step)) = SCALE_VAL(smem[y + i*block_size], scale);
640#else
641        if (x == 0)
642        {
643            // pack first column to CCS
644            __local FT* smem_1cn = (__local FT*) smem;
645            __global uchar* dst = dst_ptr + mad24(y+1, dst_step, dst_offset);
646            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
647                *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
648            if (y == 0)
649                *((__global FT*) (dst_ptr + dst_offset)) = SCALE_VAL(smem_1cn[0], scale);
650        }
651        else if (x == (dst_cols+1)/2)
652        {
653            // pack last column to CCS (if needed)
654            __local FT* smem_1cn = (__local FT*) smem;
655            __global uchar* dst = dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), mad24(y+1, dst_step, dst_offset));
656            for (int i=y; i<dst_rows-1; i+=block_size, dst+=dst_step*block_size)
657                *((__global FT*) dst) = SCALE_VAL(smem_1cn[i+2], scale);
658            if (y == 0)
659                *((__global FT*) (dst_ptr + mad24(dst_cols-1, (int)sizeof(FT), dst_offset))) = SCALE_VAL(smem_1cn[0], scale);
660        }
661        else
662        {
663            __global uchar* dst = dst_ptr + mad24(x, (int)sizeof(FT)*2, mad24(y, dst_step, dst_offset - (int)sizeof(FT)));
664            #pragma unroll
665            for (int i=y; i<dst_rows; i+=block_size, dst+=block_size*dst_step)
666                vstore2(SCALE_VAL(smem[i], scale), 0, (__global FT*) dst);
667        }
668#endif
669    }
670}
671
672__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
673                                    __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
674                                    __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
675{
676    const int x = get_global_id(0);
677    const int y = get_group_id(1);
678    const int block_size = LOCAL_SIZE/kercn;
679#ifdef IS_1D
680    const FT scale = (FT) 1/dst_cols;
681#else
682    const FT scale = (FT) 1/(dst_cols*dst_rows);
683#endif
684
685    if (y < nz)
686    {
687        __local CT smem[LOCAL_SIZE];
688        __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
689        const int ind = x;
690
691#if defined(COMPLEX_INPUT) && !defined(NO_CONJUGATE)
692        __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset)));
693        #pragma unroll
694        for (int i=0; i<kercn; i++)
695        {
696            smem[x+i*block_size].x =  src[i*block_size].x;
697            smem[x+i*block_size].y = -src[i*block_size].y;
698        }
699#else
700
701    #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
702        __global const CT* src = (__global const CT*)(src_ptr + mad24(y, src_step, mad24(2, (int)sizeof(FT), src_offset)));
703
704        #pragma unroll
705        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
706        {
707            smem[i+1].x = src[i].x;
708            smem[i+1].y = -src[i].y;
709            smem[LOCAL_SIZE-i-1] = src[i];
710        }
711    #else
712
713        #pragma unroll
714        for (int i=x; i<(LOCAL_SIZE-1)/2; i+=block_size)
715        {
716            CT src = vload2(0, (__global const FT*)(src_ptr + mad24(y, src_step, mad24(2*i+1, (int)sizeof(FT), src_offset))));
717
718            smem[i+1].x = src.x;
719            smem[i+1].y = -src.y;
720            smem[LOCAL_SIZE-i-1] = src;
721        }
722
723    #endif
724
725        if (x==0)
726        {
727            smem[0].x = *(__global const FT*)(src_ptr + mad24(y, src_step, src_offset));
728            smem[0].y = 0.f;
729
730            if(LOCAL_SIZE % 2 ==0)
731            {
732                #if !defined(REAL_INPUT) && defined(NO_CONJUGATE)
733                smem[LOCAL_SIZE/2].x = src[LOCAL_SIZE/2-1].x;
734                #else
735                smem[LOCAL_SIZE/2].x = *(__global const FT*)(src_ptr + mad24(y, src_step, mad24(LOCAL_SIZE-1, (int)sizeof(FT), src_offset)));
736                #endif
737                smem[LOCAL_SIZE/2].y = 0.f;
738            }
739        }
740#endif
741
742        barrier(CLK_LOCAL_MEM_FENCE);
743
744        RADIX_PROCESS;
745
746        // copy data to dst
747#ifdef COMPLEX_OUTPUT
748        __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset)));
749        #pragma unroll
750        for (int i=0; i<kercn; i++)
751        {
752            dst[i*block_size].x = SCALE_VAL(smem[x + i*block_size].x, scale);
753            dst[i*block_size].y = SCALE_VAL(-smem[x + i*block_size].y, scale);
754        }
755#else
756        __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(FT)), dst_offset)));
757        #pragma unroll
758        for (int i=0; i<kercn; i++)
759        {
760            dst[i*block_size] = SCALE_VAL(smem[x + i*block_size].x, scale);
761        }
762#endif
763    }
764    else
765    {
766        // fill with zero other rows
767#ifdef COMPLEX_OUTPUT
768        __global CT* dst = (__global CT*)(dst_ptr + mad24(y, dst_step, dst_offset));
769#else
770        __global FT* dst = (__global FT*)(dst_ptr + mad24(y, dst_step, dst_offset));
771#endif
772        #pragma unroll
773        for (int i=x; i<dst_cols; i+=block_size)
774            dst[i] = 0.f;
775    }
776}
777
778__kernel void ifft_multi_radix_cols(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols,
779                              __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
780                              __global CT* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz)
781{
782    const int x = get_group_id(0);
783    const int y = get_global_id(1);
784
785#ifdef COMPLEX_INPUT
786    if (x < nz)
787    {
788        __local CT smem[LOCAL_SIZE];
789        __global const uchar* src = src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(CT)), src_offset));
790        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
791        __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
792        const int ind = y;
793        const int block_size = LOCAL_SIZE/kercn;
794
795        #pragma unroll
796        for (int i=0; i<kercn; i++)
797        {
798            CT temp = *((__global const CT*)(src + i*block_size*src_step));
799            smem[y+i*block_size].x =  temp.x;
800            smem[y+i*block_size].y =  -temp.y;
801        }
802
803        barrier(CLK_LOCAL_MEM_FENCE);
804
805        RADIX_PROCESS;
806
807        // copy data to dst
808        #pragma unroll
809        for (int i=0; i<kercn; i++)
810        {
811           __global CT* res = (__global CT*)(dst + i*block_size*dst_step);
812            res[0].x = smem[y + i*block_size].x;
813            res[0].y = -smem[y + i*block_size].y;
814        }
815    }
816#else
817    if (x < nz)
818    {
819        __global const CT* twiddles = (__global const CT*)(twiddles_ptr + twiddles_offset);
820        const int ind = y;
821        const int block_size = LOCAL_SIZE/kercn;
822
823        __local CT smem[LOCAL_SIZE];
824#ifdef EVEN
825        if (x!=0 && (x!=(nz-1)))
826#else
827        if (x!=0)
828#endif
829        {
830            __global const uchar* src = src_ptr + mad24(y, src_step, mad24(2*x-1, (int)sizeof(FT), src_offset));
831            #pragma unroll
832            for (int i=0; i<kercn; i++)
833            {
834                CT temp = vload2(0, (__global const FT*)(src + i*block_size*src_step));
835                smem[y+i*block_size].x = temp.x;
836                smem[y+i*block_size].y = -temp.y;
837            }
838        }
839        else
840        {
841            int ind = x==0 ? 0: 2*x-1;
842            __global const FT* src = (__global const FT*)(src_ptr + mad24(1, src_step, mad24(ind, (int)sizeof(FT), src_offset)));
843            int step = src_step/(int)sizeof(FT);
844
845            #pragma unroll
846            for (int i=y; i<(LOCAL_SIZE-1)/2; i+=block_size)
847            {
848                smem[i+1].x = src[2*i*step];
849                smem[i+1].y = -src[(2*i+1)*step];
850
851                smem[LOCAL_SIZE-i-1].x = src[2*i*step];;
852                smem[LOCAL_SIZE-i-1].y = src[(2*i+1)*step];
853            }
854            if (y==0)
855            {
856                smem[0].x = *(__global const FT*)(src_ptr + mad24(ind, (int)sizeof(FT), src_offset));
857                smem[0].y = 0.f;
858
859                if(LOCAL_SIZE % 2 ==0)
860                {
861                    smem[LOCAL_SIZE/2].x = src[(LOCAL_SIZE-2)*step];
862                    smem[LOCAL_SIZE/2].y = 0.f;
863                }
864            }
865        }
866        barrier(CLK_LOCAL_MEM_FENCE);
867
868        RADIX_PROCESS;
869
870        // copy data to dst
871        __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(CT)), dst_offset));
872
873        #pragma unroll
874        for (int i=0; i<kercn; i++)
875        {
876            __global CT* res = (__global CT*)(dst + i*block_size*dst_step);
877            res[0].x =  smem[y + i*block_size].x;
878            res[0].y = -smem[y + i*block_size].y;
879        }
880    }
881#endif
882}