1 /*
2 Fast Fourier/Cosine/Sine Transform
3     dimension   :three
4     data length :power of 2
5     decimation  :frequency
6     radix       :split-radix, row-column
7     data        :inplace
8     table       :use
9 functions
10     cdft3d: Complex Discrete Fourier Transform
11     rdft3d: Real Discrete Fourier Transform
12     ddct3d: Discrete Cosine Transform
13     ddst3d: Discrete Sine Transform
14 function prototypes
15     void cdft3d(int, int, int, int, double ***, double *, int *, double *);
16     void rdft3d(int, int, int, int, double ***, double *, int *, double *);
17     void rdft3dsort(int, int, int, int, double ***);
18     void ddct3d(int, int, int, int, double ***, double *, int *, double *);
19     void ddst3d(int, int, int, int, double ***, double *, int *, double *);
20 necessary package
21     fftsg.c  : 1D-FFT package
22 macro definitions
23     USE_FFT3D_PTHREADS : default=not defined
24         FFT3D_MAX_THREADS     : must be 2^N, default=4
25         FFT3D_THREADS_BEGIN_N : default=65536
26     USE_FFT3D_WINTHREADS : default=not defined
27         FFT3D_MAX_THREADS     : must be 2^N, default=4
28         FFT3D_THREADS_BEGIN_N : default=131072
29 
30 
31 -------- Complex DFT (Discrete Fourier Transform) --------
32     [definition]
33         <case1>
34             X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
35                                 x[j1][j2][j3] *
36                                 exp(2*pi*i*j1*k1/n1) *
37                                 exp(2*pi*i*j2*k2/n2) *
38                                 exp(2*pi*i*j3*k3/n3),
39                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
40         <case2>
41             X[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
42                                 x[j1][j2][j3] *
43                                 exp(-2*pi*i*j1*k1/n1) *
44                                 exp(-2*pi*i*j2*k2/n2) *
45                                 exp(-2*pi*i*j3*k3/n3),
46                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
47         (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
48     [usage]
49         <case1>
50             ip[0] = 0; // first time only
51             cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
52         <case2>
53             ip[0] = 0; // first time only
54             cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
55     [parameters]
56         n1     :data length (int)
57                 n1 >= 1, n1 = power of 2
58         n2     :data length (int)
59                 n2 >= 1, n2 = power of 2
60         2*n3   :data length (int)
61                 n3 >= 1, n3 = power of 2
62         a[0...n1-1][0...n2-1][0...2*n3-1]
63                :input/output data (double ***)
64                 input data
65                     a[j1][j2][2*j3] = Re(x[j1][j2][j3]),
66                     a[j1][j2][2*j3+1] = Im(x[j1][j2][j3]),
67                     0<=j1<n1, 0<=j2<n2, 0<=j3<n3
68                 output data
69                     a[k1][k2][2*k3] = Re(X[k1][k2][k3]),
70                     a[k1][k2][2*k3+1] = Im(X[k1][k2][k3]),
71                     0<=k1<n1, 0<=k2<n2, 0<=k3<n3
72         t[0...*]
73                :work area (double *)
74                 length of t >= max(8*n1, 8*n2),        if single thread,
75                 length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS,
76                                                        if multi threads,
77                 t is dynamically allocated, if t == NULL.
78         ip[0...*]
79                :work area for bit reversal (int *)
80                 length of ip >= 2+sqrt(n)
81                 (n = max(n1, n2, n3))
82                 ip[0],ip[1] are pointers of the cos/sin table.
83         w[0...*]
84                :cos/sin table (double *)
85                 length of w >= max(n1/2, n2/2, n3/2)
86                 w[],ip[] are initialized if ip[0] == 0.
87     [remark]
88         Inverse of
89             cdft3d(n1, n2, 2*n3, -1, a, t, ip, w);
90         is
91             cdft3d(n1, n2, 2*n3, 1, a, t, ip, w);
92             for (j1 = 0; j1 <= n1 - 1; j1++) {
93                 for (j2 = 0; j2 <= n2 - 1; j2++) {
94                     for (j3 = 0; j3 <= 2 * n3 - 1; j3++) {
95                         a[j1][j2][j3] *= 1.0 / n1 / n2 / n3;
96                     }
97                 }
98             }
99         .
100 
101 
102 -------- Real DFT / Inverse of Real DFT --------
103     [definition]
104         <case1> RDFT
105             R[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
106                                 a[j1][j2][j3] *
107                                 cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
108                                     2*pi*j3*k3/n3),
109                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
110             I[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
111                                 a[j1][j2][j3] *
112                                 sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
113                                     2*pi*j3*k3/n3),
114                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
115         <case2> IRDFT (excluding scale)
116             a[k1][k2][k3] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
117                                 (R[j1][j2][j3] *
118                                 cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
119                                     2*pi*j3*k3/n3) +
120                                 I[j1][j2][j3] *
121                                 sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2 +
122                                     2*pi*j3*k3/n3)),
123                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
124         (notes: R[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = R[k1][k2][k3],
125                 I[(n1-k1)%n1][(n2-k2)%n2][(n3-k3)%n3] = -I[k1][k2][k3],
126                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3)
127     [usage]
128         <case1>
129             ip[0] = 0; // first time only
130             rdft3d(n1, n2, n3, 1, a, t, ip, w);
131         <case2>
132             ip[0] = 0; // first time only
133             rdft3d(n1, n2, n3, -1, a, t, ip, w);
134     [parameters]
135         n1     :data length (int)
136                 n1 >= 2, n1 = power of 2
137         n2     :data length (int)
138                 n2 >= 2, n2 = power of 2
139         n3     :data length (int)
140                 n3 >= 2, n3 = power of 2
141         a[0...n1-1][0...n2-1][0...n3-1]
142                :input/output data (double ***)
143                 <case1>
144                     output data
145                         a[k1][k2][2*k3] = R[k1][k2][k3]
146                                         = R[(n1-k1)%n1][(n2-k2)%n2][n3-k3],
147                         a[k1][k2][2*k3+1] = I[k1][k2][k3]
148                                           = -I[(n1-k1)%n1][(n2-k2)%n2][n3-k3],
149                             0<=k1<n1, 0<=k2<n2, 0<k3<n3/2,
150                             (n%m : n mod m),
151                         a[k1][k2][0] = R[k1][k2][0]
152                                      = R[(n1-k1)%n1][n2-k2][0],
153                         a[k1][k2][1] = I[k1][k2][0]
154                                      = -I[(n1-k1)%n1][n2-k2][0],
155                         a[k1][n2-k2][1] = R[(n1-k1)%n1][k2][n3/2]
156                                         = R[k1][n2-k2][n3/2],
157                         a[k1][n2-k2][0] = -I[(n1-k1)%n1][k2][n3/2]
158                                         = I[k1][n2-k2][n3/2],
159                             0<=k1<n1, 0<k2<n2/2,
160                         a[k1][0][0] = R[k1][0][0]
161                                     = R[n1-k1][0][0],
162                         a[k1][0][1] = I[k1][0][0]
163                                     = -I[n1-k1][0][0],
164                         a[k1][n2/2][0] = R[k1][n2/2][0]
165                                        = R[n1-k1][n2/2][0],
166                         a[k1][n2/2][1] = I[k1][n2/2][0]
167                                        = -I[n1-k1][n2/2][0],
168                         a[n1-k1][0][1] = R[k1][0][n3/2]
169                                        = R[n1-k1][0][n3/2],
170                         a[n1-k1][0][0] = -I[k1][0][n3/2]
171                                        = I[n1-k1][0][n3/2],
172                         a[n1-k1][n2/2][1] = R[k1][n2/2][n3/2]
173                                           = R[n1-k1][n2/2][n3/2],
174                         a[n1-k1][n2/2][0] = -I[k1][n2/2][n3/2]
175                                           = I[n1-k1][n2/2][n3/2],
176                             0<k1<n1/2,
177                         a[0][0][0] = R[0][0][0],
178                         a[0][0][1] = R[0][0][n3/2],
179                         a[0][n2/2][0] = R[0][n2/2][0],
180                         a[0][n2/2][1] = R[0][n2/2][n3/2],
181                         a[n1/2][0][0] = R[n1/2][0][0],
182                         a[n1/2][0][1] = R[n1/2][0][n3/2],
183                         a[n1/2][n2/2][0] = R[n1/2][n2/2][0],
184                         a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
185                 <case2>
186                     input data
187                         a[j1][j2][2*j3] = R[j1][j2][j3]
188                                         = R[(n1-j1)%n1][(n2-j2)%n2][n3-j3],
189                         a[j1][j2][2*j3+1] = I[j1][j2][j3]
190                                           = -I[(n1-j1)%n1][(n2-j2)%n2][n3-j3],
191                             0<=j1<n1, 0<=j2<n2, 0<j3<n3/2,
192                         a[j1][j2][0] = R[j1][j2][0]
193                                      = R[(n1-j1)%n1][n2-j2][0],
194                         a[j1][j2][1] = I[j1][j2][0]
195                                      = -I[(n1-j1)%n1][n2-j2][0],
196                         a[j1][n2-j2][1] = R[(n1-j1)%n1][j2][n3/2]
197                                         = R[j1][n2-j2][n3/2],
198                         a[j1][n2-j2][0] = -I[(n1-j1)%n1][j2][n3/2]
199                                         = I[j1][n2-j2][n3/2],
200                             0<=j1<n1, 0<j2<n2/2,
201                         a[j1][0][0] = R[j1][0][0]
202                                     = R[n1-j1][0][0],
203                         a[j1][0][1] = I[j1][0][0]
204                                     = -I[n1-j1][0][0],
205                         a[j1][n2/2][0] = R[j1][n2/2][0]
206                                        = R[n1-j1][n2/2][0],
207                         a[j1][n2/2][1] = I[j1][n2/2][0]
208                                        = -I[n1-j1][n2/2][0],
209                         a[n1-j1][0][1] = R[j1][0][n3/2]
210                                        = R[n1-j1][0][n3/2],
211                         a[n1-j1][0][0] = -I[j1][0][n3/2]
212                                        = I[n1-j1][0][n3/2],
213                         a[n1-j1][n2/2][1] = R[j1][n2/2][n3/2]
214                                           = R[n1-j1][n2/2][n3/2],
215                         a[n1-j1][n2/2][0] = -I[j1][n2/2][n3/2]
216                                           = I[n1-j1][n2/2][n3/2],
217                             0<j1<n1/2,
218                         a[0][0][0] = R[0][0][0],
219                         a[0][0][1] = R[0][0][n3/2],
220                         a[0][n2/2][0] = R[0][n2/2][0],
221                         a[0][n2/2][1] = R[0][n2/2][n3/2],
222                         a[n1/2][0][0] = R[n1/2][0][0],
223                         a[n1/2][0][1] = R[n1/2][0][n3/2],
224                         a[n1/2][n2/2][0] = R[n1/2][n2/2][0],
225                         a[n1/2][n2/2][1] = R[n1/2][n2/2][n3/2]
226                 ---- output ordering ----
227                     rdft3d(n1, n2, n3, 1, a, t, ip, w);
228                     rdft3dsort(n1, n2, n3, 1, a);
229                     // stored data is a[0...n1-1][0...n2-1][0...n3+1]:
230                     // a[k1][k2][2*k3] = R[k1][k2][k3],
231                     // a[k1][k2][2*k3+1] = I[k1][k2][k3],
232                     // 0<=k1<n1, 0<=k2<n2, 0<=k3<=n3/2.
233                     // the stored data is larger than the input data!
234                 ---- input ordering ----
235                     rdft3dsort(n1, n2, n3, -1, a);
236                     rdft3d(n1, n2, n3, -1, a, t, ip, w);
237         t[0...*]
238                :work area (double *)
239                 length of t >= max(8*n1, 8*n2),        if single thread,
240                 length of t >= max(8*n1, 8*n2)*FFT3D_MAX_THREADS,
241                                                        if multi threads,
242                 t is dynamically allocated, if t == NULL.
243         ip[0...*]
244                :work area for bit reversal (int *)
245                 length of ip >= 2+sqrt(n)
246                 (n = max(n1, n2, n3/2))
247                 ip[0],ip[1] are pointers of the cos/sin table.
248         w[0...*]
249                :cos/sin table (double *)
250                 length of w >= max(n1/2, n2/2, n3/4) + n3/4
251                 w[],ip[] are initialized if ip[0] == 0.
252     [remark]
253         Inverse of
254             rdft3d(n1, n2, n3, 1, a, t, ip, w);
255         is
256             rdft3d(n1, n2, n3, -1, a, t, ip, w);
257             for (j1 = 0; j1 <= n1 - 1; j1++) {
258                 for (j2 = 0; j2 <= n2 - 1; j2++) {
259                     for (j3 = 0; j3 <= n3 - 1; j3++) {
260                         a[j1][j2][j3] *= 2.0 / n1 / n2 / n3;
261                     }
262                 }
263             }
264         .
265 
266 
267 -------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
268     [definition]
269         <case1> IDCT (excluding scale)
270             C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
271                                 a[j1][j2][j3] *
272                                 cos(pi*j1*(k1+1/2)/n1) *
273                                 cos(pi*j2*(k2+1/2)/n2) *
274                                 cos(pi*j3*(k3+1/2)/n3),
275                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
276         <case2> DCT
277             C[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
278                                 a[j1][j2][j3] *
279                                 cos(pi*(j1+1/2)*k1/n1) *
280                                 cos(pi*(j2+1/2)*k2/n2) *
281                                 cos(pi*(j3+1/2)*k3/n3),
282                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
283     [usage]
284         <case1>
285             ip[0] = 0; // first time only
286             ddct3d(n1, n2, n3, 1, a, t, ip, w);
287         <case2>
288             ip[0] = 0; // first time only
289             ddct3d(n1, n2, n3, -1, a, t, ip, w);
290     [parameters]
291         n1     :data length (int)
292                 n1 >= 2, n1 = power of 2
293         n2     :data length (int)
294                 n2 >= 2, n2 = power of 2
295         n3     :data length (int)
296                 n3 >= 2, n3 = power of 2
297         a[0...n1-1][0...n2-1][0...n3-1]
298                :input/output data (double ***)
299                 output data
300                     a[k1][k2][k3] = C[k1][k2][k3],
301                         0<=k1<n1, 0<=k2<n2, 0<=k3<n3
302         t[0...*]
303                :work area (double *)
304                 length of t >= max(4*n1, 4*n2),        if single thread,
305                 length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS,
306                                                        if multi threads,
307                 t is dynamically allocated, if t == NULL.
308         ip[0...*]
309                :work area for bit reversal (int *)
310                 length of ip >= 2+sqrt(n)
311                 (n = max(n1/2, n2/2, n3/2))
312                 ip[0],ip[1] are pointers of the cos/sin table.
313         w[0...*]
314                :cos/sin table (double *)
315                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
316                 w[],ip[] are initialized if ip[0] == 0.
317     [remark]
318         Inverse of
319             ddct3d(n1, n2, n3, -1, a, t, ip, w);
320         is
321             for (j1 = 0; j1 <= n1 - 1; j1++) {
322                 for (j2 = 0; j2 <= n2 - 1; j2++) {
323                     a[j1][j2][0] *= 0.5;
324                 }
325                 for (j3 = 0; j3 <= n3 - 1; j3++) {
326                     a[j1][0][j3] *= 0.5;
327                 }
328             }
329             for (j2 = 0; j2 <= n2 - 1; j2++) {
330                 for (j3 = 0; j3 <= n3 - 1; j3++) {
331                     a[0][j2][j3] *= 0.5;
332                 }
333             }
334             ddct3d(n1, n2, n3, 1, a, t, ip, w);
335             for (j1 = 0; j1 <= n1 - 1; j1++) {
336                 for (j2 = 0; j2 <= n2 - 1; j2++) {
337                     for (j3 = 0; j3 <= n3 - 1; j3++) {
338                         a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
339                     }
340                 }
341             }
342         .
343 
344 
345 -------- DST (Discrete Sine Transform) / Inverse of DST --------
346     [definition]
347         <case1> IDST (excluding scale)
348             S[k1][k2][k3] = sum_j1=1^n1 sum_j2=1^n2 sum_j3=1^n3
349                                 A[j1][j2][j3] *
350                                 sin(pi*j1*(k1+1/2)/n1) *
351                                 sin(pi*j2*(k2+1/2)/n2) *
352                                 sin(pi*j3*(k3+1/2)/n3),
353                                 0<=k1<n1, 0<=k2<n2, 0<=k3<n3
354         <case2> DST
355             S[k1][k2][k3] = sum_j1=0^n1-1 sum_j2=0^n2-1 sum_j3=0^n3-1
356                                 a[j1][j2][j3] *
357                                 sin(pi*(j1+1/2)*k1/n1) *
358                                 sin(pi*(j2+1/2)*k2/n2) *
359                                 sin(pi*(j3+1/2)*k3/n3),
360                                 0<k1<=n1, 0<k2<=n2, 0<k3<=n3
361     [usage]
362         <case1>
363             ip[0] = 0; // first time only
364             ddst3d(n1, n2, n3, 1, a, t, ip, w);
365         <case2>
366             ip[0] = 0; // first time only
367             ddst3d(n1, n2, n3, -1, a, t, ip, w);
368     [parameters]
369         n1     :data length (int)
370                 n1 >= 2, n1 = power of 2
371         n2     :data length (int)
372                 n2 >= 2, n2 = power of 2
373         n3     :data length (int)
374                 n3 >= 2, n3 = power of 2
375         a[0...n1-1][0...n2-1][0...n3-1]
376                :input/output data (double ***)
377                 <case1>
378                     input data
379                         a[j1%n1][j2%n2][j3%n3] = A[j1][j2][j3],
380                             0<j1<=n1, 0<j2<=n2, 0<j3<=n3,
381                             (n%m : n mod m)
382                     output data
383                         a[k1][k2][k3] = S[k1][k2][k3],
384                             0<=k1<n1, 0<=k2<n2, 0<=k3<n3
385                 <case2>
386                     output data
387                         a[k1%n1][k2%n2][k3%n3] = S[k1][k2][k3],
388                             0<k1<=n1, 0<k2<=n2, 0<k3<=n3
389         t[0...*]
390                :work area (double *)
391                 length of t >= max(4*n1, 4*n2),        if single thread,
392                 length of t >= max(4*n1, 4*n2)*FFT3D_MAX_THREADS,
393                                                        if multi threads,
394                 t is dynamically allocated, if t == NULL.
395         ip[0...*]
396                :work area for bit reversal (int *)
397                 length of ip >= 2+sqrt(n)
398                 (n = max(n1/2, n2/2, n3/2))
399                 ip[0],ip[1] are pointers of the cos/sin table.
400         w[0...*]
401                :cos/sin table (double *)
402                 length of w >= max(n1*3/2, n2*3/2, n3*3/2)
403                 w[],ip[] are initialized if ip[0] == 0.
404     [remark]
405         Inverse of
406             ddst3d(n1, n2, n3, -1, a, t, ip, w);
407         is
408             for (j1 = 0; j1 <= n1 - 1; j1++) {
409                 for (j2 = 0; j2 <= n2 - 1; j2++) {
410                     a[j1][j2][0] *= 0.5;
411                 }
412                 for (j3 = 0; j3 <= n3 - 1; j3++) {
413                     a[j1][0][j3] *= 0.5;
414                 }
415             }
416             for (j2 = 0; j2 <= n2 - 1; j2++) {
417                 for (j3 = 0; j3 <= n3 - 1; j3++) {
418                     a[0][j2][j3] *= 0.5;
419                 }
420             }
421             ddst3d(n1, n2, n3, 1, a, t, ip, w);
422             for (j1 = 0; j1 <= n1 - 1; j1++) {
423                 for (j2 = 0; j2 <= n2 - 1; j2++) {
424                     for (j3 = 0; j3 <= n3 - 1; j3++) {
425                         a[j1][j2][j3] *= 8.0 / n1 / n2 / n3;
426                     }
427                 }
428             }
429         .
430 */
431 
432 
433 #include <stdio.h>
434 #include <stdlib.h>
435 #define fft3d_alloc_error_check(p) { \
436     if ((p) == NULL) { \
437         fprintf(stderr, "fft3d memory allocation error\n"); \
438         exit(1); \
439     } \
440 }
441 
442 
443 #ifdef USE_FFT3D_PTHREADS
444 #define USE_FFT3D_THREADS
445 #ifndef FFT3D_MAX_THREADS
446 #define FFT3D_MAX_THREADS 4
447 #endif
448 #ifndef FFT3D_THREADS_BEGIN_N
449 #define FFT3D_THREADS_BEGIN_N 65536
450 #endif
451 #include <pthread.h>
452 #define fft3d_thread_t pthread_t
453 #define fft3d_thread_create(thp,func,argp) { \
454     if (pthread_create(thp, NULL, func, (void *) (argp)) != 0) { \
455         fprintf(stderr, "fft3d thread error\n"); \
456         exit(1); \
457     } \
458 }
459 #define fft3d_thread_wait(th) { \
460     if (pthread_join(th, NULL) != 0) { \
461         fprintf(stderr, "fft3d thread error\n"); \
462         exit(1); \
463     } \
464 }
465 #endif /* USE_FFT3D_PTHREADS */
466 
467 
468 #ifdef USE_FFT3D_WINTHREADS
469 #define USE_FFT3D_THREADS
470 #ifndef FFT3D_MAX_THREADS
471 #define FFT3D_MAX_THREADS 4
472 #endif
473 #ifndef FFT3D_THREADS_BEGIN_N
474 #define FFT3D_THREADS_BEGIN_N 131072
475 #endif
476 #include <windows.h>
477 #define fft3d_thread_t HANDLE
478 #define fft3d_thread_create(thp,func,argp) { \
479     DWORD thid; \
480     *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) (func), (LPVOID) (argp), 0, &thid); \
481     if (*(thp) == 0) { \
482         fprintf(stderr, "fft3d thread error\n"); \
483         exit(1); \
484     } \
485 }
486 #define fft3d_thread_wait(th) { \
487     WaitForSingleObject(th, INFINITE); \
488     CloseHandle(th); \
489 }
490 #endif /* USE_FFT3D_WINTHREADS */
491 
492 
cdft3d(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)493 void cdft3d(int n1, int n2, int n3, int isgn, double ***a,
494     double *t, int *ip, double *w)
495 {
496     void makewt(int nw, int *ip, double *w);
497     void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
498         double ***a, double *t, int *ip, double *w);
499     void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
500         double *t, int *ip, double *w);
501 #ifdef USE_FFT3D_THREADS
502     void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
503         double ***a, double *t, int *ip, double *w);
504     void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
505         double *t, int *ip, double *w);
506 #endif /* USE_FFT3D_THREADS */
507     int n, itnull, nt;
508 
509     n = n1;
510     if (n < n2) {
511         n = n2;
512     }
513     n <<= 1;
514     if (n < n3) {
515         n = n3;
516     }
517     if (n > (ip[0] << 2)) {
518         makewt(n >> 2, ip, w);
519     }
520     itnull = 0;
521     if (t == NULL) {
522         itnull = 1;
523         nt = n1;
524         if (nt < n2) {
525             nt = n2;
526         }
527         nt *= 8;
528 #ifdef USE_FFT3D_THREADS
529         nt *= FFT3D_MAX_THREADS;
530 #endif /* USE_FFT3D_THREADS */
531         if (n3 == 4) {
532             nt >>= 1;
533         } else if (n3 < 4) {
534             nt >>= 2;
535         }
536         t = (double *) malloc(sizeof(double) * nt);
537         fft3d_alloc_error_check(t);
538     }
539 #ifdef USE_FFT3D_THREADS
540     if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
541         xdft3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
542         cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
543     } else
544 #endif /* USE_FFT3D_THREADS */
545     {
546         xdft3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
547         cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
548     }
549     if (itnull != 0) {
550         free(t);
551     }
552 }
553 
554 
rdft3d(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)555 void rdft3d(int n1, int n2, int n3, int isgn, double ***a,
556     double *t, int *ip, double *w)
557 {
558     void makewt(int nw, int *ip, double *w);
559     void makect(int nc, int *ip, double *c);
560     void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
561         double ***a, double *t, int *ip, double *w);
562     void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
563         double *t, int *ip, double *w);
564     void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a);
565 #ifdef USE_FFT3D_THREADS
566     void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
567         double ***a, double *t, int *ip, double *w);
568     void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
569         double *t, int *ip, double *w);
570 #endif /* USE_FFT3D_THREADS */
571     int n, nw, nc, itnull, nt;
572 
573     n = n1;
574     if (n < n2) {
575         n = n2;
576     }
577     n <<= 1;
578     if (n < n3) {
579         n = n3;
580     }
581     nw = ip[0];
582     if (n > (nw << 2)) {
583         nw = n >> 2;
584         makewt(nw, ip, w);
585     }
586     nc = ip[1];
587     if (n3 > (nc << 2)) {
588         nc = n3 >> 2;
589         makect(nc, ip, w + nw);
590     }
591     itnull = 0;
592     if (t == NULL) {
593         itnull = 1;
594         nt = n1;
595         if (nt < n2) {
596             nt = n2;
597         }
598         nt *= 8;
599 #ifdef USE_FFT3D_THREADS
600         nt *= FFT3D_MAX_THREADS;
601 #endif /* USE_FFT3D_THREADS */
602         if (n3 == 4) {
603             nt >>= 1;
604         } else if (n3 < 4) {
605             nt >>= 2;
606         }
607         t = (double *) malloc(sizeof(double) * nt);
608         fft3d_alloc_error_check(t);
609     }
610 #ifdef USE_FFT3D_THREADS
611     if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
612         if (isgn < 0) {
613             rdft3d_sub(n1, n2, n3, isgn, a);
614             cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
615         }
616         xdft3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
617         if (isgn >= 0) {
618             cdft3db_subth(n1, n2, n3, isgn, a, t, ip, w);
619             rdft3d_sub(n1, n2, n3, isgn, a);
620         }
621     } else
622 #endif /* USE_FFT3D_THREADS */
623     {
624         if (isgn < 0) {
625             rdft3d_sub(n1, n2, n3, isgn, a);
626             cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
627         }
628         xdft3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
629         if (isgn >= 0) {
630             cdft3db_sub(n1, n2, n3, isgn, a, t, ip, w);
631             rdft3d_sub(n1, n2, n3, isgn, a);
632         }
633     }
634     if (itnull != 0) {
635         free(t);
636     }
637 }
638 
639 
rdft3dsort(int n1,int n2,int n3,int isgn,double *** a)640 void rdft3dsort(int n1, int n2, int n3, int isgn, double ***a)
641 {
642     int n1h, n2h, i, j;
643     double x, y;
644 
645     n1h = n1 >> 1;
646     n2h = n2 >> 1;
647     if (isgn < 0) {
648         for (i = 0; i < n1; i++) {
649             for (j = n2h + 1; j < n2; j++) {
650                 a[i][j][0] = a[i][j][n3 + 1];
651                 a[i][j][1] = a[i][j][n3];
652             }
653         }
654         for (i = n1h + 1; i < n1; i++) {
655             a[i][0][0] = a[i][0][n3 + 1];
656             a[i][0][1] = a[i][0][n3];
657             a[i][n2h][0] = a[i][n2h][n3 + 1];
658             a[i][n2h][1] = a[i][n2h][n3];
659         }
660         a[0][0][1] = a[0][0][n3];
661         a[0][n2h][1] = a[0][n2h][n3];
662         a[n1h][0][1] = a[n1h][0][n3];
663         a[n1h][n2h][1] = a[n1h][n2h][n3];
664     } else {
665         for (j = n2h + 1; j < n2; j++) {
666             y = a[0][j][0];
667             x = a[0][j][1];
668             a[0][j][n3] = x;
669             a[0][j][n3 + 1] = y;
670             a[0][n2 - j][n3] = x;
671             a[0][n2 - j][n3 + 1] = -y;
672             a[0][j][0] = a[0][n2 - j][0];
673             a[0][j][1] = -a[0][n2 - j][1];
674         }
675         for (i = 1; i < n1; i++) {
676             for (j = n2h + 1; j < n2; j++) {
677                 y = a[i][j][0];
678                 x = a[i][j][1];
679                 a[i][j][n3] = x;
680                 a[i][j][n3 + 1] = y;
681                 a[n1 - i][n2 - j][n3] = x;
682                 a[n1 - i][n2 - j][n3 + 1] = -y;
683                 a[i][j][0] = a[n1 - i][n2 - j][0];
684                 a[i][j][1] = -a[n1 - i][n2 - j][1];
685             }
686         }
687         for (i = n1h + 1; i < n1; i++) {
688             y = a[i][0][0];
689             x = a[i][0][1];
690             a[i][0][n3] = x;
691             a[i][0][n3 + 1] = y;
692             a[n1 - i][0][n3] = x;
693             a[n1 - i][0][n3 + 1] = -y;
694             a[i][0][0] = a[n1 - i][0][0];
695             a[i][0][1] = -a[n1 - i][0][1];
696             y = a[i][n2h][0];
697             x = a[i][n2h][1];
698             a[i][n2h][n3] = x;
699             a[i][n2h][n3 + 1] = y;
700             a[n1 - i][n2h][n3] = x;
701             a[n1 - i][n2h][n3 + 1] = -y;
702             a[i][n2h][0] = a[n1 - i][n2h][0];
703             a[i][n2h][1] = -a[n1 - i][n2h][1];
704         }
705         a[0][0][n3] = a[0][0][1];
706         a[0][0][n3 + 1] = 0;
707         a[0][0][1] = 0;
708         a[0][n2h][n3] = a[0][n2h][1];
709         a[0][n2h][n3 + 1] = 0;
710         a[0][n2h][1] = 0;
711         a[n1h][0][n3] = a[n1h][0][1];
712         a[n1h][0][n3 + 1] = 0;
713         a[n1h][0][1] = 0;
714         a[n1h][n2h][n3] = a[n1h][n2h][1];
715         a[n1h][n2h][n3 + 1] = 0;
716         a[n1h][n2h][1] = 0;
717     }
718 }
719 
720 
ddct3d(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)721 void ddct3d(int n1, int n2, int n3, int isgn, double ***a,
722     double *t, int *ip, double *w)
723 {
724     void makewt(int nw, int *ip, double *w);
725     void makect(int nc, int *ip, double *c);
726     void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
727         double ***a, double *t, int *ip, double *w);
728     void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
729         double ***a, double *t, int *ip, double *w);
730 #ifdef USE_FFT3D_THREADS
731     void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
732         double ***a, double *t, int *ip, double *w);
733     void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
734         double ***a, double *t, int *ip, double *w);
735 #endif /* USE_FFT3D_THREADS */
736     int n, nw, nc, itnull, nt;
737 
738     n = n1;
739     if (n < n2) {
740         n = n2;
741     }
742     if (n < n3) {
743         n = n3;
744     }
745     nw = ip[0];
746     if (n > (nw << 2)) {
747         nw = n >> 2;
748         makewt(nw, ip, w);
749     }
750     nc = ip[1];
751     if (n > nc) {
752         nc = n;
753         makect(nc, ip, w + nw);
754     }
755     itnull = 0;
756     if (t == NULL) {
757         itnull = 1;
758         nt = n1;
759         if (nt < n2) {
760             nt = n2;
761         }
762         nt *= 4;
763 #ifdef USE_FFT3D_THREADS
764         nt *= FFT3D_MAX_THREADS;
765 #endif /* USE_FFT3D_THREADS */
766         if (n3 == 2) {
767             nt >>= 1;
768         }
769         t = (double *) malloc(sizeof(double) * nt);
770         fft3d_alloc_error_check(t);
771     }
772 #ifdef USE_FFT3D_THREADS
773     if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
774         ddxt3da_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
775         ddxt3db_subth(n1, n2, n3, 0, isgn, a, t, ip, w);
776     } else
777 #endif /* USE_FFT3D_THREADS */
778     {
779         ddxt3da_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
780         ddxt3db_sub(n1, n2, n3, 0, isgn, a, t, ip, w);
781     }
782     if (itnull != 0) {
783         free(t);
784     }
785 }
786 
787 
ddst3d(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)788 void ddst3d(int n1, int n2, int n3, int isgn, double ***a,
789     double *t, int *ip, double *w)
790 {
791     void makewt(int nw, int *ip, double *w);
792     void makect(int nc, int *ip, double *c);
793     void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
794         double ***a, double *t, int *ip, double *w);
795     void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
796         double ***a, double *t, int *ip, double *w);
797 #ifdef USE_FFT3D_THREADS
798     void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
799         double ***a, double *t, int *ip, double *w);
800     void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
801         double ***a, double *t, int *ip, double *w);
802 #endif /* USE_FFT3D_THREADS */
803     int n, nw, nc, itnull, nt;
804 
805     n = n1;
806     if (n < n2) {
807         n = n2;
808     }
809     if (n < n3) {
810         n = n3;
811     }
812     nw = ip[0];
813     if (n > (nw << 2)) {
814         nw = n >> 2;
815         makewt(nw, ip, w);
816     }
817     nc = ip[1];
818     if (n > nc) {
819         nc = n;
820         makect(nc, ip, w + nw);
821     }
822     itnull = 0;
823     if (t == NULL) {
824         itnull = 1;
825         nt = n1;
826         if (nt < n2) {
827             nt = n2;
828         }
829         nt *= 4;
830 #ifdef USE_FFT3D_THREADS
831         nt *= FFT3D_MAX_THREADS;
832 #endif /* USE_FFT3D_THREADS */
833         if (n3 == 2) {
834             nt >>= 1;
835         }
836         t = (double *) malloc(sizeof(double) * nt);
837         fft3d_alloc_error_check(t);
838     }
839 #ifdef USE_FFT3D_THREADS
840     if ((double) n1 * n2 * n3 >= (double) FFT3D_THREADS_BEGIN_N) {
841         ddxt3da_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
842         ddxt3db_subth(n1, n2, n3, 1, isgn, a, t, ip, w);
843     } else
844 #endif /* USE_FFT3D_THREADS */
845     {
846         ddxt3da_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
847         ddxt3db_sub(n1, n2, n3, 1, isgn, a, t, ip, w);
848     }
849     if (itnull != 0) {
850         free(t);
851     }
852 }
853 
854 
855 /* -------- child routines -------- */
856 
857 
xdft3da_sub(int n1,int n2,int n3,int icr,int isgn,double *** a,double * t,int * ip,double * w)858 void xdft3da_sub(int n1, int n2, int n3, int icr, int isgn,
859     double ***a, double *t, int *ip, double *w)
860 {
861     void cdft(int n, int isgn, double *a, int *ip, double *w);
862     void rdft(int n, int isgn, double *a, int *ip, double *w);
863     int i, j, k;
864 
865     for (i = 0; i < n1; i++) {
866         if (icr == 0) {
867             for (j = 0; j < n2; j++) {
868                 cdft(n3, isgn, a[i][j], ip, w);
869             }
870         } else if (isgn >= 0) {
871             for (j = 0; j < n2; j++) {
872                 rdft(n3, isgn, a[i][j], ip, w);
873             }
874         }
875         if (n3 > 4) {
876             for (k = 0; k < n3; k += 8) {
877                 for (j = 0; j < n2; j++) {
878                     t[2 * j] = a[i][j][k];
879                     t[2 * j + 1] = a[i][j][k + 1];
880                     t[2 * n2 + 2 * j] = a[i][j][k + 2];
881                     t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
882                     t[4 * n2 + 2 * j] = a[i][j][k + 4];
883                     t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
884                     t[6 * n2 + 2 * j] = a[i][j][k + 6];
885                     t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
886                 }
887                 cdft(2 * n2, isgn, t, ip, w);
888                 cdft(2 * n2, isgn, &t[2 * n2], ip, w);
889                 cdft(2 * n2, isgn, &t[4 * n2], ip, w);
890                 cdft(2 * n2, isgn, &t[6 * n2], ip, w);
891                 for (j = 0; j < n2; j++) {
892                     a[i][j][k] = t[2 * j];
893                     a[i][j][k + 1] = t[2 * j + 1];
894                     a[i][j][k + 2] = t[2 * n2 + 2 * j];
895                     a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
896                     a[i][j][k + 4] = t[4 * n2 + 2 * j];
897                     a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
898                     a[i][j][k + 6] = t[6 * n2 + 2 * j];
899                     a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
900                 }
901             }
902         } else if (n3 == 4) {
903             for (j = 0; j < n2; j++) {
904                 t[2 * j] = a[i][j][0];
905                 t[2 * j + 1] = a[i][j][1];
906                 t[2 * n2 + 2 * j] = a[i][j][2];
907                 t[2 * n2 + 2 * j + 1] = a[i][j][3];
908             }
909             cdft(2 * n2, isgn, t, ip, w);
910             cdft(2 * n2, isgn, &t[2 * n2], ip, w);
911             for (j = 0; j < n2; j++) {
912                 a[i][j][0] = t[2 * j];
913                 a[i][j][1] = t[2 * j + 1];
914                 a[i][j][2] = t[2 * n2 + 2 * j];
915                 a[i][j][3] = t[2 * n2 + 2 * j + 1];
916             }
917         } else if (n3 == 2) {
918             for (j = 0; j < n2; j++) {
919                 t[2 * j] = a[i][j][0];
920                 t[2 * j + 1] = a[i][j][1];
921             }
922             cdft(2 * n2, isgn, t, ip, w);
923             for (j = 0; j < n2; j++) {
924                 a[i][j][0] = t[2 * j];
925                 a[i][j][1] = t[2 * j + 1];
926             }
927         }
928         if (icr != 0 && isgn < 0) {
929             for (j = 0; j < n2; j++) {
930                 rdft(n3, isgn, a[i][j], ip, w);
931             }
932         }
933     }
934 }
935 
936 
cdft3db_sub(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)937 void cdft3db_sub(int n1, int n2, int n3, int isgn, double ***a,
938     double *t, int *ip, double *w)
939 {
940     void cdft(int n, int isgn, double *a, int *ip, double *w);
941     int i, j, k;
942 
943     if (n3 > 4) {
944         for (j = 0; j < n2; j++) {
945             for (k = 0; k < n3; k += 8) {
946                 for (i = 0; i < n1; i++) {
947                     t[2 * i] = a[i][j][k];
948                     t[2 * i + 1] = a[i][j][k + 1];
949                     t[2 * n1 + 2 * i] = a[i][j][k + 2];
950                     t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
951                     t[4 * n1 + 2 * i] = a[i][j][k + 4];
952                     t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
953                     t[6 * n1 + 2 * i] = a[i][j][k + 6];
954                     t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
955                 }
956                 cdft(2 * n1, isgn, t, ip, w);
957                 cdft(2 * n1, isgn, &t[2 * n1], ip, w);
958                 cdft(2 * n1, isgn, &t[4 * n1], ip, w);
959                 cdft(2 * n1, isgn, &t[6 * n1], ip, w);
960                 for (i = 0; i < n1; i++) {
961                     a[i][j][k] = t[2 * i];
962                     a[i][j][k + 1] = t[2 * i + 1];
963                     a[i][j][k + 2] = t[2 * n1 + 2 * i];
964                     a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
965                     a[i][j][k + 4] = t[4 * n1 + 2 * i];
966                     a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
967                     a[i][j][k + 6] = t[6 * n1 + 2 * i];
968                     a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
969                 }
970             }
971         }
972     } else if (n3 == 4) {
973         for (j = 0; j < n2; j++) {
974             for (i = 0; i < n1; i++) {
975                 t[2 * i] = a[i][j][0];
976                 t[2 * i + 1] = a[i][j][1];
977                 t[2 * n1 + 2 * i] = a[i][j][2];
978                 t[2 * n1 + 2 * i + 1] = a[i][j][3];
979             }
980             cdft(2 * n1, isgn, t, ip, w);
981             cdft(2 * n1, isgn, &t[2 * n1], ip, w);
982             for (i = 0; i < n1; i++) {
983                 a[i][j][0] = t[2 * i];
984                 a[i][j][1] = t[2 * i + 1];
985                 a[i][j][2] = t[2 * n1 + 2 * i];
986                 a[i][j][3] = t[2 * n1 + 2 * i + 1];
987             }
988         }
989     } else if (n3 == 2) {
990         for (j = 0; j < n2; j++) {
991             for (i = 0; i < n1; i++) {
992                 t[2 * i] = a[i][j][0];
993                 t[2 * i + 1] = a[i][j][1];
994             }
995             cdft(2 * n1, isgn, t, ip, w);
996             for (i = 0; i < n1; i++) {
997                 a[i][j][0] = t[2 * i];
998                 a[i][j][1] = t[2 * i + 1];
999             }
1000         }
1001     }
1002 }
1003 
1004 
rdft3d_sub(int n1,int n2,int n3,int isgn,double *** a)1005 void rdft3d_sub(int n1, int n2, int n3, int isgn, double ***a)
1006 {
1007     int n1h, n2h, i, j, k, l;
1008     double xi;
1009 
1010     n1h = n1 >> 1;
1011     n2h = n2 >> 1;
1012     if (isgn < 0) {
1013         for (i = 1; i < n1h; i++) {
1014             j = n1 - i;
1015             xi = a[i][0][0] - a[j][0][0];
1016             a[i][0][0] += a[j][0][0];
1017             a[j][0][0] = xi;
1018             xi = a[j][0][1] - a[i][0][1];
1019             a[i][0][1] += a[j][0][1];
1020             a[j][0][1] = xi;
1021             xi = a[i][n2h][0] - a[j][n2h][0];
1022             a[i][n2h][0] += a[j][n2h][0];
1023             a[j][n2h][0] = xi;
1024             xi = a[j][n2h][1] - a[i][n2h][1];
1025             a[i][n2h][1] += a[j][n2h][1];
1026             a[j][n2h][1] = xi;
1027             for (k = 1; k < n2h; k++) {
1028                 l = n2 - k;
1029                 xi = a[i][k][0] - a[j][l][0];
1030                 a[i][k][0] += a[j][l][0];
1031                 a[j][l][0] = xi;
1032                 xi = a[j][l][1] - a[i][k][1];
1033                 a[i][k][1] += a[j][l][1];
1034                 a[j][l][1] = xi;
1035                 xi = a[j][k][0] - a[i][l][0];
1036                 a[j][k][0] += a[i][l][0];
1037                 a[i][l][0] = xi;
1038                 xi = a[i][l][1] - a[j][k][1];
1039                 a[j][k][1] += a[i][l][1];
1040                 a[i][l][1] = xi;
1041             }
1042         }
1043         for (k = 1; k < n2h; k++) {
1044             l = n2 - k;
1045             xi = a[0][k][0] - a[0][l][0];
1046             a[0][k][0] += a[0][l][0];
1047             a[0][l][0] = xi;
1048             xi = a[0][l][1] - a[0][k][1];
1049             a[0][k][1] += a[0][l][1];
1050             a[0][l][1] = xi;
1051             xi = a[n1h][k][0] - a[n1h][l][0];
1052             a[n1h][k][0] += a[n1h][l][0];
1053             a[n1h][l][0] = xi;
1054             xi = a[n1h][l][1] - a[n1h][k][1];
1055             a[n1h][k][1] += a[n1h][l][1];
1056             a[n1h][l][1] = xi;
1057         }
1058     } else {
1059         for (i = 1; i < n1h; i++) {
1060             j = n1 - i;
1061             a[j][0][0] = 0.5 * (a[i][0][0] - a[j][0][0]);
1062             a[i][0][0] -= a[j][0][0];
1063             a[j][0][1] = 0.5 * (a[i][0][1] + a[j][0][1]);
1064             a[i][0][1] -= a[j][0][1];
1065             a[j][n2h][0] = 0.5 * (a[i][n2h][0] - a[j][n2h][0]);
1066             a[i][n2h][0] -= a[j][n2h][0];
1067             a[j][n2h][1] = 0.5 * (a[i][n2h][1] + a[j][n2h][1]);
1068             a[i][n2h][1] -= a[j][n2h][1];
1069             for (k = 1; k < n2h; k++) {
1070                 l = n2 - k;
1071                 a[j][l][0] = 0.5 * (a[i][k][0] - a[j][l][0]);
1072                 a[i][k][0] -= a[j][l][0];
1073                 a[j][l][1] = 0.5 * (a[i][k][1] + a[j][l][1]);
1074                 a[i][k][1] -= a[j][l][1];
1075                 a[i][l][0] = 0.5 * (a[j][k][0] - a[i][l][0]);
1076                 a[j][k][0] -= a[i][l][0];
1077                 a[i][l][1] = 0.5 * (a[j][k][1] + a[i][l][1]);
1078                 a[j][k][1] -= a[i][l][1];
1079             }
1080         }
1081         for (k = 1; k < n2h; k++) {
1082             l = n2 - k;
1083             a[0][l][0] = 0.5 * (a[0][k][0] - a[0][l][0]);
1084             a[0][k][0] -= a[0][l][0];
1085             a[0][l][1] = 0.5 * (a[0][k][1] + a[0][l][1]);
1086             a[0][k][1] -= a[0][l][1];
1087             a[n1h][l][0] = 0.5 * (a[n1h][k][0] - a[n1h][l][0]);
1088             a[n1h][k][0] -= a[n1h][l][0];
1089             a[n1h][l][1] = 0.5 * (a[n1h][k][1] + a[n1h][l][1]);
1090             a[n1h][k][1] -= a[n1h][l][1];
1091         }
1092     }
1093 }
1094 
1095 
ddxt3da_sub(int n1,int n2,int n3,int ics,int isgn,double *** a,double * t,int * ip,double * w)1096 void ddxt3da_sub(int n1, int n2, int n3, int ics, int isgn,
1097     double ***a, double *t, int *ip, double *w)
1098 {
1099     void ddct(int n, int isgn, double *a, int *ip, double *w);
1100     void ddst(int n, int isgn, double *a, int *ip, double *w);
1101     int i, j, k;
1102 
1103     for (i = 0; i < n1; i++) {
1104         if (ics == 0) {
1105             for (j = 0; j < n2; j++) {
1106                 ddct(n3, isgn, a[i][j], ip, w);
1107             }
1108         } else {
1109             for (j = 0; j < n2; j++) {
1110                 ddst(n3, isgn, a[i][j], ip, w);
1111             }
1112         }
1113         if (n3 > 2) {
1114             for (k = 0; k < n3; k += 4) {
1115                 for (j = 0; j < n2; j++) {
1116                     t[j] = a[i][j][k];
1117                     t[n2 + j] = a[i][j][k + 1];
1118                     t[2 * n2 + j] = a[i][j][k + 2];
1119                     t[3 * n2 + j] = a[i][j][k + 3];
1120                 }
1121                 if (ics == 0) {
1122                     ddct(n2, isgn, t, ip, w);
1123                     ddct(n2, isgn, &t[n2], ip, w);
1124                     ddct(n2, isgn, &t[2 * n2], ip, w);
1125                     ddct(n2, isgn, &t[3 * n2], ip, w);
1126                 } else {
1127                     ddst(n2, isgn, t, ip, w);
1128                     ddst(n2, isgn, &t[n2], ip, w);
1129                     ddst(n2, isgn, &t[2 * n2], ip, w);
1130                     ddst(n2, isgn, &t[3 * n2], ip, w);
1131                 }
1132                 for (j = 0; j < n2; j++) {
1133                     a[i][j][k] = t[j];
1134                     a[i][j][k + 1] = t[n2 + j];
1135                     a[i][j][k + 2] = t[2 * n2 + j];
1136                     a[i][j][k + 3] = t[3 * n2 + j];
1137                 }
1138             }
1139         } else if (n3 == 2) {
1140             for (j = 0; j < n2; j++) {
1141                 t[j] = a[i][j][0];
1142                 t[n2 + j] = a[i][j][1];
1143             }
1144             if (ics == 0) {
1145                 ddct(n2, isgn, t, ip, w);
1146                 ddct(n2, isgn, &t[n2], ip, w);
1147             } else {
1148                 ddst(n2, isgn, t, ip, w);
1149                 ddst(n2, isgn, &t[n2], ip, w);
1150             }
1151             for (j = 0; j < n2; j++) {
1152                 a[i][j][0] = t[j];
1153                 a[i][j][1] = t[n2 + j];
1154             }
1155         }
1156     }
1157 }
1158 
1159 
ddxt3db_sub(int n1,int n2,int n3,int ics,int isgn,double *** a,double * t,int * ip,double * w)1160 void ddxt3db_sub(int n1, int n2, int n3, int ics, int isgn,
1161     double ***a, double *t, int *ip, double *w)
1162 {
1163     void ddct(int n, int isgn, double *a, int *ip, double *w);
1164     void ddst(int n, int isgn, double *a, int *ip, double *w);
1165     int i, j, k;
1166 
1167     if (n3 > 2) {
1168         for (j = 0; j < n2; j++) {
1169             for (k = 0; k < n3; k += 4) {
1170                 for (i = 0; i < n1; i++) {
1171                     t[i] = a[i][j][k];
1172                     t[n1 + i] = a[i][j][k + 1];
1173                     t[2 * n1 + i] = a[i][j][k + 2];
1174                     t[3 * n1 + i] = a[i][j][k + 3];
1175                 }
1176                 if (ics == 0) {
1177                     ddct(n1, isgn, t, ip, w);
1178                     ddct(n1, isgn, &t[n1], ip, w);
1179                     ddct(n1, isgn, &t[2 * n1], ip, w);
1180                     ddct(n1, isgn, &t[3 * n1], ip, w);
1181                 } else {
1182                     ddst(n1, isgn, t, ip, w);
1183                     ddst(n1, isgn, &t[n1], ip, w);
1184                     ddst(n1, isgn, &t[2 * n1], ip, w);
1185                     ddst(n1, isgn, &t[3 * n1], ip, w);
1186                 }
1187                 for (i = 0; i < n1; i++) {
1188                     a[i][j][k] = t[i];
1189                     a[i][j][k + 1] = t[n1 + i];
1190                     a[i][j][k + 2] = t[2 * n1 + i];
1191                     a[i][j][k + 3] = t[3 * n1 + i];
1192                 }
1193             }
1194         }
1195     } else if (n3 == 2) {
1196         for (j = 0; j < n2; j++) {
1197             for (i = 0; i < n1; i++) {
1198                 t[i] = a[i][j][0];
1199                 t[n1 + i] = a[i][j][1];
1200             }
1201             if (ics == 0) {
1202                 ddct(n1, isgn, t, ip, w);
1203                 ddct(n1, isgn, &t[n1], ip, w);
1204             } else {
1205                 ddst(n1, isgn, t, ip, w);
1206                 ddst(n1, isgn, &t[n1], ip, w);
1207             }
1208             for (i = 0; i < n1; i++) {
1209                 a[i][j][0] = t[i];
1210                 a[i][j][1] = t[n1 + i];
1211             }
1212         }
1213     }
1214 }
1215 
1216 
1217 #ifdef USE_FFT3D_THREADS
1218 struct fft3d_arg_st {
1219     int nthread;
1220     int n0;
1221     int n1;
1222     int n2;
1223     int n3;
1224     int ic;
1225     int isgn;
1226     double ***a;
1227     double *t;
1228     int *ip;
1229     double *w;
1230 };
1231 typedef struct fft3d_arg_st fft3d_arg_t;
1232 
1233 
xdft3da_subth(int n1,int n2,int n3,int icr,int isgn,double *** a,double * t,int * ip,double * w)1234 void xdft3da_subth(int n1, int n2, int n3, int icr, int isgn,
1235     double ***a, double *t, int *ip, double *w)
1236 {
1237     void *xdft3da_th(void *p);
1238     fft3d_thread_t th[FFT3D_MAX_THREADS];
1239     fft3d_arg_t ag[FFT3D_MAX_THREADS];
1240     int nthread, nt, i;
1241 
1242     nthread = FFT3D_MAX_THREADS;
1243     if (nthread > n1) {
1244         nthread = n1;
1245     }
1246     nt = 8 * n2;
1247     if (n3 == 4) {
1248         nt >>= 1;
1249     } else if (n3 < 4) {
1250         nt >>= 2;
1251     }
1252     for (i = 0; i < nthread; i++) {
1253         ag[i].nthread = nthread;
1254         ag[i].n0 = i;
1255         ag[i].n1 = n1;
1256         ag[i].n2 = n2;
1257         ag[i].n3 = n3;
1258         ag[i].ic = icr;
1259         ag[i].isgn = isgn;
1260         ag[i].a = a;
1261         ag[i].t = &t[nt * i];
1262         ag[i].ip = ip;
1263         ag[i].w = w;
1264         fft3d_thread_create(&th[i], xdft3da_th, &ag[i]);
1265     }
1266     for (i = 0; i < nthread; i++) {
1267         fft3d_thread_wait(th[i]);
1268     }
1269 }
1270 
1271 
cdft3db_subth(int n1,int n2,int n3,int isgn,double *** a,double * t,int * ip,double * w)1272 void cdft3db_subth(int n1, int n2, int n3, int isgn, double ***a,
1273     double *t, int *ip, double *w)
1274 {
1275     void *cdft3db_th(void *p);
1276     fft3d_thread_t th[FFT3D_MAX_THREADS];
1277     fft3d_arg_t ag[FFT3D_MAX_THREADS];
1278     int nthread, nt, i;
1279 
1280     nthread = FFT3D_MAX_THREADS;
1281     if (nthread > n2) {
1282         nthread = n2;
1283     }
1284     nt = 8 * n1;
1285     if (n3 == 4) {
1286         nt >>= 1;
1287     } else if (n3 < 4) {
1288         nt >>= 2;
1289     }
1290     for (i = 0; i < nthread; i++) {
1291         ag[i].nthread = nthread;
1292         ag[i].n0 = i;
1293         ag[i].n1 = n1;
1294         ag[i].n2 = n2;
1295         ag[i].n3 = n3;
1296         ag[i].isgn = isgn;
1297         ag[i].a = a;
1298         ag[i].t = &t[nt * i];
1299         ag[i].ip = ip;
1300         ag[i].w = w;
1301         fft3d_thread_create(&th[i], cdft3db_th, &ag[i]);
1302     }
1303     for (i = 0; i < nthread; i++) {
1304         fft3d_thread_wait(th[i]);
1305     }
1306 }
1307 
1308 
ddxt3da_subth(int n1,int n2,int n3,int ics,int isgn,double *** a,double * t,int * ip,double * w)1309 void ddxt3da_subth(int n1, int n2, int n3, int ics, int isgn,
1310     double ***a, double *t, int *ip, double *w)
1311 {
1312     void *ddxt3da_th(void *p);
1313     fft3d_thread_t th[FFT3D_MAX_THREADS];
1314     fft3d_arg_t ag[FFT3D_MAX_THREADS];
1315     int nthread, nt, i;
1316 
1317     nthread = FFT3D_MAX_THREADS;
1318     if (nthread > n1) {
1319         nthread = n1;
1320     }
1321     nt = 4 * n2;
1322     if (n3 == 2) {
1323         nt >>= 1;
1324     }
1325     for (i = 0; i < nthread; i++) {
1326         ag[i].nthread = nthread;
1327         ag[i].n0 = i;
1328         ag[i].n1 = n1;
1329         ag[i].n2 = n2;
1330         ag[i].n3 = n3;
1331         ag[i].ic = ics;
1332         ag[i].isgn = isgn;
1333         ag[i].a = a;
1334         ag[i].t = &t[nt * i];
1335         ag[i].ip = ip;
1336         ag[i].w = w;
1337         fft3d_thread_create(&th[i], ddxt3da_th, &ag[i]);
1338     }
1339     for (i = 0; i < nthread; i++) {
1340         fft3d_thread_wait(th[i]);
1341     }
1342 }
1343 
1344 
ddxt3db_subth(int n1,int n2,int n3,int ics,int isgn,double *** a,double * t,int * ip,double * w)1345 void ddxt3db_subth(int n1, int n2, int n3, int ics, int isgn,
1346     double ***a, double *t, int *ip, double *w)
1347 {
1348     void *ddxt3db_th(void *p);
1349     fft3d_thread_t th[FFT3D_MAX_THREADS];
1350     fft3d_arg_t ag[FFT3D_MAX_THREADS];
1351     int nthread, nt, i;
1352 
1353     nthread = FFT3D_MAX_THREADS;
1354     if (nthread > n2) {
1355         nthread = n2;
1356     }
1357     nt = 4 * n1;
1358     if (n3 == 2) {
1359         nt >>= 1;
1360     }
1361     for (i = 0; i < nthread; i++) {
1362         ag[i].nthread = nthread;
1363         ag[i].n0 = i;
1364         ag[i].n1 = n1;
1365         ag[i].n2 = n2;
1366         ag[i].n3 = n3;
1367         ag[i].ic = ics;
1368         ag[i].isgn = isgn;
1369         ag[i].a = a;
1370         ag[i].t = &t[nt * i];
1371         ag[i].ip = ip;
1372         ag[i].w = w;
1373         fft3d_thread_create(&th[i], ddxt3db_th, &ag[i]);
1374     }
1375     for (i = 0; i < nthread; i++) {
1376         fft3d_thread_wait(th[i]);
1377     }
1378 }
1379 
1380 
xdft3da_th(void * p)1381 void *xdft3da_th(void *p)
1382 {
1383     void cdft(int n, int isgn, double *a, int *ip, double *w);
1384     void rdft(int n, int isgn, double *a, int *ip, double *w);
1385     int nthread, n0, n1, n2, n3, icr, isgn, *ip, i, j, k;
1386     double ***a, *t, *w;
1387 
1388     nthread = ((fft3d_arg_t *) p)->nthread;
1389     n0 = ((fft3d_arg_t *) p)->n0;
1390     n1 = ((fft3d_arg_t *) p)->n1;
1391     n2 = ((fft3d_arg_t *) p)->n2;
1392     n3 = ((fft3d_arg_t *) p)->n3;
1393     icr = ((fft3d_arg_t *) p)->ic;
1394     isgn = ((fft3d_arg_t *) p)->isgn;
1395     a = ((fft3d_arg_t *) p)->a;
1396     t = ((fft3d_arg_t *) p)->t;
1397     ip = ((fft3d_arg_t *) p)->ip;
1398     w = ((fft3d_arg_t *) p)->w;
1399     for (i = n0; i < n1; i += nthread) {
1400         if (icr == 0) {
1401             for (j = 0; j < n2; j++) {
1402                 cdft(n3, isgn, a[i][j], ip, w);
1403             }
1404         } else if (isgn >= 0) {
1405             for (j = 0; j < n2; j++) {
1406                 rdft(n3, isgn, a[i][j], ip, w);
1407             }
1408         }
1409         if (n3 > 4) {
1410             for (k = 0; k < n3; k += 8) {
1411                 for (j = 0; j < n2; j++) {
1412                     t[2 * j] = a[i][j][k];
1413                     t[2 * j + 1] = a[i][j][k + 1];
1414                     t[2 * n2 + 2 * j] = a[i][j][k + 2];
1415                     t[2 * n2 + 2 * j + 1] = a[i][j][k + 3];
1416                     t[4 * n2 + 2 * j] = a[i][j][k + 4];
1417                     t[4 * n2 + 2 * j + 1] = a[i][j][k + 5];
1418                     t[6 * n2 + 2 * j] = a[i][j][k + 6];
1419                     t[6 * n2 + 2 * j + 1] = a[i][j][k + 7];
1420                 }
1421                 cdft(2 * n2, isgn, t, ip, w);
1422                 cdft(2 * n2, isgn, &t[2 * n2], ip, w);
1423                 cdft(2 * n2, isgn, &t[4 * n2], ip, w);
1424                 cdft(2 * n2, isgn, &t[6 * n2], ip, w);
1425                 for (j = 0; j < n2; j++) {
1426                     a[i][j][k] = t[2 * j];
1427                     a[i][j][k + 1] = t[2 * j + 1];
1428                     a[i][j][k + 2] = t[2 * n2 + 2 * j];
1429                     a[i][j][k + 3] = t[2 * n2 + 2 * j + 1];
1430                     a[i][j][k + 4] = t[4 * n2 + 2 * j];
1431                     a[i][j][k + 5] = t[4 * n2 + 2 * j + 1];
1432                     a[i][j][k + 6] = t[6 * n2 + 2 * j];
1433                     a[i][j][k + 7] = t[6 * n2 + 2 * j + 1];
1434                 }
1435             }
1436         } else if (n3 == 4) {
1437             for (j = 0; j < n2; j++) {
1438                 t[2 * j] = a[i][j][0];
1439                 t[2 * j + 1] = a[i][j][1];
1440                 t[2 * n2 + 2 * j] = a[i][j][2];
1441                 t[2 * n2 + 2 * j + 1] = a[i][j][3];
1442             }
1443             cdft(2 * n2, isgn, t, ip, w);
1444             cdft(2 * n2, isgn, &t[2 * n2], ip, w);
1445             for (j = 0; j < n2; j++) {
1446                 a[i][j][0] = t[2 * j];
1447                 a[i][j][1] = t[2 * j + 1];
1448                 a[i][j][2] = t[2 * n2 + 2 * j];
1449                 a[i][j][3] = t[2 * n2 + 2 * j + 1];
1450             }
1451         } else if (n3 == 2) {
1452             for (j = 0; j < n2; j++) {
1453                 t[2 * j] = a[i][j][0];
1454                 t[2 * j + 1] = a[i][j][1];
1455             }
1456             cdft(2 * n2, isgn, t, ip, w);
1457             for (j = 0; j < n2; j++) {
1458                 a[i][j][0] = t[2 * j];
1459                 a[i][j][1] = t[2 * j + 1];
1460             }
1461         }
1462         if (icr != 0 && isgn < 0) {
1463             for (j = 0; j < n2; j++) {
1464                 rdft(n3, isgn, a[i][j], ip, w);
1465             }
1466         }
1467     }
1468     return (void *) 0;
1469 }
1470 
1471 
cdft3db_th(void * p)1472 void *cdft3db_th(void *p)
1473 {
1474     void cdft(int n, int isgn, double *a, int *ip, double *w);
1475     int nthread, n0, n1, n2, n3, isgn, *ip, i, j, k;
1476     double ***a, *t, *w;
1477 
1478     nthread = ((fft3d_arg_t *) p)->nthread;
1479     n0 = ((fft3d_arg_t *) p)->n0;
1480     n1 = ((fft3d_arg_t *) p)->n1;
1481     n2 = ((fft3d_arg_t *) p)->n2;
1482     n3 = ((fft3d_arg_t *) p)->n3;
1483     isgn = ((fft3d_arg_t *) p)->isgn;
1484     a = ((fft3d_arg_t *) p)->a;
1485     t = ((fft3d_arg_t *) p)->t;
1486     ip = ((fft3d_arg_t *) p)->ip;
1487     w = ((fft3d_arg_t *) p)->w;
1488     if (n3 > 4) {
1489         for (j = n0; j < n2; j += nthread) {
1490             for (k = 0; k < n3; k += 8) {
1491                 for (i = 0; i < n1; i++) {
1492                     t[2 * i] = a[i][j][k];
1493                     t[2 * i + 1] = a[i][j][k + 1];
1494                     t[2 * n1 + 2 * i] = a[i][j][k + 2];
1495                     t[2 * n1 + 2 * i + 1] = a[i][j][k + 3];
1496                     t[4 * n1 + 2 * i] = a[i][j][k + 4];
1497                     t[4 * n1 + 2 * i + 1] = a[i][j][k + 5];
1498                     t[6 * n1 + 2 * i] = a[i][j][k + 6];
1499                     t[6 * n1 + 2 * i + 1] = a[i][j][k + 7];
1500                 }
1501                 cdft(2 * n1, isgn, t, ip, w);
1502                 cdft(2 * n1, isgn, &t[2 * n1], ip, w);
1503                 cdft(2 * n1, isgn, &t[4 * n1], ip, w);
1504                 cdft(2 * n1, isgn, &t[6 * n1], ip, w);
1505                 for (i = 0; i < n1; i++) {
1506                     a[i][j][k] = t[2 * i];
1507                     a[i][j][k + 1] = t[2 * i + 1];
1508                     a[i][j][k + 2] = t[2 * n1 + 2 * i];
1509                     a[i][j][k + 3] = t[2 * n1 + 2 * i + 1];
1510                     a[i][j][k + 4] = t[4 * n1 + 2 * i];
1511                     a[i][j][k + 5] = t[4 * n1 + 2 * i + 1];
1512                     a[i][j][k + 6] = t[6 * n1 + 2 * i];
1513                     a[i][j][k + 7] = t[6 * n1 + 2 * i + 1];
1514                 }
1515             }
1516         }
1517     } else if (n3 == 4) {
1518         for (j = n0; j < n2; j += nthread) {
1519             for (i = 0; i < n1; i++) {
1520                 t[2 * i] = a[i][j][0];
1521                 t[2 * i + 1] = a[i][j][1];
1522                 t[2 * n1 + 2 * i] = a[i][j][2];
1523                 t[2 * n1 + 2 * i + 1] = a[i][j][3];
1524             }
1525             cdft(2 * n1, isgn, t, ip, w);
1526             cdft(2 * n1, isgn, &t[2 * n1], ip, w);
1527             for (i = 0; i < n1; i++) {
1528                 a[i][j][0] = t[2 * i];
1529                 a[i][j][1] = t[2 * i + 1];
1530                 a[i][j][2] = t[2 * n1 + 2 * i];
1531                 a[i][j][3] = t[2 * n1 + 2 * i + 1];
1532             }
1533         }
1534     } else if (n3 == 2) {
1535         for (j = n0; j < n2; j += nthread) {
1536             for (i = 0; i < n1; i++) {
1537                 t[2 * i] = a[i][j][0];
1538                 t[2 * i + 1] = a[i][j][1];
1539             }
1540             cdft(2 * n1, isgn, t, ip, w);
1541             for (i = 0; i < n1; i++) {
1542                 a[i][j][0] = t[2 * i];
1543                 a[i][j][1] = t[2 * i + 1];
1544             }
1545         }
1546     }
1547     return (void *) 0;
1548 }
1549 
1550 
ddxt3da_th(void * p)1551 void *ddxt3da_th(void *p)
1552 {
1553     void ddct(int n, int isgn, double *a, int *ip, double *w);
1554     void ddst(int n, int isgn, double *a, int *ip, double *w);
1555     int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
1556     double ***a, *t, *w;
1557 
1558     nthread = ((fft3d_arg_t *) p)->nthread;
1559     n0 = ((fft3d_arg_t *) p)->n0;
1560     n1 = ((fft3d_arg_t *) p)->n1;
1561     n2 = ((fft3d_arg_t *) p)->n2;
1562     n3 = ((fft3d_arg_t *) p)->n3;
1563     ics = ((fft3d_arg_t *) p)->ic;
1564     isgn = ((fft3d_arg_t *) p)->isgn;
1565     a = ((fft3d_arg_t *) p)->a;
1566     t = ((fft3d_arg_t *) p)->t;
1567     ip = ((fft3d_arg_t *) p)->ip;
1568     w = ((fft3d_arg_t *) p)->w;
1569     for (i = n0; i < n1; i += nthread) {
1570         if (ics == 0) {
1571             for (j = 0; j < n2; j++) {
1572                 ddct(n3, isgn, a[i][j], ip, w);
1573             }
1574         } else {
1575             for (j = 0; j < n2; j++) {
1576                 ddst(n3, isgn, a[i][j], ip, w);
1577             }
1578         }
1579         if (n3 > 2) {
1580             for (k = 0; k < n3; k += 4) {
1581                 for (j = 0; j < n2; j++) {
1582                     t[j] = a[i][j][k];
1583                     t[n2 + j] = a[i][j][k + 1];
1584                     t[2 * n2 + j] = a[i][j][k + 2];
1585                     t[3 * n2 + j] = a[i][j][k + 3];
1586                 }
1587                 if (ics == 0) {
1588                     ddct(n2, isgn, t, ip, w);
1589                     ddct(n2, isgn, &t[n2], ip, w);
1590                     ddct(n2, isgn, &t[2 * n2], ip, w);
1591                     ddct(n2, isgn, &t[3 * n2], ip, w);
1592                 } else {
1593                     ddst(n2, isgn, t, ip, w);
1594                     ddst(n2, isgn, &t[n2], ip, w);
1595                     ddst(n2, isgn, &t[2 * n2], ip, w);
1596                     ddst(n2, isgn, &t[3 * n2], ip, w);
1597                 }
1598                 for (j = 0; j < n2; j++) {
1599                     a[i][j][k] = t[j];
1600                     a[i][j][k + 1] = t[n2 + j];
1601                     a[i][j][k + 2] = t[2 * n2 + j];
1602                     a[i][j][k + 3] = t[3 * n2 + j];
1603                 }
1604             }
1605         } else if (n3 == 2) {
1606             for (j = 0; j < n2; j++) {
1607                 t[j] = a[i][j][0];
1608                 t[n2 + j] = a[i][j][1];
1609             }
1610             if (ics == 0) {
1611                 ddct(n2, isgn, t, ip, w);
1612                 ddct(n2, isgn, &t[n2], ip, w);
1613             } else {
1614                 ddst(n2, isgn, t, ip, w);
1615                 ddst(n2, isgn, &t[n2], ip, w);
1616             }
1617             for (j = 0; j < n2; j++) {
1618                 a[i][j][0] = t[j];
1619                 a[i][j][1] = t[n2 + j];
1620             }
1621         }
1622     }
1623     return (void *) 0;
1624 }
1625 
1626 
ddxt3db_th(void * p)1627 void *ddxt3db_th(void *p)
1628 {
1629     void ddct(int n, int isgn, double *a, int *ip, double *w);
1630     void ddst(int n, int isgn, double *a, int *ip, double *w);
1631     int nthread, n0, n1, n2, n3, ics, isgn, *ip, i, j, k;
1632     double ***a, *t, *w;
1633 
1634     nthread = ((fft3d_arg_t *) p)->nthread;
1635     n0 = ((fft3d_arg_t *) p)->n0;
1636     n1 = ((fft3d_arg_t *) p)->n1;
1637     n2 = ((fft3d_arg_t *) p)->n2;
1638     n3 = ((fft3d_arg_t *) p)->n3;
1639     ics = ((fft3d_arg_t *) p)->ic;
1640     isgn = ((fft3d_arg_t *) p)->isgn;
1641     a = ((fft3d_arg_t *) p)->a;
1642     t = ((fft3d_arg_t *) p)->t;
1643     ip = ((fft3d_arg_t *) p)->ip;
1644     w = ((fft3d_arg_t *) p)->w;
1645     if (n3 > 2) {
1646         for (j = n0; j < n2; j += nthread) {
1647             for (k = 0; k < n3; k += 4) {
1648                 for (i = 0; i < n1; i++) {
1649                     t[i] = a[i][j][k];
1650                     t[n1 + i] = a[i][j][k + 1];
1651                     t[2 * n1 + i] = a[i][j][k + 2];
1652                     t[3 * n1 + i] = a[i][j][k + 3];
1653                 }
1654                 if (ics == 0) {
1655                     ddct(n1, isgn, t, ip, w);
1656                     ddct(n1, isgn, &t[n1], ip, w);
1657                     ddct(n1, isgn, &t[2 * n1], ip, w);
1658                     ddct(n1, isgn, &t[3 * n1], ip, w);
1659                 } else {
1660                     ddst(n1, isgn, t, ip, w);
1661                     ddst(n1, isgn, &t[n1], ip, w);
1662                     ddst(n1, isgn, &t[2 * n1], ip, w);
1663                     ddst(n1, isgn, &t[3 * n1], ip, w);
1664                 }
1665                 for (i = 0; i < n1; i++) {
1666                     a[i][j][k] = t[i];
1667                     a[i][j][k + 1] = t[n1 + i];
1668                     a[i][j][k + 2] = t[2 * n1 + i];
1669                     a[i][j][k + 3] = t[3 * n1 + i];
1670                 }
1671             }
1672         }
1673     } else if (n3 == 2) {
1674         for (j = n0; j < n2; j += nthread) {
1675             for (i = 0; i < n1; i++) {
1676                 t[i] = a[i][j][0];
1677                 t[n1 + i] = a[i][j][1];
1678             }
1679             if (ics == 0) {
1680                 ddct(n1, isgn, t, ip, w);
1681                 ddct(n1, isgn, &t[n1], ip, w);
1682             } else {
1683                 ddst(n1, isgn, t, ip, w);
1684                 ddst(n1, isgn, &t[n1], ip, w);
1685             }
1686             for (i = 0; i < n1; i++) {
1687                 a[i][j][0] = t[i];
1688                 a[i][j][1] = t[n1 + i];
1689             }
1690         }
1691     }
1692     return (void *) 0;
1693 }
1694 #endif /* USE_FFT3D_THREADS */
1695 
1696