1 // This small program computes a Fast Fourier Transform.  It tests
2 // Valgrind's handling of FP operations.  It is representative of all
3 // programs that do a lot of FP operations.
4 
5 // Licensing: This program is closely based on the one of the same name from
6 // http://www.fourmilab.ch/.  The front page of that site says:
7 //
8 //   "Except for a few clearly-marked exceptions, all the material on this
9 //   site is in the public domain and may be used in any manner without
10 //   permission, restriction, attribution, or compensation."
11 
12 /*
13 
14 	Two-dimensional FFT benchmark
15 
16 	Designed and implemented by John Walker in April of 1989.
17 
18 	This  benchmark executes a specified number of passes (default
19 	20) through a loop in which each  iteration  performs  a  fast
20 	Fourier transform of a square matrix (default size 256x256) of
21 	complex numbers (default precision double),  followed  by  the
22 	inverse  transform.   After  all loop iterations are performed
23 	the results are checked against known correct values.
24 
25 	This benchmark is intended for use on C implementations  which
26         define  "int"  as  32 bits or longer and permit allocation and
27 	direct addressing of arrays larger than one megabyte.
28 
29 	If CAPOUT is defined,  the  result  after  all	iterations  is
30 	written  as  a	CA  Lab  pattern  file.   This is intended for
31 	debugging in case horribly wrong results  are  obtained  on  a
32 	given machine.
33 
34 	Archival  timings  are	run  with the definitions below set as
35 	follows: Float = double, Asize = 256, Passes = 20, CAPOUT  not
36 	defined.
37 
38 	Time (seconds)		    System
39 
40             2393.93       Sun 3/260, SunOS 3.4, C, "-f68881 -O".
41 			  (John Walker).
42 
43             1928          Macintosh IIx, MPW C 3.0, "-mc68020
44                           -mc68881 -elems881 -m".  (Hugh Hoover).
45 
46             1636.1        Sun 4/110, "cc -O3 -lm".  (Michael McClary).
47 			  The suspicion is that this is software
48 			  floating point.
49 
50             1556.7        Macintosh II, A/UX, "cc -O -lm"
51 			  (Michael McClary).
52 
53 	    1388.8	  Sun 386i/250, SunOS 4.0.1 C
54                           "-O /usr/lib/trig.il".  (James Carrington).
55 
56 	    1331.93	  Sun 3/60, SunOS 4.0.1, C,
57                           "-O4 -f68881 /usr/lib/libm.il"
58 			  (Bob Elman).
59 
60             1204.0        Apollo Domain DN4000, C, "-cpu 3000 -opt 4".
61 			  (Sam Crupi).
62 
63 	    1174.66	  Compaq 386/25, SCO Xenix 386 C.
64 			  (Peter Shieh).
65 
66 	    1068	  Compaq 386/25, SCO Xenix 386,
67 			  Metaware High C.  (Robert Wenig).
68 
69 	    1064.0	  Sun 3/80, SunOS 4.0.3 Beta C
70                           "-O3 -f68881 /usr/lib/libm.il".  (James Carrington).
71 
72 	    1061.4	  Compaq 386/25, SCO Xenix, High C 1.4.
73 			  (James Carrington).
74 
75 	    1059.79	  Compaq 386/25, 387/25, High C 1.4,
76 			  DOS|Extender 2.2, 387 inline code
77 			  generation.  (Nathan Bender).
78 
79 	     777.14	  Compaq 386/25, IIT 3C87-25 (387 Compatible),
80 			  High C 1.5, DOS|Extender 2.2, 387 inline
81 			  code generation.  (Nathan Bender).
82 
83 	     751	  Compaq DeskPro 386/33, High C 1.5 + DOS|Extender,
84 			  387 code generation.	(James Carrington).
85 
86 	     431.44	  Compaq 386/25, Weitek 3167-25, DOS 3.31,
87 			  High C 1.4, DOS|Extender, Weitek code generation.
88 			  (Nathan Bender).
89 
90 	     344.9	  Compaq 486/25, Metaware High C 1.6, Phar Lap
91 			  DOS|Extender, in-line floating point.  (Nathan
92 			  Bender).
93 
94 	     324.2	  Data General Motorola 88000, 16 Mhz, Gnu C.
95 
96              323.1        Sun 4/280, C, "-O4".  (Eric Hill).
97 
98 	     254	  Compaq SystemPro 486/33, High C 1.5 + DOS|Extender,
99 			  387 code generation.	(James Carrington).
100 
101 	     242.8	  Silicon Graphics Personal IRIS, MIPS R2000A,
102                           12.5 Mhz, "-O3" (highest level optimisation).
103 			  (Mike Zentner).
104 
105              233.0        Sun SPARCStation 1, C, "-O4", SunOS 4.0.3.
106 			  (Nathan Bender).
107 
108 	     187.30	  DEC PMAX 3100, MIPS 2000 chip.
109 			  (Robert Wenig).
110 
111              120.46       Sun SparcStation 2, C, "-O4", SunOS 4.1.1.
112 			  (John Walker).
113 
114              120.21       DEC 3MAX, MIPS 3000, "-O4".
115 
116 	      98.0	  Intel i860 experimental environment,
117 			  OS/2, data caching disabled.	(Kern
118 			  Sibbald).
119 
120 	      34.9	  Silicon Graphics Indigo�, MIPS R4400,
121                           175 Mhz, IRIX 5.2, "-O".
122 
123 	      32.4	  Pentium 133, Windows NT, Microsoft Visual
124 			  C++ 4.0.
125 
126 	      17.25	  Silicon Graphics Indigo�, MIPS R4400,
127                           175 Mhz, IRIX 6.5, "-O3".
128 
129 	      14.10	  Dell Dimension XPS R100, Pentium II 400 MHz,
130 			  Windows 98, Microsoft Visual C 5.0.
131 
132 	      10.7	  Hewlett-Packard Kayak XU 450Mhz Pentium II,
133 			  Microsoft Visual C++ 6.0, Windows NT 4.0sp3.	(Nathan Bender).
134 
135 	       5.09	  Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3.
136 
137 	       0.846	  Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3.
138 
139 */
140 
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <math.h>
144 #include <string.h>
145 
146 /*  The  program  may  be  run	with  Float defined as either float or
147     double.  With IEEE arithmetic, the same answers are generated  for
148     either floating point mode.  */
149 
150 #define Float	 double 	   /* Floating point type used in FFT */
151 
152 #define Asize	 256		   /* Array edge size */
153 #define Passes	 20		   /* Number of FFT/Inverse passes */
154 
155 #define max(a,b) ((a)>(b)?(a):(b))
156 #define min(a,b) ((a)<=(b)?(a):(b))
157 
158 /*
159 
160 	Multi-dimensional fast Fourier transform
161 
162         Adapted from Press et al., "Numerical Recipes in C".
163 
164 */
165 
166 #define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr
167 
fourn(data,nn,ndim,isign)168 static void fourn(data, nn, ndim, isign)
169   Float data[];
170   int nn[], ndim, isign;
171 {
172 	register int i1, i2, i3;
173 	int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2;
174 	int ibit, idim, k1, k2, n, nprev, nrem, ntot;
175 	Float tempi, tempr;
176 	double theta, wi, wpi, wpr, wr, wtemp;
177 
178 	ntot = 1;
179 	for (idim = 1; idim <= ndim; idim++)
180 	   ntot *= nn[idim];
181 	nprev = 1;
182 	for (idim = ndim; idim >= 1; idim--) {
183 	   n = nn[idim];
184 	   nrem = ntot / (n * nprev);
185 	   ip1 = nprev << 1;
186 	   ip2 = ip1 * n;
187 	   ip3 = ip2 * nrem;
188 	   i2rev = 1;
189 	   for (i2 = 1; i2 <= ip2; i2 += ip1) {
190 	      if (i2 < i2rev) {
191 		 for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) {
192 		    for (i3 = i1; i3 <= ip3; i3 += ip2) {
193 		       i3rev = i2rev + i3 - i2;
194 		       SWAP(data[i3], data[i3rev]);
195 		       SWAP(data[i3 + 1], data[i3rev + 1]);
196 		    }
197 		 }
198 	      }
199 	      ibit = ip2 >> 1;
200 	      while (ibit >= ip1 && i2rev > ibit) {
201 		 i2rev -= ibit;
202 		 ibit >>= 1;
203 	      }
204 	      i2rev += ibit;
205 	   }
206 	   ifp1 = ip1;
207 	   while (ifp1 < ip2) {
208 	      ifp2 = ifp1 << 1;
209 	      theta = isign * 6.28318530717959 / (ifp2 / ip1);
210 	      wtemp = sin(0.5 * theta);
211 	      wpr = -2.0 * wtemp * wtemp;
212 	      wpi = sin(theta);
213 	      wr = 1.0;
214 	      wi = 0.0;
215 	      for (i3 = 1; i3 <= ifp1; i3 += ip1) {
216 		 for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) {
217 		    for (i2 = i1; i2 <= ip3; i2 += ifp2) {
218 		       k1 = i2;
219 		       k2 = k1 + ifp1;
220 		       tempr = wr * data[k2] - wi * data[k2 + 1];
221 		       tempi = wr * data[k2 + 1] + wi * data[k2];
222 		       data[k2] = data[k1] - tempr;
223 		       data[k2 + 1] = data[k1 + 1] - tempi;
224 		       data[k1] += tempr;
225 		       data[k1 + 1] += tempi;
226 		    }
227 		 }
228 		 wr = (wtemp = wr) * wpr - wi * wpi + wr;
229 		 wi = wi * wpr + wtemp * wpi + wi;
230 	      }
231 	      ifp1 = ifp2;
232 	   }
233 	   nprev *= n;
234 	}
235 }
236 #undef SWAP
237 
main()238 int main()
239 {
240 	int i, j, k, l, m, npasses = Passes, faedge;
241 	Float *fdata /* , *fd */ ;
242 	static int nsize[] = {0, 0, 0};
243 	long fanum, fasize;
244 	double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax;
245 
246 	faedge = Asize; 	   /* FFT array edge size */
247 	fanum = faedge * faedge;   /* Elements in FFT array */
248 	fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */
249 	nsize[1] = nsize[2] = faedge;
250 
251 	fdata = (Float *) malloc(fasize);
252 	if (fdata == NULL) {
253            fprintf(stdout, "Can't allocate data array.\n");
254 	   exit(1);
255 	}
256 
257 	/*  Generate data array to process.  */
258 
259 #define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2]
260 #define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2]
261 
262 	memset(fdata, 0, fasize);
263 	for (i = 0; i < faedge; i++) {
264 	   for (j = 0; j < faedge; j++) {
265 	      if (((i & 15) == 8) || ((j & 15) == 8))
266 		 Re(i, j) = 128.0;
267 	   }
268 	}
269 
270 	for (i = 0; i < npasses; i++) {
271 /*printf("Pass %d\n", i);*/
272 	   /* Transform image to frequency domain. */
273 	   fourn(fdata, nsize, 2, 1);
274 
275 	   /*  Back-transform to image. */
276 	   fourn(fdata, nsize, 2, -1);
277 	}
278 
279 	{
280 	   double r, ij, ar, ai;
281 	   rmin = 1e10; rmax = -1e10;
282 	   imin = 1e10; imax = -1e10;
283 	   ar = 0;
284 	   ai = 0;
285 
286 	   for (i = 1; i <= fanum; i += 2) {
287 	      r = fdata[i];
288 	      ij = fdata[i + 1];
289 	      ar += r;
290 	      ai += ij;
291 	      rmin = min(r, rmin);
292 	      rmax = max(r, rmax);
293 	      imin = min(ij, imin);
294 	      imax = max(ij, imax);
295 	   }
296 #ifdef DEBUG
297            printf("Real min %.4g, max %.4g.  Imaginary min %.4g, max %.4g.\n",
298 	      rmin, rmax, imin, imax);
299            printf("Average real %.4g, imaginary %.4g.\n",
300 	      ar / fanum, ai / fanum);
301 #endif
302 	   mapbase = rmin;
303 	   mapscale = 255 / (rmax - rmin);
304 	}
305 
306 	/* See if we got the right answers. */
307 
308 	m = 0;
309 	for (i = 0; i < faedge; i++) {
310 	   for (j = 0; j < faedge; j++) {
311 	      k = (Re(i, j) - mapbase) * mapscale;
312 	      l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0;
313 	      if (k != l) {
314 		 m++;
315 		 fprintf(stdout,
316                     "Wrong answer at (%d,%d)!  Expected %d, got %d.\n",
317 		    i, j, l, k);
318 	      }
319 	   }
320 	}
321 	if (m == 0) {
322            fprintf(stdout, "%d passes.  No errors in results.\n", npasses);
323 	} else {
324            fprintf(stdout, "%d passes.  %d errors in results.\n",
325 	      npasses, m);
326 	}
327 
328 #ifdef CAPOUT
329 
330 	/* Output the result of the transform as a CA Lab pattern
331 	   file for debugging. */
332 
333 	{
334 #define SCRX  322
335 #define SCRY  200
336 #define SCRN  (SCRX * SCRY)
337 	   unsigned char patarr[SCRY][SCRX];
338 	   FILE *fp;
339 
340 /*  Map user external state numbers to internal state index  */
341 
342 #define UtoI(x)     (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF)
343 
344 	   /* Copy data from FFT buffer to map. */
345 
346 	   memset(patarr, 0, sizeof patarr);
347 	   l = (SCRX - faedge) / 2;
348 	   m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2);
349 	   for (i = 1; i < faedge; i++) {
350 	      for (j = 0; j < min(SCRY, faedge); j++) {
351 		 k = (Re(i, j) - mapbase) * mapscale;
352 		 patarr[j + m][i + l] = UtoI(k);
353 	      }
354 	   }
355 
356 	   /* Dump pattern map to file. */
357 
358            fp = fopen("fft.cap", "w");
359 	   if (fp == NULL) {
360               fprintf(stdout, "Cannot open output file.\n");
361 	      exit(0);
362 	   }
363            putc(':', fp);
364 	   putc(1, fp);
365 	   fwrite(patarr, SCRN, 1, fp);
366 	   putc(6, fp);
367 	   fclose(fp);
368 	}
369 #endif
370 
371 	return 0;
372 }
373