1// This small program computes a Fast Fourier Transform.  It tests
2// Valgrind's handling of FP operations.  It is representative of all
3// programs that do a lot of FP operations.
4
5// Licensing: This program is closely based on the one of the same name from
6// http://www.fourmilab.ch/.  The front page of that site says:
7//
8//   "Except for a few clearly-marked exceptions, all the material on this
9//   site is in the public domain and may be used in any manner without
10//   permission, restriction, attribution, or compensation."
11
12/*
13
14	Two-dimensional FFT benchmark
15
16	Designed and implemented by John Walker in April of 1989.
17
18	This  benchmark executes a specified number of passes (default
19	20) through a loop in which each  iteration  performs  a  fast
20	Fourier transform of a square matrix (default size 256x256) of
21	complex numbers (default precision double),  followed  by  the
22	inverse  transform.   After  all loop iterations are performed
23	the results are checked against known correct values.
24
25	This benchmark is intended for use on C implementations  which
26        define  "int"  as  32 bits or longer and permit allocation and
27	direct addressing of arrays larger than one megabyte.
28
29	If CAPOUT is defined,  the  result  after  all	iterations  is
30	written  as  a	CA  Lab  pattern  file.   This is intended for
31	debugging in case horribly wrong results  are  obtained  on  a
32	given machine.
33
34	Archival  timings  are	run  with the definitions below set as
35	follows: Float = double, Asize = 256, Passes = 20, CAPOUT  not
36	defined.
37
38	Time (seconds)		    System
39
40            2393.93       Sun 3/260, SunOS 3.4, C, "-f68881 -O".
41			  (John Walker).
42
43            1928          Macintosh IIx, MPW C 3.0, "-mc68020
44                          -mc68881 -elems881 -m".  (Hugh Hoover).
45
46            1636.1        Sun 4/110, "cc -O3 -lm".  (Michael McClary).
47			  The suspicion is that this is software
48			  floating point.
49
50            1556.7        Macintosh II, A/UX, "cc -O -lm"
51			  (Michael McClary).
52
53	    1388.8	  Sun 386i/250, SunOS 4.0.1 C
54                          "-O /usr/lib/trig.il".  (James Carrington).
55
56	    1331.93	  Sun 3/60, SunOS 4.0.1, C,
57                          "-O4 -f68881 /usr/lib/libm.il"
58			  (Bob Elman).
59
60            1204.0        Apollo Domain DN4000, C, "-cpu 3000 -opt 4".
61			  (Sam Crupi).
62
63	    1174.66	  Compaq 386/25, SCO Xenix 386 C.
64			  (Peter Shieh).
65
66	    1068	  Compaq 386/25, SCO Xenix 386,
67			  Metaware High C.  (Robert Wenig).
68
69	    1064.0	  Sun 3/80, SunOS 4.0.3 Beta C
70                          "-O3 -f68881 /usr/lib/libm.il".  (James Carrington).
71
72	    1061.4	  Compaq 386/25, SCO Xenix, High C 1.4.
73			  (James Carrington).
74
75	    1059.79	  Compaq 386/25, 387/25, High C 1.4,
76			  DOS|Extender 2.2, 387 inline code
77			  generation.  (Nathan Bender).
78
79	     777.14	  Compaq 386/25, IIT 3C87-25 (387 Compatible),
80			  High C 1.5, DOS|Extender 2.2, 387 inline
81			  code generation.  (Nathan Bender).
82
83	     751	  Compaq DeskPro 386/33, High C 1.5 + DOS|Extender,
84			  387 code generation.	(James Carrington).
85
86	     431.44	  Compaq 386/25, Weitek 3167-25, DOS 3.31,
87			  High C 1.4, DOS|Extender, Weitek code generation.
88			  (Nathan Bender).
89
90	     344.9	  Compaq 486/25, Metaware High C 1.6, Phar Lap
91			  DOS|Extender, in-line floating point.  (Nathan
92			  Bender).
93
94	     324.2	  Data General Motorola 88000, 16 Mhz, Gnu C.
95
96             323.1        Sun 4/280, C, "-O4".  (Eric Hill).
97
98	     254	  Compaq SystemPro 486/33, High C 1.5 + DOS|Extender,
99			  387 code generation.	(James Carrington).
100
101	     242.8	  Silicon Graphics Personal IRIS, MIPS R2000A,
102                          12.5 Mhz, "-O3" (highest level optimisation).
103			  (Mike Zentner).
104
105             233.0        Sun SPARCStation 1, C, "-O4", SunOS 4.0.3.
106			  (Nathan Bender).
107
108	     187.30	  DEC PMAX 3100, MIPS 2000 chip.
109			  (Robert Wenig).
110
111             120.46       Sun SparcStation 2, C, "-O4", SunOS 4.1.1.
112			  (John Walker).
113
114             120.21       DEC 3MAX, MIPS 3000, "-O4".
115
116	      98.0	  Intel i860 experimental environment,
117			  OS/2, data caching disabled.	(Kern
118			  Sibbald).
119
120	      34.9	  Silicon Graphics Indigo�, MIPS R4400,
121                          175 Mhz, IRIX 5.2, "-O".
122
123	      32.4	  Pentium 133, Windows NT, Microsoft Visual
124			  C++ 4.0.
125
126	      17.25	  Silicon Graphics Indigo�, MIPS R4400,
127                          175 Mhz, IRIX 6.5, "-O3".
128
129	      14.10	  Dell Dimension XPS R100, Pentium II 400 MHz,
130			  Windows 98, Microsoft Visual C 5.0.
131
132	      10.7	  Hewlett-Packard Kayak XU 450Mhz Pentium II,
133			  Microsoft Visual C++ 6.0, Windows NT 4.0sp3.	(Nathan Bender).
134
135	       5.09	  Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3.
136
137	       0.846	  Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3.
138
139*/
140
141#include <stdio.h>
142#include <stdlib.h>
143#include <math.h>
144#include <string.h>
145
146/*  The  program  may  be  run	with  Float defined as either float or
147    double.  With IEEE arithmetic, the same answers are generated  for
148    either floating point mode.  */
149
150#define Float	 double 	   /* Floating point type used in FFT */
151
152#define Asize	 256		   /* Array edge size */
153#define Passes	 20		   /* Number of FFT/Inverse passes */
154
155#define max(a,b) ((a)>(b)?(a):(b))
156#define min(a,b) ((a)<=(b)?(a):(b))
157
158/*
159
160	Multi-dimensional fast Fourier transform
161
162        Adapted from Press et al., "Numerical Recipes in C".
163
164*/
165
166#define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr
167
168static void fourn(data, nn, ndim, isign)
169  Float data[];
170  int nn[], ndim, isign;
171{
172	register int i1, i2, i3;
173	int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2;
174	int ibit, idim, k1, k2, n, nprev, nrem, ntot;
175	Float tempi, tempr;
176	double theta, wi, wpi, wpr, wr, wtemp;
177
178	ntot = 1;
179	for (idim = 1; idim <= ndim; idim++)
180	   ntot *= nn[idim];
181	nprev = 1;
182	for (idim = ndim; idim >= 1; idim--) {
183	   n = nn[idim];
184	   nrem = ntot / (n * nprev);
185	   ip1 = nprev << 1;
186	   ip2 = ip1 * n;
187	   ip3 = ip2 * nrem;
188	   i2rev = 1;
189	   for (i2 = 1; i2 <= ip2; i2 += ip1) {
190	      if (i2 < i2rev) {
191		 for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) {
192		    for (i3 = i1; i3 <= ip3; i3 += ip2) {
193		       i3rev = i2rev + i3 - i2;
194		       SWAP(data[i3], data[i3rev]);
195		       SWAP(data[i3 + 1], data[i3rev + 1]);
196		    }
197		 }
198	      }
199	      ibit = ip2 >> 1;
200	      while (ibit >= ip1 && i2rev > ibit) {
201		 i2rev -= ibit;
202		 ibit >>= 1;
203	      }
204	      i2rev += ibit;
205	   }
206	   ifp1 = ip1;
207	   while (ifp1 < ip2) {
208	      ifp2 = ifp1 << 1;
209	      theta = isign * 6.28318530717959 / (ifp2 / ip1);
210	      wtemp = sin(0.5 * theta);
211	      wpr = -2.0 * wtemp * wtemp;
212	      wpi = sin(theta);
213	      wr = 1.0;
214	      wi = 0.0;
215	      for (i3 = 1; i3 <= ifp1; i3 += ip1) {
216		 for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) {
217		    for (i2 = i1; i2 <= ip3; i2 += ifp2) {
218		       k1 = i2;
219		       k2 = k1 + ifp1;
220		       tempr = wr * data[k2] - wi * data[k2 + 1];
221		       tempi = wr * data[k2 + 1] + wi * data[k2];
222		       data[k2] = data[k1] - tempr;
223		       data[k2 + 1] = data[k1 + 1] - tempi;
224		       data[k1] += tempr;
225		       data[k1 + 1] += tempi;
226		    }
227		 }
228		 wr = (wtemp = wr) * wpr - wi * wpi + wr;
229		 wi = wi * wpr + wtemp * wpi + wi;
230	      }
231	      ifp1 = ifp2;
232	   }
233	   nprev *= n;
234	}
235}
236#undef SWAP
237
238int main()
239{
240	int i, j, k, l, m, npasses = Passes, faedge;
241	Float *fdata /* , *fd */ ;
242	static int nsize[] = {0, 0, 0};
243	long fanum, fasize;
244	double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax;
245
246	faedge = Asize; 	   /* FFT array edge size */
247	fanum = faedge * faedge;   /* Elements in FFT array */
248	fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */
249	nsize[1] = nsize[2] = faedge;
250
251	fdata = (Float *) malloc(fasize);
252	if (fdata == NULL) {
253           fprintf(stdout, "Can't allocate data array.\n");
254	   exit(1);
255	}
256
257	/*  Generate data array to process.  */
258
259#define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2]
260#define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2]
261
262	memset(fdata, 0, fasize);
263	for (i = 0; i < faedge; i++) {
264	   for (j = 0; j < faedge; j++) {
265	      if (((i & 15) == 8) || ((j & 15) == 8))
266		 Re(i, j) = 128.0;
267	   }
268	}
269
270	for (i = 0; i < npasses; i++) {
271/*printf("Pass %d\n", i);*/
272	   /* Transform image to frequency domain. */
273	   fourn(fdata, nsize, 2, 1);
274
275	   /*  Back-transform to image. */
276	   fourn(fdata, nsize, 2, -1);
277	}
278
279	{
280	   double r, ij, ar, ai;
281	   rmin = 1e10; rmax = -1e10;
282	   imin = 1e10; imax = -1e10;
283	   ar = 0;
284	   ai = 0;
285
286	   for (i = 1; i <= fanum; i += 2) {
287	      r = fdata[i];
288	      ij = fdata[i + 1];
289	      ar += r;
290	      ai += ij;
291	      rmin = min(r, rmin);
292	      rmax = max(r, rmax);
293	      imin = min(ij, imin);
294	      imax = max(ij, imax);
295	   }
296#ifdef DEBUG
297           printf("Real min %.4g, max %.4g.  Imaginary min %.4g, max %.4g.\n",
298	      rmin, rmax, imin, imax);
299           printf("Average real %.4g, imaginary %.4g.\n",
300	      ar / fanum, ai / fanum);
301#endif
302	   mapbase = rmin;
303	   mapscale = 255 / (rmax - rmin);
304	}
305
306	/* See if we got the right answers. */
307
308	m = 0;
309	for (i = 0; i < faedge; i++) {
310	   for (j = 0; j < faedge; j++) {
311	      k = (Re(i, j) - mapbase) * mapscale;
312	      l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0;
313	      if (k != l) {
314		 m++;
315		 fprintf(stdout,
316                    "Wrong answer at (%d,%d)!  Expected %d, got %d.\n",
317		    i, j, l, k);
318	      }
319	   }
320	}
321	if (m == 0) {
322           fprintf(stdout, "%d passes.  No errors in results.\n", npasses);
323	} else {
324           fprintf(stdout, "%d passes.  %d errors in results.\n",
325	      npasses, m);
326	}
327
328#ifdef CAPOUT
329
330	/* Output the result of the transform as a CA Lab pattern
331	   file for debugging. */
332
333	{
334#define SCRX  322
335#define SCRY  200
336#define SCRN  (SCRX * SCRY)
337	   unsigned char patarr[SCRY][SCRX];
338	   FILE *fp;
339
340/*  Map user external state numbers to internal state index  */
341
342#define UtoI(x)     (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF)
343
344	   /* Copy data from FFT buffer to map. */
345
346	   memset(patarr, 0, sizeof patarr);
347	   l = (SCRX - faedge) / 2;
348	   m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2);
349	   for (i = 1; i < faedge; i++) {
350	      for (j = 0; j < min(SCRY, faedge); j++) {
351		 k = (Re(i, j) - mapbase) * mapscale;
352		 patarr[j + m][i + l] = UtoI(k);
353	      }
354	   }
355
356	   /* Dump pattern map to file. */
357
358           fp = fopen("fft.cap", "w");
359	   if (fp == NULL) {
360              fprintf(stdout, "Cannot open output file.\n");
361	      exit(0);
362	   }
363           putc(':', fp);
364	   putc(1, fp);
365	   fwrite(patarr, SCRN, 1, fp);
366	   putc(6, fp);
367	   fclose(fp);
368	}
369#endif
370
371	return 0;
372}
373