1#include <stdio.h>
2#include <string.h>
3
4#define N 64
5struct float_test {
6   float x[N], y[N], z[N], expected[N], res[N];
7} ft __attribute__((aligned (32)));
8
9struct double_test {
10   double x[N], y[N], z[N], expected[N], res[N];
11} dt __attribute__((aligned (32)));
12
13float plus_zero, plus_infty, minus_infty, nan_value;
14
15static int testf( float x, float y )
16{
17   unsigned int a, b;
18   memcpy( &a, &x, sizeof (a) );
19   memcpy( &b, &y, sizeof (b) );
20   if ((a & 0x7fc00000U) == 0x7fc00000U)
21      return (b & 0x7fc00000U) != 0x7fc00000U;
22   return memcmp( &a, &b, sizeof (a) ) != 0;
23}
24
25static int test_fmaf( void )
26{
27   int res = 0, i, j;
28   float w;
29   for (i = 0; i < N; i++) {
30      int thisres = 0;
31      __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
32      thisres |= testf( w, ft.expected[i] );
33      __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
34      thisres |= testf( w, ft.expected[i] );
35      __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
36      thisres |= testf( w, ft.expected[i] );
37      __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
38      thisres |= testf( w, ft.expected[i] );
39      __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
40      thisres |= testf( w, ft.expected[i] );
41      __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
42      thisres |= testf( w, ft.expected[i] );
43      if (thisres)
44         printf( "Failure 1 %d %a %a\n", i, w, ft.expected[i] );
45      res |= thisres;
46      thisres = 0;
47      __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
48      thisres |= testf( -w, ft.expected[i] );
49      __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
50      thisres |= testf( -w, ft.expected[i] );
51      __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
52      thisres |= testf( -w, ft.expected[i] );
53      __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
54      thisres |= testf( -w, ft.expected[i] );
55      __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
56      thisres |= testf( -w, ft.expected[i] );
57      __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
58      thisres |= testf( -w, ft.expected[i] );
59      if (thisres)
60         printf( "Failure 2 %d %a %a\n", i, w, ft.expected[i] );
61      res |= thisres;
62   }
63   for (i = 0; i < N; i++)
64      ft.z[i] = -ft.z[i];
65   for (i = 0; i < N; i++) {
66      int thisres = 0;
67      __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
68      thisres |= testf( w, ft.expected[i] );
69      __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
70      thisres |= testf( w, ft.expected[i] );
71      __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
72      thisres |= testf( w, ft.expected[i] );
73      __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
74      thisres |= testf( w, ft.expected[i] );
75      __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
76      thisres |= testf( w, ft.expected[i] );
77      __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
78      thisres |= testf( w, ft.expected[i] );
79      if (thisres)
80         printf( "Failure 3 %d %a %a\n", i, w, ft.expected[i] );
81      res |= thisres;
82      thisres = 0;
83      __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
84      thisres |= testf( -w, ft.expected[i] );
85      __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
86      thisres |= testf( -w, ft.expected[i] );
87      __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
88      thisres |= testf( -w, ft.expected[i] );
89      __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
90      thisres |= testf( -w, ft.expected[i] );
91      __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
92      thisres |= testf( -w, ft.expected[i] );
93      __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
94      thisres |= testf( -w, ft.expected[i] );
95      if (thisres)
96         printf( "Failure 4 %d %a %a\n", i, w, ft.expected[i] );
97      res |= thisres;
98   }
99   for (i = 0; i < N; i++)
100      ft.z[i] = -ft.z[i];
101   for (i = 0; i < N; i += 4) {
102      int thisres = 0;
103      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
104                          "vfmadd132ps %%xmm7, %%xmm8, %%xmm9;"
105                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
106                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
107      for (j = 0; j < 4; j++)
108         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
109      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
110                          "vfmadd132ps (%2), %%xmm8, %%xmm9;"
111                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
112                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
113      for (j = 0; j < 4; j++)
114         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
115      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
116                          "vfmadd213ps %%xmm7, %%xmm8, %%xmm9;"
117                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
118                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
119      for (j = 0; j < 4; j++)
120         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
121      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
122                          "vfmadd213ps (%3), %%xmm8, %%xmm9;"
123                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
124                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
125      for (j = 0; j < 4; j++)
126         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
127      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
128                          "vfmadd231ps %%xmm7, %%xmm8, %%xmm9;"
129                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
130                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
131      for (j = 0; j < 4; j++)
132         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
133      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
134                          "vfmadd231ps (%2), %%xmm8, %%xmm9;"
135                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
136                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
137      for (j = 0; j < 4; j++)
138         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
139      if (thisres) {
140         printf( "Failure 5 %d", i );
141         for (j = 0; j < 4; j++)
142            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
143         printf( "\n" );
144      }
145      res |= thisres;
146      thisres = 0;
147      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
148                          "vfnmsub132ps %%xmm7, %%xmm8, %%xmm9;"
149                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
150                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
151      for (j = 0; j < 4; j++)
152         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
153      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
154                          "vfnmsub132ps (%2), %%xmm8, %%xmm9;"
155                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
156                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
157      for (j = 0; j < 4; j++)
158         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
159      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
160                          "vfnmsub213ps %%xmm7, %%xmm8, %%xmm9;"
161                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
162                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
163      for (j = 0; j < 4; j++)
164         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
165      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
166                          "vfnmsub213ps (%3), %%xmm8, %%xmm9;"
167                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
168                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
169      for (j = 0; j < 4; j++)
170         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
171      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
172                          "vfnmsub231ps %%xmm7, %%xmm8, %%xmm9;"
173                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
174                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
175      for (j = 0; j < 4; j++)
176         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
177      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
178                          "vfnmsub231ps (%2), %%xmm8, %%xmm9;"
179                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
180                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
181      for (j = 0; j < 4; j++)
182         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
183      if (thisres) {
184         printf( "Failure 6 %d", i );
185         for (j = 0; j < 4; j++)
186            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
187         printf( "\n" );
188      }
189      res |= thisres;
190   }
191   for (i = 0; i < N; i++)
192      ft.z[i] = -ft.z[i];
193   for (i = 0; i < N; i += 4) {
194      int thisres = 0;
195      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
196                          "vfmsub132ps %%xmm7, %%xmm8, %%xmm9;"
197                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
198                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
199      for (j = 0; j < 4; j++)
200         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
201      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
202                          "vfmsub132ps (%2), %%xmm8, %%xmm9;"
203                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
204                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
205      for (j = 0; j < 4; j++)
206         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
207      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
208                          "vfmsub213ps %%xmm7, %%xmm8, %%xmm9;"
209                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
210                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
211      for (j = 0; j < 4; j++)
212         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
213      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
214                          "vfmsub213ps (%3), %%xmm8, %%xmm9;"
215                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
216                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
217      for (j = 0; j < 4; j++)
218         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
219      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
220                          "vfmsub231ps %%xmm7, %%xmm8, %%xmm9;"
221                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
222                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
223      for (j = 0; j < 4; j++)
224         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
225      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
226                          "vfmsub231ps (%2), %%xmm8, %%xmm9;"
227                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
228                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
229      for (j = 0; j < 4; j++)
230         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
231      if (thisres) {
232         printf( "Failure 7 %d", i );
233         for (j = 0; j < 4; j++)
234            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
235         printf( "\n" );
236      }
237      res |= thisres;
238      thisres = 0;
239      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
240                          "vfnmadd132ps %%xmm7, %%xmm8, %%xmm9;"
241                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
242                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
243      for (j = 0; j < 4; j++)
244         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
245      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
246                          "vfnmadd132ps (%2), %%xmm8, %%xmm9;"
247                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
248                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
249      for (j = 0; j < 4; j++)
250         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
251      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
252                          "vfnmadd213ps %%xmm7, %%xmm8, %%xmm9;"
253                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
254                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
255      for (j = 0; j < 4; j++)
256         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
257      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
258                          "vfnmadd213ps (%3), %%xmm8, %%xmm9;"
259                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
260                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
261      for (j = 0; j < 4; j++)
262         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
263      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
264                          "vfnmadd231ps %%xmm7, %%xmm8, %%xmm9;"
265                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
266                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
267      for (j = 0; j < 4; j++)
268         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
269      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
270                          "vfnmadd231ps (%2), %%xmm8, %%xmm9;"
271                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
272                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
273      for (j = 0; j < 4; j++)
274         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
275      if (thisres) {
276         printf( "Failure 8 %d", i );
277         for (j = 0; j < 4; j++)
278            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
279         printf( "\n" );
280      }
281      res |= thisres;
282   }
283   for (i = 1; i < N; i += 2)
284      ft.z[i] = -ft.z[i];
285   for (i = 0; i < N; i += 4) {
286      int thisres = 0;
287      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
288                          "vfmaddsub132ps %%xmm7, %%xmm8, %%xmm9;"
289                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
290                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
291      for (j = 0; j < 4; j++)
292         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
293      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
294                          "vfmaddsub132ps (%2), %%xmm8, %%xmm9;"
295                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
296                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
297      for (j = 0; j < 4; j++)
298         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
299      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
300                          "vfmaddsub213ps %%xmm7, %%xmm8, %%xmm9;"
301                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
302                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
303      for (j = 0; j < 4; j++)
304         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
305      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
306                          "vfmaddsub213ps (%3), %%xmm8, %%xmm9;"
307                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
308                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
309      for (j = 0; j < 4; j++)
310         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
311      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
312                          "vfmaddsub231ps %%xmm7, %%xmm8, %%xmm9;"
313                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
314                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
315      for (j = 0; j < 4; j++)
316         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
317      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
318                          "vfmaddsub231ps (%2), %%xmm8, %%xmm9;"
319                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
320                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
321      for (j = 0; j < 4; j++)
322         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
323      if (thisres) {
324         printf( "Failure 9 %d", i );
325         for (j = 0; j < 4; j++)
326            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
327         printf( "\n" );
328      }
329      res |= thisres;
330   }
331   for (i = 0; i < N; i++)
332      ft.z[i] = -ft.z[i];
333   for (i = 0; i < N; i += 4) {
334      int thisres = 0;
335      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
336                          "vfmsubadd132ps %%xmm7, %%xmm8, %%xmm9;"
337                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
338                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
339      for (j = 0; j < 4; j++)
340         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
341      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
342                          "vfmsubadd132ps (%2), %%xmm8, %%xmm9;"
343                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
344                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
345      for (j = 0; j < 4; j++)
346         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
347      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
348                          "vfmsubadd213ps %%xmm7, %%xmm8, %%xmm9;"
349                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
350                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
351      for (j = 0; j < 4; j++)
352         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
353      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
354                          "vfmsubadd213ps (%3), %%xmm8, %%xmm9;"
355                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
356                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
357      for (j = 0; j < 4; j++)
358         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
359      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
360                          "vfmsubadd231ps %%xmm7, %%xmm8, %%xmm9;"
361                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
362                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
363      for (j = 0; j < 4; j++)
364         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
365      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
366                          "vfmsubadd231ps (%2), %%xmm8, %%xmm9;"
367                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
368                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
369      for (j = 0; j < 4; j++)
370         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
371      if (thisres) {
372         printf( "Failure 10 %d", i );
373         for (j = 0; j < 4; j++)
374            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
375         printf( "\n" );
376      }
377      res |= thisres;
378   }
379   for (i = 1; i < N; i += 2)
380      ft.z[i] = -ft.z[i];
381   for (i = 0; i < N; i += 8) {
382      int thisres = 0;
383      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
384                          "vfmadd132ps %%ymm7, %%ymm8, %%ymm9;"
385                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
386                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
387      for (j = 0; j < 8; j++)
388         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
389      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
390                          "vfmadd132ps (%2), %%ymm8, %%ymm9;"
391                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
392                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
393      for (j = 0; j < 8; j++)
394         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
395      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
396                          "vfmadd213ps %%ymm7, %%ymm8, %%ymm9;"
397                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
398                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
399      for (j = 0; j < 8; j++)
400         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
401      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
402                          "vfmadd213ps (%3), %%ymm8, %%ymm9;"
403                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
404                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
405      for (j = 0; j < 8; j++)
406         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
407      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
408                          "vfmadd231ps %%ymm7, %%ymm8, %%ymm9;"
409                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
410                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
411      for (j = 0; j < 8; j++)
412         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
413      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
414                          "vfmadd231ps (%2), %%ymm8, %%ymm9;"
415                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
416                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
417      for (j = 0; j < 8; j++)
418         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
419      if (thisres) {
420         printf( "Failure 11 %d", i );
421         for (j = 0; j < 8; j++)
422            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
423         printf( "\n" );
424      }
425      res |= thisres;
426      thisres = 0;
427      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
428                          "vfnmsub132ps %%ymm7, %%ymm8, %%ymm9;"
429                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
430                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
431      for (j = 0; j < 8; j++)
432         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
433      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
434                          "vfnmsub132ps (%2), %%ymm8, %%ymm9;"
435                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
436                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
437      for (j = 0; j < 8; j++)
438         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
439      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
440                          "vfnmsub213ps %%ymm7, %%ymm8, %%ymm9;"
441                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
442                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
443      for (j = 0; j < 8; j++)
444         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
445      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
446                          "vfnmsub213ps (%3), %%ymm8, %%ymm9;"
447                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
448                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
449      for (j = 0; j < 8; j++)
450         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
451      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
452                          "vfnmsub231ps %%ymm7, %%ymm8, %%ymm9;"
453                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
454                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
455      for (j = 0; j < 8; j++)
456         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
457      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
458                          "vfnmsub231ps (%2), %%ymm8, %%ymm9;"
459                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
460                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
461      for (j = 0; j < 8; j++)
462         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
463      if (thisres) {
464         printf( "Failure 12 %d", i );
465         for (j = 0; j < 8; j++)
466            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
467         printf( "\n" );
468      }
469      res |= thisres;
470   }
471   for (i = 0; i < N; i++)
472      ft.z[i] = -ft.z[i];
473   for (i = 0; i < N; i += 8) {
474      int thisres = 0;
475      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
476                          "vfmsub132ps %%ymm7, %%ymm8, %%ymm9;"
477                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
478                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
479      for (j = 0; j < 8; j++)
480         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
481      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
482                          "vfmsub132ps (%2), %%ymm8, %%ymm9;"
483                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
484                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
485      for (j = 0; j < 8; j++)
486         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
487      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
488                          "vfmsub213ps %%ymm7, %%ymm8, %%ymm9;"
489                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
490                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
491      for (j = 0; j < 8; j++)
492         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
493      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
494                          "vfmsub213ps (%3), %%ymm8, %%ymm9;"
495                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
496                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
497      for (j = 0; j < 8; j++)
498         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
499      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
500                          "vfmsub231ps %%ymm7, %%ymm8, %%ymm9;"
501                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
502                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
503      for (j = 0; j < 8; j++)
504         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
505      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
506                          "vfmsub231ps (%2), %%ymm8, %%ymm9;"
507                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
508                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
509      for (j = 0; j < 8; j++)
510         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
511      if (thisres) {
512         printf( "Failure 13 %d", i );
513         for (j = 0; j < 8; j++)
514            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
515         printf( "\n" );
516      }
517      res |= thisres;
518      thisres = 0;
519      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
520                          "vfnmadd132ps %%ymm7, %%ymm8, %%ymm9;"
521                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
522                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
523      for (j = 0; j < 8; j++)
524         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
525      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
526                          "vfnmadd132ps (%2), %%ymm8, %%ymm9;"
527                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
528                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
529      for (j = 0; j < 8; j++)
530         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
531      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
532                          "vfnmadd213ps %%ymm7, %%ymm8, %%ymm9;"
533                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
534                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
535      for (j = 0; j < 8; j++)
536         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
537      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
538                          "vfnmadd213ps (%3), %%ymm8, %%ymm9;"
539                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
540                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
541      for (j = 0; j < 8; j++)
542         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
543      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
544                          "vfnmadd231ps %%ymm7, %%ymm8, %%ymm9;"
545                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
546                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
547      for (j = 0; j < 8; j++)
548         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
549      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
550                          "vfnmadd231ps (%2), %%ymm8, %%ymm9;"
551                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
552                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
553      for (j = 0; j < 8; j++)
554         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
555      if (thisres) {
556         printf( "Failure 14 %d", i );
557         for (j = 0; j < 8; j++)
558            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
559         printf( "\n" );
560      }
561      res |= thisres;
562   }
563   for (i = 1; i < N; i += 2)
564      ft.z[i] = -ft.z[i];
565   for (i = 0; i < N; i += 8) {
566      int thisres = 0;
567      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
568                          "vfmaddsub132ps %%ymm7, %%ymm8, %%ymm9;"
569                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
570                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
571      for (j = 0; j < 8; j++)
572         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
573      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
574                          "vfmaddsub132ps (%2), %%ymm8, %%ymm9;"
575                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
576                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
577      for (j = 0; j < 8; j++)
578         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
579      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
580                          "vfmaddsub213ps %%ymm7, %%ymm8, %%ymm9;"
581                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
582                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
583      for (j = 0; j < 8; j++)
584         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
585      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
586                          "vfmaddsub213ps (%3), %%ymm8, %%ymm9;"
587                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
588                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
589      for (j = 0; j < 8; j++)
590         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
591      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
592                          "vfmaddsub231ps %%ymm7, %%ymm8, %%ymm9;"
593                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
594                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
595      for (j = 0; j < 8; j++)
596         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
597      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
598                          "vfmaddsub231ps (%2), %%ymm8, %%ymm9;"
599                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
600                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
601      for (j = 0; j < 8; j++)
602         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
603      if (thisres) {
604         printf( "Failure 15 %d", i );
605         for (j = 0; j < 8; j++)
606            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
607         printf( "\n" );
608      }
609      res |= thisres;
610   }
611   for (i = 0; i < N; i++)
612      ft.z[i] = -ft.z[i];
613   for (i = 0; i < N; i += 8) {
614      int thisres = 0;
615      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
616                          "vfmsubadd132ps %%ymm7, %%ymm8, %%ymm9;"
617                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
618                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
619      for (j = 0; j < 8; j++)
620         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
621      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
622                          "vfmsubadd132ps (%2), %%ymm8, %%ymm9;"
623                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
624                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
625      for (j = 0; j < 8; j++)
626         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
627      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
628                          "vfmsubadd213ps %%ymm7, %%ymm8, %%ymm9;"
629                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
630                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
631      for (j = 0; j < 8; j++)
632         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
633      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
634                          "vfmsubadd213ps (%3), %%ymm8, %%ymm9;"
635                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
636                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
637      for (j = 0; j < 8; j++)
638         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
639      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
640                          "vfmsubadd231ps %%ymm7, %%ymm8, %%ymm9;"
641                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
642                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
643      for (j = 0; j < 8; j++)
644         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
645      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
646                          "vfmsubadd231ps (%2), %%ymm8, %%ymm9;"
647                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
648                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
649      for (j = 0; j < 8; j++)
650         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
651      if (thisres) {
652         printf( "Failure 16 %d", i );
653         for (j = 0; j < 8; j++)
654            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
655         printf( "\n" );
656      }
657      res |= thisres;
658   }
659   for (i = 1; i < N; i += 2)
660      ft.z[i] = -ft.z[i];
661   return res;
662}
663
664static int test( double x, double y )
665{
666   unsigned long long a, b;
667   memcpy( &a, &x, sizeof (a) );
668   memcpy( &b, &y, sizeof (b) );
669   if ((a & 0x7ff8000000000000ULL) == 0x7ff8000000000000ULL)
670      return (b & 0x7ff8000000000000ULL) != 0x7ff8000000000000ULL;
671   return memcmp( &a, &b, sizeof (a) ) != 0;
672}
673
674static int test_fma( void )
675{
676   int res = 0, i, j;
677   double w;
678   for (i = 0; i < N; i++) {
679      int thisres = 0;
680      __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
681      thisres |= test( w, dt.expected[i] );
682      __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
683      thisres |= test( w, dt.expected[i] );
684      __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
685      thisres |= test( w, dt.expected[i] );
686      __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
687      thisres |= test( w, dt.expected[i] );
688      __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
689      thisres |= test( w, dt.expected[i] );
690      __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
691      thisres |= test( w, dt.expected[i] );
692      if (thisres)
693         printf( "Failure 1 %d %a %a\n", i, w, dt.expected[i] );
694      res |= thisres;
695      thisres = 0;
696      __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
697      thisres |= test( -w, dt.expected[i] );
698      __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
699      thisres |= test( -w, dt.expected[i] );
700      __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
701      thisres |= test( -w, dt.expected[i] );
702      __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
703      thisres |= test( -w, dt.expected[i] );
704      __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
705      thisres |= test( -w, dt.expected[i] );
706      __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
707      thisres |= test( -w, dt.expected[i] );
708      if (thisres)
709         printf( "Failure 2 %d %a %a\n", i, w, dt.expected[i] );
710      res |= thisres;
711   }
712   for (i = 0; i < N; i++)
713      dt.z[i] = -dt.z[i];
714   for (i = 0; i < N; i++) {
715      int thisres = 0;
716      __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
717      thisres |= test( w, dt.expected[i] );
718      __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
719      thisres |= test( w, dt.expected[i] );
720      __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
721      thisres |= test( w, dt.expected[i] );
722      __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
723      thisres |= test( w, dt.expected[i] );
724      __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
725      thisres |= test( w, dt.expected[i] );
726      __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
727      thisres |= test( w, dt.expected[i] );
728      if (thisres)
729         printf( "Failure 3 %d %a %a\n", i, w, dt.expected[i] );
730      res |= thisres;
731      thisres = 0;
732      __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
733      thisres |= test( -w, dt.expected[i] );
734      __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
735      thisres |= test( -w, dt.expected[i] );
736      __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
737      thisres |= test( -w, dt.expected[i] );
738      __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
739      thisres |= test( -w, dt.expected[i] );
740      __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
741      thisres |= test( -w, dt.expected[i] );
742      __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
743      thisres |= test( -w, dt.expected[i] );
744      if (thisres)
745         printf( "Failure 4 %d %a %a\n", i, w, dt.expected[i] );
746      res |= thisres;
747   }
748   for (i = 0; i < N; i++)
749      dt.z[i] = -dt.z[i];
750   for (i = 0; i < N; i += 2) {
751      int thisres = 0;
752      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
753                          "vfmadd132pd %%xmm7, %%xmm8, %%xmm9;"
754                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
755                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
756      for (j = 0; j < 2; j++)
757         thisres |= test( dt.res[i+j], dt.expected[i+j] );
758      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
759                          "vfmadd132pd (%2), %%xmm8, %%xmm9;"
760                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
761                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
762      for (j = 0; j < 2; j++)
763         thisres |= test( dt.res[i+j], dt.expected[i+j] );
764      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
765                          "vfmadd213pd %%xmm7, %%xmm8, %%xmm9;"
766                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
767                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
768      for (j = 0; j < 2; j++)
769         thisres |= test( dt.res[i+j], dt.expected[i+j] );
770      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
771                          "vfmadd213pd (%3), %%xmm8, %%xmm9;"
772                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
773                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
774      for (j = 0; j < 2; j++)
775         thisres |= test( dt.res[i+j], dt.expected[i+j] );
776      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
777                          "vfmadd231pd %%xmm7, %%xmm8, %%xmm9;"
778                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
779                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
780      for (j = 0; j < 2; j++)
781         thisres |= test( dt.res[i+j], dt.expected[i+j] );
782      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
783                          "vfmadd231pd (%2), %%xmm8, %%xmm9;"
784                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
785                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
786      for (j = 0; j < 2; j++)
787         thisres |= test( dt.res[i+j], dt.expected[i+j] );
788      if (thisres) {
789         printf( "Failure 5 %d", i );
790         for (j = 0; j < 2; j++)
791            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
792         printf( "\n" );
793      }
794      res |= thisres;
795      thisres = 0;
796      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
797                          "vfnmsub132pd %%xmm7, %%xmm8, %%xmm9;"
798                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
799                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
800      for (j = 0; j < 2; j++)
801         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
802      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
803                          "vfnmsub132pd (%2), %%xmm8, %%xmm9;"
804                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
805                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
806      for (j = 0; j < 2; j++)
807         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
808      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
809                          "vfnmsub213pd %%xmm7, %%xmm8, %%xmm9;"
810                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
811                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
812      for (j = 0; j < 2; j++)
813         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
814      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
815                          "vfnmsub213pd (%3), %%xmm8, %%xmm9;"
816                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
817                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
818      for (j = 0; j < 2; j++)
819         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
820      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
821                          "vfnmsub231pd %%xmm7, %%xmm8, %%xmm9;"
822                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
823                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
824      for (j = 0; j < 2; j++)
825         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
826      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
827                          "vfnmsub231pd (%2), %%xmm8, %%xmm9;"
828                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
829                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
830      for (j = 0; j < 2; j++)
831         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
832      if (thisres) {
833         printf( "Failure 6 %d", i );
834         for (j = 0; j < 2; j++)
835            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
836         printf( "\n" );
837      }
838      res |= thisres;
839   }
840   for (i = 0; i < N; i++)
841      dt.z[i] = -dt.z[i];
842   for (i = 0; i < N; i += 2) {
843      int thisres = 0;
844      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
845                          "vfmsub132pd %%xmm7, %%xmm8, %%xmm9;"
846                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
847                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
848      for (j = 0; j < 2; j++)
849         thisres |= test( dt.res[i+j], dt.expected[i+j] );
850      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
851                          "vfmsub132pd (%2), %%xmm8, %%xmm9;"
852                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
853                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
854      for (j = 0; j < 2; j++)
855         thisres |= test( dt.res[i+j], dt.expected[i+j] );
856      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
857                          "vfmsub213pd %%xmm7, %%xmm8, %%xmm9;"
858                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
859                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
860      for (j = 0; j < 2; j++)
861         thisres |= test( dt.res[i+j], dt.expected[i+j] );
862      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
863                          "vfmsub213pd (%3), %%xmm8, %%xmm9;"
864                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
865                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
866      for (j = 0; j < 2; j++)
867         thisres |= test( dt.res[i+j], dt.expected[i+j] );
868      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
869                          "vfmsub231pd %%xmm7, %%xmm8, %%xmm9;"
870                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
871                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
872      for (j = 0; j < 2; j++)
873         thisres |= test( dt.res[i+j], dt.expected[i+j] );
874      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
875                          "vfmsub231pd (%2), %%xmm8, %%xmm9;"
876                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
877                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
878      for (j = 0; j < 2; j++)
879         thisres |= test( dt.res[i+j], dt.expected[i+j] );
880      if (thisres) {
881         printf( "Failure 7 %d", i );
882         for (j = 0; j < 2; j++)
883            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
884         printf( "\n" );
885      }
886      res |= thisres;
887      thisres = 0;
888      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
889                          "vfnmadd132pd %%xmm7, %%xmm8, %%xmm9;"
890                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
891                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
892      for (j = 0; j < 2; j++)
893         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
894      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
895                          "vfnmadd132pd (%2), %%xmm8, %%xmm9;"
896                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
897                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
898      for (j = 0; j < 2; j++)
899         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
900      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
901                          "vfnmadd213pd %%xmm7, %%xmm8, %%xmm9;"
902                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
903                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
904      for (j = 0; j < 2; j++)
905         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
906      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
907                          "vfnmadd213pd (%3), %%xmm8, %%xmm9;"
908                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
909                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
910      for (j = 0; j < 2; j++)
911         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
912      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
913                          "vfnmadd231pd %%xmm7, %%xmm8, %%xmm9;"
914                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
915                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
916      for (j = 0; j < 2; j++)
917         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
918      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
919                          "vfnmadd231pd (%2), %%xmm8, %%xmm9;"
920                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
921                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
922      for (j = 0; j < 2; j++)
923         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
924      if (thisres) {
925         printf( "Failure 8 %d", i );
926         for (j = 0; j < 2; j++)
927            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
928         printf( "\n" );
929      }
930      res |= thisres;
931   }
932   for (i = 1; i < N; i += 2)
933      dt.z[i] = -dt.z[i];
934   for (i = 0; i < N; i += 2) {
935      int thisres = 0;
936      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
937                          "vfmaddsub132pd %%xmm7, %%xmm8, %%xmm9;"
938                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
939                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
940      for (j = 0; j < 2; j++)
941         thisres |= test( dt.res[i+j], dt.expected[i+j] );
942      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
943                          "vfmaddsub132pd (%2), %%xmm8, %%xmm9;"
944                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
945                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
946      for (j = 0; j < 2; j++)
947         thisres |= test( dt.res[i+j], dt.expected[i+j] );
948      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
949                          "vfmaddsub213pd %%xmm7, %%xmm8, %%xmm9;"
950                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
951                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
952      for (j = 0; j < 2; j++)
953         thisres |= test( dt.res[i+j], dt.expected[i+j] );
954      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
955                          "vfmaddsub213pd (%3), %%xmm8, %%xmm9;"
956                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
957                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
958      for (j = 0; j < 2; j++)
959         thisres |= test( dt.res[i+j], dt.expected[i+j] );
960      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
961                          "vfmaddsub231pd %%xmm7, %%xmm8, %%xmm9;"
962                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
963                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
964      for (j = 0; j < 2; j++)
965         thisres |= test( dt.res[i+j], dt.expected[i+j] );
966      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
967                          "vfmaddsub231pd (%2), %%xmm8, %%xmm9;"
968                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
969                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
970      for (j = 0; j < 2; j++)
971         thisres |= test( dt.res[i+j], dt.expected[i+j] );
972      if (thisres) {
973         printf( "Failure 9 %d", i );
974         for (j = 0; j < 2; j++)
975            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
976         printf( "\n" );
977      }
978      res |= thisres;
979   }
980   for (i = 0; i < N; i++)
981      dt.z[i] = -dt.z[i];
982   for (i = 0; i < N; i += 2) {
983      int thisres = 0;
984      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
985                          "vfmsubadd132pd %%xmm7, %%xmm8, %%xmm9;"
986                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
987                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
988      for (j = 0; j < 2; j++)
989         thisres |= test( dt.res[i+j], dt.expected[i+j] );
990      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
991                          "vfmsubadd132pd (%2), %%xmm8, %%xmm9;"
992                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
993                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
994      for (j = 0; j < 2; j++)
995         thisres |= test( dt.res[i+j], dt.expected[i+j] );
996      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
997                          "vfmsubadd213pd %%xmm7, %%xmm8, %%xmm9;"
998                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
999                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1000      for (j = 0; j < 2; j++)
1001         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1002      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
1003                          "vfmsubadd213pd (%3), %%xmm8, %%xmm9;"
1004                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1005                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1006      for (j = 0; j < 2; j++)
1007         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1008      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
1009                          "vfmsubadd231pd %%xmm7, %%xmm8, %%xmm9;"
1010                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1011                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1012      for (j = 0; j < 2; j++)
1013         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1014      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
1015                          "vfmsubadd231pd (%2), %%xmm8, %%xmm9;"
1016                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1017                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1018      for (j = 0; j < 2; j++)
1019         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1020      if (thisres) {
1021         printf( "Failure 10 %d", i );
1022         for (j = 0; j < 2; j++)
1023            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1024         printf( "\n" );
1025      }
1026      res |= thisres;
1027   }
1028   for (i = 1; i < N; i += 2)
1029      dt.z[i] = -dt.z[i];
1030   for (i = 0; i < N; i += 4) {
1031      int thisres = 0;
1032      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1033                          "vfmadd132pd %%ymm7, %%ymm8, %%ymm9;"
1034                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1035                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1036      for (j = 0; j < 4; j++)
1037         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1038      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1039                          "vfmadd132pd (%2), %%ymm8, %%ymm9;"
1040                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1041                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1042      for (j = 0; j < 4; j++)
1043         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1044      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1045                          "vfmadd213pd %%ymm7, %%ymm8, %%ymm9;"
1046                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1047                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1048      for (j = 0; j < 4; j++)
1049         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1050      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1051                          "vfmadd213pd (%3), %%ymm8, %%ymm9;"
1052                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1053                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1054      for (j = 0; j < 4; j++)
1055         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1056      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1057                          "vfmadd231pd %%ymm7, %%ymm8, %%ymm9;"
1058                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1059                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1060      for (j = 0; j < 4; j++)
1061         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1062      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1063                          "vfmadd231pd (%2), %%ymm8, %%ymm9;"
1064                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1065                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1066      for (j = 0; j < 4; j++)
1067         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1068      if (thisres) {
1069         printf( "Failure 11 %d", i );
1070         for (j = 0; j < 4; j++)
1071            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1072         printf( "\n" );
1073      }
1074      res |= thisres;
1075      thisres = 0;
1076      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1077                          "vfnmsub132pd %%ymm7, %%ymm8, %%ymm9;"
1078                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1079                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1080      for (j = 0; j < 4; j++)
1081         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1082      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1083                          "vfnmsub132pd (%2), %%ymm8, %%ymm9;"
1084                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1085                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1086      for (j = 0; j < 4; j++)
1087         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1088      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1089                          "vfnmsub213pd %%ymm7, %%ymm8, %%ymm9;"
1090                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1091                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1092      for (j = 0; j < 4; j++)
1093         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1094      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1095                          "vfnmsub213pd (%3), %%ymm8, %%ymm9;"
1096                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1097                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1098      for (j = 0; j < 4; j++)
1099         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1100      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1101                          "vfnmsub231pd %%ymm7, %%ymm8, %%ymm9;"
1102                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1103                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1104      for (j = 0; j < 4; j++)
1105         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1106      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1107                          "vfnmsub231pd (%2), %%ymm8, %%ymm9;"
1108                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1109                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1110      for (j = 0; j < 4; j++)
1111         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1112      if (thisres) {
1113         printf( "Failure 12 %d", i );
1114         for (j = 0; j < 4; j++)
1115            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1116         printf( "\n" );
1117      }
1118      res |= thisres;
1119   }
1120   for (i = 0; i < N; i++)
1121      dt.z[i] = -dt.z[i];
1122   for (i = 0; i < N; i += 4) {
1123      int thisres = 0;
1124      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1125                          "vfmsub132pd %%ymm7, %%ymm8, %%ymm9;"
1126                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1127                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1128      for (j = 0; j < 4; j++)
1129         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1130      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1131                          "vfmsub132pd (%2), %%ymm8, %%ymm9;"
1132                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1133                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1134      for (j = 0; j < 4; j++)
1135         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1136      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1137                          "vfmsub213pd %%ymm7, %%ymm8, %%ymm9;"
1138                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1139                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1140      for (j = 0; j < 4; j++)
1141         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1142      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1143                          "vfmsub213pd (%3), %%ymm8, %%ymm9;"
1144                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1145                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1146      for (j = 0; j < 4; j++)
1147         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1148      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1149                          "vfmsub231pd %%ymm7, %%ymm8, %%ymm9;"
1150                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1151                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1152      for (j = 0; j < 4; j++)
1153         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1154      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1155                          "vfmsub231pd (%2), %%ymm8, %%ymm9;"
1156                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1157                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1158      for (j = 0; j < 4; j++)
1159         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1160      if (thisres) {
1161         printf( "Failure 13 %d", i );
1162         for (j = 0; j < 4; j++)
1163            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1164         printf( "\n" );
1165      }
1166      res |= thisres;
1167      thisres = 0;
1168      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1169                          "vfnmadd132pd %%ymm7, %%ymm8, %%ymm9;"
1170                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1171                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1172      for (j = 0; j < 4; j++)
1173         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1174      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1175                          "vfnmadd132pd (%2), %%ymm8, %%ymm9;"
1176                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1177                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1178      for (j = 0; j < 4; j++)
1179         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1180      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1181                          "vfnmadd213pd %%ymm7, %%ymm8, %%ymm9;"
1182                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1183                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1184      for (j = 0; j < 4; j++)
1185         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1186      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1187                          "vfnmadd213pd (%3), %%ymm8, %%ymm9;"
1188                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1189                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1190      for (j = 0; j < 4; j++)
1191         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1192      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1193                          "vfnmadd231pd %%ymm7, %%ymm8, %%ymm9;"
1194                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1195                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1196      for (j = 0; j < 4; j++)
1197         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1198      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1199                          "vfnmadd231pd (%2), %%ymm8, %%ymm9;"
1200                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1201                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1202      for (j = 0; j < 4; j++)
1203         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1204      if (thisres) {
1205         printf( "Failure 14 %d", i );
1206         for (j = 0; j < 4; j++)
1207            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1208         printf( "\n" );
1209      }
1210      res |= thisres;
1211   }
1212   for (i = 1; i < N; i += 2)
1213      dt.z[i] = -dt.z[i];
1214   for (i = 0; i < N; i += 4) {
1215      int thisres = 0;
1216      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1217                          "vfmaddsub132pd %%ymm7, %%ymm8, %%ymm9;"
1218                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1219                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1220      for (j = 0; j < 4; j++)
1221         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1222      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1223                          "vfmaddsub132pd (%2), %%ymm8, %%ymm9;"
1224                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1225                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1226      for (j = 0; j < 4; j++)
1227         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1228      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1229                          "vfmaddsub213pd %%ymm7, %%ymm8, %%ymm9;"
1230                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1231                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1232      for (j = 0; j < 4; j++)
1233         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1234      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1235                          "vfmaddsub213pd (%3), %%ymm8, %%ymm9;"
1236                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1237                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1238      for (j = 0; j < 4; j++)
1239         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1240      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1241                          "vfmaddsub231pd %%ymm7, %%ymm8, %%ymm9;"
1242                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1243                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1244      for (j = 0; j < 4; j++)
1245         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1246      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1247                          "vfmaddsub231pd (%2), %%ymm8, %%ymm9;"
1248                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1249                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1250      for (j = 0; j < 4; j++)
1251         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1252      if (thisres) {
1253         printf( "Failure 15 %d", i );
1254         for (j = 0; j < 4; j++)
1255            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1256         printf( "\n" );
1257      }
1258      res |= thisres;
1259   }
1260   for (i = 0; i < N; i++)
1261      dt.z[i] = -dt.z[i];
1262   for (i = 0; i < N; i += 4) {
1263      int thisres = 0;
1264      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1265                          "vfmsubadd132pd %%ymm7, %%ymm8, %%ymm9;"
1266                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1267                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1268      for (j = 0; j < 4; j++)
1269         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1270      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1271                          "vfmsubadd132pd (%2), %%ymm8, %%ymm9;"
1272                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1273                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1274      for (j = 0; j < 4; j++)
1275         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1276      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1277                          "vfmsubadd213pd %%ymm7, %%ymm8, %%ymm9;"
1278                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1279                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1280      for (j = 0; j < 4; j++)
1281         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1282      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1283                          "vfmsubadd213pd (%3), %%ymm8, %%ymm9;"
1284                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1285                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1286      for (j = 0; j < 4; j++)
1287         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1288      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1289                          "vfmsubadd231pd %%ymm7, %%ymm8, %%ymm9;"
1290                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1291                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1292      for (j = 0; j < 4; j++)
1293         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1294      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1295                          "vfmsubadd231pd (%2), %%ymm8, %%ymm9;"
1296                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1297                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1298      for (j = 0; j < 4; j++)
1299         thisres |= test( dt.res[i+j], dt.expected[i+j] );
1300      if (thisres) {
1301         printf( "Failure 16 %d", i );
1302         for (j = 0; j < 4; j++)
1303            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1304         printf( "\n" );
1305      }
1306      res |= thisres;
1307   }
1308   for (i = 1; i < N; i += 2)
1309      dt.z[i] = -dt.z[i];
1310   return res;
1311}
1312
1313int main( )
1314{
1315   int res = 0;
1316   int i = 0;
1317   plus_zero = 0.0;
1318   __asm __volatile__ ("" : : "r" (&plus_zero) : "memory");
1319   nan_value = plus_zero / plus_zero;
1320   plus_infty = 3.40282346638528859812e+38F * 16.0F;
1321   minus_infty = -plus_infty;
1322#define TEST_F( a, b, c, d ) \
1323   do {				\
1324      ft.x[i] = a;		\
1325      ft.y[i] = b;		\
1326      ft.z[i] = c;		\
1327      ft.expected[i] = d;	\
1328      i++;			\
1329   } while (0)
1330   TEST_F( 1.0, 2.0, 3.0, 5.0 );
1331   TEST_F( nan_value, 2.0, 3.0, nan_value );
1332   TEST_F( 1.0, nan_value, 3.0, nan_value );
1333   TEST_F( 1.0, 2.0, nan_value, nan_value );
1334   TEST_F( plus_infty, 0.0, nan_value, nan_value );
1335   TEST_F( minus_infty, 0.0, nan_value, nan_value );
1336   TEST_F( 0.0, plus_infty, nan_value, nan_value );
1337   TEST_F( 0.0, minus_infty, nan_value, nan_value );
1338   TEST_F( plus_infty, 0.0, 1.0, nan_value );
1339   TEST_F( minus_infty, 0.0, 1.0, nan_value );
1340   TEST_F( 0.0, plus_infty, 1.0, nan_value );
1341   TEST_F( 0.0, minus_infty, 1.0, nan_value );
1342   TEST_F( plus_infty, plus_infty, minus_infty, nan_value );
1343   TEST_F( minus_infty, plus_infty, plus_infty, nan_value );
1344   TEST_F( plus_infty, minus_infty, plus_infty, nan_value );
1345   TEST_F( minus_infty, minus_infty, minus_infty, nan_value );
1346   TEST_F( plus_infty, 3.5L, minus_infty, nan_value );
1347   TEST_F( minus_infty, -7.5L, minus_infty, nan_value );
1348   TEST_F( -13.5L, plus_infty, plus_infty, nan_value );
1349   TEST_F( minus_infty, 7.5L, plus_infty, nan_value );
1350   TEST_F( 1.25L, 0.75L, 0.0625L, 1.0L );
1351   TEST_F( -3.40282346638528859812e+38F, -3.40282346638528859812e+38F, minus_infty, minus_infty );
1352   TEST_F( 3.40282346638528859812e+38F / 2, 3.40282346638528859812e+38F / 2, minus_infty, minus_infty );
1353   TEST_F( -3.40282346638528859812e+38F, 3.40282346638528859812e+38F, plus_infty, plus_infty );
1354   TEST_F( 3.40282346638528859812e+38F / 2, -3.40282346638528859812e+38F / 4, plus_infty, plus_infty );
1355   TEST_F( plus_infty, 4, plus_infty, plus_infty );
1356   TEST_F( 2, minus_infty, minus_infty, minus_infty );
1357   TEST_F( minus_infty, minus_infty, plus_infty, plus_infty );
1358   TEST_F( plus_infty, minus_infty, minus_infty, minus_infty );
1359   TEST_F( 0x1.7ff8p+13, 0x1.000002p+0, 0x1.ffffp-24, 0x1.7ff802p+13 );
1360   TEST_F( 0x1.fffp+0, 0x1.00001p+0, -0x1.fffp+0, 0x1.fffp-20 );
1361   TEST_F( 0x1.9abcdep+127, 0x0.9abcdep-126, -0x1.f08948p+0, 0x1.bb421p-25 );
1362   TEST_F( 0x1.9abcdep+100, 0x0.9abcdep-126, -0x1.f08948p-27, 0x1.bb421p-52 );
1363   TEST_F( 0x1.fffffep+127, 0x1.001p+0, -0x1.fffffep+127, 0x1.fffffep+115 );
1364   TEST_F( -0x1.fffffep+127, 0x1.fffffep+0, 0x1.fffffep+127, -0x1.fffffap+127 );
1365   TEST_F( 0x1.fffffep+127, 2.0, -0x1.fffffep+127, 0x1.fffffep+127 );
1366
1367   res |= test_fmaf( );
1368   i = 0;
1369#define TEST( a, b, c, d ) \
1370   do {				\
1371      dt.x[i] = a;		\
1372      dt.y[i] = b;		\
1373      dt.z[i] = c;		\
1374      dt.expected[i] = d;	\
1375      i++;			\
1376   } while (0)
1377   TEST( 1.0, 2.0, 3.0, 5.0 );
1378   TEST( nan_value, 2.0, 3.0, nan_value );
1379   TEST( 1.0, nan_value, 3.0, nan_value );
1380   TEST( 1.0, 2.0, nan_value, nan_value );
1381   TEST( plus_infty, 0.0, nan_value, nan_value );
1382   TEST( minus_infty, 0.0, nan_value, nan_value );
1383   TEST( 0.0, plus_infty, nan_value, nan_value );
1384   TEST( 0.0, minus_infty, nan_value, nan_value );
1385   TEST( plus_infty, 0.0, 1.0, nan_value );
1386   TEST( minus_infty, 0.0, 1.0, nan_value );
1387   TEST( 0.0, plus_infty, 1.0, nan_value );
1388   TEST( 0.0, minus_infty, 1.0, nan_value );
1389   TEST( plus_infty, plus_infty, minus_infty, nan_value );
1390   TEST( minus_infty, plus_infty, plus_infty, nan_value );
1391   TEST( plus_infty, minus_infty, plus_infty, nan_value );
1392   TEST( minus_infty, minus_infty, minus_infty, nan_value );
1393   TEST( plus_infty, 3.5L, minus_infty, nan_value );
1394   TEST( minus_infty, -7.5L, minus_infty, nan_value );
1395   TEST( -13.5L, plus_infty, plus_infty, nan_value );
1396   TEST( minus_infty, 7.5L, plus_infty, nan_value );
1397   TEST( 1.25L, 0.75L, 0.0625L, 1.0L );
1398   TEST( -1.79769313486231570815e+308L, -1.79769313486231570815e+308L, minus_infty, minus_infty );
1399   TEST( 1.79769313486231570815e+308L / 2, 1.79769313486231570815e+308L / 2, minus_infty, minus_infty );
1400   TEST( -1.79769313486231570815e+308L, 1.79769313486231570815e+308L, plus_infty, plus_infty );
1401   TEST( 1.79769313486231570815e+308L / 2, -1.79769313486231570815e+308L / 4, plus_infty, plus_infty );
1402   TEST( plus_infty, 4, plus_infty, plus_infty );
1403   TEST( 2, minus_infty, minus_infty, minus_infty );
1404   TEST( minus_infty, minus_infty, plus_infty, plus_infty );
1405   TEST( plus_infty, minus_infty, minus_infty, minus_infty );
1406   TEST( 0x1.7fp+13, 0x1.0000000000001p+0, 0x1.ffep-48, 0x1.7f00000000001p+13 );
1407   TEST( 0x1.fffp+0, 0x1.0000000000001p+0, -0x1.fffp+0, 0x1.fffp-52 );
1408   TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, 0x1p-300, 1.0 );
1409   TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, -0x1p-300, 0x1.fffffffffffffp-1 );
1410   TEST( 0x1.deadbeef2feedp+1023, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp+1, 0x1.0989687bc9da4p-53 );
1411   TEST( 0x1.deadbeef2feedp+900, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp-122, 0x1.0989687bc9da4p-176 );
1412   TEST( 0x1.fffffffffffffp+1023, 0x1.001p+0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1011 );
1413   TEST( -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+0, 0x1.fffffffffffffp+1023, -0x1.ffffffffffffdp+1023 );
1414   TEST( 0x1.fffffffffffffp+1023, 2.0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1023 );
1415   TEST( 0x1.6a09e667f3bccp-538, 0x1.6a09e667f3bccp-538, 0.0, 0.0 );
1416   TEST( 0x1.deadbeef2feedp-495, 0x1.deadbeef2feedp-495, -0x1.bf86a5786a574p-989, 0x0.0000042625a1fp-1022 );
1417   TEST( 0x1.deadbeef2feedp-503, 0x1.deadbeef2feedp-503, -0x1.bf86a5786a574p-1005, 0x0.0000000004262p-1022 );
1418   TEST( 0x1p-537, 0x1p-538, 0x1p-1074, 0x0.0000000000002p-1022 );
1419   TEST( 0x1.7fffff8p-968, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000001p-1022 );
1420   TEST( 0x1.4000004p-967, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000003p-1022 );
1421   TEST( 0x1.4p-967, -0x1p-106, -0x0.000001p-1022, -0x0.0000010000002p-1022 );
1422   TEST( -0x1.19cab66d73e17p-959, 0x1.c7108a8c5ff51p-107, -0x0.80b0ad65d9b64p-1022, -0x0.80b0ad65d9d59p-1022 );
1423   TEST( -0x1.d2eaed6e8e9d3p-979, -0x1.4e066c62ac9ddp-63, -0x0.9245e6b003454p-1022, -0x0.9245c09c5fb5dp-1022 );
1424   TEST( 0x1.153d650bb9f06p-907, 0x1.2d01230d48407p-125, -0x0.b278d5acfc3cp-1022, -0x0.b22757123bbe9p-1022 );
1425   TEST( -0x1.fffffffffffffp-711, 0x1.fffffffffffffp-275, 0x1.fffffe00007ffp-983, 0x1.7ffffe00007ffp-983 );
1426
1427   res |= test_fma( );
1428   if (res == 0)
1429      printf( "Testing successful\n");
1430   return 0;
1431}
1432