1 #include <stdio.h>
2 #include <string.h>
3 
4 #define N 64
5 struct float_test {
6    float x[N], y[N], z[N], expected[N], res[N];
7 } ft __attribute__((aligned (32)));
8 
9 struct double_test {
10    double x[N], y[N], z[N], expected[N], res[N];
11 } dt __attribute__((aligned (32)));
12 
13 float plus_zero, plus_infty, minus_infty, nan_value;
14 
testf(float x,float y)15 static int testf( float x, float y )
16 {
17    unsigned int a, b;
18    memcpy( &a, &x, sizeof (a) );
19    memcpy( &b, &y, sizeof (b) );
20    if ((a & 0x7fc00000U) == 0x7fc00000U)
21       return (b & 0x7fc00000U) != 0x7fc00000U;
22    return memcmp( &a, &b, sizeof (a) ) != 0;
23 }
24 
test_fmaf(void)25 static int test_fmaf( void )
26 {
27    int res = 0, i, j;
28    float w;
29    for (i = 0; i < N; i++) {
30       int thisres = 0;
31       __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
32       thisres |= testf( w, ft.expected[i] );
33       __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
34       thisres |= testf( w, ft.expected[i] );
35       __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
36       thisres |= testf( w, ft.expected[i] );
37       __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
38       thisres |= testf( w, ft.expected[i] );
39       __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
40       thisres |= testf( w, ft.expected[i] );
41       __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
42       thisres |= testf( w, ft.expected[i] );
43       if (thisres)
44          printf( "Failure 1 %d %a %a\n", i, w, ft.expected[i] );
45       res |= thisres;
46       thisres = 0;
47       __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
48       thisres |= testf( -w, ft.expected[i] );
49       __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
50       thisres |= testf( -w, ft.expected[i] );
51       __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
52       thisres |= testf( -w, ft.expected[i] );
53       __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
54       thisres |= testf( -w, ft.expected[i] );
55       __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
56       thisres |= testf( -w, ft.expected[i] );
57       __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
58       thisres |= testf( -w, ft.expected[i] );
59       if (thisres)
60          printf( "Failure 2 %d %a %a\n", i, w, ft.expected[i] );
61       res |= thisres;
62    }
63    for (i = 0; i < N; i++)
64       ft.z[i] = -ft.z[i];
65    for (i = 0; i < N; i++) {
66       int thisres = 0;
67       __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
68       thisres |= testf( w, ft.expected[i] );
69       __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
70       thisres |= testf( w, ft.expected[i] );
71       __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
72       thisres |= testf( w, ft.expected[i] );
73       __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
74       thisres |= testf( w, ft.expected[i] );
75       __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
76       thisres |= testf( w, ft.expected[i] );
77       __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
78       thisres |= testf( w, ft.expected[i] );
79       if (thisres)
80          printf( "Failure 3 %d %a %a\n", i, w, ft.expected[i] );
81       res |= thisres;
82       thisres = 0;
83       __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
84       thisres |= testf( -w, ft.expected[i] );
85       __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
86       thisres |= testf( -w, ft.expected[i] );
87       __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
88       thisres |= testf( -w, ft.expected[i] );
89       __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
90       thisres |= testf( -w, ft.expected[i] );
91       __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
92       thisres |= testf( -w, ft.expected[i] );
93       __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
94       thisres |= testf( -w, ft.expected[i] );
95       if (thisres)
96          printf( "Failure 4 %d %a %a\n", i, w, ft.expected[i] );
97       res |= thisres;
98    }
99    for (i = 0; i < N; i++)
100       ft.z[i] = -ft.z[i];
101    for (i = 0; i < N; i += 4) {
102       int thisres = 0;
103       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
104                           "vfmadd132ps %%xmm7, %%xmm8, %%xmm9;"
105                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
106                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
107       for (j = 0; j < 4; j++)
108          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
109       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
110                           "vfmadd132ps (%2), %%xmm8, %%xmm9;"
111                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
112                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
113       for (j = 0; j < 4; j++)
114          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
115       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
116                           "vfmadd213ps %%xmm7, %%xmm8, %%xmm9;"
117                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
118                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
119       for (j = 0; j < 4; j++)
120          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
121       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
122                           "vfmadd213ps (%3), %%xmm8, %%xmm9;"
123                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
124                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
125       for (j = 0; j < 4; j++)
126          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
127       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
128                           "vfmadd231ps %%xmm7, %%xmm8, %%xmm9;"
129                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
130                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
131       for (j = 0; j < 4; j++)
132          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
133       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
134                           "vfmadd231ps (%2), %%xmm8, %%xmm9;"
135                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
136                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
137       for (j = 0; j < 4; j++)
138          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
139       if (thisres) {
140          printf( "Failure 5 %d", i );
141          for (j = 0; j < 4; j++)
142             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
143          printf( "\n" );
144       }
145       res |= thisres;
146       thisres = 0;
147       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
148                           "vfnmsub132ps %%xmm7, %%xmm8, %%xmm9;"
149                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
150                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
151       for (j = 0; j < 4; j++)
152          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
153       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
154                           "vfnmsub132ps (%2), %%xmm8, %%xmm9;"
155                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
156                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
157       for (j = 0; j < 4; j++)
158          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
159       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
160                           "vfnmsub213ps %%xmm7, %%xmm8, %%xmm9;"
161                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
162                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
163       for (j = 0; j < 4; j++)
164          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
165       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
166                           "vfnmsub213ps (%3), %%xmm8, %%xmm9;"
167                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
168                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
169       for (j = 0; j < 4; j++)
170          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
171       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
172                           "vfnmsub231ps %%xmm7, %%xmm8, %%xmm9;"
173                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
174                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
175       for (j = 0; j < 4; j++)
176          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
177       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
178                           "vfnmsub231ps (%2), %%xmm8, %%xmm9;"
179                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
180                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
181       for (j = 0; j < 4; j++)
182          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
183       if (thisres) {
184          printf( "Failure 6 %d", i );
185          for (j = 0; j < 4; j++)
186             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
187          printf( "\n" );
188       }
189       res |= thisres;
190    }
191    for (i = 0; i < N; i++)
192       ft.z[i] = -ft.z[i];
193    for (i = 0; i < N; i += 4) {
194       int thisres = 0;
195       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
196                           "vfmsub132ps %%xmm7, %%xmm8, %%xmm9;"
197                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
198                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
199       for (j = 0; j < 4; j++)
200          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
201       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
202                           "vfmsub132ps (%2), %%xmm8, %%xmm9;"
203                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
204                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
205       for (j = 0; j < 4; j++)
206          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
207       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
208                           "vfmsub213ps %%xmm7, %%xmm8, %%xmm9;"
209                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
210                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
211       for (j = 0; j < 4; j++)
212          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
213       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
214                           "vfmsub213ps (%3), %%xmm8, %%xmm9;"
215                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
216                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
217       for (j = 0; j < 4; j++)
218          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
219       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
220                           "vfmsub231ps %%xmm7, %%xmm8, %%xmm9;"
221                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
222                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
223       for (j = 0; j < 4; j++)
224          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
225       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
226                           "vfmsub231ps (%2), %%xmm8, %%xmm9;"
227                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
228                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
229       for (j = 0; j < 4; j++)
230          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
231       if (thisres) {
232          printf( "Failure 7 %d", i );
233          for (j = 0; j < 4; j++)
234             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
235          printf( "\n" );
236       }
237       res |= thisres;
238       thisres = 0;
239       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
240                           "vfnmadd132ps %%xmm7, %%xmm8, %%xmm9;"
241                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
242                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
243       for (j = 0; j < 4; j++)
244          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
245       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
246                           "vfnmadd132ps (%2), %%xmm8, %%xmm9;"
247                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
248                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
249       for (j = 0; j < 4; j++)
250          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
251       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
252                           "vfnmadd213ps %%xmm7, %%xmm8, %%xmm9;"
253                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
254                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
255       for (j = 0; j < 4; j++)
256          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
257       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
258                           "vfnmadd213ps (%3), %%xmm8, %%xmm9;"
259                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
260                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
261       for (j = 0; j < 4; j++)
262          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
263       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
264                           "vfnmadd231ps %%xmm7, %%xmm8, %%xmm9;"
265                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
266                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
267       for (j = 0; j < 4; j++)
268          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
269       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
270                           "vfnmadd231ps (%2), %%xmm8, %%xmm9;"
271                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
272                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
273       for (j = 0; j < 4; j++)
274          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
275       if (thisres) {
276          printf( "Failure 8 %d", i );
277          for (j = 0; j < 4; j++)
278             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
279          printf( "\n" );
280       }
281       res |= thisres;
282    }
283    for (i = 1; i < N; i += 2)
284       ft.z[i] = -ft.z[i];
285    for (i = 0; i < N; i += 4) {
286       int thisres = 0;
287       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
288                           "vfmaddsub132ps %%xmm7, %%xmm8, %%xmm9;"
289                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
290                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
291       for (j = 0; j < 4; j++)
292          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
293       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
294                           "vfmaddsub132ps (%2), %%xmm8, %%xmm9;"
295                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
296                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
297       for (j = 0; j < 4; j++)
298          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
299       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
300                           "vfmaddsub213ps %%xmm7, %%xmm8, %%xmm9;"
301                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
302                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
303       for (j = 0; j < 4; j++)
304          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
305       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
306                           "vfmaddsub213ps (%3), %%xmm8, %%xmm9;"
307                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
308                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
309       for (j = 0; j < 4; j++)
310          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
311       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
312                           "vfmaddsub231ps %%xmm7, %%xmm8, %%xmm9;"
313                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
314                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
315       for (j = 0; j < 4; j++)
316          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
317       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
318                           "vfmaddsub231ps (%2), %%xmm8, %%xmm9;"
319                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
320                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
321       for (j = 0; j < 4; j++)
322          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
323       if (thisres) {
324          printf( "Failure 9 %d", i );
325          for (j = 0; j < 4; j++)
326             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
327          printf( "\n" );
328       }
329       res |= thisres;
330    }
331    for (i = 0; i < N; i++)
332       ft.z[i] = -ft.z[i];
333    for (i = 0; i < N; i += 4) {
334       int thisres = 0;
335       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
336                           "vfmsubadd132ps %%xmm7, %%xmm8, %%xmm9;"
337                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
338                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
339       for (j = 0; j < 4; j++)
340          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
341       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
342                           "vfmsubadd132ps (%2), %%xmm8, %%xmm9;"
343                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
344                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
345       for (j = 0; j < 4; j++)
346          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
347       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
348                           "vfmsubadd213ps %%xmm7, %%xmm8, %%xmm9;"
349                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
350                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
351       for (j = 0; j < 4; j++)
352          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
353       __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
354                           "vfmsubadd213ps (%3), %%xmm8, %%xmm9;"
355                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
356                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
357       for (j = 0; j < 4; j++)
358          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
359       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
360                           "vfmsubadd231ps %%xmm7, %%xmm8, %%xmm9;"
361                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
362                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
363       for (j = 0; j < 4; j++)
364          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
365       __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
366                           "vfmsubadd231ps (%2), %%xmm8, %%xmm9;"
367                           "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
368                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
369       for (j = 0; j < 4; j++)
370          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
371       if (thisres) {
372          printf( "Failure 10 %d", i );
373          for (j = 0; j < 4; j++)
374             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
375          printf( "\n" );
376       }
377       res |= thisres;
378    }
379    for (i = 1; i < N; i += 2)
380       ft.z[i] = -ft.z[i];
381    for (i = 0; i < N; i += 8) {
382       int thisres = 0;
383       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
384                           "vfmadd132ps %%ymm7, %%ymm8, %%ymm9;"
385                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
386                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
387       for (j = 0; j < 8; j++)
388          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
389       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
390                           "vfmadd132ps (%2), %%ymm8, %%ymm9;"
391                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
392                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
393       for (j = 0; j < 8; j++)
394          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
395       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
396                           "vfmadd213ps %%ymm7, %%ymm8, %%ymm9;"
397                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
398                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
399       for (j = 0; j < 8; j++)
400          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
401       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
402                           "vfmadd213ps (%3), %%ymm8, %%ymm9;"
403                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
404                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
405       for (j = 0; j < 8; j++)
406          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
407       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
408                           "vfmadd231ps %%ymm7, %%ymm8, %%ymm9;"
409                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
410                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
411       for (j = 0; j < 8; j++)
412          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
413       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
414                           "vfmadd231ps (%2), %%ymm8, %%ymm9;"
415                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
416                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
417       for (j = 0; j < 8; j++)
418          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
419       if (thisres) {
420          printf( "Failure 11 %d", i );
421          for (j = 0; j < 8; j++)
422             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
423          printf( "\n" );
424       }
425       res |= thisres;
426       thisres = 0;
427       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
428                           "vfnmsub132ps %%ymm7, %%ymm8, %%ymm9;"
429                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
430                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
431       for (j = 0; j < 8; j++)
432          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
433       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
434                           "vfnmsub132ps (%2), %%ymm8, %%ymm9;"
435                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
436                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
437       for (j = 0; j < 8; j++)
438          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
439       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
440                           "vfnmsub213ps %%ymm7, %%ymm8, %%ymm9;"
441                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
442                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
443       for (j = 0; j < 8; j++)
444          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
445       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
446                           "vfnmsub213ps (%3), %%ymm8, %%ymm9;"
447                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
448                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
449       for (j = 0; j < 8; j++)
450          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
451       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
452                           "vfnmsub231ps %%ymm7, %%ymm8, %%ymm9;"
453                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
454                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
455       for (j = 0; j < 8; j++)
456          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
457       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
458                           "vfnmsub231ps (%2), %%ymm8, %%ymm9;"
459                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
460                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
461       for (j = 0; j < 8; j++)
462          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
463       if (thisres) {
464          printf( "Failure 12 %d", i );
465          for (j = 0; j < 8; j++)
466             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
467          printf( "\n" );
468       }
469       res |= thisres;
470    }
471    for (i = 0; i < N; i++)
472       ft.z[i] = -ft.z[i];
473    for (i = 0; i < N; i += 8) {
474       int thisres = 0;
475       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
476                           "vfmsub132ps %%ymm7, %%ymm8, %%ymm9;"
477                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
478                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
479       for (j = 0; j < 8; j++)
480          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
481       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
482                           "vfmsub132ps (%2), %%ymm8, %%ymm9;"
483                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
484                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
485       for (j = 0; j < 8; j++)
486          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
487       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
488                           "vfmsub213ps %%ymm7, %%ymm8, %%ymm9;"
489                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
490                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
491       for (j = 0; j < 8; j++)
492          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
493       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
494                           "vfmsub213ps (%3), %%ymm8, %%ymm9;"
495                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
496                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
497       for (j = 0; j < 8; j++)
498          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
499       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
500                           "vfmsub231ps %%ymm7, %%ymm8, %%ymm9;"
501                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
502                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
503       for (j = 0; j < 8; j++)
504          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
505       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
506                           "vfmsub231ps (%2), %%ymm8, %%ymm9;"
507                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
508                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
509       for (j = 0; j < 8; j++)
510          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
511       if (thisres) {
512          printf( "Failure 13 %d", i );
513          for (j = 0; j < 8; j++)
514             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
515          printf( "\n" );
516       }
517       res |= thisres;
518       thisres = 0;
519       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
520                           "vfnmadd132ps %%ymm7, %%ymm8, %%ymm9;"
521                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
522                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
523       for (j = 0; j < 8; j++)
524          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
525       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
526                           "vfnmadd132ps (%2), %%ymm8, %%ymm9;"
527                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
528                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
529       for (j = 0; j < 8; j++)
530          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
531       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
532                           "vfnmadd213ps %%ymm7, %%ymm8, %%ymm9;"
533                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
534                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
535       for (j = 0; j < 8; j++)
536          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
537       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
538                           "vfnmadd213ps (%3), %%ymm8, %%ymm9;"
539                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
540                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
541       for (j = 0; j < 8; j++)
542          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
543       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
544                           "vfnmadd231ps %%ymm7, %%ymm8, %%ymm9;"
545                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
546                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
547       for (j = 0; j < 8; j++)
548          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
549       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
550                           "vfnmadd231ps (%2), %%ymm8, %%ymm9;"
551                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
552                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
553       for (j = 0; j < 8; j++)
554          thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
555       if (thisres) {
556          printf( "Failure 14 %d", i );
557          for (j = 0; j < 8; j++)
558             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
559          printf( "\n" );
560       }
561       res |= thisres;
562    }
563    for (i = 1; i < N; i += 2)
564       ft.z[i] = -ft.z[i];
565    for (i = 0; i < N; i += 8) {
566       int thisres = 0;
567       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
568                           "vfmaddsub132ps %%ymm7, %%ymm8, %%ymm9;"
569                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
570                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
571       for (j = 0; j < 8; j++)
572          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
573       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
574                           "vfmaddsub132ps (%2), %%ymm8, %%ymm9;"
575                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
576                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
577       for (j = 0; j < 8; j++)
578          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
579       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
580                           "vfmaddsub213ps %%ymm7, %%ymm8, %%ymm9;"
581                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
582                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
583       for (j = 0; j < 8; j++)
584          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
585       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
586                           "vfmaddsub213ps (%3), %%ymm8, %%ymm9;"
587                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
588                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
589       for (j = 0; j < 8; j++)
590          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
591       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
592                           "vfmaddsub231ps %%ymm7, %%ymm8, %%ymm9;"
593                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
594                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
595       for (j = 0; j < 8; j++)
596          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
597       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
598                           "vfmaddsub231ps (%2), %%ymm8, %%ymm9;"
599                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
600                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
601       for (j = 0; j < 8; j++)
602          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
603       if (thisres) {
604          printf( "Failure 15 %d", i );
605          for (j = 0; j < 8; j++)
606             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
607          printf( "\n" );
608       }
609       res |= thisres;
610    }
611    for (i = 0; i < N; i++)
612       ft.z[i] = -ft.z[i];
613    for (i = 0; i < N; i += 8) {
614       int thisres = 0;
615       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
616                           "vfmsubadd132ps %%ymm7, %%ymm8, %%ymm9;"
617                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
618                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
619       for (j = 0; j < 8; j++)
620          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
621       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
622                           "vfmsubadd132ps (%2), %%ymm8, %%ymm9;"
623                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
624                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
625       for (j = 0; j < 8; j++)
626          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
627       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
628                           "vfmsubadd213ps %%ymm7, %%ymm8, %%ymm9;"
629                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
630                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
631       for (j = 0; j < 8; j++)
632          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
633       __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
634                           "vfmsubadd213ps (%3), %%ymm8, %%ymm9;"
635                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
636                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
637       for (j = 0; j < 8; j++)
638          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
639       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
640                           "vfmsubadd231ps %%ymm7, %%ymm8, %%ymm9;"
641                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
642                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
643       for (j = 0; j < 8; j++)
644          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
645       __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
646                           "vfmsubadd231ps (%2), %%ymm8, %%ymm9;"
647                           "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
648                                                      "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
649       for (j = 0; j < 8; j++)
650          thisres |= testf( ft.res[i+j], ft.expected[i+j] );
651       if (thisres) {
652          printf( "Failure 16 %d", i );
653          for (j = 0; j < 8; j++)
654             printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
655          printf( "\n" );
656       }
657       res |= thisres;
658    }
659    for (i = 1; i < N; i += 2)
660       ft.z[i] = -ft.z[i];
661    return res;
662 }
663 
test(double x,double y)664 static int test( double x, double y )
665 {
666    unsigned long long a, b;
667    memcpy( &a, &x, sizeof (a) );
668    memcpy( &b, &y, sizeof (b) );
669    if ((a & 0x7ff8000000000000ULL) == 0x7ff8000000000000ULL)
670       return (b & 0x7ff8000000000000ULL) != 0x7ff8000000000000ULL;
671    return memcmp( &a, &b, sizeof (a) ) != 0;
672 }
673 
test_fma(void)674 static int test_fma( void )
675 {
676    int res = 0, i, j;
677    double w;
678    for (i = 0; i < N; i++) {
679       int thisres = 0;
680       __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
681       thisres |= test( w, dt.expected[i] );
682       __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
683       thisres |= test( w, dt.expected[i] );
684       __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
685       thisres |= test( w, dt.expected[i] );
686       __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
687       thisres |= test( w, dt.expected[i] );
688       __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
689       thisres |= test( w, dt.expected[i] );
690       __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
691       thisres |= test( w, dt.expected[i] );
692       if (thisres)
693          printf( "Failure 1 %d %a %a\n", i, w, dt.expected[i] );
694       res |= thisres;
695       thisres = 0;
696       __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
697       thisres |= test( -w, dt.expected[i] );
698       __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
699       thisres |= test( -w, dt.expected[i] );
700       __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
701       thisres |= test( -w, dt.expected[i] );
702       __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
703       thisres |= test( -w, dt.expected[i] );
704       __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
705       thisres |= test( -w, dt.expected[i] );
706       __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
707       thisres |= test( -w, dt.expected[i] );
708       if (thisres)
709          printf( "Failure 2 %d %a %a\n", i, w, dt.expected[i] );
710       res |= thisres;
711    }
712    for (i = 0; i < N; i++)
713       dt.z[i] = -dt.z[i];
714    for (i = 0; i < N; i++) {
715       int thisres = 0;
716       __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
717       thisres |= test( w, dt.expected[i] );
718       __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
719       thisres |= test( w, dt.expected[i] );
720       __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
721       thisres |= test( w, dt.expected[i] );
722       __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
723       thisres |= test( w, dt.expected[i] );
724       __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
725       thisres |= test( w, dt.expected[i] );
726       __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
727       thisres |= test( w, dt.expected[i] );
728       if (thisres)
729          printf( "Failure 3 %d %a %a\n", i, w, dt.expected[i] );
730       res |= thisres;
731       thisres = 0;
732       __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
733       thisres |= test( -w, dt.expected[i] );
734       __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
735       thisres |= test( -w, dt.expected[i] );
736       __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
737       thisres |= test( -w, dt.expected[i] );
738       __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
739       thisres |= test( -w, dt.expected[i] );
740       __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
741       thisres |= test( -w, dt.expected[i] );
742       __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
743       thisres |= test( -w, dt.expected[i] );
744       if (thisres)
745          printf( "Failure 4 %d %a %a\n", i, w, dt.expected[i] );
746       res |= thisres;
747    }
748    for (i = 0; i < N; i++)
749       dt.z[i] = -dt.z[i];
750    for (i = 0; i < N; i += 2) {
751       int thisres = 0;
752       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
753                           "vfmadd132pd %%xmm7, %%xmm8, %%xmm9;"
754                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
755                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
756       for (j = 0; j < 2; j++)
757          thisres |= test( dt.res[i+j], dt.expected[i+j] );
758       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
759                           "vfmadd132pd (%2), %%xmm8, %%xmm9;"
760                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
761                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
762       for (j = 0; j < 2; j++)
763          thisres |= test( dt.res[i+j], dt.expected[i+j] );
764       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
765                           "vfmadd213pd %%xmm7, %%xmm8, %%xmm9;"
766                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
767                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
768       for (j = 0; j < 2; j++)
769          thisres |= test( dt.res[i+j], dt.expected[i+j] );
770       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
771                           "vfmadd213pd (%3), %%xmm8, %%xmm9;"
772                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
773                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
774       for (j = 0; j < 2; j++)
775          thisres |= test( dt.res[i+j], dt.expected[i+j] );
776       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
777                           "vfmadd231pd %%xmm7, %%xmm8, %%xmm9;"
778                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
779                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
780       for (j = 0; j < 2; j++)
781          thisres |= test( dt.res[i+j], dt.expected[i+j] );
782       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
783                           "vfmadd231pd (%2), %%xmm8, %%xmm9;"
784                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
785                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
786       for (j = 0; j < 2; j++)
787          thisres |= test( dt.res[i+j], dt.expected[i+j] );
788       if (thisres) {
789          printf( "Failure 5 %d", i );
790          for (j = 0; j < 2; j++)
791             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
792          printf( "\n" );
793       }
794       res |= thisres;
795       thisres = 0;
796       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
797                           "vfnmsub132pd %%xmm7, %%xmm8, %%xmm9;"
798                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
799                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
800       for (j = 0; j < 2; j++)
801          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
802       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
803                           "vfnmsub132pd (%2), %%xmm8, %%xmm9;"
804                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
805                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
806       for (j = 0; j < 2; j++)
807          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
808       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
809                           "vfnmsub213pd %%xmm7, %%xmm8, %%xmm9;"
810                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
811                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
812       for (j = 0; j < 2; j++)
813          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
814       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
815                           "vfnmsub213pd (%3), %%xmm8, %%xmm9;"
816                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
817                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
818       for (j = 0; j < 2; j++)
819          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
820       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
821                           "vfnmsub231pd %%xmm7, %%xmm8, %%xmm9;"
822                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
823                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
824       for (j = 0; j < 2; j++)
825          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
826       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
827                           "vfnmsub231pd (%2), %%xmm8, %%xmm9;"
828                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
829                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
830       for (j = 0; j < 2; j++)
831          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
832       if (thisres) {
833          printf( "Failure 6 %d", i );
834          for (j = 0; j < 2; j++)
835             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
836          printf( "\n" );
837       }
838       res |= thisres;
839    }
840    for (i = 0; i < N; i++)
841       dt.z[i] = -dt.z[i];
842    for (i = 0; i < N; i += 2) {
843       int thisres = 0;
844       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
845                           "vfmsub132pd %%xmm7, %%xmm8, %%xmm9;"
846                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
847                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
848       for (j = 0; j < 2; j++)
849          thisres |= test( dt.res[i+j], dt.expected[i+j] );
850       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
851                           "vfmsub132pd (%2), %%xmm8, %%xmm9;"
852                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
853                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
854       for (j = 0; j < 2; j++)
855          thisres |= test( dt.res[i+j], dt.expected[i+j] );
856       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
857                           "vfmsub213pd %%xmm7, %%xmm8, %%xmm9;"
858                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
859                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
860       for (j = 0; j < 2; j++)
861          thisres |= test( dt.res[i+j], dt.expected[i+j] );
862       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
863                           "vfmsub213pd (%3), %%xmm8, %%xmm9;"
864                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
865                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
866       for (j = 0; j < 2; j++)
867          thisres |= test( dt.res[i+j], dt.expected[i+j] );
868       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
869                           "vfmsub231pd %%xmm7, %%xmm8, %%xmm9;"
870                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
871                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
872       for (j = 0; j < 2; j++)
873          thisres |= test( dt.res[i+j], dt.expected[i+j] );
874       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
875                           "vfmsub231pd (%2), %%xmm8, %%xmm9;"
876                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
877                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
878       for (j = 0; j < 2; j++)
879          thisres |= test( dt.res[i+j], dt.expected[i+j] );
880       if (thisres) {
881          printf( "Failure 7 %d", i );
882          for (j = 0; j < 2; j++)
883             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
884          printf( "\n" );
885       }
886       res |= thisres;
887       thisres = 0;
888       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
889                           "vfnmadd132pd %%xmm7, %%xmm8, %%xmm9;"
890                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
891                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
892       for (j = 0; j < 2; j++)
893          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
894       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
895                           "vfnmadd132pd (%2), %%xmm8, %%xmm9;"
896                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
897                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
898       for (j = 0; j < 2; j++)
899          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
900       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
901                           "vfnmadd213pd %%xmm7, %%xmm8, %%xmm9;"
902                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
903                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
904       for (j = 0; j < 2; j++)
905          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
906       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
907                           "vfnmadd213pd (%3), %%xmm8, %%xmm9;"
908                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
909                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
910       for (j = 0; j < 2; j++)
911          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
912       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
913                           "vfnmadd231pd %%xmm7, %%xmm8, %%xmm9;"
914                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
915                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
916       for (j = 0; j < 2; j++)
917          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
918       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
919                           "vfnmadd231pd (%2), %%xmm8, %%xmm9;"
920                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
921                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
922       for (j = 0; j < 2; j++)
923          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
924       if (thisres) {
925          printf( "Failure 8 %d", i );
926          for (j = 0; j < 2; j++)
927             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
928          printf( "\n" );
929       }
930       res |= thisres;
931    }
932    for (i = 1; i < N; i += 2)
933       dt.z[i] = -dt.z[i];
934    for (i = 0; i < N; i += 2) {
935       int thisres = 0;
936       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
937                           "vfmaddsub132pd %%xmm7, %%xmm8, %%xmm9;"
938                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
939                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
940       for (j = 0; j < 2; j++)
941          thisres |= test( dt.res[i+j], dt.expected[i+j] );
942       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
943                           "vfmaddsub132pd (%2), %%xmm8, %%xmm9;"
944                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
945                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
946       for (j = 0; j < 2; j++)
947          thisres |= test( dt.res[i+j], dt.expected[i+j] );
948       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
949                           "vfmaddsub213pd %%xmm7, %%xmm8, %%xmm9;"
950                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
951                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
952       for (j = 0; j < 2; j++)
953          thisres |= test( dt.res[i+j], dt.expected[i+j] );
954       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
955                           "vfmaddsub213pd (%3), %%xmm8, %%xmm9;"
956                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
957                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
958       for (j = 0; j < 2; j++)
959          thisres |= test( dt.res[i+j], dt.expected[i+j] );
960       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
961                           "vfmaddsub231pd %%xmm7, %%xmm8, %%xmm9;"
962                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
963                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
964       for (j = 0; j < 2; j++)
965          thisres |= test( dt.res[i+j], dt.expected[i+j] );
966       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
967                           "vfmaddsub231pd (%2), %%xmm8, %%xmm9;"
968                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
969                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
970       for (j = 0; j < 2; j++)
971          thisres |= test( dt.res[i+j], dt.expected[i+j] );
972       if (thisres) {
973          printf( "Failure 9 %d", i );
974          for (j = 0; j < 2; j++)
975             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
976          printf( "\n" );
977       }
978       res |= thisres;
979    }
980    for (i = 0; i < N; i++)
981       dt.z[i] = -dt.z[i];
982    for (i = 0; i < N; i += 2) {
983       int thisres = 0;
984       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
985                           "vfmsubadd132pd %%xmm7, %%xmm8, %%xmm9;"
986                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
987                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
988       for (j = 0; j < 2; j++)
989          thisres |= test( dt.res[i+j], dt.expected[i+j] );
990       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
991                           "vfmsubadd132pd (%2), %%xmm8, %%xmm9;"
992                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
993                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
994       for (j = 0; j < 2; j++)
995          thisres |= test( dt.res[i+j], dt.expected[i+j] );
996       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
997                           "vfmsubadd213pd %%xmm7, %%xmm8, %%xmm9;"
998                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
999                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1000       for (j = 0; j < 2; j++)
1001          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1002       __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
1003                           "vfmsubadd213pd (%3), %%xmm8, %%xmm9;"
1004                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1005                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1006       for (j = 0; j < 2; j++)
1007          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1008       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
1009                           "vfmsubadd231pd %%xmm7, %%xmm8, %%xmm9;"
1010                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1011                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1012       for (j = 0; j < 2; j++)
1013          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1014       __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
1015                           "vfmsubadd231pd (%2), %%xmm8, %%xmm9;"
1016                           "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1017                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1018       for (j = 0; j < 2; j++)
1019          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1020       if (thisres) {
1021          printf( "Failure 10 %d", i );
1022          for (j = 0; j < 2; j++)
1023             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1024          printf( "\n" );
1025       }
1026       res |= thisres;
1027    }
1028    for (i = 1; i < N; i += 2)
1029       dt.z[i] = -dt.z[i];
1030    for (i = 0; i < N; i += 4) {
1031       int thisres = 0;
1032       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1033                           "vfmadd132pd %%ymm7, %%ymm8, %%ymm9;"
1034                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1035                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1036       for (j = 0; j < 4; j++)
1037          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1038       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1039                           "vfmadd132pd (%2), %%ymm8, %%ymm9;"
1040                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1041                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1042       for (j = 0; j < 4; j++)
1043          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1044       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1045                           "vfmadd213pd %%ymm7, %%ymm8, %%ymm9;"
1046                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1047                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1048       for (j = 0; j < 4; j++)
1049          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1050       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1051                           "vfmadd213pd (%3), %%ymm8, %%ymm9;"
1052                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1053                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1054       for (j = 0; j < 4; j++)
1055          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1056       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1057                           "vfmadd231pd %%ymm7, %%ymm8, %%ymm9;"
1058                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1059                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1060       for (j = 0; j < 4; j++)
1061          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1062       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1063                           "vfmadd231pd (%2), %%ymm8, %%ymm9;"
1064                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1065                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1066       for (j = 0; j < 4; j++)
1067          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1068       if (thisres) {
1069          printf( "Failure 11 %d", i );
1070          for (j = 0; j < 4; j++)
1071             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1072          printf( "\n" );
1073       }
1074       res |= thisres;
1075       thisres = 0;
1076       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1077                           "vfnmsub132pd %%ymm7, %%ymm8, %%ymm9;"
1078                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1079                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1080       for (j = 0; j < 4; j++)
1081          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1082       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1083                           "vfnmsub132pd (%2), %%ymm8, %%ymm9;"
1084                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1085                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1086       for (j = 0; j < 4; j++)
1087          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1088       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1089                           "vfnmsub213pd %%ymm7, %%ymm8, %%ymm9;"
1090                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1091                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1092       for (j = 0; j < 4; j++)
1093          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1094       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1095                           "vfnmsub213pd (%3), %%ymm8, %%ymm9;"
1096                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1097                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1098       for (j = 0; j < 4; j++)
1099          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1100       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1101                           "vfnmsub231pd %%ymm7, %%ymm8, %%ymm9;"
1102                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1103                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1104       for (j = 0; j < 4; j++)
1105          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1106       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1107                           "vfnmsub231pd (%2), %%ymm8, %%ymm9;"
1108                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1109                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1110       for (j = 0; j < 4; j++)
1111          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1112       if (thisres) {
1113          printf( "Failure 12 %d", i );
1114          for (j = 0; j < 4; j++)
1115             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1116          printf( "\n" );
1117       }
1118       res |= thisres;
1119    }
1120    for (i = 0; i < N; i++)
1121       dt.z[i] = -dt.z[i];
1122    for (i = 0; i < N; i += 4) {
1123       int thisres = 0;
1124       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1125                           "vfmsub132pd %%ymm7, %%ymm8, %%ymm9;"
1126                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1127                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1128       for (j = 0; j < 4; j++)
1129          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1130       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1131                           "vfmsub132pd (%2), %%ymm8, %%ymm9;"
1132                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1133                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1134       for (j = 0; j < 4; j++)
1135          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1136       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1137                           "vfmsub213pd %%ymm7, %%ymm8, %%ymm9;"
1138                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1139                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1140       for (j = 0; j < 4; j++)
1141          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1142       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1143                           "vfmsub213pd (%3), %%ymm8, %%ymm9;"
1144                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1145                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1146       for (j = 0; j < 4; j++)
1147          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1148       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1149                           "vfmsub231pd %%ymm7, %%ymm8, %%ymm9;"
1150                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1151                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1152       for (j = 0; j < 4; j++)
1153          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1154       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1155                           "vfmsub231pd (%2), %%ymm8, %%ymm9;"
1156                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1157                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1158       for (j = 0; j < 4; j++)
1159          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1160       if (thisres) {
1161          printf( "Failure 13 %d", i );
1162          for (j = 0; j < 4; j++)
1163             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1164          printf( "\n" );
1165       }
1166       res |= thisres;
1167       thisres = 0;
1168       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1169                           "vfnmadd132pd %%ymm7, %%ymm8, %%ymm9;"
1170                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1171                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1172       for (j = 0; j < 4; j++)
1173          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1174       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1175                           "vfnmadd132pd (%2), %%ymm8, %%ymm9;"
1176                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1177                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1178       for (j = 0; j < 4; j++)
1179          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1180       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1181                           "vfnmadd213pd %%ymm7, %%ymm8, %%ymm9;"
1182                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1183                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1184       for (j = 0; j < 4; j++)
1185          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1186       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1187                           "vfnmadd213pd (%3), %%ymm8, %%ymm9;"
1188                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1189                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1190       for (j = 0; j < 4; j++)
1191          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1192       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1193                           "vfnmadd231pd %%ymm7, %%ymm8, %%ymm9;"
1194                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1195                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1196       for (j = 0; j < 4; j++)
1197          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1198       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1199                           "vfnmadd231pd (%2), %%ymm8, %%ymm9;"
1200                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1201                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1202       for (j = 0; j < 4; j++)
1203          thisres |= test( -dt.res[i+j], dt.expected[i+j] );
1204       if (thisres) {
1205          printf( "Failure 14 %d", i );
1206          for (j = 0; j < 4; j++)
1207             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1208          printf( "\n" );
1209       }
1210       res |= thisres;
1211    }
1212    for (i = 1; i < N; i += 2)
1213       dt.z[i] = -dt.z[i];
1214    for (i = 0; i < N; i += 4) {
1215       int thisres = 0;
1216       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1217                           "vfmaddsub132pd %%ymm7, %%ymm8, %%ymm9;"
1218                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1219                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1220       for (j = 0; j < 4; j++)
1221          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1222       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1223                           "vfmaddsub132pd (%2), %%ymm8, %%ymm9;"
1224                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1225                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1226       for (j = 0; j < 4; j++)
1227          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1228       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1229                           "vfmaddsub213pd %%ymm7, %%ymm8, %%ymm9;"
1230                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1231                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1232       for (j = 0; j < 4; j++)
1233          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1234       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1235                           "vfmaddsub213pd (%3), %%ymm8, %%ymm9;"
1236                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1237                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1238       for (j = 0; j < 4; j++)
1239          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1240       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1241                           "vfmaddsub231pd %%ymm7, %%ymm8, %%ymm9;"
1242                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1243                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1244       for (j = 0; j < 4; j++)
1245          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1246       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1247                           "vfmaddsub231pd (%2), %%ymm8, %%ymm9;"
1248                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1249                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1250       for (j = 0; j < 4; j++)
1251          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1252       if (thisres) {
1253          printf( "Failure 15 %d", i );
1254          for (j = 0; j < 4; j++)
1255             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1256          printf( "\n" );
1257       }
1258       res |= thisres;
1259    }
1260    for (i = 0; i < N; i++)
1261       dt.z[i] = -dt.z[i];
1262    for (i = 0; i < N; i += 4) {
1263       int thisres = 0;
1264       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
1265                           "vfmsubadd132pd %%ymm7, %%ymm8, %%ymm9;"
1266                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1267                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1268       for (j = 0; j < 4; j++)
1269          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1270       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
1271                           "vfmsubadd132pd (%2), %%ymm8, %%ymm9;"
1272                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1273                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1274       for (j = 0; j < 4; j++)
1275          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1276       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
1277                           "vfmsubadd213pd %%ymm7, %%ymm8, %%ymm9;"
1278                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1279                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1280       for (j = 0; j < 4; j++)
1281          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1282       __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
1283                           "vfmsubadd213pd (%3), %%ymm8, %%ymm9;"
1284                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1285                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1286       for (j = 0; j < 4; j++)
1287          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1288       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
1289                           "vfmsubadd231pd %%ymm7, %%ymm8, %%ymm9;"
1290                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1291                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1292       for (j = 0; j < 4; j++)
1293          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1294       __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
1295                           "vfmsubadd231pd (%2), %%ymm8, %%ymm9;"
1296                           "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
1297                                                      "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
1298       for (j = 0; j < 4; j++)
1299          thisres |= test( dt.res[i+j], dt.expected[i+j] );
1300       if (thisres) {
1301          printf( "Failure 16 %d", i );
1302          for (j = 0; j < 4; j++)
1303             printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
1304          printf( "\n" );
1305       }
1306       res |= thisres;
1307    }
1308    for (i = 1; i < N; i += 2)
1309       dt.z[i] = -dt.z[i];
1310    return res;
1311 }
1312 
main()1313 int main( )
1314 {
1315    int res = 0;
1316    int i = 0;
1317    plus_zero = 0.0;
1318    __asm __volatile__ ("" : : "r" (&plus_zero) : "memory");
1319    nan_value = plus_zero / plus_zero;
1320    plus_infty = 3.40282346638528859812e+38F * 16.0F;
1321    minus_infty = -plus_infty;
1322 #define TEST_F( a, b, c, d ) \
1323    do {				\
1324       ft.x[i] = a;		\
1325       ft.y[i] = b;		\
1326       ft.z[i] = c;		\
1327       ft.expected[i] = d;	\
1328       i++;			\
1329    } while (0)
1330    TEST_F( 1.0, 2.0, 3.0, 5.0 );
1331    TEST_F( nan_value, 2.0, 3.0, nan_value );
1332    TEST_F( 1.0, nan_value, 3.0, nan_value );
1333    TEST_F( 1.0, 2.0, nan_value, nan_value );
1334    TEST_F( plus_infty, 0.0, nan_value, nan_value );
1335    TEST_F( minus_infty, 0.0, nan_value, nan_value );
1336    TEST_F( 0.0, plus_infty, nan_value, nan_value );
1337    TEST_F( 0.0, minus_infty, nan_value, nan_value );
1338    TEST_F( plus_infty, 0.0, 1.0, nan_value );
1339    TEST_F( minus_infty, 0.0, 1.0, nan_value );
1340    TEST_F( 0.0, plus_infty, 1.0, nan_value );
1341    TEST_F( 0.0, minus_infty, 1.0, nan_value );
1342    TEST_F( plus_infty, plus_infty, minus_infty, nan_value );
1343    TEST_F( minus_infty, plus_infty, plus_infty, nan_value );
1344    TEST_F( plus_infty, minus_infty, plus_infty, nan_value );
1345    TEST_F( minus_infty, minus_infty, minus_infty, nan_value );
1346    TEST_F( plus_infty, 3.5L, minus_infty, nan_value );
1347    TEST_F( minus_infty, -7.5L, minus_infty, nan_value );
1348    TEST_F( -13.5L, plus_infty, plus_infty, nan_value );
1349    TEST_F( minus_infty, 7.5L, plus_infty, nan_value );
1350    TEST_F( 1.25L, 0.75L, 0.0625L, 1.0L );
1351    TEST_F( -3.40282346638528859812e+38F, -3.40282346638528859812e+38F, minus_infty, minus_infty );
1352    TEST_F( 3.40282346638528859812e+38F / 2, 3.40282346638528859812e+38F / 2, minus_infty, minus_infty );
1353    TEST_F( -3.40282346638528859812e+38F, 3.40282346638528859812e+38F, plus_infty, plus_infty );
1354    TEST_F( 3.40282346638528859812e+38F / 2, -3.40282346638528859812e+38F / 4, plus_infty, plus_infty );
1355    TEST_F( plus_infty, 4, plus_infty, plus_infty );
1356    TEST_F( 2, minus_infty, minus_infty, minus_infty );
1357    TEST_F( minus_infty, minus_infty, plus_infty, plus_infty );
1358    TEST_F( plus_infty, minus_infty, minus_infty, minus_infty );
1359    TEST_F( 0x1.7ff8p+13, 0x1.000002p+0, 0x1.ffffp-24, 0x1.7ff802p+13 );
1360    TEST_F( 0x1.fffp+0, 0x1.00001p+0, -0x1.fffp+0, 0x1.fffp-20 );
1361    TEST_F( 0x1.9abcdep+127, 0x0.9abcdep-126, -0x1.f08948p+0, 0x1.bb421p-25 );
1362    TEST_F( 0x1.9abcdep+100, 0x0.9abcdep-126, -0x1.f08948p-27, 0x1.bb421p-52 );
1363    TEST_F( 0x1.fffffep+127, 0x1.001p+0, -0x1.fffffep+127, 0x1.fffffep+115 );
1364    TEST_F( -0x1.fffffep+127, 0x1.fffffep+0, 0x1.fffffep+127, -0x1.fffffap+127 );
1365    TEST_F( 0x1.fffffep+127, 2.0, -0x1.fffffep+127, 0x1.fffffep+127 );
1366 
1367    res |= test_fmaf( );
1368    i = 0;
1369 #define TEST( a, b, c, d ) \
1370    do {				\
1371       dt.x[i] = a;		\
1372       dt.y[i] = b;		\
1373       dt.z[i] = c;		\
1374       dt.expected[i] = d;	\
1375       i++;			\
1376    } while (0)
1377    TEST( 1.0, 2.0, 3.0, 5.0 );
1378    TEST( nan_value, 2.0, 3.0, nan_value );
1379    TEST( 1.0, nan_value, 3.0, nan_value );
1380    TEST( 1.0, 2.0, nan_value, nan_value );
1381    TEST( plus_infty, 0.0, nan_value, nan_value );
1382    TEST( minus_infty, 0.0, nan_value, nan_value );
1383    TEST( 0.0, plus_infty, nan_value, nan_value );
1384    TEST( 0.0, minus_infty, nan_value, nan_value );
1385    TEST( plus_infty, 0.0, 1.0, nan_value );
1386    TEST( minus_infty, 0.0, 1.0, nan_value );
1387    TEST( 0.0, plus_infty, 1.0, nan_value );
1388    TEST( 0.0, minus_infty, 1.0, nan_value );
1389    TEST( plus_infty, plus_infty, minus_infty, nan_value );
1390    TEST( minus_infty, plus_infty, plus_infty, nan_value );
1391    TEST( plus_infty, minus_infty, plus_infty, nan_value );
1392    TEST( minus_infty, minus_infty, minus_infty, nan_value );
1393    TEST( plus_infty, 3.5L, minus_infty, nan_value );
1394    TEST( minus_infty, -7.5L, minus_infty, nan_value );
1395    TEST( -13.5L, plus_infty, plus_infty, nan_value );
1396    TEST( minus_infty, 7.5L, plus_infty, nan_value );
1397    TEST( 1.25L, 0.75L, 0.0625L, 1.0L );
1398    TEST( -1.79769313486231570815e+308L, -1.79769313486231570815e+308L, minus_infty, minus_infty );
1399    TEST( 1.79769313486231570815e+308L / 2, 1.79769313486231570815e+308L / 2, minus_infty, minus_infty );
1400    TEST( -1.79769313486231570815e+308L, 1.79769313486231570815e+308L, plus_infty, plus_infty );
1401    TEST( 1.79769313486231570815e+308L / 2, -1.79769313486231570815e+308L / 4, plus_infty, plus_infty );
1402    TEST( plus_infty, 4, plus_infty, plus_infty );
1403    TEST( 2, minus_infty, minus_infty, minus_infty );
1404    TEST( minus_infty, minus_infty, plus_infty, plus_infty );
1405    TEST( plus_infty, minus_infty, minus_infty, minus_infty );
1406    TEST( 0x1.7fp+13, 0x1.0000000000001p+0, 0x1.ffep-48, 0x1.7f00000000001p+13 );
1407    TEST( 0x1.fffp+0, 0x1.0000000000001p+0, -0x1.fffp+0, 0x1.fffp-52 );
1408    TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, 0x1p-300, 1.0 );
1409    TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, -0x1p-300, 0x1.fffffffffffffp-1 );
1410    TEST( 0x1.deadbeef2feedp+1023, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp+1, 0x1.0989687bc9da4p-53 );
1411    TEST( 0x1.deadbeef2feedp+900, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp-122, 0x1.0989687bc9da4p-176 );
1412    TEST( 0x1.fffffffffffffp+1023, 0x1.001p+0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1011 );
1413    TEST( -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+0, 0x1.fffffffffffffp+1023, -0x1.ffffffffffffdp+1023 );
1414    TEST( 0x1.fffffffffffffp+1023, 2.0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1023 );
1415    TEST( 0x1.6a09e667f3bccp-538, 0x1.6a09e667f3bccp-538, 0.0, 0.0 );
1416    TEST( 0x1.deadbeef2feedp-495, 0x1.deadbeef2feedp-495, -0x1.bf86a5786a574p-989, 0x0.0000042625a1fp-1022 );
1417    TEST( 0x1.deadbeef2feedp-503, 0x1.deadbeef2feedp-503, -0x1.bf86a5786a574p-1005, 0x0.0000000004262p-1022 );
1418    TEST( 0x1p-537, 0x1p-538, 0x1p-1074, 0x0.0000000000002p-1022 );
1419    TEST( 0x1.7fffff8p-968, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000001p-1022 );
1420    TEST( 0x1.4000004p-967, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000003p-1022 );
1421    TEST( 0x1.4p-967, -0x1p-106, -0x0.000001p-1022, -0x0.0000010000002p-1022 );
1422    TEST( -0x1.19cab66d73e17p-959, 0x1.c7108a8c5ff51p-107, -0x0.80b0ad65d9b64p-1022, -0x0.80b0ad65d9d59p-1022 );
1423    TEST( -0x1.d2eaed6e8e9d3p-979, -0x1.4e066c62ac9ddp-63, -0x0.9245e6b003454p-1022, -0x0.9245c09c5fb5dp-1022 );
1424    TEST( 0x1.153d650bb9f06p-907, 0x1.2d01230d48407p-125, -0x0.b278d5acfc3cp-1022, -0x0.b22757123bbe9p-1022 );
1425    TEST( -0x1.fffffffffffffp-711, 0x1.fffffffffffffp-275, 0x1.fffffe00007ffp-983, 0x1.7ffffe00007ffp-983 );
1426 
1427    res |= test_fma( );
1428    if (res == 0)
1429       printf( "Testing successful\n");
1430    return 0;
1431 }
1432