1; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
2
3; CHECK-LABEL: fmaddsubpd_loop_128:
4; CHECK:   vfmaddsub231pd %xmm1, %xmm0, %xmm2
5; CHECK:   vmovaps %xmm2, %xmm0
6; CHECK-NEXT: retq
7define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
8entry:
9  br label %for.cond
10
11for.cond:
12  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
13  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
14  %cmp = icmp slt i32 %i.0, %iter
15  br i1 %cmp, label %for.body, label %for.end
16
17for.body:
18  br label %for.inc
19
20for.inc:
21  %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
22  %inc = add nsw i32 %i.0, 1
23  br label %for.cond
24
25for.end:
26  ret <2 x double> %c.addr.0
27}
28
29; CHECK-LABEL: fmsubaddpd_loop_128:
30; CHECK:   vfmsubadd231pd %xmm1, %xmm0, %xmm2
31; CHECK:   vmovaps %xmm2, %xmm0
32; CHECK-NEXT: retq
33define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
34entry:
35  br label %for.cond
36
37for.cond:
38  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
39  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
40  %cmp = icmp slt i32 %i.0, %iter
41  br i1 %cmp, label %for.body, label %for.end
42
43for.body:
44  br label %for.inc
45
46for.inc:
47  %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
48  %inc = add nsw i32 %i.0, 1
49  br label %for.cond
50
51for.end:
52  ret <2 x double> %c.addr.0
53}
54
55; CHECK-LABEL: fmaddpd_loop_128:
56; CHECK:   vfmadd231pd %xmm1, %xmm0, %xmm2
57; CHECK:   vmovaps %xmm2, %xmm0
58; CHECK-NEXT: retq
59define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
60entry:
61  br label %for.cond
62
63for.cond:
64  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
65  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
66  %cmp = icmp slt i32 %i.0, %iter
67  br i1 %cmp, label %for.body, label %for.end
68
69for.body:
70  br label %for.inc
71
72for.inc:
73  %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
74  %inc = add nsw i32 %i.0, 1
75  br label %for.cond
76
77for.end:
78  ret <2 x double> %c.addr.0
79}
80
81; CHECK-LABEL: fmsubpd_loop_128:
82; CHECK:   vfmsub231pd %xmm1, %xmm0, %xmm2
83; CHECK:   vmovaps %xmm2, %xmm0
84; CHECK-NEXT: retq
85define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
86entry:
87  br label %for.cond
88
89for.cond:
90  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
91  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
92  %cmp = icmp slt i32 %i.0, %iter
93  br i1 %cmp, label %for.body, label %for.end
94
95for.body:
96  br label %for.inc
97
98for.inc:
99  %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
100  %inc = add nsw i32 %i.0, 1
101  br label %for.cond
102
103for.end:
104  ret <2 x double> %c.addr.0
105}
106
107; CHECK-LABEL: fnmaddpd_loop_128:
108; CHECK:   vfnmadd231pd %xmm1, %xmm0, %xmm2
109; CHECK:   vmovaps %xmm2, %xmm0
110; CHECK-NEXT: retq
111define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
112entry:
113  br label %for.cond
114
115for.cond:
116  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
117  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
118  %cmp = icmp slt i32 %i.0, %iter
119  br i1 %cmp, label %for.body, label %for.end
120
121for.body:
122  br label %for.inc
123
124for.inc:
125  %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
126  %inc = add nsw i32 %i.0, 1
127  br label %for.cond
128
129for.end:
130  ret <2 x double> %c.addr.0
131}
132
133; CHECK-LABEL: fnmsubpd_loop_128:
134; CHECK:   vfnmsub231pd %xmm1, %xmm0, %xmm2
135; CHECK:   vmovaps %xmm2, %xmm0
136; CHECK-NEXT: retq
137define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
138entry:
139  br label %for.cond
140
141for.cond:
142  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
143  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
144  %cmp = icmp slt i32 %i.0, %iter
145  br i1 %cmp, label %for.body, label %for.end
146
147for.body:
148  br label %for.inc
149
150for.inc:
151  %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
152  %inc = add nsw i32 %i.0, 1
153  br label %for.cond
154
155for.end:
156  ret <2 x double> %c.addr.0
157}
158
159declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
160declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
161declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
162declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
163declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
164declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
165
166
167; CHECK-LABEL: fmaddsubps_loop_128:
168; CHECK:   vfmaddsub231ps %xmm1, %xmm0, %xmm2
169; CHECK:   vmovaps %xmm2, %xmm0
170; CHECK-NEXT: retq
171define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
172entry:
173  br label %for.cond
174
175for.cond:
176  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
177  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
178  %cmp = icmp slt i32 %i.0, %iter
179  br i1 %cmp, label %for.body, label %for.end
180
181for.body:
182  br label %for.inc
183
184for.inc:
185  %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
186  %inc = add nsw i32 %i.0, 1
187  br label %for.cond
188
189for.end:
190  ret <4 x float> %c.addr.0
191}
192
193; CHECK-LABEL: fmsubaddps_loop_128:
194; CHECK:   vfmsubadd231ps %xmm1, %xmm0, %xmm2
195; CHECK:   vmovaps %xmm2, %xmm0
196; CHECK-NEXT: retq
197define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
198entry:
199  br label %for.cond
200
201for.cond:
202  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
203  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
204  %cmp = icmp slt i32 %i.0, %iter
205  br i1 %cmp, label %for.body, label %for.end
206
207for.body:
208  br label %for.inc
209
210for.inc:
211  %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
212  %inc = add nsw i32 %i.0, 1
213  br label %for.cond
214
215for.end:
216  ret <4 x float> %c.addr.0
217}
218
219; CHECK-LABEL: fmaddps_loop_128:
220; CHECK:   vfmadd231ps %xmm1, %xmm0, %xmm2
221; CHECK:   vmovaps %xmm2, %xmm0
222; CHECK-NEXT: retq
223define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
224entry:
225  br label %for.cond
226
227for.cond:
228  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
229  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
230  %cmp = icmp slt i32 %i.0, %iter
231  br i1 %cmp, label %for.body, label %for.end
232
233for.body:
234  br label %for.inc
235
236for.inc:
237  %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
238  %inc = add nsw i32 %i.0, 1
239  br label %for.cond
240
241for.end:
242  ret <4 x float> %c.addr.0
243}
244
245; CHECK-LABEL: fmsubps_loop_128:
246; CHECK:   vfmsub231ps %xmm1, %xmm0, %xmm2
247; CHECK:   vmovaps %xmm2, %xmm0
248; CHECK-NEXT: retq
249define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
250entry:
251  br label %for.cond
252
253for.cond:
254  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
255  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
256  %cmp = icmp slt i32 %i.0, %iter
257  br i1 %cmp, label %for.body, label %for.end
258
259for.body:
260  br label %for.inc
261
262for.inc:
263  %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
264  %inc = add nsw i32 %i.0, 1
265  br label %for.cond
266
267for.end:
268  ret <4 x float> %c.addr.0
269}
270
271; CHECK-LABEL: fnmaddps_loop_128:
272; CHECK:   vfnmadd231ps %xmm1, %xmm0, %xmm2
273; CHECK:   vmovaps %xmm2, %xmm0
274; CHECK-NEXT: retq
275define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
276entry:
277  br label %for.cond
278
279for.cond:
280  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
281  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
282  %cmp = icmp slt i32 %i.0, %iter
283  br i1 %cmp, label %for.body, label %for.end
284
285for.body:
286  br label %for.inc
287
288for.inc:
289  %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
290  %inc = add nsw i32 %i.0, 1
291  br label %for.cond
292
293for.end:
294  ret <4 x float> %c.addr.0
295}
296
297; CHECK-LABEL: fnmsubps_loop_128:
298; CHECK:   vfnmsub231ps %xmm1, %xmm0, %xmm2
299; CHECK:   vmovaps %xmm2, %xmm0
300; CHECK-NEXT: retq
301define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
302entry:
303  br label %for.cond
304
305for.cond:
306  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
307  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
308  %cmp = icmp slt i32 %i.0, %iter
309  br i1 %cmp, label %for.body, label %for.end
310
311for.body:
312  br label %for.inc
313
314for.inc:
315  %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
316  %inc = add nsw i32 %i.0, 1
317  br label %for.cond
318
319for.end:
320  ret <4 x float> %c.addr.0
321}
322
323declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
324declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
325declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
326declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
327declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
328declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
329
330; CHECK-LABEL: fmaddsubpd_loop_256:
331; CHECK:   vfmaddsub231pd %ymm1, %ymm0, %ymm2
332; CHECK:   vmovaps %ymm2, %ymm0
333; CHECK-NEXT: retq
334define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
335entry:
336  br label %for.cond
337
338for.cond:
339  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
340  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
341  %cmp = icmp slt i32 %i.0, %iter
342  br i1 %cmp, label %for.body, label %for.end
343
344for.body:
345  br label %for.inc
346
347for.inc:
348  %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
349  %inc = add nsw i32 %i.0, 1
350  br label %for.cond
351
352for.end:
353  ret <4 x double> %c.addr.0
354}
355
356; CHECK-LABEL: fmsubaddpd_loop_256:
357; CHECK:   vfmsubadd231pd %ymm1, %ymm0, %ymm2
358; CHECK:   vmovaps %ymm2, %ymm0
359; CHECK-NEXT: retq
360define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
361entry:
362  br label %for.cond
363
364for.cond:
365  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
366  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
367  %cmp = icmp slt i32 %i.0, %iter
368  br i1 %cmp, label %for.body, label %for.end
369
370for.body:
371  br label %for.inc
372
373for.inc:
374  %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
375  %inc = add nsw i32 %i.0, 1
376  br label %for.cond
377
378for.end:
379  ret <4 x double> %c.addr.0
380}
381
382; CHECK-LABEL: fmaddpd_loop_256:
383; CHECK:   vfmadd231pd %ymm1, %ymm0, %ymm2
384; CHECK:   vmovaps %ymm2, %ymm0
385; CHECK-NEXT: retq
386define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
387entry:
388  br label %for.cond
389
390for.cond:
391  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
392  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
393  %cmp = icmp slt i32 %i.0, %iter
394  br i1 %cmp, label %for.body, label %for.end
395
396for.body:
397  br label %for.inc
398
399for.inc:
400  %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
401  %inc = add nsw i32 %i.0, 1
402  br label %for.cond
403
404for.end:
405  ret <4 x double> %c.addr.0
406}
407
408; CHECK-LABEL: fmsubpd_loop_256:
409; CHECK:   vfmsub231pd %ymm1, %ymm0, %ymm2
410; CHECK:   vmovaps %ymm2, %ymm0
411; CHECK-NEXT: retq
412define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
413entry:
414  br label %for.cond
415
416for.cond:
417  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
418  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
419  %cmp = icmp slt i32 %i.0, %iter
420  br i1 %cmp, label %for.body, label %for.end
421
422for.body:
423  br label %for.inc
424
425for.inc:
426  %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
427  %inc = add nsw i32 %i.0, 1
428  br label %for.cond
429
430for.end:
431  ret <4 x double> %c.addr.0
432}
433
434; CHECK-LABEL: fnmaddpd_loop_256:
435; CHECK:   vfnmadd231pd %ymm1, %ymm0, %ymm2
436; CHECK:   vmovaps %ymm2, %ymm0
437; CHECK-NEXT: retq
438define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
439entry:
440  br label %for.cond
441
442for.cond:
443  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
444  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
445  %cmp = icmp slt i32 %i.0, %iter
446  br i1 %cmp, label %for.body, label %for.end
447
448for.body:
449  br label %for.inc
450
451for.inc:
452  %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
453  %inc = add nsw i32 %i.0, 1
454  br label %for.cond
455
456for.end:
457  ret <4 x double> %c.addr.0
458}
459
460; CHECK-LABEL: fnmsubpd_loop_256:
461; CHECK:   vfnmsub231pd %ymm1, %ymm0, %ymm2
462; CHECK:   vmovaps %ymm2, %ymm0
463; CHECK-NEXT: retq
464define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
465entry:
466  br label %for.cond
467
468for.cond:
469  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
470  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
471  %cmp = icmp slt i32 %i.0, %iter
472  br i1 %cmp, label %for.body, label %for.end
473
474for.body:
475  br label %for.inc
476
477for.inc:
478  %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
479  %inc = add nsw i32 %i.0, 1
480  br label %for.cond
481
482for.end:
483  ret <4 x double> %c.addr.0
484}
485
486declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
487declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
488declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
489declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
490declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
491declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
492
493
494; CHECK-LABEL: fmaddsubps_loop_256:
495; CHECK:   vfmaddsub231ps %ymm1, %ymm0, %ymm2
496; CHECK:   vmovaps %ymm2, %ymm0
497; CHECK-NEXT: retq
498define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
499entry:
500  br label %for.cond
501
502for.cond:
503  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
504  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
505  %cmp = icmp slt i32 %i.0, %iter
506  br i1 %cmp, label %for.body, label %for.end
507
508for.body:
509  br label %for.inc
510
511for.inc:
512  %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
513  %inc = add nsw i32 %i.0, 1
514  br label %for.cond
515
516for.end:
517  ret <8 x float> %c.addr.0
518}
519
520; CHECK-LABEL: fmsubaddps_loop_256:
521; CHECK:   vfmsubadd231ps %ymm1, %ymm0, %ymm2
522; CHECK:   vmovaps %ymm2, %ymm0
523; CHECK-NEXT: retq
524define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
525entry:
526  br label %for.cond
527
528for.cond:
529  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
530  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
531  %cmp = icmp slt i32 %i.0, %iter
532  br i1 %cmp, label %for.body, label %for.end
533
534for.body:
535  br label %for.inc
536
537for.inc:
538  %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
539  %inc = add nsw i32 %i.0, 1
540  br label %for.cond
541
542for.end:
543  ret <8 x float> %c.addr.0
544}
545
546; CHECK-LABEL: fmaddps_loop_256:
547; CHECK:   vfmadd231ps %ymm1, %ymm0, %ymm2
548; CHECK:   vmovaps %ymm2, %ymm0
549; CHECK-NEXT: retq
550define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
551entry:
552  br label %for.cond
553
554for.cond:
555  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
556  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
557  %cmp = icmp slt i32 %i.0, %iter
558  br i1 %cmp, label %for.body, label %for.end
559
560for.body:
561  br label %for.inc
562
563for.inc:
564  %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
565  %inc = add nsw i32 %i.0, 1
566  br label %for.cond
567
568for.end:
569  ret <8 x float> %c.addr.0
570}
571
572; CHECK-LABEL: fmsubps_loop_256:
573; CHECK:   vfmsub231ps %ymm1, %ymm0, %ymm2
574; CHECK:   vmovaps %ymm2, %ymm0
575; CHECK-NEXT: retq
576define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
577entry:
578  br label %for.cond
579
580for.cond:
581  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
582  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
583  %cmp = icmp slt i32 %i.0, %iter
584  br i1 %cmp, label %for.body, label %for.end
585
586for.body:
587  br label %for.inc
588
589for.inc:
590  %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
591  %inc = add nsw i32 %i.0, 1
592  br label %for.cond
593
594for.end:
595  ret <8 x float> %c.addr.0
596}
597
598; CHECK-LABEL: fnmaddps_loop_256:
599; CHECK:   vfnmadd231ps %ymm1, %ymm0, %ymm2
600; CHECK:   vmovaps %ymm2, %ymm0
601; CHECK-NEXT: retq
602define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
603entry:
604  br label %for.cond
605
606for.cond:
607  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
608  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
609  %cmp = icmp slt i32 %i.0, %iter
610  br i1 %cmp, label %for.body, label %for.end
611
612for.body:
613  br label %for.inc
614
615for.inc:
616  %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
617  %inc = add nsw i32 %i.0, 1
618  br label %for.cond
619
620for.end:
621  ret <8 x float> %c.addr.0
622}
623
624; CHECK-LABEL: fnmsubps_loop_256:
625; CHECK:   vfnmsub231ps %ymm1, %ymm0, %ymm2
626; CHECK:   vmovaps %ymm2, %ymm0
627; CHECK-NEXT: retq
628define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
629entry:
630  br label %for.cond
631
632for.cond:
633  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
634  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
635  %cmp = icmp slt i32 %i.0, %iter
636  br i1 %cmp, label %for.body, label %for.end
637
638for.body:
639  br label %for.inc
640
641for.inc:
642  %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
643  %inc = add nsw i32 %i.0, 1
644  br label %for.cond
645
646for.end:
647  ret <8 x float> %c.addr.0
648}
649
650declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
651declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
652declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
653declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
654declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
655declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
656