1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s
3
4define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
5; CHECK-LABEL: fmaddsubpd_loop_128:
6; CHECK:       # %bb.0: # %entry
7; CHECK-NEXT:    xorl %eax, %eax
8; CHECK-NEXT:    cmpl %edi, %eax
9; CHECK-NEXT:    jge .LBB0_3
10; CHECK-NEXT:    .p2align 4, 0x90
11; CHECK-NEXT:  .LBB0_2: # %for.body
12; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
13; CHECK-NEXT:    vfmaddsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
14; CHECK-NEXT:    incl %eax
15; CHECK-NEXT:    cmpl %edi, %eax
16; CHECK-NEXT:    jl .LBB0_2
17; CHECK-NEXT:  .LBB0_3: # %for.end
18; CHECK-NEXT:    vmovapd %xmm2, %xmm0
19; CHECK-NEXT:    retq
20entry:
21  br label %for.cond
22
23for.cond:
24  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
25  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
26  %cmp = icmp slt i32 %i.0, %iter
27  br i1 %cmp, label %for.body, label %for.end
28
29for.body:
30  br label %for.inc
31
32for.inc:
33  %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
34  %inc = add nsw i32 %i.0, 1
35  br label %for.cond
36
37for.end:
38  ret <2 x double> %c.addr.0
39}
40
41define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
42; CHECK-LABEL: fmsubaddpd_loop_128:
43; CHECK:       # %bb.0: # %entry
44; CHECK-NEXT:    xorl %eax, %eax
45; CHECK-NEXT:    cmpl %edi, %eax
46; CHECK-NEXT:    jge .LBB1_3
47; CHECK-NEXT:    .p2align 4, 0x90
48; CHECK-NEXT:  .LBB1_2: # %for.body
49; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
50; CHECK-NEXT:    vfmsubadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
51; CHECK-NEXT:    incl %eax
52; CHECK-NEXT:    cmpl %edi, %eax
53; CHECK-NEXT:    jl .LBB1_2
54; CHECK-NEXT:  .LBB1_3: # %for.end
55; CHECK-NEXT:    vmovapd %xmm2, %xmm0
56; CHECK-NEXT:    retq
57entry:
58  br label %for.cond
59
60for.cond:
61  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
62  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
63  %cmp = icmp slt i32 %i.0, %iter
64  br i1 %cmp, label %for.body, label %for.end
65
66for.body:
67  br label %for.inc
68
69for.inc:
70  %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
71  %inc = add nsw i32 %i.0, 1
72  br label %for.cond
73
74for.end:
75  ret <2 x double> %c.addr.0
76}
77
78define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
79; CHECK-LABEL: fmaddpd_loop_128:
80; CHECK:       # %bb.0: # %entry
81; CHECK-NEXT:    xorl %eax, %eax
82; CHECK-NEXT:    cmpl %edi, %eax
83; CHECK-NEXT:    jge .LBB2_3
84; CHECK-NEXT:    .p2align 4, 0x90
85; CHECK-NEXT:  .LBB2_2: # %for.body
86; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
87; CHECK-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
88; CHECK-NEXT:    incl %eax
89; CHECK-NEXT:    cmpl %edi, %eax
90; CHECK-NEXT:    jl .LBB2_2
91; CHECK-NEXT:  .LBB2_3: # %for.end
92; CHECK-NEXT:    vmovapd %xmm2, %xmm0
93; CHECK-NEXT:    retq
94entry:
95  br label %for.cond
96
97for.cond:
98  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
99  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
100  %cmp = icmp slt i32 %i.0, %iter
101  br i1 %cmp, label %for.body, label %for.end
102
103for.body:
104  br label %for.inc
105
106for.inc:
107  %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
108  %inc = add nsw i32 %i.0, 1
109  br label %for.cond
110
111for.end:
112  ret <2 x double> %c.addr.0
113}
114
115define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
116; CHECK-LABEL: fmsubpd_loop_128:
117; CHECK:       # %bb.0: # %entry
118; CHECK-NEXT:    xorl %eax, %eax
119; CHECK-NEXT:    cmpl %edi, %eax
120; CHECK-NEXT:    jge .LBB3_3
121; CHECK-NEXT:    .p2align 4, 0x90
122; CHECK-NEXT:  .LBB3_2: # %for.body
123; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
124; CHECK-NEXT:    vfmsub231pd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
125; CHECK-NEXT:    incl %eax
126; CHECK-NEXT:    cmpl %edi, %eax
127; CHECK-NEXT:    jl .LBB3_2
128; CHECK-NEXT:  .LBB3_3: # %for.end
129; CHECK-NEXT:    vmovapd %xmm2, %xmm0
130; CHECK-NEXT:    retq
131entry:
132  br label %for.cond
133
134for.cond:
135  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
136  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
137  %cmp = icmp slt i32 %i.0, %iter
138  br i1 %cmp, label %for.body, label %for.end
139
140for.body:
141  br label %for.inc
142
143for.inc:
144  %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
145  %inc = add nsw i32 %i.0, 1
146  br label %for.cond
147
148for.end:
149  ret <2 x double> %c.addr.0
150}
151
152define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
153; CHECK-LABEL: fnmaddpd_loop_128:
154; CHECK:       # %bb.0: # %entry
155; CHECK-NEXT:    xorl %eax, %eax
156; CHECK-NEXT:    cmpl %edi, %eax
157; CHECK-NEXT:    jge .LBB4_3
158; CHECK-NEXT:    .p2align 4, 0x90
159; CHECK-NEXT:  .LBB4_2: # %for.body
160; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
161; CHECK-NEXT:    vfnmadd231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
162; CHECK-NEXT:    incl %eax
163; CHECK-NEXT:    cmpl %edi, %eax
164; CHECK-NEXT:    jl .LBB4_2
165; CHECK-NEXT:  .LBB4_3: # %for.end
166; CHECK-NEXT:    vmovapd %xmm2, %xmm0
167; CHECK-NEXT:    retq
168entry:
169  br label %for.cond
170
171for.cond:
172  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
173  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
174  %cmp = icmp slt i32 %i.0, %iter
175  br i1 %cmp, label %for.body, label %for.end
176
177for.body:
178  br label %for.inc
179
180for.inc:
181  %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
182  %inc = add nsw i32 %i.0, 1
183  br label %for.cond
184
185for.end:
186  ret <2 x double> %c.addr.0
187}
188
189define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
190; CHECK-LABEL: fnmsubpd_loop_128:
191; CHECK:       # %bb.0: # %entry
192; CHECK-NEXT:    xorl %eax, %eax
193; CHECK-NEXT:    cmpl %edi, %eax
194; CHECK-NEXT:    jge .LBB5_3
195; CHECK-NEXT:    .p2align 4, 0x90
196; CHECK-NEXT:  .LBB5_2: # %for.body
197; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
198; CHECK-NEXT:    vfnmsub231pd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
199; CHECK-NEXT:    incl %eax
200; CHECK-NEXT:    cmpl %edi, %eax
201; CHECK-NEXT:    jl .LBB5_2
202; CHECK-NEXT:  .LBB5_3: # %for.end
203; CHECK-NEXT:    vmovapd %xmm2, %xmm0
204; CHECK-NEXT:    retq
205entry:
206  br label %for.cond
207
208for.cond:
209  %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ]
210  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
211  %cmp = icmp slt i32 %i.0, %iter
212  br i1 %cmp, label %for.body, label %for.end
213
214for.body:
215  br label %for.inc
216
217for.inc:
218  %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0)
219  %inc = add nsw i32 %i.0, 1
220  br label %for.cond
221
222for.end:
223  ret <2 x double> %c.addr.0
224}
225
226declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
227declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
228declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
229declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
230declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
231declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
232
233define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
234; CHECK-LABEL: fmaddsubps_loop_128:
235; CHECK:       # %bb.0: # %entry
236; CHECK-NEXT:    xorl %eax, %eax
237; CHECK-NEXT:    cmpl %edi, %eax
238; CHECK-NEXT:    jge .LBB6_3
239; CHECK-NEXT:    .p2align 4, 0x90
240; CHECK-NEXT:  .LBB6_2: # %for.body
241; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
242; CHECK-NEXT:    vfmaddsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) +/- xmm2
243; CHECK-NEXT:    incl %eax
244; CHECK-NEXT:    cmpl %edi, %eax
245; CHECK-NEXT:    jl .LBB6_2
246; CHECK-NEXT:  .LBB6_3: # %for.end
247; CHECK-NEXT:    vmovaps %xmm2, %xmm0
248; CHECK-NEXT:    retq
249entry:
250  br label %for.cond
251
252for.cond:
253  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
254  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
255  %cmp = icmp slt i32 %i.0, %iter
256  br i1 %cmp, label %for.body, label %for.end
257
258for.body:
259  br label %for.inc
260
261for.inc:
262  %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
263  %inc = add nsw i32 %i.0, 1
264  br label %for.cond
265
266for.end:
267  ret <4 x float> %c.addr.0
268}
269
270define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
271; CHECK-LABEL: fmsubaddps_loop_128:
272; CHECK:       # %bb.0: # %entry
273; CHECK-NEXT:    xorl %eax, %eax
274; CHECK-NEXT:    cmpl %edi, %eax
275; CHECK-NEXT:    jge .LBB7_3
276; CHECK-NEXT:    .p2align 4, 0x90
277; CHECK-NEXT:  .LBB7_2: # %for.body
278; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
279; CHECK-NEXT:    vfmsubadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) -/+ xmm2
280; CHECK-NEXT:    incl %eax
281; CHECK-NEXT:    cmpl %edi, %eax
282; CHECK-NEXT:    jl .LBB7_2
283; CHECK-NEXT:  .LBB7_3: # %for.end
284; CHECK-NEXT:    vmovaps %xmm2, %xmm0
285; CHECK-NEXT:    retq
286entry:
287  br label %for.cond
288
289for.cond:
290  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
291  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
292  %cmp = icmp slt i32 %i.0, %iter
293  br i1 %cmp, label %for.body, label %for.end
294
295for.body:
296  br label %for.inc
297
298for.inc:
299  %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
300  %inc = add nsw i32 %i.0, 1
301  br label %for.cond
302
303for.end:
304  ret <4 x float> %c.addr.0
305}
306
307define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
308; CHECK-LABEL: fmaddps_loop_128:
309; CHECK:       # %bb.0: # %entry
310; CHECK-NEXT:    xorl %eax, %eax
311; CHECK-NEXT:    cmpl %edi, %eax
312; CHECK-NEXT:    jge .LBB8_3
313; CHECK-NEXT:    .p2align 4, 0x90
314; CHECK-NEXT:  .LBB8_2: # %for.body
315; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
316; CHECK-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
317; CHECK-NEXT:    incl %eax
318; CHECK-NEXT:    cmpl %edi, %eax
319; CHECK-NEXT:    jl .LBB8_2
320; CHECK-NEXT:  .LBB8_3: # %for.end
321; CHECK-NEXT:    vmovaps %xmm2, %xmm0
322; CHECK-NEXT:    retq
323entry:
324  br label %for.cond
325
326for.cond:
327  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
328  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
329  %cmp = icmp slt i32 %i.0, %iter
330  br i1 %cmp, label %for.body, label %for.end
331
332for.body:
333  br label %for.inc
334
335for.inc:
336  %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
337  %inc = add nsw i32 %i.0, 1
338  br label %for.cond
339
340for.end:
341  ret <4 x float> %c.addr.0
342}
343
344define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
345; CHECK-LABEL: fmsubps_loop_128:
346; CHECK:       # %bb.0: # %entry
347; CHECK-NEXT:    xorl %eax, %eax
348; CHECK-NEXT:    cmpl %edi, %eax
349; CHECK-NEXT:    jge .LBB9_3
350; CHECK-NEXT:    .p2align 4, 0x90
351; CHECK-NEXT:  .LBB9_2: # %for.body
352; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
353; CHECK-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
354; CHECK-NEXT:    incl %eax
355; CHECK-NEXT:    cmpl %edi, %eax
356; CHECK-NEXT:    jl .LBB9_2
357; CHECK-NEXT:  .LBB9_3: # %for.end
358; CHECK-NEXT:    vmovaps %xmm2, %xmm0
359; CHECK-NEXT:    retq
360entry:
361  br label %for.cond
362
363for.cond:
364  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
365  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
366  %cmp = icmp slt i32 %i.0, %iter
367  br i1 %cmp, label %for.body, label %for.end
368
369for.body:
370  br label %for.inc
371
372for.inc:
373  %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
374  %inc = add nsw i32 %i.0, 1
375  br label %for.cond
376
377for.end:
378  ret <4 x float> %c.addr.0
379}
380
381define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
382; CHECK-LABEL: fnmaddps_loop_128:
383; CHECK:       # %bb.0: # %entry
384; CHECK-NEXT:    xorl %eax, %eax
385; CHECK-NEXT:    cmpl %edi, %eax
386; CHECK-NEXT:    jge .LBB10_3
387; CHECK-NEXT:    .p2align 4, 0x90
388; CHECK-NEXT:  .LBB10_2: # %for.body
389; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
390; CHECK-NEXT:    vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
391; CHECK-NEXT:    incl %eax
392; CHECK-NEXT:    cmpl %edi, %eax
393; CHECK-NEXT:    jl .LBB10_2
394; CHECK-NEXT:  .LBB10_3: # %for.end
395; CHECK-NEXT:    vmovaps %xmm2, %xmm0
396; CHECK-NEXT:    retq
397entry:
398  br label %for.cond
399
400for.cond:
401  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
402  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
403  %cmp = icmp slt i32 %i.0, %iter
404  br i1 %cmp, label %for.body, label %for.end
405
406for.body:
407  br label %for.inc
408
409for.inc:
410  %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
411  %inc = add nsw i32 %i.0, 1
412  br label %for.cond
413
414for.end:
415  ret <4 x float> %c.addr.0
416}
417
418define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) {
419; CHECK-LABEL: fnmsubps_loop_128:
420; CHECK:       # %bb.0: # %entry
421; CHECK-NEXT:    xorl %eax, %eax
422; CHECK-NEXT:    cmpl %edi, %eax
423; CHECK-NEXT:    jge .LBB11_3
424; CHECK-NEXT:    .p2align 4, 0x90
425; CHECK-NEXT:  .LBB11_2: # %for.body
426; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
427; CHECK-NEXT:    vfnmsub231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
428; CHECK-NEXT:    incl %eax
429; CHECK-NEXT:    cmpl %edi, %eax
430; CHECK-NEXT:    jl .LBB11_2
431; CHECK-NEXT:  .LBB11_3: # %for.end
432; CHECK-NEXT:    vmovaps %xmm2, %xmm0
433; CHECK-NEXT:    retq
434entry:
435  br label %for.cond
436
437for.cond:
438  %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ]
439  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
440  %cmp = icmp slt i32 %i.0, %iter
441  br i1 %cmp, label %for.body, label %for.end
442
443for.body:
444  br label %for.inc
445
446for.inc:
447  %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0)
448  %inc = add nsw i32 %i.0, 1
449  br label %for.cond
450
451for.end:
452  ret <4 x float> %c.addr.0
453}
454
455declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
456declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
457declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
458declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
459declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
460declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
461
462define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
463; CHECK-LABEL: fmaddsubpd_loop_256:
464; CHECK:       # %bb.0: # %entry
465; CHECK-NEXT:    xorl %eax, %eax
466; CHECK-NEXT:    cmpl %edi, %eax
467; CHECK-NEXT:    jge .LBB12_3
468; CHECK-NEXT:    .p2align 4, 0x90
469; CHECK-NEXT:  .LBB12_2: # %for.body
470; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
471; CHECK-NEXT:    vfmaddsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
472; CHECK-NEXT:    incl %eax
473; CHECK-NEXT:    cmpl %edi, %eax
474; CHECK-NEXT:    jl .LBB12_2
475; CHECK-NEXT:  .LBB12_3: # %for.end
476; CHECK-NEXT:    vmovapd %ymm2, %ymm0
477; CHECK-NEXT:    retq
478entry:
479  br label %for.cond
480
481for.cond:
482  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
483  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
484  %cmp = icmp slt i32 %i.0, %iter
485  br i1 %cmp, label %for.body, label %for.end
486
487for.body:
488  br label %for.inc
489
490for.inc:
491  %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
492  %inc = add nsw i32 %i.0, 1
493  br label %for.cond
494
495for.end:
496  ret <4 x double> %c.addr.0
497}
498
499define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
500; CHECK-LABEL: fmsubaddpd_loop_256:
501; CHECK:       # %bb.0: # %entry
502; CHECK-NEXT:    xorl %eax, %eax
503; CHECK-NEXT:    cmpl %edi, %eax
504; CHECK-NEXT:    jge .LBB13_3
505; CHECK-NEXT:    .p2align 4, 0x90
506; CHECK-NEXT:  .LBB13_2: # %for.body
507; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
508; CHECK-NEXT:    vfmsubadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
509; CHECK-NEXT:    incl %eax
510; CHECK-NEXT:    cmpl %edi, %eax
511; CHECK-NEXT:    jl .LBB13_2
512; CHECK-NEXT:  .LBB13_3: # %for.end
513; CHECK-NEXT:    vmovapd %ymm2, %ymm0
514; CHECK-NEXT:    retq
515entry:
516  br label %for.cond
517
518for.cond:
519  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
520  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
521  %cmp = icmp slt i32 %i.0, %iter
522  br i1 %cmp, label %for.body, label %for.end
523
524for.body:
525  br label %for.inc
526
527for.inc:
528  %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
529  %inc = add nsw i32 %i.0, 1
530  br label %for.cond
531
532for.end:
533  ret <4 x double> %c.addr.0
534}
535
536define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
537; CHECK-LABEL: fmaddpd_loop_256:
538; CHECK:       # %bb.0: # %entry
539; CHECK-NEXT:    xorl %eax, %eax
540; CHECK-NEXT:    cmpl %edi, %eax
541; CHECK-NEXT:    jge .LBB14_3
542; CHECK-NEXT:    .p2align 4, 0x90
543; CHECK-NEXT:  .LBB14_2: # %for.body
544; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
545; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
546; CHECK-NEXT:    incl %eax
547; CHECK-NEXT:    cmpl %edi, %eax
548; CHECK-NEXT:    jl .LBB14_2
549; CHECK-NEXT:  .LBB14_3: # %for.end
550; CHECK-NEXT:    vmovapd %ymm2, %ymm0
551; CHECK-NEXT:    retq
552entry:
553  br label %for.cond
554
555for.cond:
556  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
557  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
558  %cmp = icmp slt i32 %i.0, %iter
559  br i1 %cmp, label %for.body, label %for.end
560
561for.body:
562  br label %for.inc
563
564for.inc:
565  %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
566  %inc = add nsw i32 %i.0, 1
567  br label %for.cond
568
569for.end:
570  ret <4 x double> %c.addr.0
571}
572
573define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
574; CHECK-LABEL: fmsubpd_loop_256:
575; CHECK:       # %bb.0: # %entry
576; CHECK-NEXT:    xorl %eax, %eax
577; CHECK-NEXT:    cmpl %edi, %eax
578; CHECK-NEXT:    jge .LBB15_3
579; CHECK-NEXT:    .p2align 4, 0x90
580; CHECK-NEXT:  .LBB15_2: # %for.body
581; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
582; CHECK-NEXT:    vfmsub231pd {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
583; CHECK-NEXT:    incl %eax
584; CHECK-NEXT:    cmpl %edi, %eax
585; CHECK-NEXT:    jl .LBB15_2
586; CHECK-NEXT:  .LBB15_3: # %for.end
587; CHECK-NEXT:    vmovapd %ymm2, %ymm0
588; CHECK-NEXT:    retq
589entry:
590  br label %for.cond
591
592for.cond:
593  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
594  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
595  %cmp = icmp slt i32 %i.0, %iter
596  br i1 %cmp, label %for.body, label %for.end
597
598for.body:
599  br label %for.inc
600
601for.inc:
602  %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
603  %inc = add nsw i32 %i.0, 1
604  br label %for.cond
605
606for.end:
607  ret <4 x double> %c.addr.0
608}
609
610define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
611; CHECK-LABEL: fnmaddpd_loop_256:
612; CHECK:       # %bb.0: # %entry
613; CHECK-NEXT:    xorl %eax, %eax
614; CHECK-NEXT:    cmpl %edi, %eax
615; CHECK-NEXT:    jge .LBB16_3
616; CHECK-NEXT:    .p2align 4, 0x90
617; CHECK-NEXT:  .LBB16_2: # %for.body
618; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
619; CHECK-NEXT:    vfnmadd231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
620; CHECK-NEXT:    incl %eax
621; CHECK-NEXT:    cmpl %edi, %eax
622; CHECK-NEXT:    jl .LBB16_2
623; CHECK-NEXT:  .LBB16_3: # %for.end
624; CHECK-NEXT:    vmovapd %ymm2, %ymm0
625; CHECK-NEXT:    retq
626entry:
627  br label %for.cond
628
629for.cond:
630  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
631  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
632  %cmp = icmp slt i32 %i.0, %iter
633  br i1 %cmp, label %for.body, label %for.end
634
635for.body:
636  br label %for.inc
637
638for.inc:
639  %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
640  %inc = add nsw i32 %i.0, 1
641  br label %for.cond
642
643for.end:
644  ret <4 x double> %c.addr.0
645}
646
647define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
648; CHECK-LABEL: fnmsubpd_loop_256:
649; CHECK:       # %bb.0: # %entry
650; CHECK-NEXT:    xorl %eax, %eax
651; CHECK-NEXT:    cmpl %edi, %eax
652; CHECK-NEXT:    jge .LBB17_3
653; CHECK-NEXT:    .p2align 4, 0x90
654; CHECK-NEXT:  .LBB17_2: # %for.body
655; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
656; CHECK-NEXT:    vfnmsub231pd {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
657; CHECK-NEXT:    incl %eax
658; CHECK-NEXT:    cmpl %edi, %eax
659; CHECK-NEXT:    jl .LBB17_2
660; CHECK-NEXT:  .LBB17_3: # %for.end
661; CHECK-NEXT:    vmovapd %ymm2, %ymm0
662; CHECK-NEXT:    retq
663entry:
664  br label %for.cond
665
666for.cond:
667  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
668  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
669  %cmp = icmp slt i32 %i.0, %iter
670  br i1 %cmp, label %for.body, label %for.end
671
672for.body:
673  br label %for.inc
674
675for.inc:
676  %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
677  %inc = add nsw i32 %i.0, 1
678  br label %for.cond
679
680for.end:
681  ret <4 x double> %c.addr.0
682}
683
684declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
685declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
686declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
687declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
688declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
689declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
690
691define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
692; CHECK-LABEL: fmaddsubps_loop_256:
693; CHECK:       # %bb.0: # %entry
694; CHECK-NEXT:    xorl %eax, %eax
695; CHECK-NEXT:    cmpl %edi, %eax
696; CHECK-NEXT:    jge .LBB18_3
697; CHECK-NEXT:    .p2align 4, 0x90
698; CHECK-NEXT:  .LBB18_2: # %for.body
699; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
700; CHECK-NEXT:    vfmaddsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) +/- ymm2
701; CHECK-NEXT:    incl %eax
702; CHECK-NEXT:    cmpl %edi, %eax
703; CHECK-NEXT:    jl .LBB18_2
704; CHECK-NEXT:  .LBB18_3: # %for.end
705; CHECK-NEXT:    vmovaps %ymm2, %ymm0
706; CHECK-NEXT:    retq
707entry:
708  br label %for.cond
709
710for.cond:
711  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
712  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
713  %cmp = icmp slt i32 %i.0, %iter
714  br i1 %cmp, label %for.body, label %for.end
715
716for.body:
717  br label %for.inc
718
719for.inc:
720  %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
721  %inc = add nsw i32 %i.0, 1
722  br label %for.cond
723
724for.end:
725  ret <8 x float> %c.addr.0
726}
727
728define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
729; CHECK-LABEL: fmsubaddps_loop_256:
730; CHECK:       # %bb.0: # %entry
731; CHECK-NEXT:    xorl %eax, %eax
732; CHECK-NEXT:    cmpl %edi, %eax
733; CHECK-NEXT:    jge .LBB19_3
734; CHECK-NEXT:    .p2align 4, 0x90
735; CHECK-NEXT:  .LBB19_2: # %for.body
736; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
737; CHECK-NEXT:    vfmsubadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) -/+ ymm2
738; CHECK-NEXT:    incl %eax
739; CHECK-NEXT:    cmpl %edi, %eax
740; CHECK-NEXT:    jl .LBB19_2
741; CHECK-NEXT:  .LBB19_3: # %for.end
742; CHECK-NEXT:    vmovaps %ymm2, %ymm0
743; CHECK-NEXT:    retq
744entry:
745  br label %for.cond
746
747for.cond:
748  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
749  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
750  %cmp = icmp slt i32 %i.0, %iter
751  br i1 %cmp, label %for.body, label %for.end
752
753for.body:
754  br label %for.inc
755
756for.inc:
757  %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
758  %inc = add nsw i32 %i.0, 1
759  br label %for.cond
760
761for.end:
762  ret <8 x float> %c.addr.0
763}
764
765define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
766; CHECK-LABEL: fmaddps_loop_256:
767; CHECK:       # %bb.0: # %entry
768; CHECK-NEXT:    xorl %eax, %eax
769; CHECK-NEXT:    cmpl %edi, %eax
770; CHECK-NEXT:    jge .LBB20_3
771; CHECK-NEXT:    .p2align 4, 0x90
772; CHECK-NEXT:  .LBB20_2: # %for.body
773; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
774; CHECK-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2
775; CHECK-NEXT:    incl %eax
776; CHECK-NEXT:    cmpl %edi, %eax
777; CHECK-NEXT:    jl .LBB20_2
778; CHECK-NEXT:  .LBB20_3: # %for.end
779; CHECK-NEXT:    vmovaps %ymm2, %ymm0
780; CHECK-NEXT:    retq
781entry:
782  br label %for.cond
783
784for.cond:
785  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
786  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
787  %cmp = icmp slt i32 %i.0, %iter
788  br i1 %cmp, label %for.body, label %for.end
789
790for.body:
791  br label %for.inc
792
793for.inc:
794  %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
795  %inc = add nsw i32 %i.0, 1
796  br label %for.cond
797
798for.end:
799  ret <8 x float> %c.addr.0
800}
801
802define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
803; CHECK-LABEL: fmsubps_loop_256:
804; CHECK:       # %bb.0: # %entry
805; CHECK-NEXT:    xorl %eax, %eax
806; CHECK-NEXT:    cmpl %edi, %eax
807; CHECK-NEXT:    jge .LBB21_3
808; CHECK-NEXT:    .p2align 4, 0x90
809; CHECK-NEXT:  .LBB21_2: # %for.body
810; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
811; CHECK-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
812; CHECK-NEXT:    incl %eax
813; CHECK-NEXT:    cmpl %edi, %eax
814; CHECK-NEXT:    jl .LBB21_2
815; CHECK-NEXT:  .LBB21_3: # %for.end
816; CHECK-NEXT:    vmovaps %ymm2, %ymm0
817; CHECK-NEXT:    retq
818entry:
819  br label %for.cond
820
821for.cond:
822  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
823  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
824  %cmp = icmp slt i32 %i.0, %iter
825  br i1 %cmp, label %for.body, label %for.end
826
827for.body:
828  br label %for.inc
829
830for.inc:
831  %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
832  %inc = add nsw i32 %i.0, 1
833  br label %for.cond
834
835for.end:
836  ret <8 x float> %c.addr.0
837}
838
839define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
840; CHECK-LABEL: fnmaddps_loop_256:
841; CHECK:       # %bb.0: # %entry
842; CHECK-NEXT:    xorl %eax, %eax
843; CHECK-NEXT:    cmpl %edi, %eax
844; CHECK-NEXT:    jge .LBB22_3
845; CHECK-NEXT:    .p2align 4, 0x90
846; CHECK-NEXT:  .LBB22_2: # %for.body
847; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
848; CHECK-NEXT:    vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2
849; CHECK-NEXT:    incl %eax
850; CHECK-NEXT:    cmpl %edi, %eax
851; CHECK-NEXT:    jl .LBB22_2
852; CHECK-NEXT:  .LBB22_3: # %for.end
853; CHECK-NEXT:    vmovaps %ymm2, %ymm0
854; CHECK-NEXT:    retq
855entry:
856  br label %for.cond
857
858for.cond:
859  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
860  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
861  %cmp = icmp slt i32 %i.0, %iter
862  br i1 %cmp, label %for.body, label %for.end
863
864for.body:
865  br label %for.inc
866
867for.inc:
868  %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
869  %inc = add nsw i32 %i.0, 1
870  br label %for.cond
871
872for.end:
873  ret <8 x float> %c.addr.0
874}
875
876define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
877; CHECK-LABEL: fnmsubps_loop_256:
878; CHECK:       # %bb.0: # %entry
879; CHECK-NEXT:    xorl %eax, %eax
880; CHECK-NEXT:    cmpl %edi, %eax
881; CHECK-NEXT:    jge .LBB23_3
882; CHECK-NEXT:    .p2align 4, 0x90
883; CHECK-NEXT:  .LBB23_2: # %for.body
884; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
885; CHECK-NEXT:    vfnmsub231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) - ymm2
886; CHECK-NEXT:    incl %eax
887; CHECK-NEXT:    cmpl %edi, %eax
888; CHECK-NEXT:    jl .LBB23_2
889; CHECK-NEXT:  .LBB23_3: # %for.end
890; CHECK-NEXT:    vmovaps %ymm2, %ymm0
891; CHECK-NEXT:    retq
892entry:
893  br label %for.cond
894
895for.cond:
896  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
897  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
898  %cmp = icmp slt i32 %i.0, %iter
899  br i1 %cmp, label %for.body, label %for.end
900
901for.body:
902  br label %for.inc
903
904for.inc:
905  %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
906  %inc = add nsw i32 %i.0, 1
907  br label %for.cond
908
909for.end:
910  ret <8 x float> %c.addr.0
911}
912
913declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
914declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
915declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
916declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
917declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
918declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
919