1; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
2target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
3target triple = "x86_64-apple-macosx10.10.0"
4
5; CHECK-LABEL: fmaddsubpd_loop
6; CHECK: [[BODYLBL:LBB.+]]:
7; CHECK:   vfmaddsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
8; CHECK: [[INCLBL:LBB.+]]:
9; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
10; CHECK:   cmpl  {{%.+}}, [[INDREG]]
11; CHECK:   jl    [[BODYLBL]]
12define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
13entry:
14  br label %for.cond
15
16for.cond:
17  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
18  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
19  %cmp = icmp slt i32 %i.0, %iter
20  br i1 %cmp, label %for.body, label %for.end
21
22for.body:
23  br label %for.inc
24
25for.inc:
26  %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
27  %inc = add nsw i32 %i.0, 1
28  br label %for.cond
29
30for.end:
31  ret <4 x double> %c.addr.0
32}
33
34; CHECK-LABEL: fmsubaddpd_loop
35; CHECK: [[BODYLBL:LBB.+]]:
36; CHECK:   vfmsubadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
37; CHECK: [[INCLBL:LBB.+]]:
38; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
39; CHECK:   cmpl  {{%.+}}, [[INDREG]]
40; CHECK:   jl    [[BODYLBL]]
41define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
42entry:
43  br label %for.cond
44
45for.cond:
46  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
47  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
48  %cmp = icmp slt i32 %i.0, %iter
49  br i1 %cmp, label %for.body, label %for.end
50
51for.body:
52  br label %for.inc
53
54for.inc:
55  %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
56  %inc = add nsw i32 %i.0, 1
57  br label %for.cond
58
59for.end:
60  ret <4 x double> %c.addr.0
61}
62
63; CHECK-LABEL: fmaddpd_loop
64; CHECK: [[BODYLBL:LBB.+]]:
65; CHECK:   vfmadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
66; CHECK: [[INCLBL:LBB.+]]:
67; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
68; CHECK:   cmpl  {{%.+}}, [[INDREG]]
69; CHECK:   jl    [[BODYLBL]]
70define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
71entry:
72  br label %for.cond
73
74for.cond:
75  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
76  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
77  %cmp = icmp slt i32 %i.0, %iter
78  br i1 %cmp, label %for.body, label %for.end
79
80for.body:
81  br label %for.inc
82
83for.inc:
84  %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
85  %inc = add nsw i32 %i.0, 1
86  br label %for.cond
87
88for.end:
89  ret <4 x double> %c.addr.0
90}
91
92; CHECK-LABEL: fmsubpd_loop
93; CHECK: [[BODYLBL:LBB.+]]:
94; CHECK:   vfmsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
95; CHECK: [[INCLBL:LBB.+]]:
96; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
97; CHECK:   cmpl  {{%.+}}, [[INDREG]]
98; CHECK:   jl    [[BODYLBL]]
99define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
100entry:
101  br label %for.cond
102
103for.cond:
104  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
105  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
106  %cmp = icmp slt i32 %i.0, %iter
107  br i1 %cmp, label %for.body, label %for.end
108
109for.body:
110  br label %for.inc
111
112for.inc:
113  %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
114  %inc = add nsw i32 %i.0, 1
115  br label %for.cond
116
117for.end:
118  ret <4 x double> %c.addr.0
119}
120
121declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
122declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
123declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
124declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
125
126
127; CHECK-LABEL: fmaddsubps_loop
128; CHECK: [[BODYLBL:LBB.+]]:
129; CHECK:   vfmaddsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
130; CHECK: [[INCLBL:LBB.+]]:
131; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
132; CHECK:   cmpl  {{%.+}}, [[INDREG]]
133; CHECK:   jl    [[BODYLBL]]
134define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
135entry:
136  br label %for.cond
137
138for.cond:
139  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
140  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
141  %cmp = icmp slt i32 %i.0, %iter
142  br i1 %cmp, label %for.body, label %for.end
143
144for.body:
145  br label %for.inc
146
147for.inc:
148  %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
149  %inc = add nsw i32 %i.0, 1
150  br label %for.cond
151
152for.end:
153  ret <8 x float> %c.addr.0
154}
155
156; CHECK-LABEL: fmsubaddps_loop
157; CHECK: [[BODYLBL:LBB.+]]:
158; CHECK:   vfmsubadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
159; CHECK: [[INCLBL:LBB.+]]:
160; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
161; CHECK:   cmpl  {{%.+}}, [[INDREG]]
162; CHECK:   jl    [[BODYLBL]]
163define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
164entry:
165  br label %for.cond
166
167for.cond:
168  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
169  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
170  %cmp = icmp slt i32 %i.0, %iter
171  br i1 %cmp, label %for.body, label %for.end
172
173for.body:
174  br label %for.inc
175
176for.inc:
177  %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
178  %inc = add nsw i32 %i.0, 1
179  br label %for.cond
180
181for.end:
182  ret <8 x float> %c.addr.0
183}
184
185; CHECK-LABEL: fmaddps_loop
186; CHECK: [[BODYLBL:LBB.+]]:
187; CHECK:   vfmadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
188; CHECK: [[INCLBL:LBB.+]]:
189; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
190; CHECK:   cmpl  {{%.+}}, [[INDREG]]
191; CHECK:   jl    [[BODYLBL]]
192define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
193entry:
194  br label %for.cond
195
196for.cond:
197  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
198  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
199  %cmp = icmp slt i32 %i.0, %iter
200  br i1 %cmp, label %for.body, label %for.end
201
202for.body:
203  br label %for.inc
204
205for.inc:
206  %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
207  %inc = add nsw i32 %i.0, 1
208  br label %for.cond
209
210for.end:
211  ret <8 x float> %c.addr.0
212}
213
214; CHECK-LABEL: fmsubps_loop
215; CHECK: [[BODYLBL:LBB.+]]:
216; CHECK:   vfmsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
217; CHECK: [[INCLBL:LBB.+]]:
218; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
219; CHECK:   cmpl  {{%.+}}, [[INDREG]]
220; CHECK:   jl    [[BODYLBL]]
221define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
222entry:
223  br label %for.cond
224
225for.cond:
226  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
227  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
228  %cmp = icmp slt i32 %i.0, %iter
229  br i1 %cmp, label %for.body, label %for.end
230
231for.body:
232  br label %for.inc
233
234for.inc:
235  %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
236  %inc = add nsw i32 %i.0, 1
237  br label %for.cond
238
239for.end:
240  ret <8 x float> %c.addr.0
241}
242
243declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
244declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
245declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
246declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
247