1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2
3; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
4
5; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
6
7; Check that we do not copy agprs to vgprs and back inside the loop.
8
9; GCN: [[LOOP:BB[0-9_]+]]:
10; GCN-NOT: v_accvgpr
11; GCN: v_mfma_f32_32x32x1f32
12; GCN-NOT: v_accvgpr
13; GCN: s_cbranch_scc1 [[LOOP]]
14
15; Final result should be read only once after the loop.
16
17; GCN-COUNT-32: v_accvgpr_read_b32
18
19define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
20entry:
21  br label %for.cond.preheader
22
23for.cond.preheader:
24  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
25  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
26  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
27  %inc = add nuw nsw i32 %c, 1
28  %cc = icmp eq i32 %inc, 16
29  br i1 %cc, label %exit, label %for.cond.preheader
30
31exit:
32  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
33  ret void
34}
35
36; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
37
38; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
39; 3 vgprs are needed to avoid wait states between writes.
40; Check that we do not use 32 temp sgprs as well.
41
42; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
43; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
44
45; GCN: [[LOOP:BB[0-9_]+]]:
46; GCN-NOT: v_accvgpr
47; GCN: v_mfma_f32_32x32x1f32
48; GCN-NOT: v_accvgpr
49; GCN: s_cbranch_scc1 [[LOOP]]
50
51; GCN-COUNT-32: v_accvgpr_read_b32
52
53define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
54entry:
55  br label %for.cond.preheader
56
57for.cond.preheader:
58  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
59  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
60  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
61  %inc = add nuw nsw i32 %c, 1
62  %cc = icmp eq i32 %inc, 16
63  br i1 %cc, label %exit, label %for.cond.preheader
64
65exit:
66  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
67  ret void
68}
69
70; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
71
72; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
73; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
74; GCN-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
75
76; GCN: [[LOOP:BB[0-9_]+]]:
77; GCN-NOT: v_accvgpr
78; GCN: v_mfma_f32_32x32x1f32
79; GCN-NOT: v_accvgpr
80; GCN: s_cbranch_scc1 [[LOOP]]
81
82; GCN-COUNT-32: v_accvgpr_read_b32
83
84define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
85entry:
86  br label %for.cond.preheader
87
88for.cond.preheader:
89  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
90  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
91  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
92  %inc = add nuw nsw i32 %c, 1
93  %cc = icmp eq i32 %inc, 16
94  br i1 %cc, label %exit, label %for.cond.preheader
95
96exit:
97  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
98  ret void
99}
100
101; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
102
103; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
104; 3 vgprs are needed to avoid wait states between writes.
105
106; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
107; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
108; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
109; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
110; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
111; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
112; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
113; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
114; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
115; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
116; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
117; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
118; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
119; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
120; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
121; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
122; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
123; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
124; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
125; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
126; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
127; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
128; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
129; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
130; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
131; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
132; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
133; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
134; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
135; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
136; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
137; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
138; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
139; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
140; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
141; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
142; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
143; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
144; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
145; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
146; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
147; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
148; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
149; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
150; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
151; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
152; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
153; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
154; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
155; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
156; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
157; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
158; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
159; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
160; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
161; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
162; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
163; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
164; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
165; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
166; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
167; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
168; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
169; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
170
171; GCN: [[LOOP:BB[0-9_]+]]:
172; GCN-NOT: v_accvgpr
173; GCN: v_mfma_f32_32x32x1f32
174; GCN-NOT: v_accvgpr
175; GCN: s_cbranch_scc1 [[LOOP]]
176
177; GCN-COUNT-32: v_accvgpr_read_b32
178
179define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
180entry:
181  br label %for.cond.preheader
182
183for.cond.preheader:
184  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
185  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
186  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
187  %inc = add nuw nsw i32 %c, 1
188  %cc = icmp eq i32 %inc, 16
189  br i1 %cc, label %exit, label %for.cond.preheader
190
191exit:
192  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
193  ret void
194}
195
196; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
197
198; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
199
200; GCN: [[LOOP:BB[0-9_]+]]:
201; GCN-NOT: v_accvgpr
202; GCN: v_mfma_f32_32x32x1f32
203; GCN-NOT: v_accvgpr
204; GCN: s_cbranch_scc1 [[LOOP]]
205
206; GCN-COUNT-32: v_accvgpr_read_b32
207
208define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
209entry:
210  %tid = call i32 @llvm.amdgcn.workitem.id.x()
211  %init = bitcast i32 %tid to float
212  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
213  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
214  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
215  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
216  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
217  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
218  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
219  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
220  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
221  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
222  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
223  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
224  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
225  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
226  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
227  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
228  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
229  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
230  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
231  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
232  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
233  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
234  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
235  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
236  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
237  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
238  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
239  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
240  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
241  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
242  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
243  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
244
245  br label %for.cond.preheader
246
247for.cond.preheader:
248  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
249  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
250  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
251  %inc = add nuw nsw i32 %c, 1
252  %cc = icmp eq i32 %inc, 16
253  br i1 %cc, label %exit, label %for.cond.preheader
254
255exit:
256  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
257  ret void
258}
259
260; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
261
262; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
263; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
264
265; GCN: [[LOOP:BB[0-9_]+]]:
266; GCN-NOT: v_accvgpr
267; GCN: v_mfma_f32_32x32x1f32
268; GCN-NOT: v_accvgpr
269; GCN: s_cbranch_scc1 [[LOOP]]
270
271; GCN-COUNT-32: v_accvgpr_read_b32
272
273define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
274entry:
275  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
276  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
277  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
278  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
279  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
280  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
281  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
282  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
283  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
284  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
285  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
286  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
287  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
288  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
289  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
290  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
291  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
292  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
293  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
294  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
295  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
296  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
297  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
298  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
299  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
300  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
301  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
302  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
303  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
304  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
305  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
306  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
307
308  br label %for.cond.preheader
309
310for.cond.preheader:
311  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
312  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
313  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
314  %inc = add nuw nsw i32 %c, 1
315  %cc = icmp eq i32 %inc, 16
316  br i1 %cc, label %exit, label %for.cond.preheader
317
318exit:
319  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
320  ret void
321}
322
323; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
324
325; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
326; GCN-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
327; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
328; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
329; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
330; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
331; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
332; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
333; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
334; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
335; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
336; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
337; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
338; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
339; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
340; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
341; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
342; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
343; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
344; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
345; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
346; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
347; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
348; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
349; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
350; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
351; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
352; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
353; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
354; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
355; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
356; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
357; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
358
359; GCN: [[LOOP:BB[0-9_]+]]:
360; GCN-NOT: v_accvgpr
361; GCN: v_mfma_f32_32x32x1f32
362; GCN-NOT: v_accvgpr
363; GCN: s_cbranch_scc1 [[LOOP]]
364
365; GCN-COUNT-32: v_accvgpr_read_b32
366
367define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
368entry:
369  %tid = call i32 @llvm.amdgcn.workitem.id.x()
370  %init = bitcast i32 %tid to float
371  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
372  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
373
374  br label %for.cond.preheader
375
376for.cond.preheader:
377  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
378  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
379  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
380  %inc = add nuw nsw i32 %c, 1
381  %cc = icmp eq i32 %inc, 16
382  br i1 %cc, label %exit, label %for.cond.preheader
383
384exit:
385  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
386  ret void
387}
388
389; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
390
391; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
392; GCN: v_mfma_f32_32x32x1f32
393; GCN-NOT: v_accvgpr
394
395; GCN: [[LOOP:BB[0-9_]+]]:
396; GCN-NOT: v_accvgpr
397; GCN: v_mfma_f32_32x32x1f32
398; GCN-NOT: v_accvgpr
399; GCN: s_cbranch_scc1 [[LOOP]]
400
401; GCN-COUNT-32: v_accvgpr_read_b32
402define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
403entry:
404  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
405
406  br label %for.cond.preheader
407
408for.cond.preheader:
409  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
410  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
411  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
412  %inc = add nuw nsw i32 %c, 1
413  %cc = icmp eq i32 %inc, 16
414  br i1 %cc, label %exit, label %for.cond.preheader
415
416exit:
417  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
418  ret void
419}
420
421; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
422
423; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
424; GCN: v_mfma_f32_32x32x1f32
425
426; Check that we are using only one tmp VGPR.
427
428; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
429; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
430
431; GCN: [[LOOP:BB[0-9_]+]]:
432; GCN-NOT: v_accvgpr
433; GCN: v_mfma_f32_32x32x1f32
434; GCN-NOT: v_accvgpr
435; GCN: s_cbranch_scc1 [[LOOP]]
436
437; GCN-COUNT-32: v_accvgpr_read_b32
438define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
439entry:
440  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
441  %init = extractelement <32 x float> %mai.0, i32 0
442  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
443  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
444  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
445  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
446  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
447  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
448  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
449  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
450  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
451  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
452  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
453  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
454  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
455  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
456  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
457  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
458  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
459  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
460  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
461  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
462  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
463  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
464  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
465  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
466  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
467  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
468  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
469  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
470  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
471  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
472  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
473  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
474
475  br label %for.cond.preheader
476
477for.cond.preheader:
478  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
479  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
480  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
481  %inc = add nuw nsw i32 %c, 1
482  %cc = icmp eq i32 %inc, 16
483  br i1 %cc, label %exit, label %for.cond.preheader
484
485exit:
486  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
487  ret void
488}
489
490; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
491
492; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
493
494; Check that we do not copy agprs to vgprs and back in an outer loop.
495
496; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
497; GCN-NOT: v_accvgpr
498; GCN: [[INNER_LOOP:BB[0-9_]+]]:
499; GCN-NOT: v_accvgpr
500; GCN: v_mfma_f32_32x32x1f32
501; GCN-NOT: v_accvgpr
502; GCN: s_cbranch_scc1 [[INNER_LOOP]]
503; GCN-NOT: v_accvgpr
504; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
505
506; Final result should be read only once after the loop.
507
508; GCN-COUNT-32: v_accvgpr_read_b32
509
510define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
511entry:
512  br label %for.cond.preheader
513
514for.cond.preheader:
515  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
516  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
517  br label %inner.for.cond.preheader
518
519inner.for.cond.preheader:
520  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
521  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
522  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
523  %inc = add nuw nsw i32 %c, 1
524  %cc = icmp eq i32 %inc, 16
525  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
526
527inner.exit:
528  %inc.0 = add nuw nsw i32 %c.0, 1
529  %cc.0 = icmp eq i32 %inc.0, 16
530  br i1 %cc.0, label %exit, label %for.cond.preheader
531
532exit:
533  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
534  ret void
535}
536
537declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
538declare i32 @llvm.amdgcn.workitem.id.x()
539