1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC %s
2; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC %s
3
4declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
5declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
6declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
7declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32)
8declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32)
9declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
10declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
11declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
12declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32)
13declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32)
14declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32)
15declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32)
16declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
17declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
18declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
19declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
20declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
21declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
22declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
23declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
24declare i32 @llvm.amdgcn.workitem.id.x()
25
26; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
27; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
28; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
29; GCN-DAG: s_load_dwordx16
30; GCN-DAG: s_load_dwordx16
31; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
33; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
34; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
35; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
36; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
37; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
38; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
39; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
40; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
41; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
42; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
43; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
44; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
45; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
46; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
47; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
48; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
49; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
50; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
51; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
52; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
53; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
54; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
55; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
56; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
57; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
58; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
59; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
60; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
61; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
62; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
63; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
64; GCN-DAG: v_accvgpr_read_b32
65; GCN-DAG: v_accvgpr_read_b32
66; GCN-DAG: v_accvgpr_read_b32
67; GCN-DAG: v_accvgpr_read_b32
68; GCN-DAG: v_accvgpr_read_b32
69; GCN-DAG: v_accvgpr_read_b32
70; GCN-DAG: v_accvgpr_read_b32
71; GCN-DAG: v_accvgpr_read_b32
72; GCN-DAG: v_accvgpr_read_b32
73; GCN-DAG: v_accvgpr_read_b32
74; GCN-DAG: v_accvgpr_read_b32
75; GCN-DAG: v_accvgpr_read_b32
76; GCN-DAG: v_accvgpr_read_b32
77; GCN-DAG: v_accvgpr_read_b32
78; GCN-DAG: v_accvgpr_read_b32
79; GCN-DAG: v_accvgpr_read_b32
80; GCN-DAG: v_accvgpr_read_b32
81; GCN-DAG: v_accvgpr_read_b32
82; GCN-DAG: v_accvgpr_read_b32
83; GCN-DAG: v_accvgpr_read_b32
84; GCN-DAG: v_accvgpr_read_b32
85; GCN-DAG: v_accvgpr_read_b32
86; GCN-DAG: v_accvgpr_read_b32
87; GCN-DAG: v_accvgpr_read_b32
88; GCN-DAG: v_accvgpr_read_b32
89; GCN-DAG: v_accvgpr_read_b32
90; GCN-DAG: v_accvgpr_read_b32
91; GCN-DAG: v_accvgpr_read_b32
92; GCN-DAG: v_accvgpr_read_b32
93; GCN-DAG: v_accvgpr_read_b32
94; GCN-DAG: v_accvgpr_read_b32
95; GCN-DAG: v_accvgpr_read_b32
96; GCN-DAG: global_store_dwordx4
97; GCN-DAG: global_store_dwordx4
98; GCN-DAG: global_store_dwordx4
99; GCN-DAG: global_store_dwordx4
100; GCN-DAG: global_store_dwordx4
101; GCN-DAG: global_store_dwordx4
102; GCN-DAG: global_store_dwordx4
103; GCN-DAG: global_store_dwordx4
104define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) {
105bb:
106  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
107  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
108  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
109  ret void
110}
111
112; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32:
113; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
114; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
115; GCN: s_load_dwordx16
116; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
117; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
118; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
119; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
120; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
121; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
122; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
123; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
124; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
125; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
126; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
127; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
128; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
129; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
130; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
131; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
132; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
133; GCN-DAG: v_accvgpr_read_b32
134; GCN-DAG: v_accvgpr_read_b32
135; GCN-DAG: v_accvgpr_read_b32
136; GCN-DAG: v_accvgpr_read_b32
137; GCN-DAG: v_accvgpr_read_b32
138; GCN-DAG: v_accvgpr_read_b32
139; GCN-DAG: v_accvgpr_read_b32
140; GCN-DAG: v_accvgpr_read_b32
141; GCN-DAG: v_accvgpr_read_b32
142; GCN-DAG: v_accvgpr_read_b32
143; GCN-DAG: v_accvgpr_read_b32
144; GCN-DAG: v_accvgpr_read_b32
145; GCN-DAG: v_accvgpr_read_b32
146; GCN-DAG: v_accvgpr_read_b32
147; GCN-DAG: v_accvgpr_read_b32
148; GCN-DAG: v_accvgpr_read_b32
149; GCN-DAG: global_store_dwordx4
150; GCN-DAG: global_store_dwordx4
151; GCN-DAG: global_store_dwordx4
152; GCN-DAG: global_store_dwordx4
153define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) {
154bb:
155  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
156  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
157  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
158  ret void
159}
160
161; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
162; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
163; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
164; GCN: s_load_dwordx4
165; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
166; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
167; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
168; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
169; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
170; GCN: v_accvgpr_read_b32
171; GCN: v_accvgpr_read_b32
172; GCN: v_accvgpr_read_b32
173; GCN: v_accvgpr_read_b32
174; GCN: global_store_dwordx4
175define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) {
176bb:
177  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
178  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
179  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
180  ret void
181}
182
183; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32:
184; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
185; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
186; GCN: s_load_dwordx16
187; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
188; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
189; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
190; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
191; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
192; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
193; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
194; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
195; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
196; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
197; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
198; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
199; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
200; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
201; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
202; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
203; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
204; GCN-DAG: v_accvgpr_read_b32
205; GCN-DAG: v_accvgpr_read_b32
206; GCN-DAG: v_accvgpr_read_b32
207; GCN-DAG: v_accvgpr_read_b32
208; GCN-DAG: v_accvgpr_read_b32
209; GCN-DAG: v_accvgpr_read_b32
210; GCN-DAG: v_accvgpr_read_b32
211; GCN-DAG: v_accvgpr_read_b32
212; GCN-DAG: v_accvgpr_read_b32
213; GCN-DAG: v_accvgpr_read_b32
214; GCN-DAG: v_accvgpr_read_b32
215; GCN-DAG: v_accvgpr_read_b32
216; GCN-DAG: v_accvgpr_read_b32
217; GCN-DAG: v_accvgpr_read_b32
218; GCN-DAG: v_accvgpr_read_b32
219; GCN-DAG: v_accvgpr_read_b32
220; GCN-DAG: global_store_dwordx4
221; GCN-DAG: global_store_dwordx4
222; GCN-DAG: global_store_dwordx4
223; GCN-DAG: global_store_dwordx4
224define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) {
225bb:
226  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
227  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3)
228  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
229  ret void
230}
231
232; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32:
233; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
234; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
235; GCN: s_load_dwordx4
236; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
237; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
238; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
239; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
240; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
241; GCN-DAG: v_accvgpr_read_b32
242; GCN-DAG: v_accvgpr_read_b32
243; GCN-DAG: v_accvgpr_read_b32
244; GCN-DAG: v_accvgpr_read_b32
245; GCN-DAG: global_store_dwordx4
246define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) {
247bb:
248  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
249  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3)
250  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
251  ret void
252}
253
254; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16:
255; GCN-DAG: s_load_dwordx16
256; GCN-DAG: s_load_dwordx16
257; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
258; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
259; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
260; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
261; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
262; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
263; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
264; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
265; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
266; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
267; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
268; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
269; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
270; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
271; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
272; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
273; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
274; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
275; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
276; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
277; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
278; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
279; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
280; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
281; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
282; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
283; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
284; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
285; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
286; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
287; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
288; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
289; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
290; GCN-DAG: v_accvgpr_read_b32
291; GCN-DAG: v_accvgpr_read_b32
292; GCN-DAG: v_accvgpr_read_b32
293; GCN-DAG: v_accvgpr_read_b32
294; GCN-DAG: v_accvgpr_read_b32
295; GCN-DAG: v_accvgpr_read_b32
296; GCN-DAG: v_accvgpr_read_b32
297; GCN-DAG: v_accvgpr_read_b32
298; GCN-DAG: v_accvgpr_read_b32
299; GCN-DAG: v_accvgpr_read_b32
300; GCN-DAG: v_accvgpr_read_b32
301; GCN-DAG: v_accvgpr_read_b32
302; GCN-DAG: v_accvgpr_read_b32
303; GCN-DAG: v_accvgpr_read_b32
304; GCN-DAG: v_accvgpr_read_b32
305; GCN-DAG: v_accvgpr_read_b32
306; GCN-DAG: v_accvgpr_read_b32
307; GCN-DAG: v_accvgpr_read_b32
308; GCN-DAG: v_accvgpr_read_b32
309; GCN-DAG: v_accvgpr_read_b32
310; GCN-DAG: v_accvgpr_read_b32
311; GCN-DAG: v_accvgpr_read_b32
312; GCN-DAG: v_accvgpr_read_b32
313; GCN-DAG: v_accvgpr_read_b32
314; GCN-DAG: v_accvgpr_read_b32
315; GCN-DAG: v_accvgpr_read_b32
316; GCN-DAG: v_accvgpr_read_b32
317; GCN-DAG: v_accvgpr_read_b32
318; GCN-DAG: v_accvgpr_read_b32
319; GCN-DAG: v_accvgpr_read_b32
320; GCN-DAG: v_accvgpr_read_b32
321; GCN-DAG: v_accvgpr_read_b32
322; GCN-DAG: global_store_dwordx4
323; GCN-DAG: global_store_dwordx4
324; GCN-DAG: global_store_dwordx4
325; GCN-DAG: global_store_dwordx4
326; GCN-DAG: global_store_dwordx4
327; GCN-DAG: global_store_dwordx4
328; GCN-DAG: global_store_dwordx4
329; GCN-DAG: global_store_dwordx4
330define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
331bb:
332  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
333  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
334  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
335  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
336  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3)
337  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
338  ret void
339}
340
341; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16:
342; GCN: s_load_dwordx16
343; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
344; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
345; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
346; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
347; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
348; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
349; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
350; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
351; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
352; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
353; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
354; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
355; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
356; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
357; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
358; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
359; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
360; GCN-DAG: v_accvgpr_read_b32
361; GCN-DAG: v_accvgpr_read_b32
362; GCN-DAG: v_accvgpr_read_b32
363; GCN-DAG: v_accvgpr_read_b32
364; GCN-DAG: v_accvgpr_read_b32
365; GCN-DAG: v_accvgpr_read_b32
366; GCN-DAG: v_accvgpr_read_b32
367; GCN-DAG: v_accvgpr_read_b32
368; GCN-DAG: v_accvgpr_read_b32
369; GCN-DAG: v_accvgpr_read_b32
370; GCN-DAG: v_accvgpr_read_b32
371; GCN-DAG: v_accvgpr_read_b32
372; GCN-DAG: v_accvgpr_read_b32
373; GCN-DAG: v_accvgpr_read_b32
374; GCN-DAG: v_accvgpr_read_b32
375; GCN-DAG: v_accvgpr_read_b32
376; GCN-DAG: global_store_dwordx4
377; GCN-DAG: global_store_dwordx4
378; GCN-DAG: global_store_dwordx4
379; GCN-DAG: global_store_dwordx4
380define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
381bb:
382  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
383  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
384  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
385  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
386  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
387  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
388  ret void
389}
390
391; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16:
392; GCN: s_load_dwordx4
393; GCN: s_load_dwordx2
394; GCN: s_load_dwordx2
395; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
396; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
397; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
398; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
399; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
400; GCN-DAG: v_accvgpr_read_b32
401; GCN-DAG: v_accvgpr_read_b32
402; GCN-DAG: v_accvgpr_read_b32
403; GCN-DAG: v_accvgpr_read_b32
404; GCN-DAG: global_store_dwordx4
405define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
406bb:
407  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
408  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
409  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
410  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
411  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
412  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
413  ret void
414}
415
416; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16:
417; GCN: s_load_dwordx16
418; GCN: s_waitcnt lgkmcnt(0)
419; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
420; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
421; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
422; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
423; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
424; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
425; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
426; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
427; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
428; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
429; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
430; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
431; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
432; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
433; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
434; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
435; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
436; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
437; GCN-DAG: v_accvgpr_read_b32
438; GCN-DAG: v_accvgpr_read_b32
439; GCN-DAG: v_accvgpr_read_b32
440; GCN-DAG: v_accvgpr_read_b32
441; GCN-DAG: v_accvgpr_read_b32
442; GCN-DAG: v_accvgpr_read_b32
443; GCN-DAG: v_accvgpr_read_b32
444; GCN-DAG: v_accvgpr_read_b32
445; GCN-DAG: v_accvgpr_read_b32
446; GCN-DAG: v_accvgpr_read_b32
447; GCN-DAG: v_accvgpr_read_b32
448; GCN-DAG: v_accvgpr_read_b32
449; GCN-DAG: v_accvgpr_read_b32
450; GCN-DAG: v_accvgpr_read_b32
451; GCN-DAG: v_accvgpr_read_b32
452; GCN-DAG: v_accvgpr_read_b32
453; GCN-DAG: global_store_dwordx4
454; GCN-DAG: global_store_dwordx4
455; GCN-DAG: global_store_dwordx4
456; GCN-DAG: global_store_dwordx4
457define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
458bb:
459  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
460  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
461  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
462  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
463  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3)
464  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
465  ret void
466}
467
468; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16:
469; GCN: s_load_dwordx4
470; GCN: s_load_dwordx4
471; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
472; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
473; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
474; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
475; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
476; GCN-DAG: v_accvgpr_read_b32
477; GCN-DAG: v_accvgpr_read_b32
478; GCN-DAG: v_accvgpr_read_b32
479; GCN-DAG: v_accvgpr_read_b32
480; GCN-DAG: global_store_dwordx4
481define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) {
482bb:
483  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
484  %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c
485  %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1
486  %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p
487  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3)
488  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
489  ret void
490}
491
492; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8:
493; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
494; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
495; GCN-DAG: s_load_dwordx16
496; GCN-DAG: s_load_dwordx16
497; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
498; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
499; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
500; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
501; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
502; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
503; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
504; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
505; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
506; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
507; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
508; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
509; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
510; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
511; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
512; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
513; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
514; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
515; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
516; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
517; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
518; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
519; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
520; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
521; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
522; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
523; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
524; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
525; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
526; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
527; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
528; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
529; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
530; GCN-DAG: v_accvgpr_read_b32
531; GCN-DAG: v_accvgpr_read_b32
532; GCN-DAG: v_accvgpr_read_b32
533; GCN-DAG: v_accvgpr_read_b32
534; GCN-DAG: v_accvgpr_read_b32
535; GCN-DAG: v_accvgpr_read_b32
536; GCN-DAG: v_accvgpr_read_b32
537; GCN-DAG: v_accvgpr_read_b32
538; GCN-DAG: v_accvgpr_read_b32
539; GCN-DAG: v_accvgpr_read_b32
540; GCN-DAG: v_accvgpr_read_b32
541; GCN-DAG: v_accvgpr_read_b32
542; GCN-DAG: v_accvgpr_read_b32
543; GCN-DAG: v_accvgpr_read_b32
544; GCN-DAG: v_accvgpr_read_b32
545; GCN-DAG: v_accvgpr_read_b32
546; GCN-DAG: v_accvgpr_read_b32
547; GCN-DAG: v_accvgpr_read_b32
548; GCN-DAG: v_accvgpr_read_b32
549; GCN-DAG: v_accvgpr_read_b32
550; GCN-DAG: v_accvgpr_read_b32
551; GCN-DAG: v_accvgpr_read_b32
552; GCN-DAG: v_accvgpr_read_b32
553; GCN-DAG: v_accvgpr_read_b32
554; GCN-DAG: v_accvgpr_read_b32
555; GCN-DAG: v_accvgpr_read_b32
556; GCN-DAG: v_accvgpr_read_b32
557; GCN-DAG: v_accvgpr_read_b32
558; GCN-DAG: v_accvgpr_read_b32
559; GCN-DAG: v_accvgpr_read_b32
560; GCN-DAG: v_accvgpr_read_b32
561; GCN-DAG: v_accvgpr_read_b32
562; GCN-DAG: global_store_dwordx4
563; GCN-DAG: global_store_dwordx4
564; GCN-DAG: global_store_dwordx4
565; GCN-DAG: global_store_dwordx4
566; GCN-DAG: global_store_dwordx4
567; GCN-DAG: global_store_dwordx4
568; GCN-DAG: global_store_dwordx4
569; GCN-DAG: global_store_dwordx4
570define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) {
571bb:
572  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
573  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3)
574  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
575  ret void
576}
577
578; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8:
579; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
580; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
581; GCN: s_load_dwordx16
582; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
583; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
584; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
585; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
586; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
587; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
588; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
589; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
590; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
591; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
592; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
593; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
594; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
595; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
596; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
597; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
598; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
599; GCN-DAG: v_accvgpr_read_b32
600; GCN-DAG: v_accvgpr_read_b32
601; GCN-DAG: v_accvgpr_read_b32
602; GCN-DAG: v_accvgpr_read_b32
603; GCN-DAG: v_accvgpr_read_b32
604; GCN-DAG: v_accvgpr_read_b32
605; GCN-DAG: v_accvgpr_read_b32
606; GCN-DAG: v_accvgpr_read_b32
607; GCN-DAG: v_accvgpr_read_b32
608; GCN-DAG: v_accvgpr_read_b32
609; GCN-DAG: v_accvgpr_read_b32
610; GCN-DAG: v_accvgpr_read_b32
611; GCN-DAG: v_accvgpr_read_b32
612; GCN-DAG: v_accvgpr_read_b32
613; GCN-DAG: v_accvgpr_read_b32
614; GCN-DAG: v_accvgpr_read_b32
615; GCN-DAG: global_store_dwordx4
616; GCN-DAG: global_store_dwordx4
617; GCN-DAG: global_store_dwordx4
618; GCN-DAG: global_store_dwordx4
619define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) {
620bb:
621  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
622  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
623  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
624  ret void
625}
626
627; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8:
628; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
629; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
630; GCN: s_load_dwordx4
631; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
632; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
633; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
634; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
635; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
636; GCN: v_accvgpr_read_b32
637; GCN: v_accvgpr_read_b32
638; GCN: v_accvgpr_read_b32
639; GCN: v_accvgpr_read_b32
640; GCN: global_store_dwordx4
641define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) {
642bb:
643  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
644  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
645  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
646  ret void
647}
648
649; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
650; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
651; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
652; GCN: s_load_dwordx16
653; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
654; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
655; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
656; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
657; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
658; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
659; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
660; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
661; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
662; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
663; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
664; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
665; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
666; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
667; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
668; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
669; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
670; GCN-DAG: v_accvgpr_read_b32
671; GCN-DAG: v_accvgpr_read_b32
672; GCN-DAG: v_accvgpr_read_b32
673; GCN-DAG: v_accvgpr_read_b32
674; GCN-DAG: v_accvgpr_read_b32
675; GCN-DAG: v_accvgpr_read_b32
676; GCN-DAG: v_accvgpr_read_b32
677; GCN-DAG: v_accvgpr_read_b32
678; GCN-DAG: v_accvgpr_read_b32
679; GCN-DAG: v_accvgpr_read_b32
680; GCN-DAG: v_accvgpr_read_b32
681; GCN-DAG: v_accvgpr_read_b32
682; GCN-DAG: v_accvgpr_read_b32
683; GCN-DAG: v_accvgpr_read_b32
684; GCN-DAG: v_accvgpr_read_b32
685; GCN-DAG: v_accvgpr_read_b32
686; GCN-DAG: global_store_dwordx4
687; GCN-DAG: global_store_dwordx4
688; GCN-DAG: global_store_dwordx4
689; GCN-DAG: global_store_dwordx4
690define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) {
691bb:
692  %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg
693  %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
694  store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg
695  ret void
696}
697
698; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
699; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
700; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
701; GCN: s_load_dwordx4
702; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
703; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
704; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
705; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
706; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
707; GCN-DAG: v_accvgpr_read_b32
708; GCN-DAG: v_accvgpr_read_b32
709; GCN-DAG: v_accvgpr_read_b32
710; GCN-DAG: v_accvgpr_read_b32
711; GCN-DAG: global_store_dwordx4
712define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) {
713bb:
714  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg
715  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
716  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg
717  ret void
718}
719
720; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
721; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
722; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
723; GCN-DAG: s_load_dwordx16
724; GCN-DAG: s_load_dwordx16
725; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
726; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
727; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
728; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
729; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
730; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
731; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
732; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
733; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
734; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
735; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
736; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
737; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
738; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
739; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
740; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
741; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
742; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
743; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
744; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
745; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
746; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
747; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
748; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
749; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
750; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
751; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
752; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
753; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
754; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
755; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
756; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
757; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
758; GCN-DAG: v_accvgpr_read_b32
759; GCN-DAG: v_accvgpr_read_b32
760; GCN-DAG: v_accvgpr_read_b32
761; GCN-DAG: v_accvgpr_read_b32
762; GCN-DAG: v_accvgpr_read_b32
763; GCN-DAG: v_accvgpr_read_b32
764; GCN-DAG: v_accvgpr_read_b32
765; GCN-DAG: v_accvgpr_read_b32
766; GCN-DAG: v_accvgpr_read_b32
767; GCN-DAG: v_accvgpr_read_b32
768; GCN-DAG: v_accvgpr_read_b32
769; GCN-DAG: v_accvgpr_read_b32
770; GCN-DAG: v_accvgpr_read_b32
771; GCN-DAG: v_accvgpr_read_b32
772; GCN-DAG: v_accvgpr_read_b32
773; GCN-DAG: v_accvgpr_read_b32
774; GCN-DAG: v_accvgpr_read_b32
775; GCN-DAG: v_accvgpr_read_b32
776; GCN-DAG: v_accvgpr_read_b32
777; GCN-DAG: v_accvgpr_read_b32
778; GCN-DAG: v_accvgpr_read_b32
779; GCN-DAG: v_accvgpr_read_b32
780; GCN-DAG: v_accvgpr_read_b32
781; GCN-DAG: v_accvgpr_read_b32
782; GCN-DAG: v_accvgpr_read_b32
783; GCN-DAG: v_accvgpr_read_b32
784; GCN-DAG: v_accvgpr_read_b32
785; GCN-DAG: v_accvgpr_read_b32
786; GCN-DAG: v_accvgpr_read_b32
787; GCN-DAG: v_accvgpr_read_b32
788; GCN-DAG: v_accvgpr_read_b32
789; GCN-DAG: v_accvgpr_read_b32
790; GCN-DAG: global_store_dwordx4
791; GCN-DAG: global_store_dwordx4
792; GCN-DAG: global_store_dwordx4
793; GCN-DAG: global_store_dwordx4
794; GCN-DAG: global_store_dwordx4
795; GCN-DAG: global_store_dwordx4
796; GCN-DAG: global_store_dwordx4
797; GCN-DAG: global_store_dwordx4
798define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) {
799bb:
800  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
801  %a = bitcast i32 1 to <2 x i16>
802  %b = bitcast i32 2 to <2 x i16>
803  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
804  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
805  ret void
806}
807
808; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16:
809; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
810; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
811; GCN: s_load_dwordx16
812; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
813; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
814; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
815; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
816; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
817; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
818; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
819; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
820; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
821; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
822; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
823; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
824; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
825; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
826; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
827; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
828; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
829; GCN-DAG: v_accvgpr_read_b32
830; GCN-DAG: v_accvgpr_read_b32
831; GCN-DAG: v_accvgpr_read_b32
832; GCN-DAG: v_accvgpr_read_b32
833; GCN-DAG: v_accvgpr_read_b32
834; GCN-DAG: v_accvgpr_read_b32
835; GCN-DAG: v_accvgpr_read_b32
836; GCN-DAG: v_accvgpr_read_b32
837; GCN-DAG: v_accvgpr_read_b32
838; GCN-DAG: v_accvgpr_read_b32
839; GCN-DAG: v_accvgpr_read_b32
840; GCN-DAG: v_accvgpr_read_b32
841; GCN-DAG: v_accvgpr_read_b32
842; GCN-DAG: v_accvgpr_read_b32
843; GCN-DAG: v_accvgpr_read_b32
844; GCN-DAG: v_accvgpr_read_b32
845; GCN-DAG: global_store_dwordx4
846; GCN-DAG: global_store_dwordx4
847; GCN-DAG: global_store_dwordx4
848; GCN-DAG: global_store_dwordx4
849define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) {
850bb:
851  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
852  %a = bitcast i32 1 to <2 x i16>
853  %b = bitcast i32 2 to <2 x i16>
854  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
855  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
856  ret void
857}
858
859; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16:
860; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
861; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
862; GCN: s_load_dwordx4
863; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
864; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
865; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
866; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
867; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
868; GCN-DAG: v_accvgpr_read_b32
869; GCN-DAG: v_accvgpr_read_b32
870; GCN-DAG: v_accvgpr_read_b32
871; GCN-DAG: v_accvgpr_read_b32
872; GCN-DAG: global_store_dwordx4
873define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) {
874bb:
875  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
876  %a = bitcast i32 1 to <2 x i16>
877  %b = bitcast i32 2 to <2 x i16>
878  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
879  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
880  ret void
881}
882
883; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16:
884; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
885; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
886; GCN: s_load_dwordx16
887; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
888; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
889; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
890; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
891; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
892; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
893; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
894; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
895; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
896; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
897; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
898; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
899; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
900; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
901; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
902; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
903; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
904; GCN-DAG: v_accvgpr_read_b32
905; GCN-DAG: v_accvgpr_read_b32
906; GCN-DAG: v_accvgpr_read_b32
907; GCN-DAG: v_accvgpr_read_b32
908; GCN-DAG: v_accvgpr_read_b32
909; GCN-DAG: v_accvgpr_read_b32
910; GCN-DAG: v_accvgpr_read_b32
911; GCN-DAG: v_accvgpr_read_b32
912; GCN-DAG: v_accvgpr_read_b32
913; GCN-DAG: v_accvgpr_read_b32
914; GCN-DAG: v_accvgpr_read_b32
915; GCN-DAG: v_accvgpr_read_b32
916; GCN-DAG: v_accvgpr_read_b32
917; GCN-DAG: v_accvgpr_read_b32
918; GCN-DAG: v_accvgpr_read_b32
919; GCN-DAG: v_accvgpr_read_b32
920; GCN-DAG: global_store_dwordx4
921; GCN-DAG: global_store_dwordx4
922; GCN-DAG: global_store_dwordx4
923; GCN-DAG: global_store_dwordx4
924define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) {
925bb:
926  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
927  %a = bitcast i32 1 to <2 x i16>
928  %b = bitcast i32 2 to <2 x i16>
929  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
930  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
931  ret void
932}
933
934; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16:
935; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
936; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
937; GCN: s_load_dwordx4
938; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
939; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
940; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
941; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
942; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
943; GCN-DAG: v_accvgpr_read_b32
944; GCN-DAG: v_accvgpr_read_b32
945; GCN-DAG: v_accvgpr_read_b32
946; GCN-DAG: v_accvgpr_read_b32
947; GCN-DAG: global_store_dwordx4
948define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) {
949bb:
950  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
951  %a = bitcast i32 1 to <2 x i16>
952  %b = bitcast i32 2 to <2 x i16>
953  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
954  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
955  ret void
956}
957
958; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc:
959; GCN:      v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
960; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
961define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) {
962bb:
963  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
964  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
965  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0)
966  store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg
967  ret void
968}
969
970; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc:
971; GCN:      v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
972; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
973define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) {
974bb:
975  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
976  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
977  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
978  store <16 x float> %mai.2, <16 x float> addrspace(1)* %arg
979  ret void
980}
981
982; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc:
983; GCN:      v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
984; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]]
985define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) {
986bb:
987  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
988  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
989  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
990  store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg
991  ret void
992}
993
994; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat:
995; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
996; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
997; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
998; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
999; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1000; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1001; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
1002; LIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
1003; GCN: v_accvgpr_read_b32
1004; GCN: v_accvgpr_read_b32
1005; GCN: v_accvgpr_read_b32
1006; GCN: v_accvgpr_read_b32
1007; GCN: global_store_dwordx4
1008define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) {
1009bb:
1010  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
1011  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
1012  ret void
1013}
1014
1015; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat:
1016; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
1017; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
1018; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1019; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
1020; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0
1021; GCN-DAG: v_accvgpr_read_b32
1022; GCN-DAG: v_accvgpr_read_b32
1023; GCN-DAG: v_accvgpr_read_b32
1024; GCN-DAG: v_accvgpr_read_b32
1025; GCN-DAG: v_accvgpr_read_b32
1026; GCN-DAG: v_accvgpr_read_b32
1027; GCN-DAG: v_accvgpr_read_b32
1028; GCN-DAG: v_accvgpr_read_b32
1029; GCN-DAG: v_accvgpr_read_b32
1030; GCN-DAG: v_accvgpr_read_b32
1031; GCN-DAG: v_accvgpr_read_b32
1032; GCN-DAG: v_accvgpr_read_b32
1033; GCN-DAG: v_accvgpr_read_b32
1034; GCN-DAG: v_accvgpr_read_b32
1035; GCN-DAG: v_accvgpr_read_b32
1036; GCN-DAG: v_accvgpr_read_b32
1037; GCN-DAG: global_store_dwordx4
1038; GCN-DAG: global_store_dwordx4
1039; GCN-DAG: global_store_dwordx4
1040; GCN-DAG: global_store_dwordx4
1041define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) {
1042bb:
1043  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
1044  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
1045  ret void
1046}
1047
1048; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat:
1049; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000
1050; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00
1051; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1052; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}]
1053; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0
1054; GCN-DAG: v_accvgpr_read_b32
1055; GCN-DAG: v_accvgpr_read_b32
1056; GCN-DAG: v_accvgpr_read_b32
1057; GCN-DAG: v_accvgpr_read_b32
1058; GCN-DAG: v_accvgpr_read_b32
1059; GCN-DAG: v_accvgpr_read_b32
1060; GCN-DAG: v_accvgpr_read_b32
1061; GCN-DAG: v_accvgpr_read_b32
1062; GCN-DAG: v_accvgpr_read_b32
1063; GCN-DAG: v_accvgpr_read_b32
1064; GCN-DAG: v_accvgpr_read_b32
1065; GCN-DAG: v_accvgpr_read_b32
1066; GCN-DAG: v_accvgpr_read_b32
1067; GCN-DAG: v_accvgpr_read_b32
1068; GCN-DAG: v_accvgpr_read_b32
1069; GCN-DAG: v_accvgpr_read_b32
1070; GCN-DAG: global_store_dwordx4
1071; GCN-DAG: global_store_dwordx4
1072; GCN-DAG: global_store_dwordx4
1073; GCN-DAG: global_store_dwordx4
1074define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) {
1075bb:
1076  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
1077  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
1078  ret void
1079}
1080
1081; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat:
1082; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
1083; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
1084; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1085; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}]
1086; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0
1087; GCN-DAG: v_accvgpr_read_b32
1088; GCN-DAG: v_accvgpr_read_b32
1089; GCN-DAG: v_accvgpr_read_b32
1090; GCN-DAG: v_accvgpr_read_b32
1091; GCN-DAG: v_accvgpr_read_b32
1092; GCN-DAG: v_accvgpr_read_b32
1093; GCN-DAG: v_accvgpr_read_b32
1094; GCN-DAG: v_accvgpr_read_b32
1095; GCN-DAG: v_accvgpr_read_b32
1096; GCN-DAG: v_accvgpr_read_b32
1097; GCN-DAG: v_accvgpr_read_b32
1098; GCN-DAG: v_accvgpr_read_b32
1099; GCN-DAG: v_accvgpr_read_b32
1100; GCN-DAG: v_accvgpr_read_b32
1101; GCN-DAG: v_accvgpr_read_b32
1102; GCN-DAG: v_accvgpr_read_b32
1103; GCN-DAG: v_accvgpr_read_b32
1104; GCN-DAG: v_accvgpr_read_b32
1105; GCN-DAG: v_accvgpr_read_b32
1106; GCN-DAG: v_accvgpr_read_b32
1107; GCN-DAG: v_accvgpr_read_b32
1108; GCN-DAG: v_accvgpr_read_b32
1109; GCN-DAG: v_accvgpr_read_b32
1110; GCN-DAG: v_accvgpr_read_b32
1111; GCN-DAG: v_accvgpr_read_b32
1112; GCN-DAG: v_accvgpr_read_b32
1113; GCN-DAG: v_accvgpr_read_b32
1114; GCN-DAG: v_accvgpr_read_b32
1115; GCN-DAG: v_accvgpr_read_b32
1116; GCN-DAG: v_accvgpr_read_b32
1117; GCN-DAG: v_accvgpr_read_b32
1118; GCN-DAG: v_accvgpr_read_b32
1119; GCN-DAG: global_store_dwordx4
1120; GCN-DAG: global_store_dwordx4
1121; GCN-DAG: global_store_dwordx4
1122; GCN-DAG: global_store_dwordx4
1123; GCN-DAG: global_store_dwordx4
1124; GCN-DAG: global_store_dwordx4
1125; GCN-DAG: global_store_dwordx4
1126; GCN-DAG: global_store_dwordx4
1127define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) {
1128bb:
1129  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
1130  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
1131  ret void
1132}
1133
1134; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm:
1135; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1136; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1137; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1138; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
1139; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
1140; GCN: v_accvgpr_read_b32
1141; GCN: v_accvgpr_read_b32
1142; GCN: v_accvgpr_read_b32
1143; GCN: v_accvgpr_read_b32
1144; GCN: global_store_dwordx4
1145define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) {
1146bb:
1147  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
1148  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
1149  ret void
1150}
1151
1152; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm:
1153; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1154; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0
1155; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1156; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1157; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1158; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1159; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1160; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1161; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1162; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1163; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1164; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1165; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1166; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1167; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1168; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1169; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
1170; GCN-DAG: v_accvgpr_read_b32
1171; GCN-DAG: v_accvgpr_read_b32
1172; GCN-DAG: v_accvgpr_read_b32
1173; GCN-DAG: v_accvgpr_read_b32
1174; GCN-DAG: v_accvgpr_read_b32
1175; GCN-DAG: v_accvgpr_read_b32
1176; GCN-DAG: v_accvgpr_read_b32
1177; GCN-DAG: v_accvgpr_read_b32
1178; GCN-DAG: v_accvgpr_read_b32
1179; GCN-DAG: v_accvgpr_read_b32
1180; GCN-DAG: v_accvgpr_read_b32
1181; GCN-DAG: v_accvgpr_read_b32
1182; GCN-DAG: v_accvgpr_read_b32
1183; GCN-DAG: v_accvgpr_read_b32
1184; GCN-DAG: v_accvgpr_read_b32
1185; GCN-DAG: v_accvgpr_read_b32
1186; GCN-DAG: global_store_dwordx4
1187; GCN-DAG: global_store_dwordx4
1188; GCN-DAG: global_store_dwordx4
1189; GCN-DAG: global_store_dwordx4
1190define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) {
1191bb:
1192  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0)
1193  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
1194  ret void
1195}
1196
1197; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm:
1198; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1199; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0
1200; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1201; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1202; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1203; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1204; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1205; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1206; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1207; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1208; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1209; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1210; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1211; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1212; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1213; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1214; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1215; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1216; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1217; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1218; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1219; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1220; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1221; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1222; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1223; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1224; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1225; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1226; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1227; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1228; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1229; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0
1230; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
1231; GCN-DAG: v_accvgpr_read_b32
1232; GCN-DAG: v_accvgpr_read_b32
1233; GCN-DAG: v_accvgpr_read_b32
1234; GCN-DAG: v_accvgpr_read_b32
1235; GCN-DAG: v_accvgpr_read_b32
1236; GCN-DAG: v_accvgpr_read_b32
1237; GCN-DAG: v_accvgpr_read_b32
1238; GCN-DAG: v_accvgpr_read_b32
1239; GCN-DAG: v_accvgpr_read_b32
1240; GCN-DAG: v_accvgpr_read_b32
1241; GCN-DAG: v_accvgpr_read_b32
1242; GCN-DAG: v_accvgpr_read_b32
1243; GCN-DAG: v_accvgpr_read_b32
1244; GCN-DAG: v_accvgpr_read_b32
1245; GCN-DAG: v_accvgpr_read_b32
1246; GCN-DAG: v_accvgpr_read_b32
1247; GCN-DAG: v_accvgpr_read_b32
1248; GCN-DAG: v_accvgpr_read_b32
1249; GCN-DAG: v_accvgpr_read_b32
1250; GCN-DAG: v_accvgpr_read_b32
1251; GCN-DAG: v_accvgpr_read_b32
1252; GCN-DAG: v_accvgpr_read_b32
1253; GCN-DAG: v_accvgpr_read_b32
1254; GCN-DAG: v_accvgpr_read_b32
1255; GCN-DAG: v_accvgpr_read_b32
1256; GCN-DAG: v_accvgpr_read_b32
1257; GCN-DAG: v_accvgpr_read_b32
1258; GCN-DAG: v_accvgpr_read_b32
1259; GCN-DAG: v_accvgpr_read_b32
1260; GCN-DAG: v_accvgpr_read_b32
1261; GCN-DAG: v_accvgpr_read_b32
1262; GCN-DAG: v_accvgpr_read_b32
1263; GCN-DAG: global_store_dwordx4
1264; GCN-DAG: global_store_dwordx4
1265; GCN-DAG: global_store_dwordx4
1266; GCN-DAG: global_store_dwordx4
1267; GCN-DAG: global_store_dwordx4
1268; GCN-DAG: global_store_dwordx4
1269; GCN-DAG: global_store_dwordx4
1270; GCN-DAG: global_store_dwordx4
1271define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) {
1272bb:
1273  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
1274  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
1275  ret void
1276}
1277
1278; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat:
1279; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
1280; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
1281; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
1282; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
1283; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
1284; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
1285; GCN: v_accvgpr_read_b32
1286; GCN: v_accvgpr_read_b32
1287; GCN: v_accvgpr_read_b32
1288; GCN: v_accvgpr_read_b32
1289; GCN: global_store_dwordx4
1290define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) {
1291bb:
1292  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1293  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
1294  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
1295  ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
1296  store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep
1297  ret void
1298}
1299
1300; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved
1301; in the middle of the expanded agpr reg_sequence. The broadcast of
1302; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
1303; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
1304; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000
1305; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]]
1306; GCN: s_nop 0
1307; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]]
1308; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]]
1309; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]]
1310; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]]
1311; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]]
1312; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]]
1313; GCN: s_nop 0
1314; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
1315; GCN: v_accvgpr_read_b32
1316; GCN: v_accvgpr_read_b32
1317; GCN: v_accvgpr_read_b32
1318; GCN: v_accvgpr_read_b32
1319; GCN: global_store_dwordx4
1320define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) {
1321bb:
1322  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1323  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
1324
1325  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
1326  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
1327  ret void
1328}
1329
1330; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
1331; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
1332; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
1333; GCN-DAG:         global_load_dwordx4
1334; GCN-DAG:         global_load_dwordx4
1335; GCN-DAG:         global_load_dwordx4
1336; GCN-DAG:         global_load_dwordx4
1337; GCN-DAG:         global_load_dwordx4
1338; GCN-DAG:         global_load_dwordx4
1339; GCN-DAG:         global_load_dwordx4
1340; GCN-DAG:         global_load_dwordx4
1341; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1342; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1343; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1344; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1345; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1346; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1347; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1348; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1349; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1350; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1351; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1352; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1353; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1354; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1355; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1356; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
1357; GCN:             v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
1358; GCN-COUNT-32:    v_accvgpr_read_b32
1359; GCN-COUNT-8:     global_store_dwordx4
1360define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) {
1361bb:
1362  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1363  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
1364  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
1365  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
1366  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
1367  ret void
1368}
1369