1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC %s 2; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC %s 3 4declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 5declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) 6declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) 7declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) 8declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) 9declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) 10declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) 11declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) 12declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) 13declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) 14declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) 15declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) 16declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) 17declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) 18declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) 19declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32) 20declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) 21declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) 22declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) 23declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) 24declare i32 @llvm.amdgcn.workitem.id.x() 25 26; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: 27; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 28; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 29; GCN-DAG: s_load_dwordx16 30; GCN-DAG: s_load_dwordx16 31; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 32; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 33; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 34; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 35; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 36; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 37; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 38; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 39; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 40; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 41; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 42; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 43; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 44; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 45; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 46; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 47; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 48; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 49; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 50; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 51; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 52; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 53; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 54; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 55; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 56; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 57; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 58; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 59; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 60; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 61; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 62; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 63; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 64; GCN-DAG: v_accvgpr_read_b32 65; GCN-DAG: v_accvgpr_read_b32 66; GCN-DAG: v_accvgpr_read_b32 67; GCN-DAG: v_accvgpr_read_b32 68; GCN-DAG: v_accvgpr_read_b32 69; GCN-DAG: v_accvgpr_read_b32 70; GCN-DAG: v_accvgpr_read_b32 71; GCN-DAG: v_accvgpr_read_b32 72; GCN-DAG: v_accvgpr_read_b32 73; GCN-DAG: v_accvgpr_read_b32 74; GCN-DAG: v_accvgpr_read_b32 75; GCN-DAG: v_accvgpr_read_b32 76; GCN-DAG: v_accvgpr_read_b32 77; GCN-DAG: v_accvgpr_read_b32 78; GCN-DAG: v_accvgpr_read_b32 79; GCN-DAG: v_accvgpr_read_b32 80; GCN-DAG: v_accvgpr_read_b32 81; GCN-DAG: v_accvgpr_read_b32 82; GCN-DAG: v_accvgpr_read_b32 83; GCN-DAG: v_accvgpr_read_b32 84; GCN-DAG: v_accvgpr_read_b32 85; GCN-DAG: v_accvgpr_read_b32 86; GCN-DAG: v_accvgpr_read_b32 87; GCN-DAG: v_accvgpr_read_b32 88; GCN-DAG: v_accvgpr_read_b32 89; GCN-DAG: v_accvgpr_read_b32 90; GCN-DAG: v_accvgpr_read_b32 91; GCN-DAG: v_accvgpr_read_b32 92; GCN-DAG: v_accvgpr_read_b32 93; GCN-DAG: v_accvgpr_read_b32 94; GCN-DAG: v_accvgpr_read_b32 95; GCN-DAG: v_accvgpr_read_b32 96; GCN-DAG: global_store_dwordx4 97; GCN-DAG: global_store_dwordx4 98; GCN-DAG: global_store_dwordx4 99; GCN-DAG: global_store_dwordx4 100; GCN-DAG: global_store_dwordx4 101; GCN-DAG: global_store_dwordx4 102; GCN-DAG: global_store_dwordx4 103; GCN-DAG: global_store_dwordx4 104define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) { 105bb: 106 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg 107 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 108 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 109 ret void 110} 111 112; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: 113; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 114; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 115; GCN: s_load_dwordx16 116; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 117; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 118; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 119; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 120; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 121; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 122; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 123; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 124; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 125; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 126; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 127; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 128; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 129; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 130; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 131; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 132; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 133; GCN-DAG: v_accvgpr_read_b32 134; GCN-DAG: v_accvgpr_read_b32 135; GCN-DAG: v_accvgpr_read_b32 136; GCN-DAG: v_accvgpr_read_b32 137; GCN-DAG: v_accvgpr_read_b32 138; GCN-DAG: v_accvgpr_read_b32 139; GCN-DAG: v_accvgpr_read_b32 140; GCN-DAG: v_accvgpr_read_b32 141; GCN-DAG: v_accvgpr_read_b32 142; GCN-DAG: v_accvgpr_read_b32 143; GCN-DAG: v_accvgpr_read_b32 144; GCN-DAG: v_accvgpr_read_b32 145; GCN-DAG: v_accvgpr_read_b32 146; GCN-DAG: v_accvgpr_read_b32 147; GCN-DAG: v_accvgpr_read_b32 148; GCN-DAG: v_accvgpr_read_b32 149; GCN-DAG: global_store_dwordx4 150; GCN-DAG: global_store_dwordx4 151; GCN-DAG: global_store_dwordx4 152; GCN-DAG: global_store_dwordx4 153define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) { 154bb: 155 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 156 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) 157 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 158 ret void 159} 160 161; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: 162; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 163; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 164; GCN: s_load_dwordx4 165; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 166; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 167; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 168; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 169; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 170; GCN: v_accvgpr_read_b32 171; GCN: v_accvgpr_read_b32 172; GCN: v_accvgpr_read_b32 173; GCN: v_accvgpr_read_b32 174; GCN: global_store_dwordx4 175define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) { 176bb: 177 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 178 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) 179 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 180 ret void 181} 182 183; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: 184; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 185; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 186; GCN: s_load_dwordx16 187; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 188; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 189; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 190; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 191; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 192; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 193; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 194; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 195; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 196; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 197; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 198; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 199; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 200; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 201; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 202; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 203; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 204; GCN-DAG: v_accvgpr_read_b32 205; GCN-DAG: v_accvgpr_read_b32 206; GCN-DAG: v_accvgpr_read_b32 207; GCN-DAG: v_accvgpr_read_b32 208; GCN-DAG: v_accvgpr_read_b32 209; GCN-DAG: v_accvgpr_read_b32 210; GCN-DAG: v_accvgpr_read_b32 211; GCN-DAG: v_accvgpr_read_b32 212; GCN-DAG: v_accvgpr_read_b32 213; GCN-DAG: v_accvgpr_read_b32 214; GCN-DAG: v_accvgpr_read_b32 215; GCN-DAG: v_accvgpr_read_b32 216; GCN-DAG: v_accvgpr_read_b32 217; GCN-DAG: v_accvgpr_read_b32 218; GCN-DAG: v_accvgpr_read_b32 219; GCN-DAG: v_accvgpr_read_b32 220; GCN-DAG: global_store_dwordx4 221; GCN-DAG: global_store_dwordx4 222; GCN-DAG: global_store_dwordx4 223; GCN-DAG: global_store_dwordx4 224define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) { 225bb: 226 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 227 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) 228 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 229 ret void 230} 231 232; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: 233; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 234; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 235; GCN: s_load_dwordx4 236; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 237; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 238; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 239; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 240; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 241; GCN-DAG: v_accvgpr_read_b32 242; GCN-DAG: v_accvgpr_read_b32 243; GCN-DAG: v_accvgpr_read_b32 244; GCN-DAG: v_accvgpr_read_b32 245; GCN-DAG: global_store_dwordx4 246define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) { 247bb: 248 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 249 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) 250 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 251 ret void 252} 253 254; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: 255; GCN-DAG: s_load_dwordx16 256; GCN-DAG: s_load_dwordx16 257; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 258; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 259; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 260; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 261; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 262; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 263; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 264; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 265; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 266; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 267; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 268; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 269; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 270; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 271; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 272; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 273; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 274; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 275; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 276; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 277; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 278; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 279; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 280; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 281; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 282; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 283; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 284; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 285; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 286; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 287; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 288; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 289; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 290; GCN-DAG: v_accvgpr_read_b32 291; GCN-DAG: v_accvgpr_read_b32 292; GCN-DAG: v_accvgpr_read_b32 293; GCN-DAG: v_accvgpr_read_b32 294; GCN-DAG: v_accvgpr_read_b32 295; GCN-DAG: v_accvgpr_read_b32 296; GCN-DAG: v_accvgpr_read_b32 297; GCN-DAG: v_accvgpr_read_b32 298; GCN-DAG: v_accvgpr_read_b32 299; GCN-DAG: v_accvgpr_read_b32 300; GCN-DAG: v_accvgpr_read_b32 301; GCN-DAG: v_accvgpr_read_b32 302; GCN-DAG: v_accvgpr_read_b32 303; GCN-DAG: v_accvgpr_read_b32 304; GCN-DAG: v_accvgpr_read_b32 305; GCN-DAG: v_accvgpr_read_b32 306; GCN-DAG: v_accvgpr_read_b32 307; GCN-DAG: v_accvgpr_read_b32 308; GCN-DAG: v_accvgpr_read_b32 309; GCN-DAG: v_accvgpr_read_b32 310; GCN-DAG: v_accvgpr_read_b32 311; GCN-DAG: v_accvgpr_read_b32 312; GCN-DAG: v_accvgpr_read_b32 313; GCN-DAG: v_accvgpr_read_b32 314; GCN-DAG: v_accvgpr_read_b32 315; GCN-DAG: v_accvgpr_read_b32 316; GCN-DAG: v_accvgpr_read_b32 317; GCN-DAG: v_accvgpr_read_b32 318; GCN-DAG: v_accvgpr_read_b32 319; GCN-DAG: v_accvgpr_read_b32 320; GCN-DAG: v_accvgpr_read_b32 321; GCN-DAG: v_accvgpr_read_b32 322; GCN-DAG: global_store_dwordx4 323; GCN-DAG: global_store_dwordx4 324; GCN-DAG: global_store_dwordx4 325; GCN-DAG: global_store_dwordx4 326; GCN-DAG: global_store_dwordx4 327; GCN-DAG: global_store_dwordx4 328; GCN-DAG: global_store_dwordx4 329; GCN-DAG: global_store_dwordx4 330define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { 331bb: 332 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg 333 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c 334 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 335 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p 336 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3) 337 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 338 ret void 339} 340 341; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: 342; GCN: s_load_dwordx16 343; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 344; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 345; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 346; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 347; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 348; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 349; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 350; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 351; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 352; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 353; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 354; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 355; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 356; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 357; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 358; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 359; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 360; GCN-DAG: v_accvgpr_read_b32 361; GCN-DAG: v_accvgpr_read_b32 362; GCN-DAG: v_accvgpr_read_b32 363; GCN-DAG: v_accvgpr_read_b32 364; GCN-DAG: v_accvgpr_read_b32 365; GCN-DAG: v_accvgpr_read_b32 366; GCN-DAG: v_accvgpr_read_b32 367; GCN-DAG: v_accvgpr_read_b32 368; GCN-DAG: v_accvgpr_read_b32 369; GCN-DAG: v_accvgpr_read_b32 370; GCN-DAG: v_accvgpr_read_b32 371; GCN-DAG: v_accvgpr_read_b32 372; GCN-DAG: v_accvgpr_read_b32 373; GCN-DAG: v_accvgpr_read_b32 374; GCN-DAG: v_accvgpr_read_b32 375; GCN-DAG: v_accvgpr_read_b32 376; GCN-DAG: global_store_dwordx4 377; GCN-DAG: global_store_dwordx4 378; GCN-DAG: global_store_dwordx4 379; GCN-DAG: global_store_dwordx4 380define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { 381bb: 382 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 383 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c 384 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 385 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p 386 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) 387 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 388 ret void 389} 390 391; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: 392; GCN: s_load_dwordx4 393; GCN: s_load_dwordx2 394; GCN: s_load_dwordx2 395; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 396; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 397; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 398; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 399; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 400; GCN-DAG: v_accvgpr_read_b32 401; GCN-DAG: v_accvgpr_read_b32 402; GCN-DAG: v_accvgpr_read_b32 403; GCN-DAG: v_accvgpr_read_b32 404; GCN-DAG: global_store_dwordx4 405define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { 406bb: 407 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 408 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c 409 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 410 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p 411 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) 412 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 413 ret void 414} 415 416; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: 417; GCN: s_load_dwordx16 418; GCN: s_waitcnt lgkmcnt(0) 419; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} 420; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 421; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 422; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 423; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 424; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 425; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 426; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 427; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 428; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 429; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 430; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 431; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 432; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 433; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 434; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 435; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 436; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 437; GCN-DAG: v_accvgpr_read_b32 438; GCN-DAG: v_accvgpr_read_b32 439; GCN-DAG: v_accvgpr_read_b32 440; GCN-DAG: v_accvgpr_read_b32 441; GCN-DAG: v_accvgpr_read_b32 442; GCN-DAG: v_accvgpr_read_b32 443; GCN-DAG: v_accvgpr_read_b32 444; GCN-DAG: v_accvgpr_read_b32 445; GCN-DAG: v_accvgpr_read_b32 446; GCN-DAG: v_accvgpr_read_b32 447; GCN-DAG: v_accvgpr_read_b32 448; GCN-DAG: v_accvgpr_read_b32 449; GCN-DAG: v_accvgpr_read_b32 450; GCN-DAG: v_accvgpr_read_b32 451; GCN-DAG: v_accvgpr_read_b32 452; GCN-DAG: v_accvgpr_read_b32 453; GCN-DAG: global_store_dwordx4 454; GCN-DAG: global_store_dwordx4 455; GCN-DAG: global_store_dwordx4 456; GCN-DAG: global_store_dwordx4 457define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { 458bb: 459 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 460 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c 461 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 462 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p 463 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) 464 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 465 ret void 466} 467 468; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: 469; GCN: s_load_dwordx4 470; GCN: s_load_dwordx4 471; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 472; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 473; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 474; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 475; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 476; GCN-DAG: v_accvgpr_read_b32 477; GCN-DAG: v_accvgpr_read_b32 478; GCN-DAG: v_accvgpr_read_b32 479; GCN-DAG: v_accvgpr_read_b32 480; GCN-DAG: global_store_dwordx4 481define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { 482bb: 483 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 484 %c.1 = load <4 x half>, <4 x half> addrspace(1)* %c 485 %c2p = getelementptr <4 x half>, <4 x half> addrspace(1)* %c, i64 1 486 %c.2 = load <4 x half>, <4 x half> addrspace(1)* %c2p 487 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) 488 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 489 ret void 490} 491 492; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: 493; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 494; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 495; GCN-DAG: s_load_dwordx16 496; GCN-DAG: s_load_dwordx16 497; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 498; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 499; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 500; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 501; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 502; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 503; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 504; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 505; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 506; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 507; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 508; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 509; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 510; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 511; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 512; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 513; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 514; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 515; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 516; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 517; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 518; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 519; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 520; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 521; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 522; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 523; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 524; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 525; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 526; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 527; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 528; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 529; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 530; GCN-DAG: v_accvgpr_read_b32 531; GCN-DAG: v_accvgpr_read_b32 532; GCN-DAG: v_accvgpr_read_b32 533; GCN-DAG: v_accvgpr_read_b32 534; GCN-DAG: v_accvgpr_read_b32 535; GCN-DAG: v_accvgpr_read_b32 536; GCN-DAG: v_accvgpr_read_b32 537; GCN-DAG: v_accvgpr_read_b32 538; GCN-DAG: v_accvgpr_read_b32 539; GCN-DAG: v_accvgpr_read_b32 540; GCN-DAG: v_accvgpr_read_b32 541; GCN-DAG: v_accvgpr_read_b32 542; GCN-DAG: v_accvgpr_read_b32 543; GCN-DAG: v_accvgpr_read_b32 544; GCN-DAG: v_accvgpr_read_b32 545; GCN-DAG: v_accvgpr_read_b32 546; GCN-DAG: v_accvgpr_read_b32 547; GCN-DAG: v_accvgpr_read_b32 548; GCN-DAG: v_accvgpr_read_b32 549; GCN-DAG: v_accvgpr_read_b32 550; GCN-DAG: v_accvgpr_read_b32 551; GCN-DAG: v_accvgpr_read_b32 552; GCN-DAG: v_accvgpr_read_b32 553; GCN-DAG: v_accvgpr_read_b32 554; GCN-DAG: v_accvgpr_read_b32 555; GCN-DAG: v_accvgpr_read_b32 556; GCN-DAG: v_accvgpr_read_b32 557; GCN-DAG: v_accvgpr_read_b32 558; GCN-DAG: v_accvgpr_read_b32 559; GCN-DAG: v_accvgpr_read_b32 560; GCN-DAG: v_accvgpr_read_b32 561; GCN-DAG: v_accvgpr_read_b32 562; GCN-DAG: global_store_dwordx4 563; GCN-DAG: global_store_dwordx4 564; GCN-DAG: global_store_dwordx4 565; GCN-DAG: global_store_dwordx4 566; GCN-DAG: global_store_dwordx4 567; GCN-DAG: global_store_dwordx4 568; GCN-DAG: global_store_dwordx4 569; GCN-DAG: global_store_dwordx4 570define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) { 571bb: 572 %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg 573 %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) 574 store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg 575 ret void 576} 577 578; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: 579; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 580; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 581; GCN: s_load_dwordx16 582; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 583; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 584; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 585; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 586; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 587; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 588; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 589; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 590; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 591; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 592; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 593; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 594; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 595; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 596; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 597; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 598; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 599; GCN-DAG: v_accvgpr_read_b32 600; GCN-DAG: v_accvgpr_read_b32 601; GCN-DAG: v_accvgpr_read_b32 602; GCN-DAG: v_accvgpr_read_b32 603; GCN-DAG: v_accvgpr_read_b32 604; GCN-DAG: v_accvgpr_read_b32 605; GCN-DAG: v_accvgpr_read_b32 606; GCN-DAG: v_accvgpr_read_b32 607; GCN-DAG: v_accvgpr_read_b32 608; GCN-DAG: v_accvgpr_read_b32 609; GCN-DAG: v_accvgpr_read_b32 610; GCN-DAG: v_accvgpr_read_b32 611; GCN-DAG: v_accvgpr_read_b32 612; GCN-DAG: v_accvgpr_read_b32 613; GCN-DAG: v_accvgpr_read_b32 614; GCN-DAG: v_accvgpr_read_b32 615; GCN-DAG: global_store_dwordx4 616; GCN-DAG: global_store_dwordx4 617; GCN-DAG: global_store_dwordx4 618; GCN-DAG: global_store_dwordx4 619define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) { 620bb: 621 %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg 622 %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) 623 store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg 624 ret void 625} 626 627; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: 628; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 629; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 630; GCN: s_load_dwordx4 631; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 632; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 633; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 634; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 635; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 636; GCN: v_accvgpr_read_b32 637; GCN: v_accvgpr_read_b32 638; GCN: v_accvgpr_read_b32 639; GCN: v_accvgpr_read_b32 640; GCN: global_store_dwordx4 641define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) { 642bb: 643 %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg 644 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) 645 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg 646 ret void 647} 648 649; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: 650; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 651; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 652; GCN: s_load_dwordx16 653; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 654; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 655; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 656; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 657; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 658; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 659; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 660; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 661; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 662; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 663; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 664; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 665; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 666; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 667; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 668; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 669; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 670; GCN-DAG: v_accvgpr_read_b32 671; GCN-DAG: v_accvgpr_read_b32 672; GCN-DAG: v_accvgpr_read_b32 673; GCN-DAG: v_accvgpr_read_b32 674; GCN-DAG: v_accvgpr_read_b32 675; GCN-DAG: v_accvgpr_read_b32 676; GCN-DAG: v_accvgpr_read_b32 677; GCN-DAG: v_accvgpr_read_b32 678; GCN-DAG: v_accvgpr_read_b32 679; GCN-DAG: v_accvgpr_read_b32 680; GCN-DAG: v_accvgpr_read_b32 681; GCN-DAG: v_accvgpr_read_b32 682; GCN-DAG: v_accvgpr_read_b32 683; GCN-DAG: v_accvgpr_read_b32 684; GCN-DAG: v_accvgpr_read_b32 685; GCN-DAG: v_accvgpr_read_b32 686; GCN-DAG: global_store_dwordx4 687; GCN-DAG: global_store_dwordx4 688; GCN-DAG: global_store_dwordx4 689; GCN-DAG: global_store_dwordx4 690define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) { 691bb: 692 %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg 693 %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) 694 store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg 695 ret void 696} 697 698; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: 699; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 700; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 701; GCN: s_load_dwordx4 702; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 703; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 704; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 705; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 706; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 707; GCN-DAG: v_accvgpr_read_b32 708; GCN-DAG: v_accvgpr_read_b32 709; GCN-DAG: v_accvgpr_read_b32 710; GCN-DAG: v_accvgpr_read_b32 711; GCN-DAG: global_store_dwordx4 712define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) { 713bb: 714 %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg 715 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) 716 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg 717 ret void 718} 719 720; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: 721; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 722; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 723; GCN-DAG: s_load_dwordx16 724; GCN-DAG: s_load_dwordx16 725; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 726; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 727; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 728; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 729; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 730; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 731; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 732; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 733; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 734; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 735; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 736; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 737; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 738; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 739; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 740; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 741; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 742; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 743; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 744; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 745; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 746; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 747; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 748; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 749; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 750; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 751; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 752; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 753; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 754; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 755; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 756; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 757; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 758; GCN-DAG: v_accvgpr_read_b32 759; GCN-DAG: v_accvgpr_read_b32 760; GCN-DAG: v_accvgpr_read_b32 761; GCN-DAG: v_accvgpr_read_b32 762; GCN-DAG: v_accvgpr_read_b32 763; GCN-DAG: v_accvgpr_read_b32 764; GCN-DAG: v_accvgpr_read_b32 765; GCN-DAG: v_accvgpr_read_b32 766; GCN-DAG: v_accvgpr_read_b32 767; GCN-DAG: v_accvgpr_read_b32 768; GCN-DAG: v_accvgpr_read_b32 769; GCN-DAG: v_accvgpr_read_b32 770; GCN-DAG: v_accvgpr_read_b32 771; GCN-DAG: v_accvgpr_read_b32 772; GCN-DAG: v_accvgpr_read_b32 773; GCN-DAG: v_accvgpr_read_b32 774; GCN-DAG: v_accvgpr_read_b32 775; GCN-DAG: v_accvgpr_read_b32 776; GCN-DAG: v_accvgpr_read_b32 777; GCN-DAG: v_accvgpr_read_b32 778; GCN-DAG: v_accvgpr_read_b32 779; GCN-DAG: v_accvgpr_read_b32 780; GCN-DAG: v_accvgpr_read_b32 781; GCN-DAG: v_accvgpr_read_b32 782; GCN-DAG: v_accvgpr_read_b32 783; GCN-DAG: v_accvgpr_read_b32 784; GCN-DAG: v_accvgpr_read_b32 785; GCN-DAG: v_accvgpr_read_b32 786; GCN-DAG: v_accvgpr_read_b32 787; GCN-DAG: v_accvgpr_read_b32 788; GCN-DAG: v_accvgpr_read_b32 789; GCN-DAG: v_accvgpr_read_b32 790; GCN-DAG: global_store_dwordx4 791; GCN-DAG: global_store_dwordx4 792; GCN-DAG: global_store_dwordx4 793; GCN-DAG: global_store_dwordx4 794; GCN-DAG: global_store_dwordx4 795; GCN-DAG: global_store_dwordx4 796; GCN-DAG: global_store_dwordx4 797; GCN-DAG: global_store_dwordx4 798define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) { 799bb: 800 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg 801 %a = bitcast i32 1 to <2 x i16> 802 %b = bitcast i32 2 to <2 x i16> 803 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) 804 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 805 ret void 806} 807 808; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: 809; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 810; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 811; GCN: s_load_dwordx16 812; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 813; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 814; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 815; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 816; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 817; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 818; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 819; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 820; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 821; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 822; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 823; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 824; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 825; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 826; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 827; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 828; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 829; GCN-DAG: v_accvgpr_read_b32 830; GCN-DAG: v_accvgpr_read_b32 831; GCN-DAG: v_accvgpr_read_b32 832; GCN-DAG: v_accvgpr_read_b32 833; GCN-DAG: v_accvgpr_read_b32 834; GCN-DAG: v_accvgpr_read_b32 835; GCN-DAG: v_accvgpr_read_b32 836; GCN-DAG: v_accvgpr_read_b32 837; GCN-DAG: v_accvgpr_read_b32 838; GCN-DAG: v_accvgpr_read_b32 839; GCN-DAG: v_accvgpr_read_b32 840; GCN-DAG: v_accvgpr_read_b32 841; GCN-DAG: v_accvgpr_read_b32 842; GCN-DAG: v_accvgpr_read_b32 843; GCN-DAG: v_accvgpr_read_b32 844; GCN-DAG: v_accvgpr_read_b32 845; GCN-DAG: global_store_dwordx4 846; GCN-DAG: global_store_dwordx4 847; GCN-DAG: global_store_dwordx4 848; GCN-DAG: global_store_dwordx4 849define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) { 850bb: 851 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 852 %a = bitcast i32 1 to <2 x i16> 853 %b = bitcast i32 2 to <2 x i16> 854 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 855 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 856 ret void 857} 858 859; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: 860; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 861; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 862; GCN: s_load_dwordx4 863; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 864; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 865; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 866; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 867; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 868; GCN-DAG: v_accvgpr_read_b32 869; GCN-DAG: v_accvgpr_read_b32 870; GCN-DAG: v_accvgpr_read_b32 871; GCN-DAG: v_accvgpr_read_b32 872; GCN-DAG: global_store_dwordx4 873define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) { 874bb: 875 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 876 %a = bitcast i32 1 to <2 x i16> 877 %b = bitcast i32 2 to <2 x i16> 878 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 879 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 880 ret void 881} 882 883; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: 884; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 885; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 886; GCN: s_load_dwordx16 887; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 888; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 889; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 890; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 891; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 892; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 893; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 894; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 895; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 896; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 897; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 898; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 899; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 900; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 901; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 902; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 903; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 904; GCN-DAG: v_accvgpr_read_b32 905; GCN-DAG: v_accvgpr_read_b32 906; GCN-DAG: v_accvgpr_read_b32 907; GCN-DAG: v_accvgpr_read_b32 908; GCN-DAG: v_accvgpr_read_b32 909; GCN-DAG: v_accvgpr_read_b32 910; GCN-DAG: v_accvgpr_read_b32 911; GCN-DAG: v_accvgpr_read_b32 912; GCN-DAG: v_accvgpr_read_b32 913; GCN-DAG: v_accvgpr_read_b32 914; GCN-DAG: v_accvgpr_read_b32 915; GCN-DAG: v_accvgpr_read_b32 916; GCN-DAG: v_accvgpr_read_b32 917; GCN-DAG: v_accvgpr_read_b32 918; GCN-DAG: v_accvgpr_read_b32 919; GCN-DAG: v_accvgpr_read_b32 920; GCN-DAG: global_store_dwordx4 921; GCN-DAG: global_store_dwordx4 922; GCN-DAG: global_store_dwordx4 923; GCN-DAG: global_store_dwordx4 924define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) { 925bb: 926 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 927 %a = bitcast i32 1 to <2 x i16> 928 %b = bitcast i32 2 to <2 x i16> 929 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) 930 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 931 ret void 932} 933 934; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: 935; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 936; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 937; GCN: s_load_dwordx4 938; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 939; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 940; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 941; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 942; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 943; GCN-DAG: v_accvgpr_read_b32 944; GCN-DAG: v_accvgpr_read_b32 945; GCN-DAG: v_accvgpr_read_b32 946; GCN-DAG: v_accvgpr_read_b32 947; GCN-DAG: global_store_dwordx4 948define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) { 949bb: 950 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 951 %a = bitcast i32 1 to <2 x i16> 952 %b = bitcast i32 2 to <2 x i16> 953 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) 954 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 955 ret void 956} 957 958; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: 959; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 960; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 961define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) { 962bb: 963 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg 964 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 965 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) 966 store <32 x float> %mai.2, <32 x float> addrspace(1)* %arg 967 ret void 968} 969 970; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: 971; GCN: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 972; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 973define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) { 974bb: 975 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 976 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) 977 %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) 978 store <16 x float> %mai.2, <16 x float> addrspace(1)* %arg 979 ret void 980} 981 982; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: 983; GCN: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 984; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] 985define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) { 986bb: 987 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 988 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) 989 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) 990 store <4 x float> %mai.2, <4 x float> addrspace(1)* %arg 991 ret void 992} 993 994; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: 995; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 996; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 997; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 998; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 999; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1000; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1001; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 1002; LIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 1003; GCN: v_accvgpr_read_b32 1004; GCN: v_accvgpr_read_b32 1005; GCN: v_accvgpr_read_b32 1006; GCN: v_accvgpr_read_b32 1007; GCN: global_store_dwordx4 1008define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) { 1009bb: 1010 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 1011 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 1012 ret void 1013} 1014 1015; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: 1016; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 1017; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 1018; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1019; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 1020; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 1021; GCN-DAG: v_accvgpr_read_b32 1022; GCN-DAG: v_accvgpr_read_b32 1023; GCN-DAG: v_accvgpr_read_b32 1024; GCN-DAG: v_accvgpr_read_b32 1025; GCN-DAG: v_accvgpr_read_b32 1026; GCN-DAG: v_accvgpr_read_b32 1027; GCN-DAG: v_accvgpr_read_b32 1028; GCN-DAG: v_accvgpr_read_b32 1029; GCN-DAG: v_accvgpr_read_b32 1030; GCN-DAG: v_accvgpr_read_b32 1031; GCN-DAG: v_accvgpr_read_b32 1032; GCN-DAG: v_accvgpr_read_b32 1033; GCN-DAG: v_accvgpr_read_b32 1034; GCN-DAG: v_accvgpr_read_b32 1035; GCN-DAG: v_accvgpr_read_b32 1036; GCN-DAG: v_accvgpr_read_b32 1037; GCN-DAG: global_store_dwordx4 1038; GCN-DAG: global_store_dwordx4 1039; GCN-DAG: global_store_dwordx4 1040; GCN-DAG: global_store_dwordx4 1041define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) { 1042bb: 1043 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 1044 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 1045 ret void 1046} 1047 1048; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: 1049; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 1050; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 1051; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1052; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] 1053; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 1054; GCN-DAG: v_accvgpr_read_b32 1055; GCN-DAG: v_accvgpr_read_b32 1056; GCN-DAG: v_accvgpr_read_b32 1057; GCN-DAG: v_accvgpr_read_b32 1058; GCN-DAG: v_accvgpr_read_b32 1059; GCN-DAG: v_accvgpr_read_b32 1060; GCN-DAG: v_accvgpr_read_b32 1061; GCN-DAG: v_accvgpr_read_b32 1062; GCN-DAG: v_accvgpr_read_b32 1063; GCN-DAG: v_accvgpr_read_b32 1064; GCN-DAG: v_accvgpr_read_b32 1065; GCN-DAG: v_accvgpr_read_b32 1066; GCN-DAG: v_accvgpr_read_b32 1067; GCN-DAG: v_accvgpr_read_b32 1068; GCN-DAG: v_accvgpr_read_b32 1069; GCN-DAG: v_accvgpr_read_b32 1070; GCN-DAG: global_store_dwordx4 1071; GCN-DAG: global_store_dwordx4 1072; GCN-DAG: global_store_dwordx4 1073; GCN-DAG: global_store_dwordx4 1074define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) { 1075bb: 1076 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 1077 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 1078 ret void 1079} 1080 1081; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: 1082; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 1083; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 1084; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1085; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] 1086; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 1087; GCN-DAG: v_accvgpr_read_b32 1088; GCN-DAG: v_accvgpr_read_b32 1089; GCN-DAG: v_accvgpr_read_b32 1090; GCN-DAG: v_accvgpr_read_b32 1091; GCN-DAG: v_accvgpr_read_b32 1092; GCN-DAG: v_accvgpr_read_b32 1093; GCN-DAG: v_accvgpr_read_b32 1094; GCN-DAG: v_accvgpr_read_b32 1095; GCN-DAG: v_accvgpr_read_b32 1096; GCN-DAG: v_accvgpr_read_b32 1097; GCN-DAG: v_accvgpr_read_b32 1098; GCN-DAG: v_accvgpr_read_b32 1099; GCN-DAG: v_accvgpr_read_b32 1100; GCN-DAG: v_accvgpr_read_b32 1101; GCN-DAG: v_accvgpr_read_b32 1102; GCN-DAG: v_accvgpr_read_b32 1103; GCN-DAG: v_accvgpr_read_b32 1104; GCN-DAG: v_accvgpr_read_b32 1105; GCN-DAG: v_accvgpr_read_b32 1106; GCN-DAG: v_accvgpr_read_b32 1107; GCN-DAG: v_accvgpr_read_b32 1108; GCN-DAG: v_accvgpr_read_b32 1109; GCN-DAG: v_accvgpr_read_b32 1110; GCN-DAG: v_accvgpr_read_b32 1111; GCN-DAG: v_accvgpr_read_b32 1112; GCN-DAG: v_accvgpr_read_b32 1113; GCN-DAG: v_accvgpr_read_b32 1114; GCN-DAG: v_accvgpr_read_b32 1115; GCN-DAG: v_accvgpr_read_b32 1116; GCN-DAG: v_accvgpr_read_b32 1117; GCN-DAG: v_accvgpr_read_b32 1118; GCN-DAG: v_accvgpr_read_b32 1119; GCN-DAG: global_store_dwordx4 1120; GCN-DAG: global_store_dwordx4 1121; GCN-DAG: global_store_dwordx4 1122; GCN-DAG: global_store_dwordx4 1123; GCN-DAG: global_store_dwordx4 1124; GCN-DAG: global_store_dwordx4 1125; GCN-DAG: global_store_dwordx4 1126; GCN-DAG: global_store_dwordx4 1127define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) { 1128bb: 1129 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0) 1130 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 1131 ret void 1132} 1133 1134; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: 1135; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1136; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1137; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1138; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 1139; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 1140; GCN: v_accvgpr_read_b32 1141; GCN: v_accvgpr_read_b32 1142; GCN: v_accvgpr_read_b32 1143; GCN: v_accvgpr_read_b32 1144; GCN: global_store_dwordx4 1145define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) { 1146bb: 1147 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0) 1148 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 1149 ret void 1150} 1151 1152; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: 1153; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1154; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 1155; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1156; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1157; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1158; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1159; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1160; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1161; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1162; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1163; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1164; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1165; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1166; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1167; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1168; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1169; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 1170; GCN-DAG: v_accvgpr_read_b32 1171; GCN-DAG: v_accvgpr_read_b32 1172; GCN-DAG: v_accvgpr_read_b32 1173; GCN-DAG: v_accvgpr_read_b32 1174; GCN-DAG: v_accvgpr_read_b32 1175; GCN-DAG: v_accvgpr_read_b32 1176; GCN-DAG: v_accvgpr_read_b32 1177; GCN-DAG: v_accvgpr_read_b32 1178; GCN-DAG: v_accvgpr_read_b32 1179; GCN-DAG: v_accvgpr_read_b32 1180; GCN-DAG: v_accvgpr_read_b32 1181; GCN-DAG: v_accvgpr_read_b32 1182; GCN-DAG: v_accvgpr_read_b32 1183; GCN-DAG: v_accvgpr_read_b32 1184; GCN-DAG: v_accvgpr_read_b32 1185; GCN-DAG: v_accvgpr_read_b32 1186; GCN-DAG: global_store_dwordx4 1187; GCN-DAG: global_store_dwordx4 1188; GCN-DAG: global_store_dwordx4 1189; GCN-DAG: global_store_dwordx4 1190define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) { 1191bb: 1192 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 2.0>, i32 0, i32 0, i32 0) 1193 store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg 1194 ret void 1195} 1196 1197; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: 1198; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1199; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 1200; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1201; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1202; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1203; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1204; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1205; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1206; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1207; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1208; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1209; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1210; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1211; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1212; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1213; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1214; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1215; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1216; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1217; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1218; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1219; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1220; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1221; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1222; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1223; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1224; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1225; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1226; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1227; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1228; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1229; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 1230; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 1231; GCN-DAG: v_accvgpr_read_b32 1232; GCN-DAG: v_accvgpr_read_b32 1233; GCN-DAG: v_accvgpr_read_b32 1234; GCN-DAG: v_accvgpr_read_b32 1235; GCN-DAG: v_accvgpr_read_b32 1236; GCN-DAG: v_accvgpr_read_b32 1237; GCN-DAG: v_accvgpr_read_b32 1238; GCN-DAG: v_accvgpr_read_b32 1239; GCN-DAG: v_accvgpr_read_b32 1240; GCN-DAG: v_accvgpr_read_b32 1241; GCN-DAG: v_accvgpr_read_b32 1242; GCN-DAG: v_accvgpr_read_b32 1243; GCN-DAG: v_accvgpr_read_b32 1244; GCN-DAG: v_accvgpr_read_b32 1245; GCN-DAG: v_accvgpr_read_b32 1246; GCN-DAG: v_accvgpr_read_b32 1247; GCN-DAG: v_accvgpr_read_b32 1248; GCN-DAG: v_accvgpr_read_b32 1249; GCN-DAG: v_accvgpr_read_b32 1250; GCN-DAG: v_accvgpr_read_b32 1251; GCN-DAG: v_accvgpr_read_b32 1252; GCN-DAG: v_accvgpr_read_b32 1253; GCN-DAG: v_accvgpr_read_b32 1254; GCN-DAG: v_accvgpr_read_b32 1255; GCN-DAG: v_accvgpr_read_b32 1256; GCN-DAG: v_accvgpr_read_b32 1257; GCN-DAG: v_accvgpr_read_b32 1258; GCN-DAG: v_accvgpr_read_b32 1259; GCN-DAG: v_accvgpr_read_b32 1260; GCN-DAG: v_accvgpr_read_b32 1261; GCN-DAG: v_accvgpr_read_b32 1262; GCN-DAG: v_accvgpr_read_b32 1263; GCN-DAG: global_store_dwordx4 1264; GCN-DAG: global_store_dwordx4 1265; GCN-DAG: global_store_dwordx4 1266; GCN-DAG: global_store_dwordx4 1267; GCN-DAG: global_store_dwordx4 1268; GCN-DAG: global_store_dwordx4 1269; GCN-DAG: global_store_dwordx4 1270; GCN-DAG: global_store_dwordx4 1271define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) { 1272bb: 1273 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0) 1274 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 1275 ret void 1276} 1277 1278; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: 1279; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 1280; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 1281; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 1282; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 1283; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 1284; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 1285; GCN: v_accvgpr_read_b32 1286; GCN: v_accvgpr_read_b32 1287; GCN: v_accvgpr_read_b32 1288; GCN: v_accvgpr_read_b32 1289; GCN: global_store_dwordx4 1290define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) { 1291bb: 1292 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1293 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid 1294 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0) 1295 ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 1296 store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep 1297 ret void 1298} 1299 1300; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved 1301; in the middle of the expanded agpr reg_sequence. The broadcast of 1302; the individual AGPR->AGPR components should avoid the intermediate AGPR case. 1303; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: 1304; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000 1305; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]] 1306; GCN: s_nop 0 1307; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]] 1308; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]] 1309; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]] 1310; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]] 1311; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]] 1312; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]] 1313; GCN: s_nop 0 1314; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] 1315; GCN: v_accvgpr_read_b32 1316; GCN: v_accvgpr_read_b32 1317; GCN: v_accvgpr_read_b32 1318; GCN: v_accvgpr_read_b32 1319; GCN: global_store_dwordx4 1320define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) { 1321bb: 1322 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1323 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid 1324 1325 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0) 1326 store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg 1327 ret void 1328} 1329 1330; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: 1331; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 1332; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 1333; GCN-DAG: global_load_dwordx4 1334; GCN-DAG: global_load_dwordx4 1335; GCN-DAG: global_load_dwordx4 1336; GCN-DAG: global_load_dwordx4 1337; GCN-DAG: global_load_dwordx4 1338; GCN-DAG: global_load_dwordx4 1339; GCN-DAG: global_load_dwordx4 1340; GCN-DAG: global_load_dwordx4 1341; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1342; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1343; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1344; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1345; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1346; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1347; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1348; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1349; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1350; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1351; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1352; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1353; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1354; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1355; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1356; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 1357; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 1358; GCN-COUNT-32: v_accvgpr_read_b32 1359; GCN-COUNT-8: global_store_dwordx4 1360define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) { 1361bb: 1362 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1363 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 1364 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 1365 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 1366 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep 1367 ret void 1368} 1369