1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 2 3; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 4 5; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 6 7; Check that we do not copy agprs to vgprs and back inside the loop. 8 9; GCN: [[LOOP:BB[0-9_]+]]: 10; GCN-NOT: v_accvgpr 11; GCN: v_mfma_f32_32x32x1f32 12; GCN-NOT: v_accvgpr 13; GCN: s_cbranch_scc1 [[LOOP]] 14 15; Final result should be read only once after the loop. 16 17; GCN-COUNT-32: v_accvgpr_read_b32 18 19define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { 20entry: 21 br label %for.cond.preheader 22 23for.cond.preheader: 24 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 25 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 26 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 27 %inc = add nuw nsw i32 %c, 1 28 %cc = icmp eq i32 %inc, 16 29 br i1 %cc, label %exit, label %for.cond.preheader 30 31exit: 32 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 33 ret void 34} 35 36; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 37 38; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 39; 3 vgprs are needed to avoid wait states between writes. 40; Check that we do not use 32 temp sgprs as well. 41 42; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 43; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 44 45; GCN: [[LOOP:BB[0-9_]+]]: 46; GCN-NOT: v_accvgpr 47; GCN: v_mfma_f32_32x32x1f32 48; GCN-NOT: v_accvgpr 49; GCN: s_cbranch_scc1 [[LOOP]] 50 51; GCN-COUNT-32: v_accvgpr_read_b32 52 53define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) { 54entry: 55 br label %for.cond.preheader 56 57for.cond.preheader: 58 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 59 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 60 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 61 %inc = add nuw nsw i32 %c, 1 62 %cc = icmp eq i32 %inc, 16 63 br i1 %cc, label %exit, label %for.cond.preheader 64 65exit: 66 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 67 ret void 68} 69 70; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 71 72; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 73; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 74; GCN-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 75 76; GCN: [[LOOP:BB[0-9_]+]]: 77; GCN-NOT: v_accvgpr 78; GCN: v_mfma_f32_32x32x1f32 79; GCN-NOT: v_accvgpr 80; GCN: s_cbranch_scc1 [[LOOP]] 81 82; GCN-COUNT-32: v_accvgpr_read_b32 83 84define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) { 85entry: 86 br label %for.cond.preheader 87 88for.cond.preheader: 89 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 90 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 91 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 92 %inc = add nuw nsw i32 %c, 1 93 %cc = icmp eq i32 %inc, 16 94 br i1 %cc, label %exit, label %for.cond.preheader 95 96exit: 97 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 98 ret void 99} 100 101; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 102 103; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 104; 3 vgprs are needed to avoid wait states between writes. 105 106; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 107; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 108; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 109; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 110; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 111; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 112; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 113; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 114; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 115; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 116; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 117; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 118; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 119; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 120; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 121; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 122; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 123; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 124; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 125; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 126; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 127; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 128; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 129; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 130; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 131; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 132; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 133; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 134; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 135; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 136; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 137; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 138; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 139; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 140; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 141; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 142; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 143; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 144; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 145; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 146; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 147; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 148; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 149; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 150; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 151; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 152; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 153; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 154; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 155; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 156; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 157; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 158; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 159; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 160; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 161; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 162; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 163; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 164; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 165; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 166; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 167; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 168; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 169; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 170 171; GCN: [[LOOP:BB[0-9_]+]]: 172; GCN-NOT: v_accvgpr 173; GCN: v_mfma_f32_32x32x1f32 174; GCN-NOT: v_accvgpr 175; GCN: s_cbranch_scc1 [[LOOP]] 176 177; GCN-COUNT-32: v_accvgpr_read_b32 178 179define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) { 180entry: 181 br label %for.cond.preheader 182 183for.cond.preheader: 184 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 185 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 186 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 187 %inc = add nuw nsw i32 %c, 1 188 %cc = icmp eq i32 %inc, 16 189 br i1 %cc, label %exit, label %for.cond.preheader 190 191exit: 192 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 193 ret void 194} 195 196; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 197 198; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 199 200; GCN: [[LOOP:BB[0-9_]+]]: 201; GCN-NOT: v_accvgpr 202; GCN: v_mfma_f32_32x32x1f32 203; GCN-NOT: v_accvgpr 204; GCN: s_cbranch_scc1 [[LOOP]] 205 206; GCN-COUNT-32: v_accvgpr_read_b32 207 208define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) { 209entry: 210 %tid = call i32 @llvm.amdgcn.workitem.id.x() 211 %init = bitcast i32 %tid to float 212 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 213 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 214 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 215 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 216 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 217 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 218 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 219 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 220 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 221 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 222 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 223 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 224 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 225 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 226 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 227 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 228 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 229 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 230 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 231 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 232 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 233 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 234 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 235 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 236 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 237 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 238 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 239 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 240 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 241 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 242 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 243 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 244 245 br label %for.cond.preheader 246 247for.cond.preheader: 248 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 249 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 250 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 251 %inc = add nuw nsw i32 %c, 1 252 %cc = icmp eq i32 %inc, 16 253 br i1 %cc, label %exit, label %for.cond.preheader 254 255exit: 256 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 257 ret void 258} 259 260; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 261 262; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 263; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 264 265; GCN: [[LOOP:BB[0-9_]+]]: 266; GCN-NOT: v_accvgpr 267; GCN: v_mfma_f32_32x32x1f32 268; GCN-NOT: v_accvgpr 269; GCN: s_cbranch_scc1 [[LOOP]] 270 271; GCN-COUNT-32: v_accvgpr_read_b32 272 273define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) { 274entry: 275 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 276 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 277 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 278 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 279 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 280 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 281 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 282 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 283 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 284 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 285 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 286 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 287 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 288 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 289 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 290 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 291 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 292 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 293 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 294 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 295 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 296 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 297 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 298 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 299 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 300 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 301 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 302 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 303 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 304 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 305 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 306 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 307 308 br label %for.cond.preheader 309 310for.cond.preheader: 311 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 312 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 313 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 314 %inc = add nuw nsw i32 %c, 1 315 %cc = icmp eq i32 %inc, 16 316 br i1 %cc, label %exit, label %for.cond.preheader 317 318exit: 319 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 320 ret void 321} 322 323; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 324 325; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 326; GCN-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 327; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 328; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 329; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 330; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 331; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 332; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 333; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 334; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 335; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 336; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 337; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 338; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 339; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 340; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 341; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 342; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 343; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 344; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 345; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 346; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 347; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 348; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 349; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 350; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 351; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 352; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 353; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 354; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 355; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 356; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 357; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 358 359; GCN: [[LOOP:BB[0-9_]+]]: 360; GCN-NOT: v_accvgpr 361; GCN: v_mfma_f32_32x32x1f32 362; GCN-NOT: v_accvgpr 363; GCN: s_cbranch_scc1 [[LOOP]] 364 365; GCN-COUNT-32: v_accvgpr_read_b32 366 367define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) { 368entry: 369 %tid = call i32 @llvm.amdgcn.workitem.id.x() 370 %init = bitcast i32 %tid to float 371 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 372 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 373 374 br label %for.cond.preheader 375 376for.cond.preheader: 377 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 378 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 379 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 380 %inc = add nuw nsw i32 %c, 1 381 %cc = icmp eq i32 %inc, 16 382 br i1 %cc, label %exit, label %for.cond.preheader 383 384exit: 385 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 386 ret void 387} 388 389; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 390 391; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 392; GCN: v_mfma_f32_32x32x1f32 393; GCN-NOT: v_accvgpr 394 395; GCN: [[LOOP:BB[0-9_]+]]: 396; GCN-NOT: v_accvgpr 397; GCN: v_mfma_f32_32x32x1f32 398; GCN-NOT: v_accvgpr 399; GCN: s_cbranch_scc1 [[LOOP]] 400 401; GCN-COUNT-32: v_accvgpr_read_b32 402define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) { 403entry: 404 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 405 406 br label %for.cond.preheader 407 408for.cond.preheader: 409 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 410 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 411 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 412 %inc = add nuw nsw i32 %c, 1 413 %cc = icmp eq i32 %inc, 16 414 br i1 %cc, label %exit, label %for.cond.preheader 415 416exit: 417 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 418 ret void 419} 420 421; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 422 423; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 424; GCN: v_mfma_f32_32x32x1f32 425 426; Check that we are using only one tmp VGPR. 427 428; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 429; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} 430 431; GCN: [[LOOP:BB[0-9_]+]]: 432; GCN-NOT: v_accvgpr 433; GCN: v_mfma_f32_32x32x1f32 434; GCN-NOT: v_accvgpr 435; GCN: s_cbranch_scc1 [[LOOP]] 436 437; GCN-COUNT-32: v_accvgpr_read_b32 438define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) { 439entry: 440 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 441 %init = extractelement <32 x float> %mai.0, i32 0 442 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 443 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 444 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 445 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 446 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 447 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 448 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 449 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 450 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 451 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 452 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 453 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 454 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 455 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 456 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 457 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 458 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 459 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 460 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 461 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 462 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 463 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 464 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 465 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 466 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 467 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 468 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 469 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 470 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 471 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 472 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 473 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 474 475 br label %for.cond.preheader 476 477for.cond.preheader: 478 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 479 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 480 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 481 %inc = add nuw nsw i32 %c, 1 482 %cc = icmp eq i32 %inc, 16 483 br i1 %cc, label %exit, label %for.cond.preheader 484 485exit: 486 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 487 ret void 488} 489 490; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: 491 492; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 493 494; Check that we do not copy agprs to vgprs and back in an outer loop. 495 496; GCN: [[OUTER_LOOP:BB[0-9_]+]]: 497; GCN-NOT: v_accvgpr 498; GCN: [[INNER_LOOP:BB[0-9_]+]]: 499; GCN-NOT: v_accvgpr 500; GCN: v_mfma_f32_32x32x1f32 501; GCN-NOT: v_accvgpr 502; GCN: s_cbranch_scc1 [[INNER_LOOP]] 503; GCN-NOT: v_accvgpr 504; GCN: s_cbranch_scc1 [[OUTER_LOOP]] 505 506; Final result should be read only once after the loop. 507 508; GCN-COUNT-32: v_accvgpr_read_b32 509 510define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) { 511entry: 512 br label %for.cond.preheader 513 514for.cond.preheader: 515 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] 516 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] 517 br label %inner.for.cond.preheader 518 519inner.for.cond.preheader: 520 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] 521 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] 522 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 523 %inc = add nuw nsw i32 %c, 1 524 %cc = icmp eq i32 %inc, 16 525 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader 526 527inner.exit: 528 %inc.0 = add nuw nsw i32 %c.0, 1 529 %cc.0 = icmp eq i32 %inc.0, 16 530 br i1 %cc.0, label %exit, label %for.cond.preheader 531 532exit: 533 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 534 ret void 535} 536 537declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 538declare i32 @llvm.amdgcn.workitem.id.x() 539