1; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s 2 3; Check that WQM is not triggered by the softwqm intrinsic alone. 4; 5;CHECK-LABEL: {{^}}test1: 6;CHECK-NOT: s_wqm_b64 exec, exec 7;CHECK: buffer_load_dword 8;CHECK: buffer_load_dword 9;CHECK: v_add_f32_e32 10define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) { 11main_body: 12 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 13 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 14 %out = fadd float %src0, %src1 15 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 16 ret float %out.0 17} 18 19; Check that the softwqm intrinsic works correctly for integers. 20; 21;CHECK-LABEL: {{^}}test2: 22;CHECK-NOT: s_wqm_b64 exec, exec 23;CHECK: buffer_load_dword 24;CHECK: buffer_load_dword 25;CHECK: v_add_f32_e32 26define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) { 27main_body: 28 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 29 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 30 %out = fadd float %src0, %src1 31 %out.0 = bitcast float %out to i32 32 %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0) 33 %out.2 = bitcast i32 %out.1 to float 34 ret float %out.2 35} 36 37; Make sure the transition from WQM to Exact to softwqm does not trigger WQM. 38; 39;CHECK-LABEL: {{^}}test_softwqm1: 40;CHECK-NOT: s_wqm_b64 exec, exec 41;CHECK: buffer_load_dword 42;CHECK: buffer_load_dword 43;CHECK: buffer_store_dword 44;CHECK-NOT; s_wqm_b64 exec, exec 45;CHECK: v_add_f32_e32 46define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) { 47main_body: 48 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 49 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 50 %temp = fadd float %src0, %src1 51 call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 52 %out = fadd float %temp, %temp 53 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 54 ret float %out.0 55} 56 57; Make sure the transition from WQM to Exact to softwqm does trigger WQM. 58; 59;CHECK-LABEL: {{^}}test_softwqm2: 60;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 61;CHECK: s_wqm_b64 exec, exec 62;CHECK: buffer_load_dword 63;CHECK: buffer_load_dword 64;CHECK: v_add_f32_e32 65;CHECK: v_add_f32_e32 66;CHECK: s_and_b64 exec, exec, [[ORIG]] 67;CHECK: buffer_store_dword 68define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) { 69main_body: 70 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 71 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 72 %temp = fadd float %src0, %src1 73 %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp) 74 call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 75 %out = fadd float %temp, %temp 76 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 77 ret float %out.0 78} 79 80; Make sure the transition from Exact to WWM then softwqm does not trigger WQM. 81; 82;CHECK-LABEL: {{^}}test_wwm1: 83;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1 84;CHECK: buffer_load_dword 85;CHECK: s_mov_b64 exec, [[ORIG0]] 86;CHECK: buffer_store_dword 87;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1 88;CHECK: buffer_load_dword 89;CHECK: v_add_f32_e32 90;CHECK: s_mov_b64 exec, [[ORIG1]] 91;CHECK-NOT: s_wqm_b64 92define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 93main_body: 94 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 95 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 96 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 97 %temp = fadd float %src0, %src1 98 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 99 %out = fadd float %temp.0, %temp.0 100 %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out) 101 ret float %out.0 102} 103 104; Check that softwqm on one case of branch does not trigger WQM for shader. 105; 106;CHECK-LABEL: {{^}}test_control_flow_0: 107;CHECK-NEXT: ; %main_body 108;CHECK-NOT: s_wqm_b64 exec, exec 109;CHECK: %ELSE 110;CHECK: store 111;CHECK: %IF 112;CHECK: buffer_load 113;CHECK: buffer_load 114define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { 115main_body: 116 %cmp = icmp eq i32 %z, 0 117 br i1 %cmp, label %IF, label %ELSE 118 119IF: 120 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 121 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 122 %out = fadd float %src0, %src1 123 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) 124 br label %END 125 126ELSE: 127 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 128 br label %END 129 130END: 131 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 132 ret float %r 133} 134 135; Check that softwqm on one case of branch is treated as WQM in WQM shader. 136; 137;CHECK-LABEL: {{^}}test_control_flow_1: 138;CHECK-NEXT: ; %main_body 139;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 140;CHECK-NEXT: s_wqm_b64 exec, exec 141;CHECK: %ELSE 142;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 143;CHECK: store 144;CHECK: s_mov_b64 exec, [[SAVED]] 145;CHECK: %IF 146;CHECK-NOT: s_and_saveexec_b64 147;CHECK-NOT: s_and_b64 exec 148;CHECK: buffer_load 149;CHECK: buffer_load 150define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) { 151main_body: 152 %c.bc = bitcast i32 %c to float 153 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 154 %tex0 = extractelement <4 x float> %tex, i32 0 155 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 156 %data.sample = extractelement <4 x float> %dtex, i32 0 157 158 %cmp = icmp eq i32 %z, 0 159 br i1 %cmp, label %IF, label %ELSE 160 161IF: 162 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 163 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 164 %out = fadd float %src0, %src1 165 %data.if = call float @llvm.amdgcn.softwqm.f32(float %out) 166 br label %END 167 168ELSE: 169 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 170 br label %END 171 172END: 173 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 174 ret float %r 175} 176 177declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 178declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 179declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3 180declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 181declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 182declare void @llvm.amdgcn.kill(i1) #1 183declare float @llvm.amdgcn.wqm.f32(float) #3 184declare float @llvm.amdgcn.softwqm.f32(float) #3 185declare i32 @llvm.amdgcn.softwqm.i32(i32) #3 186declare float @llvm.amdgcn.wwm.f32(float) #3 187 188attributes #1 = { nounwind } 189attributes #2 = { nounwind readonly } 190attributes #3 = { nounwind readnone } 191