1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI 3 4; Check that WQM isn't triggered by image load/store intrinsics. 5; 6;CHECK-LABEL: {{^}}test1: 7;CHECK-NOT: s_wqm 8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { 9main_body: 10 %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 11 call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 12 ret <4 x float> %tex 13} 14 15; Check that WQM is triggered by image samples and left untouched for loads... 16; 17;CHECK-LABEL: {{^}}test2: 18;CHECK-NEXT: ; %main_body 19;CHECK-NEXT: s_wqm_b64 exec, exec 20;CHECK: image_sample 21;CHECK-NOT: exec 22;CHECK: _load_dword v0, 23define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { 24main_body: 25 %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 26 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> 27 %c.3 = extractelement <4 x i32> %c.2, i32 0 28 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 29 %data = load float, float addrspace(1)* %gep 30 ret float %data 31} 32 33; ... but disabled for stores (and, in this simple case, not re-enabled). 34; 35;CHECK-LABEL: {{^}}test3: 36;CHECK-NEXT: ; %main_body 37;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 38;CHECK-NEXT: s_wqm_b64 exec, exec 39;CHECK: image_sample 40;CHECK: s_and_b64 exec, exec, [[ORIG]] 41;CHECK: store 42;CHECK-NOT: exec 43;CHECK: .size test3 44define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { 45main_body: 46 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 47 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 48 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 49 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2 50 %wr = extractelement <4 x float> %tex, i32 1 51 store float %wr, float addrspace(1)* %gep 52 ret <4 x float> %tex 53} 54 55; Check that WQM is re-enabled when required. 56; 57;CHECK-LABEL: {{^}}test4: 58;CHECK-NEXT: ; %main_body 59;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 60;CHECK-NEXT: s_wqm_b64 exec, exec 61;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1 62;CHECK: s_and_b64 exec, exec, [[ORIG]] 63;CHECK: store 64;CHECK: s_wqm_b64 exec, exec 65;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf 66define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 67main_body: 68 %c.1 = mul i32 %c, %d 69 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 70 store float %data, float addrspace(1)* %gep 71 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 72 ret <4 x float> %tex 73} 74 75; Check a case of one branch of an if-else requiring WQM, the other requiring 76; exact. 77; 78; Note: In this particular case, the save-and-restore could be avoided if the 79; analysis understood that the two branches of the if-else are mutually 80; exclusive. 81; 82;CHECK-LABEL: {{^}}test_control_flow_0: 83;CHECK-NEXT: ; %main_body 84;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 85;CHECK-NEXT: s_wqm_b64 exec, exec 86;CHECK: %ELSE 87;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 88;CHECK: store 89;CHECK: s_mov_b64 exec, [[SAVED]] 90;CHECK: %IF 91;CHECK: image_sample 92define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { 93main_body: 94 %cmp = icmp eq i32 %z, 0 95 br i1 %cmp, label %IF, label %ELSE 96 97IF: 98 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 99 %data.if = extractelement <4 x float> %tex, i32 0 100 br label %END 101 102ELSE: 103 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c 104 store float %data, float addrspace(1)* %gep 105 br label %END 106 107END: 108 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 109 ret float %r 110} 111 112; Reverse branch order compared to the previous test. 113; 114;CHECK-LABEL: {{^}}test_control_flow_1: 115;CHECK-NEXT: ; %main_body 116;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 117;CHECK-NEXT: s_wqm_b64 exec, exec 118;CHECK: %IF 119;CHECK: image_sample 120;CHECK: %Flow 121;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], 122;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] 123;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] 124;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] 125;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] 126;CHECK-NEXT: ; BB#3: ; %ELSE 127;CHECK: store_dword 128;CHECK: [[END_BB]]: ; %END 129;CHECK: s_or_b64 exec, exec, 130;CHECK: v_mov_b32_e32 v0 131;CHECK: ; return 132define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { 133main_body: 134 %cmp = icmp eq i32 %z, 0 135 br i1 %cmp, label %ELSE, label %IF 136 137IF: 138 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 139 %data.if = extractelement <4 x float> %tex, i32 0 140 br label %END 141 142ELSE: 143 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c 144 store float %data, float addrspace(1)* %gep 145 br label %END 146 147END: 148 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 149 ret float %r 150} 151 152; Check that branch conditions are properly marked as needing WQM... 153; 154;CHECK-LABEL: {{^}}test_control_flow_2: 155;CHECK-NEXT: ; %main_body 156;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 157;CHECK-NEXT: s_wqm_b64 exec, exec 158;CHECK: s_and_b64 exec, exec, [[ORIG]] 159;CHECK: store 160;CHECK: s_wqm_b64 exec, exec 161;CHECK: load 162;CHECK: s_and_b64 exec, exec, [[ORIG]] 163;CHECK: store 164;CHECK: s_wqm_b64 exec, exec 165;CHECK: v_cmp 166define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 167main_body: 168 %idx.1 = extractelement <3 x i32> %idx, i32 0 169 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 170 %data.1 = extractelement <2 x float> %data, i32 0 171 store float %data.1, float addrspace(1)* %gep.1 172 173 ; The load that determines the branch (and should therefore be WQM) is 174 ; surrounded by stores that require disabled WQM. 175 %idx.2 = extractelement <3 x i32> %idx, i32 1 176 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 177 %z = load float, float addrspace(1)* %gep.2 178 179 %idx.3 = extractelement <3 x i32> %idx, i32 2 180 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 181 %data.3 = extractelement <2 x float> %data, i32 1 182 store float %data.3, float addrspace(1)* %gep.3 183 184 %cc = fcmp ogt float %z, 0.0 185 br i1 %cc, label %IF, label %ELSE 186 187IF: 188 %coord.IF = mul i32 %coord, 3 189 br label %END 190 191ELSE: 192 %coord.ELSE = mul i32 %coord, 4 193 br label %END 194 195END: 196 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 197 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 198 ret <4 x float> %tex 199} 200 201; ... but only if they really do need it. 202; 203;CHECK-LABEL: {{^}}test_control_flow_3: 204;CHECK-NEXT: ; %main_body 205;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 206;CHECK-NEXT: s_wqm_b64 exec, exec 207;CHECK: image_sample 208;CHECK: s_and_b64 exec, exec, [[ORIG]] 209;CHECK: store 210;CHECK: load 211;CHECK: store 212;CHECK: v_cmp 213define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 214main_body: 215 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 216 %tex.1 = extractelement <4 x float> %tex, i32 0 217 218 %idx.1 = extractelement <3 x i32> %idx, i32 0 219 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 220 %data.1 = extractelement <2 x float> %data, i32 0 221 store float %data.1, float addrspace(1)* %gep.1 222 223 %idx.2 = extractelement <3 x i32> %idx, i32 1 224 %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 225 %z = load float, float addrspace(1)* %gep.2 226 227 %idx.3 = extractelement <3 x i32> %idx, i32 2 228 %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 229 %data.3 = extractelement <2 x float> %data, i32 1 230 store float %data.3, float addrspace(1)* %gep.3 231 232 %cc = fcmp ogt float %z, 0.0 233 br i1 %cc, label %IF, label %ELSE 234 235IF: 236 %tex.IF = fmul float %tex.1, 3.0 237 br label %END 238 239ELSE: 240 %tex.ELSE = fmul float %tex.1, 4.0 241 br label %END 242 243END: 244 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 245 ret float %tex.END 246} 247 248; Another test that failed at some point because of terminator handling. 249; 250;CHECK-LABEL: {{^}}test_control_flow_4: 251;CHECK-NEXT: ; %main_body 252;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 253;CHECK-NEXT: s_wqm_b64 exec, exec 254;CHECK: %IF 255;CHECK: load 256;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 257;CHECK: store 258;CHECK: s_mov_b64 exec, [[SAVE]] 259;CHECK: %END 260;CHECK: image_sample 261define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { 262main_body: 263 %cond = icmp eq i32 %y, 0 264 br i1 %cond, label %IF, label %END 265 266IF: 267 %data = load float, float addrspace(1)* %ptr 268 %gep = getelementptr float, float addrspace(1)* %ptr, i32 1 269 store float %data, float addrspace(1)* %gep 270 br label %END 271 272END: 273 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 274 ret <4 x float> %tex 275} 276 277; Kill is performed in WQM mode so that uniform kill behaves correctly ... 278; 279;CHECK-LABEL: {{^}}test_kill_0: 280;CHECK-NEXT: ; %main_body 281;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 282;CHECK-NEXT: s_wqm_b64 exec, exec 283;CHECK: image_sample 284;CHECK: s_and_b64 exec, exec, [[ORIG]] 285;SI: buffer_store_dword 286;VI: flat_store_dword 287;CHECK: s_wqm_b64 exec, exec 288;CHECK: v_cmpx_ 289;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 290;SI: buffer_store_dword 291;VI: flat_store_dword 292;CHECK: s_mov_b64 exec, [[SAVE]] 293;CHECK: image_sample 294define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { 295main_body: 296 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 297 298 %idx.0 = extractelement <2 x i32> %idx, i32 0 299 %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0 300 %data.0 = extractelement <2 x float> %data, i32 0 301 store float %data.0, float addrspace(1)* %gep.0 302 303 call void @llvm.AMDGPU.kill(float %z) 304 305 %idx.1 = extractelement <2 x i32> %idx, i32 1 306 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 307 %data.1 = extractelement <2 x float> %data, i32 1 308 store float %data.1, float addrspace(1)* %gep.1 309 310 %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 311 %out = fadd <4 x float> %tex, %tex2 312 313 ret <4 x float> %out 314} 315 316; ... but only if WQM is necessary. 317; 318; CHECK-LABEL: {{^}}test_kill_1: 319; CHECK-NEXT: ; %main_body 320; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 321; CHECK: s_wqm_b64 exec, exec 322; CHECK: image_sample 323; CHECK: s_and_b64 exec, exec, [[ORIG]] 324; SI: buffer_store_dword 325; VI: flat_store_dword 326; CHECK-NOT: wqm 327; CHECK: v_cmpx_ 328define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { 329main_body: 330 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 331 332 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx 333 store float %data, float addrspace(1)* %gep 334 335 call void @llvm.AMDGPU.kill(float %z) 336 337 ret <4 x float> %tex 338} 339 340; Check prolog shaders. 341; 342; CHECK-LABEL: {{^}}test_prolog_1: 343; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 344; CHECK: s_wqm_b64 exec, exec 345; CHECK: v_add_f32_e32 v0, 346; CHECK: s_and_b64 exec, exec, [[ORIG]] 347define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { 348main_body: 349 %s = fadd float %a, %b 350 ret float %s 351} 352 353declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 354 355declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 356 357declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 358declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 359 360declare void @llvm.AMDGPU.kill(float) 361declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) 362 363attributes #1 = { nounwind } 364attributes #2 = { nounwind readonly } 365attributes #3 = { nounwind readnone } 366attributes #4 = { "amdgpu-ps-wqm-outputs" } 367