1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
3
4; Check that WQM isn't triggered by image load/store intrinsics.
5;
6;CHECK-LABEL: {{^}}test1:
7;CHECK-NOT: s_wqm
8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
9main_body:
10  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
11  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
12  ret <4 x float> %tex
13}
14
15; Check that WQM is triggered by image samples and left untouched for loads...
16;
17;CHECK-LABEL: {{^}}test2:
18;CHECK-NEXT: ; %main_body
19;CHECK-NEXT: s_wqm_b64 exec, exec
20;CHECK: image_sample
21;CHECK-NOT: exec
22;CHECK: _load_dword v0,
23define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
24main_body:
25  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
26  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
27  %c.3 = extractelement <4 x i32> %c.2, i32 0
28  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
29  %data = load float, float addrspace(1)* %gep
30  ret float %data
31}
32
33; ... but disabled for stores (and, in this simple case, not re-enabled).
34;
35;CHECK-LABEL: {{^}}test3:
36;CHECK-NEXT: ; %main_body
37;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
38;CHECK-NEXT: s_wqm_b64 exec, exec
39;CHECK: image_sample
40;CHECK: s_and_b64 exec, exec, [[ORIG]]
41;CHECK: store
42;CHECK-NOT: exec
43;CHECK: .size test3
44define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
45main_body:
46  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
47  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
48  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
49  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
50  %wr = extractelement <4 x float> %tex, i32 1
51  store float %wr, float addrspace(1)* %gep
52  ret <4 x float> %tex
53}
54
55; Check that WQM is re-enabled when required.
56;
57;CHECK-LABEL: {{^}}test4:
58;CHECK-NEXT: ; %main_body
59;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
60;CHECK-NEXT: s_wqm_b64 exec, exec
61;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
62;CHECK: s_and_b64 exec, exec, [[ORIG]]
63;CHECK: store
64;CHECK: s_wqm_b64 exec, exec
65;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
66define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
67main_body:
68  %c.1 = mul i32 %c, %d
69  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
70  store float %data, float addrspace(1)* %gep
71  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
72  ret <4 x float> %tex
73}
74
75; Check a case of one branch of an if-else requiring WQM, the other requiring
76; exact.
77;
78; Note: In this particular case, the save-and-restore could be avoided if the
79; analysis understood that the two branches of the if-else are mutually
80; exclusive.
81;
82;CHECK-LABEL: {{^}}test_control_flow_0:
83;CHECK-NEXT: ; %main_body
84;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
85;CHECK-NEXT: s_wqm_b64 exec, exec
86;CHECK: %ELSE
87;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
88;CHECK: store
89;CHECK: s_mov_b64 exec, [[SAVED]]
90;CHECK: %IF
91;CHECK: image_sample
92define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
93main_body:
94  %cmp = icmp eq i32 %z, 0
95  br i1 %cmp, label %IF, label %ELSE
96
97IF:
98  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
99  %data.if = extractelement <4 x float> %tex, i32 0
100  br label %END
101
102ELSE:
103  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
104  store float %data, float addrspace(1)* %gep
105  br label %END
106
107END:
108  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
109  ret float %r
110}
111
112; Reverse branch order compared to the previous test.
113;
114;CHECK-LABEL: {{^}}test_control_flow_1:
115;CHECK-NEXT: ; %main_body
116;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
117;CHECK-NEXT: s_wqm_b64 exec, exec
118;CHECK: %IF
119;CHECK: image_sample
120;CHECK: %Flow
121;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
122;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
123;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
124;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
125;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
126;CHECK-NEXT: ; BB#3: ; %ELSE
127;CHECK: store_dword
128;CHECK: [[END_BB]]: ; %END
129;CHECK: s_or_b64 exec, exec,
130;CHECK: v_mov_b32_e32 v0
131;CHECK: ; return
132define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
133main_body:
134  %cmp = icmp eq i32 %z, 0
135  br i1 %cmp, label %ELSE, label %IF
136
137IF:
138  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
139  %data.if = extractelement <4 x float> %tex, i32 0
140  br label %END
141
142ELSE:
143  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
144  store float %data, float addrspace(1)* %gep
145  br label %END
146
147END:
148  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
149  ret float %r
150}
151
152; Check that branch conditions are properly marked as needing WQM...
153;
154;CHECK-LABEL: {{^}}test_control_flow_2:
155;CHECK-NEXT: ; %main_body
156;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
157;CHECK-NEXT: s_wqm_b64 exec, exec
158;CHECK: s_and_b64 exec, exec, [[ORIG]]
159;CHECK: store
160;CHECK: s_wqm_b64 exec, exec
161;CHECK: load
162;CHECK: s_and_b64 exec, exec, [[ORIG]]
163;CHECK: store
164;CHECK: s_wqm_b64 exec, exec
165;CHECK: v_cmp
166define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
167main_body:
168  %idx.1 = extractelement <3 x i32> %idx, i32 0
169  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
170  %data.1 = extractelement <2 x float> %data, i32 0
171  store float %data.1, float addrspace(1)* %gep.1
172
173  ; The load that determines the branch (and should therefore be WQM) is
174  ; surrounded by stores that require disabled WQM.
175  %idx.2 = extractelement <3 x i32> %idx, i32 1
176  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
177  %z = load float, float addrspace(1)* %gep.2
178
179  %idx.3 = extractelement <3 x i32> %idx, i32 2
180  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
181  %data.3 = extractelement <2 x float> %data, i32 1
182  store float %data.3, float addrspace(1)* %gep.3
183
184  %cc = fcmp ogt float %z, 0.0
185  br i1 %cc, label %IF, label %ELSE
186
187IF:
188  %coord.IF = mul i32 %coord, 3
189  br label %END
190
191ELSE:
192  %coord.ELSE = mul i32 %coord, 4
193  br label %END
194
195END:
196  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
197  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
198  ret <4 x float> %tex
199}
200
201; ... but only if they really do need it.
202;
203;CHECK-LABEL: {{^}}test_control_flow_3:
204;CHECK-NEXT: ; %main_body
205;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
206;CHECK-NEXT: s_wqm_b64 exec, exec
207;CHECK: image_sample
208;CHECK: s_and_b64 exec, exec, [[ORIG]]
209;CHECK: store
210;CHECK: load
211;CHECK: store
212;CHECK: v_cmp
213define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
214main_body:
215  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
216  %tex.1 = extractelement <4 x float> %tex, i32 0
217
218  %idx.1 = extractelement <3 x i32> %idx, i32 0
219  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
220  %data.1 = extractelement <2 x float> %data, i32 0
221  store float %data.1, float addrspace(1)* %gep.1
222
223  %idx.2 = extractelement <3 x i32> %idx, i32 1
224  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
225  %z = load float, float addrspace(1)* %gep.2
226
227  %idx.3 = extractelement <3 x i32> %idx, i32 2
228  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
229  %data.3 = extractelement <2 x float> %data, i32 1
230  store float %data.3, float addrspace(1)* %gep.3
231
232  %cc = fcmp ogt float %z, 0.0
233  br i1 %cc, label %IF, label %ELSE
234
235IF:
236  %tex.IF = fmul float %tex.1, 3.0
237  br label %END
238
239ELSE:
240  %tex.ELSE = fmul float %tex.1, 4.0
241  br label %END
242
243END:
244  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
245  ret float %tex.END
246}
247
248; Another test that failed at some point because of terminator handling.
249;
250;CHECK-LABEL: {{^}}test_control_flow_4:
251;CHECK-NEXT: ; %main_body
252;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
253;CHECK-NEXT: s_wqm_b64 exec, exec
254;CHECK: %IF
255;CHECK: load
256;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
257;CHECK: store
258;CHECK: s_mov_b64 exec, [[SAVE]]
259;CHECK: %END
260;CHECK: image_sample
261define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
262main_body:
263  %cond = icmp eq i32 %y, 0
264  br i1 %cond, label %IF, label %END
265
266IF:
267  %data = load float, float addrspace(1)* %ptr
268  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
269  store float %data, float addrspace(1)* %gep
270  br label %END
271
272END:
273  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
274  ret <4 x float> %tex
275}
276
277; Kill is performed in WQM mode so that uniform kill behaves correctly ...
278;
279;CHECK-LABEL: {{^}}test_kill_0:
280;CHECK-NEXT: ; %main_body
281;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
282;CHECK-NEXT: s_wqm_b64 exec, exec
283;CHECK: image_sample
284;CHECK: s_and_b64 exec, exec, [[ORIG]]
285;SI: buffer_store_dword
286;VI: flat_store_dword
287;CHECK: s_wqm_b64 exec, exec
288;CHECK: v_cmpx_
289;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
290;SI: buffer_store_dword
291;VI: flat_store_dword
292;CHECK: s_mov_b64 exec, [[SAVE]]
293;CHECK: image_sample
294define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
295main_body:
296  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
297
298  %idx.0 = extractelement <2 x i32> %idx, i32 0
299  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
300  %data.0 = extractelement <2 x float> %data, i32 0
301  store float %data.0, float addrspace(1)* %gep.0
302
303  call void @llvm.AMDGPU.kill(float %z)
304
305  %idx.1 = extractelement <2 x i32> %idx, i32 1
306  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
307  %data.1 = extractelement <2 x float> %data, i32 1
308  store float %data.1, float addrspace(1)* %gep.1
309
310  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
311  %out = fadd <4 x float> %tex, %tex2
312
313  ret <4 x float> %out
314}
315
316; ... but only if WQM is necessary.
317;
318; CHECK-LABEL: {{^}}test_kill_1:
319; CHECK-NEXT: ; %main_body
320; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
321; CHECK: s_wqm_b64 exec, exec
322; CHECK: image_sample
323; CHECK: s_and_b64 exec, exec, [[ORIG]]
324; SI: buffer_store_dword
325; VI: flat_store_dword
326; CHECK-NOT: wqm
327; CHECK: v_cmpx_
328define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
329main_body:
330  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
331
332  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
333  store float %data, float addrspace(1)* %gep
334
335  call void @llvm.AMDGPU.kill(float %z)
336
337  ret <4 x float> %tex
338}
339
340; Check prolog shaders.
341;
342; CHECK-LABEL: {{^}}test_prolog_1:
343; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
344; CHECK: s_wqm_b64 exec, exec
345; CHECK: v_add_f32_e32 v0,
346; CHECK: s_and_b64 exec, exec, [[ORIG]]
347define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
348main_body:
349  %s = fadd float %a, %b
350  ret float %s
351}
352
353declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
354
355declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
356
357declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
358declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
359
360declare void @llvm.AMDGPU.kill(float)
361declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
362
363attributes #1 = { nounwind }
364attributes #2 = { nounwind readonly }
365attributes #3 = { nounwind readnone }
366attributes #4 = { "amdgpu-ps-wqm-outputs" }
367