1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement < %s | FileCheck -check-prefix=SI %s
2
3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5; SI-LABEL: {{^}}test_if:
6; Make sure the i1 values created by the cfg structurizer pass are
7; moved using VALU instructions
8
9
10; waitcnt should be inserted after exec modification
11; SI:      v_cmp_lt_i32_e32 vcc, 1,
12; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
13; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
14; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
15; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
16; SI-NEXT: s_cbranch_execz [[FLOW_BB:BB[0-9]+_[0-9]+]]
17
18; SI-NEXT: ; %bb.{{[0-9]+}}: ; %LeafBlock3
19; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
20; SI:      s_and_saveexec_b64
21; SI-NEXT: s_cbranch_execnz
22
23; v_mov should be after exec modification
24; SI: [[FLOW_BB]]:
25; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
26; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
27;
28define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
29entry:
30  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
31  switch i32 %tid, label %default [
32    i32 1, label %case1
33    i32 2, label %case2
34  ]
35
36case1:
37  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
38  store i32 13, i32 addrspace(1)* %arrayidx1, align 4
39  br label %end
40
41case2:
42  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
43  store i32 17, i32 addrspace(1)* %arrayidx5, align 4
44  br label %end
45
46default:
47  %cmp8 = icmp eq i32 %tid, 2
48  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
49  br i1 %cmp8, label %if, label %else
50
51if:
52  store i32 19, i32 addrspace(1)* %arrayidx10, align 4
53  br label %end
54
55else:
56  store i32 21, i32 addrspace(1)* %arrayidx10, align 4
57  br label %end
58
59end:
60  ret void
61}
62
63; SI-LABEL: {{^}}simple_test_v_if:
64; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
65; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
66; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
67
68; SI-NEXT: ; %bb.{{[0-9]+}}:
69; SI: buffer_store_dword
70
71; SI-NEXT: {{^}}[[EXIT]]:
72; SI: s_endpgm
73define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
75  %is.0 = icmp ne i32 %tid, 0
76  br i1 %is.0, label %then, label %exit
77
78then:
79  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
80  store i32 999, i32 addrspace(1)* %gep
81  br label %exit
82
83exit:
84  ret void
85}
86
87; FIXME: It would be better to endpgm in the then block.
88
89; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
90; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
91; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
92; SI-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
93
94; SI-NEXT: ; %bb.{{[0-9]+}}:
95; SI: buffer_store_dword
96
97; SI-NEXT: {{^}}[[EXIT]]:
98; SI: s_endpgm
99define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %is.0 = icmp ne i32 %tid, 0
102  br i1 %is.0, label %then, label %exit
103
104then:
105  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
106  store i32 999, i32 addrspace(1)* %gep
107  ret void
108
109exit:
110  ret void
111}
112
113; Final block has more than a ret to execute. This was miscompiled
114; before function exit blocks were unified since the endpgm would
115; terminate the then wavefront before reaching the store.
116
117; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
118; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
119; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
120; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
121; SI: s_cbranch_execnz [[EXIT:BB[0-9]+_[0-9]+]]
122
123; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %Flow
124; SI-NEXT: s_or_saveexec_b64
125; SI-NEXT: s_xor_b64 exec, exec
126; SI-NEXT: s_cbranch_execz [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
127
128; SI-NEXT: ; %bb.{{[0-9]+}}: ; %then
129; SI: s_waitcnt
130; SI-NEXT: buffer_store_dword
131
132; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
133; SI: s_endpgm
134
135; SI-NEXT: {{^}}[[EXIT]]:
136; SI: ds_write_b32
137define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
138  %tid = call i32 @llvm.amdgcn.workitem.id.x()
139  %is.0 = icmp ne i32 %tid, 0
140  br i1 %is.0, label %then, label %exit
141
142then:
143  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
144  store i32 999, i32 addrspace(1)* %gep
145  ret void
146
147exit:
148  store volatile i32 7, i32 addrspace(3)* undef
149  ret void
150}
151
152; SI-LABEL: {{^}}simple_test_v_loop:
153; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
154; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
155; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
156
157; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
158
159; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
160; SI: buffer_load_dword
161; SI-DAG: buffer_store_dword
162; SI-DAG: s_cmpk_lg_i32 s{{[0-9+]}}, 0x100
163; SI: s_cbranch_scc1 [[LABEL_LOOP]]
164; SI: [[LABEL_EXIT]]:
165; SI: s_endpgm
166
167define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
168entry:
169  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
170  %is.0 = icmp ne i32 %tid, 0
171  %limit = add i32 %tid, 64
172  br i1 %is.0, label %loop, label %exit
173
174loop:
175  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
176  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
177  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
178  %load = load i32, i32 addrspace(1)* %src
179  store i32 %load, i32 addrspace(1)* %gep.dst
180  %i.inc = add nsw i32 %i, 1
181  %cmp = icmp eq i32 %limit, %i.inc
182  br i1 %cmp, label %exit, label %loop
183
184exit:
185  ret void
186}
187
188; SI-LABEL: {{^}}multi_vcond_loop:
189
190; Load loop limit from buffer
191; Branch to exit if uniformly not taken
192; SI: ; %bb.0:
193; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
194; SI: v_cmp_lt_i32_e32 vcc
195; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
196; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
197
198; Initialize inner condition to false
199; SI: ; %bb.{{[0-9]+}}: ; %bb10.preheader
200; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
201
202; Clear exec bits for workitems that load -1s
203; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
204; SI: buffer_load_dword [[B:v[0-9]+]]
205; SI: buffer_load_dword [[A:v[0-9]+]]
206; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
207; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
208; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
209; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
210; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
211
212; SI: ; %bb.{{[0-9]+}}: ; %bb20
213; SI: buffer_store_dword
214
215; SI: [[LABEL_FLOW]]:
216; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
217; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
218; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
219; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]]
220; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
221; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
222
223; SI: [[LABEL_EXIT]]:
224; SI-NOT: [[COND_STATE]]
225; SI: s_endpgm
226
227define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
228bb:
229  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
230  %tmp4 = sext i32 %tmp to i64
231  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
232  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
233  %tmp7 = icmp sgt i32 %tmp6, 0
234  %tmp8 = sext i32 %tmp6 to i64
235  br i1 %tmp7, label %bb10, label %bb26
236
237bb10:                                             ; preds = %bb, %bb20
238  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
239  %tmp12 = add nsw i64 %tmp11, %tmp4
240  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
241  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
242  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
243  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
244  %tmp17 = icmp ne i32 %tmp14, -1
245  %tmp18 = icmp ne i32 %tmp16, -1
246  %tmp19 = and i1 %tmp17, %tmp18
247  br i1 %tmp19, label %bb20, label %bb26
248
249bb20:                                             ; preds = %bb10
250  %tmp21 = add nsw i32 %tmp16, %tmp14
251  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
252  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
253  %tmp23 = add nuw nsw i64 %tmp11, 1
254  %tmp24 = icmp slt i64 %tmp23, %tmp8
255  br i1 %tmp24, label %bb10, label %bb26
256
257bb26:                                             ; preds = %bb10, %bb20, %bb
258  ret void
259}
260
261attributes #0 = { nounwind readnone }
262attributes #1 = { nounwind }
263