1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
4
5define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
6; SI-LABEL: break_inserted_outside_of_loop:
7; SI:       ; %bb.0: ; %main_body
8; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
10; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    v_and_b32_e32 v0, s0, v0
13; SI-NEXT:    v_and_b32_e32 v0, 1, v0
14; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
15; SI-NEXT:    s_mov_b64 s[0:1], 0
16; SI-NEXT:  BB0_1: ; %ENDIF
17; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
18; SI-NEXT:    s_and_b64 s[2:3], exec, vcc
19; SI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
20; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
21; SI-NEXT:    s_cbranch_execnz BB0_1
22; SI-NEXT:  ; %bb.2: ; %ENDLOOP
23; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
24; SI-NEXT:    s_mov_b32 s7, 0xf000
25; SI-NEXT:    s_mov_b32 s6, -1
26; SI-NEXT:    v_mov_b32_e32 v0, 0
27; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; FLAT-LABEL: break_inserted_outside_of_loop:
31; FLAT:       ; %bb.0: ; %main_body
32; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
33; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
34; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
35; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
36; FLAT-NEXT:    v_and_b32_e32 v0, s0, v0
37; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
38; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
39; FLAT-NEXT:    s_mov_b64 s[0:1], 0
40; FLAT-NEXT:  BB0_1: ; %ENDIF
41; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
42; FLAT-NEXT:    s_and_b64 s[2:3], exec, vcc
43; FLAT-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
44; FLAT-NEXT:    s_andn2_b64 exec, exec, s[0:1]
45; FLAT-NEXT:    s_cbranch_execnz BB0_1
46; FLAT-NEXT:  ; %bb.2: ; %ENDLOOP
47; FLAT-NEXT:    s_or_b64 exec, exec, s[0:1]
48; FLAT-NEXT:    s_mov_b32 s7, 0xf000
49; FLAT-NEXT:    s_mov_b32 s6, -1
50; FLAT-NEXT:    v_mov_b32_e32 v0, 0
51; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
52; FLAT-NEXT:    s_endpgm
53main_body:
54  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
55  %0 = and i32 %a, %tid
56  %1 = trunc i32 %0 to i1
57  br label %ENDIF
58
59ENDLOOP:
60  store i32 0, i32 addrspace(1)* %out
61  ret void
62
63ENDIF:
64  br i1 %1, label %ENDLOOP, label %ENDIF
65}
66
67define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
68; SI-LABEL: phi_cond_outside_loop:
69; SI:       ; %bb.0: ; %entry
70; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
71; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
72; SI-NEXT:    s_mov_b64 s[2:3], 0
73; SI-NEXT:    s_mov_b64 s[4:5], 0
74; SI-NEXT:    s_and_saveexec_b64 s[6:7], vcc
75; SI-NEXT:    s_cbranch_execz BB1_2
76; SI-NEXT:  ; %bb.1: ; %else
77; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
80; SI-NEXT:    s_and_b64 s[4:5], s[0:1], exec
81; SI-NEXT:  BB1_2: ; %endif
82; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
83; SI-NEXT:  BB1_3: ; %loop
84; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
85; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
86; SI-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
87; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
88; SI-NEXT:    s_cbranch_execnz BB1_3
89; SI-NEXT:  ; %bb.4: ; %exit
90; SI-NEXT:    s_endpgm
91;
92; FLAT-LABEL: phi_cond_outside_loop:
93; FLAT:       ; %bb.0: ; %entry
94; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
95; FLAT-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
96; FLAT-NEXT:    s_mov_b64 s[2:3], 0
97; FLAT-NEXT:    s_mov_b64 s[4:5], 0
98; FLAT-NEXT:    s_and_saveexec_b64 s[6:7], vcc
99; FLAT-NEXT:    s_cbranch_execz BB1_2
100; FLAT-NEXT:  ; %bb.1: ; %else
101; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x24
102; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
103; FLAT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
104; FLAT-NEXT:    s_and_b64 s[4:5], s[0:1], exec
105; FLAT-NEXT:  BB1_2: ; %endif
106; FLAT-NEXT:    s_or_b64 exec, exec, s[6:7]
107; FLAT-NEXT:  BB1_3: ; %loop
108; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
109; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
110; FLAT-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
111; FLAT-NEXT:    s_andn2_b64 exec, exec, s[2:3]
112; FLAT-NEXT:    s_cbranch_execnz BB1_3
113; FLAT-NEXT:  ; %bb.4: ; %exit
114; FLAT-NEXT:    s_endpgm
115entry:
116  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
117  %0 = icmp eq i32 %tid , 0
118  br i1 %0, label %if, label %else
119
120if:
121  br label %endif
122
123else:
124  %1 = icmp eq i32 %b, 0
125  br label %endif
126
127endif:
128  %2 = phi i1 [0, %if], [%1, %else]
129  br label %loop
130
131loop:
132  br i1 %2, label %exit, label %loop
133
134exit:
135  ret void
136}
137
138define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
139; SI-LABEL: switch_unreachable:
140; SI:       ; %bb.0: ; %centry
141;
142; FLAT-LABEL: switch_unreachable:
143; FLAT:       ; %bb.0: ; %centry
144centry:
145  switch i32 %x, label %sw.default [
146    i32 0, label %sw.bb
147    i32 60, label %sw.bb
148  ]
149
150sw.bb:
151  unreachable
152
153sw.default:
154  unreachable
155
156sw.epilog:
157  ret void
158}
159
160declare float @llvm.fabs.f32(float) nounwind readnone
161
162define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
163; SI-LABEL: loop_land_info_assert:
164; SI:       ; %bb.0: ; %entry
165; SI-NEXT:    s_mov_b32 s7, 0xf000
166; SI-NEXT:    s_mov_b32 s6, -1
167; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
168; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
169; SI-NEXT:    s_load_dword s8, s[0:1], 0xc
170; SI-NEXT:    s_brev_b32 s9, 44
171; SI-NEXT:    s_waitcnt lgkmcnt(0)
172; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
173; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
174; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s3, 3
175; SI-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
176; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
177; SI-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
178; SI-NEXT:    s_waitcnt vmcnt(0)
179; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
180; SI-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
181; SI-NEXT:    v_mov_b32_e32 v0, 3
182; SI-NEXT:    s_branch BB3_4
183; SI-NEXT:  BB3_1: ; %Flow6
184; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
185; SI-NEXT:    s_mov_b64 s[10:11], 0
186; SI-NEXT:  BB3_2: ; %Flow5
187; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
188; SI-NEXT:    s_mov_b64 s[14:15], 0
189; SI-NEXT:  BB3_3: ; %Flow
190; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
191; SI-NEXT:    s_and_b64 vcc, exec, s[12:13]
192; SI-NEXT:    s_cbranch_vccnz BB3_8
193; SI-NEXT:  BB3_4: ; %while.cond
194; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
195; SI-NEXT:    s_mov_b64 s[14:15], -1
196; SI-NEXT:    s_mov_b64 s[10:11], -1
197; SI-NEXT:    s_mov_b64 s[12:13], -1
198; SI-NEXT:    s_mov_b64 vcc, s[0:1]
199; SI-NEXT:    s_cbranch_vccz BB3_3
200; SI-NEXT:  ; %bb.5: ; %convex.exit
201; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
202; SI-NEXT:    s_mov_b64 s[10:11], -1
203; SI-NEXT:    s_mov_b64 s[12:13], -1
204; SI-NEXT:    s_mov_b64 vcc, s[2:3]
205; SI-NEXT:    s_cbranch_vccz BB3_2
206; SI-NEXT:  ; %bb.6: ; %if.end
207; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
208; SI-NEXT:    s_mov_b64 s[12:13], -1
209; SI-NEXT:    s_mov_b64 vcc, s[4:5]
210; SI-NEXT:    s_cbranch_vccz BB3_1
211; SI-NEXT:  ; %bb.7: ; %if.else
212; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
213; SI-NEXT:    s_mov_b64 s[12:13], 0
214; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
215; SI-NEXT:    s_branch BB3_1
216; SI-NEXT:  BB3_8: ; %loop.exit.guard4
217; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
218; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
219; SI-NEXT:    s_cbranch_vccz BB3_4
220; SI-NEXT:  ; %bb.9: ; %loop.exit.guard
221; SI-NEXT:    s_and_b64 vcc, exec, s[14:15]
222; SI-NEXT:    s_cbranch_vccz BB3_13
223; SI-NEXT:  ; %bb.10: ; %for.cond.preheader
224; SI-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
225; SI-NEXT:    s_cbranch_scc0 BB3_13
226; SI-NEXT:  ; %bb.11: ; %for.body
227; SI-NEXT:    s_and_b64 vcc, exec, 0
228; SI-NEXT:  BB3_12: ; %self.loop
229; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
230; SI-NEXT:    s_mov_b64 vcc, vcc
231; SI-NEXT:    s_cbranch_vccz BB3_12
232; SI-NEXT:  BB3_13: ; %DummyReturnBlock
233; SI-NEXT:    s_endpgm
234;
235; FLAT-LABEL: loop_land_info_assert:
236; FLAT:       ; %bb.0: ; %entry
237; FLAT-NEXT:    s_mov_b32 s7, 0xf000
238; FLAT-NEXT:    s_mov_b32 s6, -1
239; FLAT-NEXT:    buffer_load_dword v0, off, s[4:7], 0
240; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
241; FLAT-NEXT:    s_load_dword s8, s[0:1], 0x30
242; FLAT-NEXT:    s_brev_b32 s9, 44
243; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
244; FLAT-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
245; FLAT-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
246; FLAT-NEXT:    v_cmp_gt_i32_e64 s[2:3], s3, 3
247; FLAT-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
248; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
249; FLAT-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
250; FLAT-NEXT:    s_waitcnt vmcnt(0)
251; FLAT-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
252; FLAT-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
253; FLAT-NEXT:    v_mov_b32_e32 v0, 3
254; FLAT-NEXT:    s_branch BB3_4
255; FLAT-NEXT:  BB3_1: ; %Flow6
256; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
257; FLAT-NEXT:    s_mov_b64 s[10:11], 0
258; FLAT-NEXT:  BB3_2: ; %Flow5
259; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
260; FLAT-NEXT:    s_mov_b64 s[14:15], 0
261; FLAT-NEXT:  BB3_3: ; %Flow
262; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
263; FLAT-NEXT:    s_and_b64 vcc, exec, s[12:13]
264; FLAT-NEXT:    s_cbranch_vccnz BB3_8
265; FLAT-NEXT:  BB3_4: ; %while.cond
266; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
267; FLAT-NEXT:    s_mov_b64 s[14:15], -1
268; FLAT-NEXT:    s_mov_b64 s[10:11], -1
269; FLAT-NEXT:    s_mov_b64 s[12:13], -1
270; FLAT-NEXT:    s_mov_b64 vcc, s[0:1]
271; FLAT-NEXT:    s_cbranch_vccz BB3_3
272; FLAT-NEXT:  ; %bb.5: ; %convex.exit
273; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
274; FLAT-NEXT:    s_mov_b64 s[10:11], -1
275; FLAT-NEXT:    s_mov_b64 s[12:13], -1
276; FLAT-NEXT:    s_mov_b64 vcc, s[2:3]
277; FLAT-NEXT:    s_cbranch_vccz BB3_2
278; FLAT-NEXT:  ; %bb.6: ; %if.end
279; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
280; FLAT-NEXT:    s_mov_b64 s[12:13], -1
281; FLAT-NEXT:    s_mov_b64 vcc, s[4:5]
282; FLAT-NEXT:    s_cbranch_vccz BB3_1
283; FLAT-NEXT:  ; %bb.7: ; %if.else
284; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
285; FLAT-NEXT:    s_mov_b64 s[12:13], 0
286; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
287; FLAT-NEXT:    s_branch BB3_1
288; FLAT-NEXT:  BB3_8: ; %loop.exit.guard4
289; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
290; FLAT-NEXT:    s_and_b64 vcc, exec, s[10:11]
291; FLAT-NEXT:    s_cbranch_vccz BB3_4
292; FLAT-NEXT:  ; %bb.9: ; %loop.exit.guard
293; FLAT-NEXT:    s_and_b64 vcc, exec, s[14:15]
294; FLAT-NEXT:    s_cbranch_vccz BB3_13
295; FLAT-NEXT:  ; %bb.10: ; %for.cond.preheader
296; FLAT-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
297; FLAT-NEXT:    s_cbranch_scc0 BB3_13
298; FLAT-NEXT:  ; %bb.11: ; %for.body
299; FLAT-NEXT:    s_and_b64 vcc, exec, 0
300; FLAT-NEXT:  BB3_12: ; %self.loop
301; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
302; FLAT-NEXT:    s_mov_b64 vcc, vcc
303; FLAT-NEXT:    s_cbranch_vccz BB3_12
304; FLAT-NEXT:  BB3_13: ; %DummyReturnBlock
305; FLAT-NEXT:    s_endpgm
306entry:
307  %cmp = icmp sgt i32 %c0, 0
308  br label %while.cond.outer
309
310while.cond.outer:
311  %tmp = load float, float addrspace(1)* undef
312  br label %while.cond
313
314while.cond:
315  %cmp1 = icmp slt i32 %c1, 4
316  br i1 %cmp1, label %convex.exit, label %for.cond
317
318convex.exit:
319  %or = or i1 %cmp, %cmp1
320  br i1 %or, label %return, label %if.end
321
322if.end:
323  %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone
324  %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000
325  br i1 %cmp2, label %if.else, label %while.cond.outer
326
327if.else:
328  store volatile i32 3, i32 addrspace(1)* undef, align 4
329  br label %while.cond
330
331for.cond:
332  %cmp3 = icmp slt i32 %c3, 1000
333  br i1 %cmp3, label %for.body, label %return
334
335for.body:
336  br i1 %cmp3, label %self.loop, label %if.end.2
337
338if.end.2:
339  %or.cond2 = or i1 %cmp3, %arg
340  br i1 %or.cond2, label %return, label %for.cond
341
342self.loop:
343 br label %self.loop
344
345return:
346  ret void
347}
348
349declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
350
351attributes #0 = { nounwind readnone }
352