1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2
3
4; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
5;        See PR33579.
6; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj %s
7; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
8
9; OBJ:       Relocations [
10; OBJ-NEXT: ]
11
12; Restrict maximum branch to between +7 and -8 dwords
13
14; Used to emit an always 4 byte instruction. Inline asm always assumes
15; each instruction is the maximum size.
16declare void @llvm.amdgcn.s.sleep(i32) #0
17
18declare i32 @llvm.amdgcn.workitem.id.x() #1
19
20
21; GCN-LABEL: {{^}}uniform_conditional_max_short_forward_branch:
22; GCN: s_load_dword [[CND:s[0-9]+]]
23; GCN: s_cmp_eq_u32 [[CND]], 0
24; GCN-NEXT: s_cbranch_scc1 [[BB3:BB[0-9]+_[0-9]+]]
25
26
27; GCN-NEXT: ; %bb.1: ; %bb2
28; GCN-NEXT: ;;#ASMSTART
29; GCN-NEXT: v_nop_e64
30; GCN-NEXT: v_nop_e64
31; GCN-NEXT: v_nop_e64
32; GCN-NEXT: ;;#ASMEND
33; GCN-NEXT: s_sleep 0
34
35; GCN-NEXT: [[BB3]]: ; %bb3
36; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
37; GCN: buffer_store_dword [[V_CND]]
38; GCN: s_endpgm
39define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
40bb:
41  %cmp = icmp eq i32 %cnd, 0
42  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
43
44bb2:
45; 24 bytes
46  call void asm sideeffect
47   "v_nop_e64
48    v_nop_e64
49    v_nop_e64", ""() #0
50  call void @llvm.amdgcn.s.sleep(i32 0)
51  br label %bb3
52
53bb3:
54  store volatile i32 %cnd, i32 addrspace(1)* %arg
55  ret void
56}
57
58; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_branch:
59; GCN: s_load_dword [[CND:s[0-9]+]]
60; GCN: s_cmp_eq_u32 [[CND]], 0
61; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
62
63; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
64; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
65; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
66; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
67; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
68
69; GCN-NEXT: [[LONGBB]]:
70; GCN-NEXT: ;;#ASMSTART
71; GCN: v_nop_e64
72; GCN: v_nop_e64
73; GCN: v_nop_e64
74; GCN: v_nop_e64
75; GCN-NEXT: ;;#ASMEND
76
77; GCN-NEXT: [[ENDBB]]:
78; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
79; GCN: buffer_store_dword [[V_CND]]
80; GCN: s_endpgm
81define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
82bb0:
83  %cmp = icmp eq i32 %cnd, 0
84  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
85
86bb2:
87; 32 bytes
88  call void asm sideeffect
89   "v_nop_e64
90    v_nop_e64
91    v_nop_e64
92    v_nop_e64", ""() #0
93  br label %bb3
94
95bb3:
96  store volatile i32 %cnd, i32 addrspace(1)* %arg
97  ret void
98}
99
100; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch:
101; GCN: s_load_dword [[CND:s[0-9]+]]
102
103; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0
104; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]]
105; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
106
107; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
108; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
109; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
110; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
111; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
112
113; GCN-NEXT: [[LONGBB]]:
114; GCN: v_nop_e64
115; GCN: v_nop_e64
116; GCN: v_nop_e64
117; GCN: v_nop_e64
118
119; GCN: [[ENDBB]]:
120; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
121; GCN: buffer_store_dword [[V_CND]]
122; GCN: s_endpgm
123define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
124bb0:
125  %cmp = fcmp oeq float %cnd, 0.0
126  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
127
128bb2:
129  call void asm sideeffect " ; 32 bytes
130    v_nop_e64
131    v_nop_e64
132    v_nop_e64
133    v_nop_e64", ""() #0
134  br label %bb3
135
136bb3:
137  store volatile float %cnd, float addrspace(1)* %arg
138  ret void
139}
140
141; GCN-LABEL: {{^}}min_long_forward_vbranch:
142
143; GCN: buffer_load_dword
144; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
145; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
146
147; GCN: v_nop_e64
148; GCN: v_nop_e64
149; GCN: v_nop_e64
150; GCN: v_nop_e64
151
152; GCN: s_or_b64 exec, exec, [[SAVE]]
153; GCN: buffer_store_dword
154; GCN: s_endpgm
155define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
156bb:
157  %tid = call i32 @llvm.amdgcn.workitem.id.x()
158  %tid.ext = zext i32 %tid to i64
159  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tid.ext
160  %load = load volatile i32, i32 addrspace(1)* %gep
161  %cmp = icmp eq i32 %load, 0
162  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
163
164bb2:
165  call void asm sideeffect " ; 32 bytes
166    v_nop_e64
167    v_nop_e64
168    v_nop_e64
169    v_nop_e64", ""() #0
170  br label %bb3
171
172bb3:
173  store volatile i32 %load, i32 addrspace(1)* %gep
174  ret void
175}
176
177; GCN-LABEL: {{^}}long_backward_sbranch:
178; GCN: s_mov_b32 [[LOOPIDX:s[0-9]+]], 0{{$}}
179
180; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: ; %bb2
181; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
182; GCN-NEXT: s_add_i32 [[INC:s[0-9]+]], [[LOOPIDX]], 1
183; GCN-NEXT: s_cmp_lt_i32 [[INC]], 10
184
185; GCN-NEXT: ;;#ASMSTART
186; GCN-NEXT: v_nop_e64
187; GCN-NEXT: v_nop_e64
188; GCN-NEXT: v_nop_e64
189; GCN-NEXT: ;;#ASMEND
190
191; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]]
192
193; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
194; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
195
196; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
197; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_JUMP]]+4)-[[LOOPBB]]
198; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
199; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
200
201; GCN-NEXT: [[ENDBB]]:
202; GCN-NEXT: s_endpgm
203define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
204bb:
205  br label %bb2
206
207bb2:
208  %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
209   ; 24 bytes
210  call void asm sideeffect
211   "v_nop_e64
212    v_nop_e64
213    v_nop_e64", ""() #0
214  %inc = add nsw i32 %loop.idx, 1 ; add cost 4
215  %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
216  br i1 %cmp, label %bb2, label %bb3 ; -
217
218bb3:
219  ret void
220}
221
222; Requires expansion of unconditional branch from %bb2 to %bb4 (and
223; expansion of conditional branch from %bb to %bb3.
224
225; GCN-LABEL: {{^}}uniform_unconditional_min_long_forward_branch:
226; GCN: s_cmp_eq_u32
227; GCN: s_cbranch_scc{{[0-1]}} [[BB2:BB[0-9]+_[0-9]+]]
228
229; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
230; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
231; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
232; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}}
233; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
234
235; GCN: [[BB2]]: ; %bb3
236; GCN: v_nop_e64
237; GCN: v_nop_e64
238; GCN: v_nop_e64
239; GCN: v_nop_e64
240; GCN: ;;#ASMEND
241
242; GCN: [[BB3]]:
243; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
244; GCN: buffer_store_dword [[BB2_K]]
245
246; GCN: v_mov_b32_e32 [[BB4_K:v[0-9]+]], 63
247; GCN: buffer_store_dword [[BB4_K]]
248; GCN-NEXT: s_endpgm
249; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
250define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
251bb0:
252  %tmp = icmp ne i32 %arg1, 0
253  br i1 %tmp, label %bb2, label %bb3
254
255bb2:
256  store volatile i32 17, i32 addrspace(1)* undef
257  br label %bb4
258
259bb3:
260  ; 32 byte asm
261  call void asm sideeffect
262   "v_nop_e64
263    v_nop_e64
264    v_nop_e64
265    v_nop_e64", ""() #0
266  br label %bb4
267
268bb4:
269  store volatile i32 63, i32 addrspace(1)* %arg
270  ret void
271}
272
273; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch:
274; GCN-NEXT: ; %bb.0: ; %entry
275
276; GCN-NEXT: [[LOOP:BB[0-9]_[0-9]+]]: ; %loop
277; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
278; GCN-NEXT: ;;#ASMSTART
279; GCN-NEXT: v_nop_e64
280; GCN-NEXT: v_nop_e64
281; GCN-NEXT: v_nop_e64
282; GCN-NEXT: v_nop_e64
283; GCN-NEXT: ;;#ASMEND
284
285; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
286; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
287
288; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
289; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP]]
290; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
291; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
292; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
293define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
294entry:
295  br label %loop
296
297loop:
298  ; 32 byte asm
299  call void asm sideeffect
300   "v_nop_e64
301    v_nop_e64
302    v_nop_e64
303    v_nop_e64", ""() #0
304  br label %loop
305}
306
307; Expansion of branch from %bb1 to %bb3 introduces need to expand
308; branch from %bb0 to %bb2
309
310; GCN-LABEL: {{^}}expand_requires_expand:
311; GCN-NEXT: ; %bb.0: ; %bb0
312; GCN: s_load_dword
313; GCN: {{s|v}}_cmp_lt_i32
314; GCN: s_cbranch
315
316; GCN: s_load_dword
317; GCN-NEXT: s_waitcnt lgkmcnt(0)
318; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64
319; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]]
320
321; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]:
322; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
323; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
324; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}}
325; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
326
327; GCN-NEXT: [[BB2]]: ; %bb2
328; GCN-NEXT: ;;#ASMSTART
329; GCN-NEXT: v_nop_e64
330; GCN-NEXT: v_nop_e64
331; GCN-NEXT: v_nop_e64
332; GCN-NEXT: v_nop_e64
333; GCN-NEXT: ;;#ASMEND
334
335; GCN-NEXT: [[BB3]]: ; %bb3
336; GCN-NEXT: ;;#ASMSTART
337; GCN-NEXT: v_nop_e64
338; GCN-NEXT: ;;#ASMEND
339; GCN-NEXT: ;;#ASMSTART
340; GCN-NEXT: v_nop_e64
341; GCN-NEXT: ;;#ASMEND
342; GCN-NEXT: s_endpgm
343define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
344bb0:
345  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
346  %cmp0 = icmp slt i32 %cond0, 0
347  br i1 %cmp0, label %bb2, label %bb1
348
349bb1:
350  %val = load volatile i32, i32 addrspace(4)* undef
351  %cmp1 = icmp eq i32 %val, 3
352  br i1 %cmp1, label %bb3, label %bb2
353
354bb2:
355  call void asm sideeffect
356   "v_nop_e64
357    v_nop_e64
358    v_nop_e64
359    v_nop_e64", ""() #0
360  br label %bb3
361
362bb3:
363; These NOPs prevent tail-duplication-based outlining
364; from firing, which defeats the need to expand the branches and this test.
365  call void asm sideeffect
366   "v_nop_e64", ""() #0
367  call void asm sideeffect
368   "v_nop_e64", ""() #0
369  ret void
370}
371
372; Requires expanding of required skip branch.
373
374; GCN-LABEL: {{^}}uniform_inside_divergent:
375; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
376; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
377; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
378
379; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
380; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
381; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
382; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
383; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
384
385; GCN-NEXT: [[IF]]: ; %if
386; GCN: buffer_store_dword
387; GCN: s_cmp_lg_u32
388; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
389
390; GCN-NEXT: ; %bb.2: ; %if_uniform
391; GCN: buffer_store_dword
392
393; GCN-NEXT: [[ENDIF]]: ; %endif
394; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
395; GCN-NEXT: s_sleep 5
396; GCN-NEXT: s_endpgm
397define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
398entry:
399  %tid = call i32 @llvm.amdgcn.workitem.id.x()
400  %d_cmp = icmp ult i32 %tid, 16
401  br i1 %d_cmp, label %if, label %endif
402
403if:
404  store i32 0, i32 addrspace(1)* %out
405  %u_cmp = icmp eq i32 %cond, 0
406  br i1 %u_cmp, label %if_uniform, label %endif
407
408if_uniform:
409  store i32 1, i32 addrspace(1)* %out
410  br label %endif
411
412endif:
413  ; layout can remove the split branch if it can copy the return block.
414  ; This call makes the return block long enough that it doesn't get copied.
415  call void @llvm.amdgcn.s.sleep(i32 5);
416  ret void
417}
418
419; si_mask_branch
420
421; GCN-LABEL: {{^}}analyze_mask_branch:
422; GCN: v_cmp_nlt_f32_e32 vcc
423; GCN-NEXT: s_and_saveexec_b64 [[TEMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc
424; GCN-NEXT: s_xor_b64  [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[TEMP_MASK]]
425
426; GCN: BB{{[0-9]+_[0-9]+}}: ; %Flow
427; GCN-NEXT: s_or_saveexec_b64 [[TEMP_MASK1:s\[[0-9]+:[0-9]+\]]], [[MASK]]
428; GCN-NEXT: s_xor_b64 exec, exec, [[TEMP_MASK1]]
429
430; GCN: [[LOOP_BODY:BB[0-9]+_[0-9]+]]: ; %loop{{$}}
431; GCN: ;;#ASMSTART
432; GCN: v_nop_e64
433; GCN: v_nop_e64
434; GCN: v_nop_e64
435; GCN: v_nop_e64
436; GCN: v_nop_e64
437; GCN: v_nop_e64
438; GCN: ;;#ASMEND
439; GCN: s_cbranch_{{vccz|vccnz}} [[RET:BB[0-9]+_[0-9]+]]
440
441; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
442; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
443; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
444; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP_BODY]]
445; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
446; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
447
448; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
449; GCN-NEXT: s_endpgm
450define amdgpu_kernel void @analyze_mask_branch() #0 {
451entry:
452  %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
453  %cmp0 = fcmp ogt float %reg, 0.000000e+00
454  br i1 %cmp0, label %loop, label %ret
455
456loop:
457  %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
458  call void asm sideeffect
459    "v_nop_e64
460     v_nop_e64", ""() #0
461  %cmp1 = fcmp olt float %phi, 8.0
462  br i1 %cmp1, label %loop_body, label %ret
463
464loop_body:
465  call void asm sideeffect
466  "v_nop_e64
467   v_nop_e64
468   v_nop_e64
469   v_nop_e64", ""() #0
470  br label %loop
471
472ret:
473  store volatile i32 7, i32 addrspace(1)* undef
474  ret void
475}
476
477; GCN-LABEL: {{^}}long_branch_hang:
478; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
479; GCN: s_cbranch_scc{{[0-1]}} [[LONG_BR_0:BB[0-9]+_[0-9]+]]
480; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
481
482; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
483; GCN-NEXT: s_addc_u32
484; GCN-NEXT: s_setpc_b64
485; GCN-NEXT: [[LONG_BR_0]]:
486
487; GCN: [[LONG_BR_DEST0]]:
488
489; GCN-DAG: v_cmp_lt_i32
490; GCN-DAG: v_cmp_ge_i32
491
492; GCN: s_cbranch_vccz
493; GCN: s_setpc_b64
494
495; GCN: s_endpgm
496define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
497bb:
498  %tmp = icmp slt i32 %arg2, 9
499  %tmp6 = icmp eq i32 %arg1, 0
500  %tmp7 = icmp sgt i32 %arg4, 0
501  %tmp8 = icmp sgt i32 %arg4, 5
502  br i1 %tmp8, label %bb9, label %bb13
503
504bb9:                                              ; preds = %bb
505  %tmp10 = and i1 %tmp7, %tmp
506  %tmp11 = icmp slt i32 %arg3, %arg4
507  %tmp12 = or i1 %tmp11, %tmp7
508  br i1 %tmp12, label %bb19, label %bb14
509
510bb13:                                             ; preds = %bb
511  call void asm sideeffect
512  "v_nop_e64
513   v_nop_e64
514   v_nop_e64
515   v_nop_e64", ""() #0
516  br i1 %tmp6, label %bb19, label %bb14
517
518bb14:                                             ; preds = %bb13, %bb9
519  %tmp15 = icmp slt i32 %arg3, %arg4
520  %tmp16 = or i1 %tmp15, %tmp
521  %tmp17 = and i1 %tmp6, %tmp16
522  %tmp18 = zext i1 %tmp17 to i32
523  br label %bb19
524
525bb19:                                             ; preds = %bb14, %bb13, %bb9
526  %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
527  %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %arg5
528  store i32 %tmp20, i32 addrspace(1)* %tmp21, align 4
529  ret void
530}
531
532attributes #0 = { nounwind }
533attributes #1 = { nounwind readnone }
534