1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; FIXME: This leaves behind a now unnecessary and with exec
5
6; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle:
7; GCN: buffer_load_dword [[VAL:v[0-9]+]]
8; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
9; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
10; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
11; GCN: buffer_store_dword [[RESULT]]
12define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
13entry:
14  %v = load float, float addrspace(1)* %in
15  %cc = fcmp oeq float %v, 1.000000e+00
16  br i1 %cc, label %if, label %endif
17
18if:
19  %u = fadd float %v, %v
20  br label %endif
21
22endif:
23  %r = phi float [ %v, %entry ], [ %u, %if ]
24  store float %r, float addrspace(1)* %out
25  ret void
26}
27
28; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond:
29; GCN: buffer_load_dword [[VAL:v[0-9]+]]
30; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]]
31; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
32; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
33; GCN: buffer_store_dword [[RESULT]]
34define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
35entry:
36  %v = load float, float addrspace(1)* %in
37  %cc = fcmp oeq float %v, 1.000000e+00
38  br i1 %cc, label %if, label %else
39
40if:
41  %u0 = fadd float %v, %v
42  br label %endif
43
44else:
45  %u1 = fmul float %v, %v
46  br label %endif
47
48endif:
49  %r = phi float [ %u0, %if ], [ %u1, %else ]
50  store float %r, float addrspace(1)* %out
51  ret void
52}
53
54; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
55; GCN: ; clobber vcc
56; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
57; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
58; GCN: s_mov_b64 vcc, [[CMP]]
59; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
60define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
61entry:
62  %v = load i32, i32 addrspace(1)* %in
63  %cc = fcmp oeq float %k, 1.000000e+00
64  br i1 %cc, label %if, label %endif
65
66if:
67  call void asm "; clobber $0", "~{vcc}"() #0
68  %u = add i32 %v, %v
69  br label %endif
70
71endif:
72  %r = phi i32 [ %v, %entry ], [ %u, %if ]
73  store i32 %r, i32 addrspace(1)* %out
74  ret void
75}
76
77; Longest chain of cheap instructions to convert
78; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap:
79; GCN: v_mul_f32
80; GCN: v_mul_f32
81; GCN: v_mul_f32
82; GCN: v_mul_f32
83; GCN: v_mul_f32
84; GCN: v_mul_f32
85; GCN: v_mul_f32
86; GCN: v_mul_f32
87; GCN: v_mul_f32
88; GCN: v_cndmask_b32_e32
89define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
90entry:
91  %v = load float, float addrspace(1)* %in
92  %cc = fcmp oeq float %v, 1.000000e+00
93  br i1 %cc, label %if, label %endif
94
95if:
96  %u.0 = fmul float %v, %v
97  %u.1 = fmul float %v, %u.0
98  %u.2 = fmul float %v, %u.1
99  %u.3 = fmul float %v, %u.2
100  %u.4 = fmul float %v, %u.3
101  %u.5 = fmul float %v, %u.4
102  %u.6 = fmul float %v, %u.5
103  %u.7 = fmul float %v, %u.6
104  %u.8 = fmul float %v, %u.7
105  br label %endif
106
107endif:
108  %r = phi float [ %v, %entry ], [ %u.8, %if ]
109  store float %r, float addrspace(1)* %out
110  ret void
111}
112
113; Short chain of cheap instructions to not convert
114; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive:
115; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
116
117; GCN: v_mul_f32
118; GCN: v_mul_f32
119; GCN: v_mul_f32
120; GCN: v_mul_f32
121; GCN: v_mul_f32
122; GCN: v_mul_f32
123; GCN: v_mul_f32
124; GCN: v_mul_f32
125; GCN: v_mul_f32
126; GCN: v_mul_f32
127
128; GCN: [[ENDIF]]:
129; GCN: buffer_store_dword
130define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
131entry:
132  %v = load float, float addrspace(1)* %in
133  %cc = fcmp oeq float %v, 1.000000e+00
134  br i1 %cc, label %if, label %endif
135
136if:
137  %u.0 = fmul float %v, %v
138  %u.1 = fmul float %v, %u.0
139  %u.2 = fmul float %v, %u.1
140  %u.3 = fmul float %v, %u.2
141  %u.4 = fmul float %v, %u.3
142  %u.5 = fmul float %v, %u.4
143  %u.6 = fmul float %v, %u.5
144  %u.7 = fmul float %v, %u.6
145  %u.8 = fmul float %v, %u.7
146  %u.9 = fmul float %v, %u.8
147  br label %endif
148
149endif:
150  %r = phi float [ %v, %entry ], [ %u.9, %if ]
151  store float %r, float addrspace(1)* %out
152  ret void
153}
154
155; Should still branch over fdiv expansion
156; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive:
157; GCN: v_cmp_neq_f32_e32
158; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
159
160; GCN: v_div_scale_f32
161
162; GCN: [[ENDIF]]:
163; GCN: buffer_store_dword
164define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
165entry:
166  %v = load float, float addrspace(1)* %in
167  %cc = fcmp oeq float %v, 1.000000e+00
168  br i1 %cc, label %if, label %endif
169
170if:
171  %u = fdiv float %v, %v
172  br label %endif
173
174endif:
175  %r = phi float [ %v, %entry ], [ %u, %if ]
176  store float %r, float addrspace(1)* %out
177  ret void
178}
179
180; vcc branch with SGPR inputs
181; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle:
182; GCN: v_cmp_neq_f32_e64
183; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
184
185; GCN: s_add_i32
186
187; GCN: [[ENDIF]]:
188; GCN: buffer_store_dword
189define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
190entry:
191  %v = load i32, i32 addrspace(4)* %in
192  %cc = fcmp oeq float %cnd, 1.000000e+00
193  br i1 %cc, label %if, label %endif
194
195if:
196  %u = add i32 %v, %v
197  br label %endif
198
199endif:
200  %r = phi i32 [ %v, %entry ], [ %u, %if ]
201  store i32 %r, i32 addrspace(1)* %out
202  ret void
203
204}
205
206; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
207; GCN: v_cndmask_b32
208define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
209entry:
210  %v = load float, float addrspace(4)* %in
211  %cc = fcmp oeq float %v, 1.000000e+00
212  br i1 %cc, label %if, label %endif
213
214if:
215  %u = fadd float %v, %v
216  br label %endif
217
218endif:
219  %r = phi float [ %v, %entry ], [ %u, %if ]
220  store float %r, float addrspace(1)* %out
221  ret void
222}
223
224; Due to broken cost heuristic, this is not if converted like
225; test_vccnz_ifcvt_triangle_constant_load even though it should be.
226
227; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
228; GCN: v_cndmask_b32
229define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
230entry:
231  %cc = fcmp oeq float %v, 1.000000e+00
232  br i1 %cc, label %if, label %endif
233
234if:
235  %u = fadd float %v, %v
236  br label %endif
237
238endif:
239  %r = phi float [ %v, %entry ], [ %u, %if ]
240  store float %r, float addrspace(1)* %out
241  ret void
242}
243
244; Scalar branch and scalar inputs
245; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle:
246; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
247; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
248; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
249; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[VAL]], [[ADD]]
250define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
251entry:
252  %v = load i32, i32 addrspace(4)* %in
253  %cc = icmp eq i32 %cond, 1
254  br i1 %cc, label %if, label %endif
255
256if:
257  %u = add i32 %v, %v
258  br label %endif
259
260endif:
261  %r = phi i32 [ %v, %entry ], [ %u, %if ]
262  call void asm sideeffect "; reg use $0", "s"(i32 %r) #0
263  ret void
264}
265
266; FIXME: Should be able to use VALU compare and select
267; Scalar branch but VGPR select operands
268; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle:
269; GCN: s_cmp_lg_u32
270; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
271
272; GCN: v_add_f32_e32
273
274; GCN: [[ENDIF]]:
275; GCN: buffer_store_dword
276define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
277entry:
278  %v = load float, float addrspace(1)* %in
279  %cc = icmp eq i32 %cond, 1
280  br i1 %cc, label %if, label %endif
281
282if:
283  %u = fadd float %v, %v
284  br label %endif
285
286endif:
287  %r = phi float [ %v, %entry ], [ %u, %if ]
288  store float %r, float addrspace(1)* %out
289  ret void
290}
291
292; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64:
293; GCN: s_add_u32
294; GCN: s_addc_u32
295; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
296; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
297define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
298entry:
299  %v = load i64, i64 addrspace(4)* %in
300  %cc = icmp eq i32 %cond, 1
301  br i1 %cc, label %if, label %endif
302
303if:
304  %u = add i64 %v, %v
305  br label %endif
306
307endif:
308  %r = phi i64 [ %v, %entry ], [ %u, %if ]
309  call void asm sideeffect "; reg use $0", "s"(i64 %r) #0
310  ret void
311}
312
313; TODO: Can do s_cselect_b64; s_cselect_b32
314; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96:
315; GCN: s_add_i32
316; GCN: s_add_i32
317; GCN: s_add_i32
318; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
319; GCN-NEXT: s_cselect_b32 s
320; GCN-NEXT: s_cselect_b32 s
321; GCN-NEXT: s_cselect_b32 s
322define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
323entry:
324  %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
325  %cc = icmp eq i32 %cond, 1
326  br i1 %cc, label %if, label %endif
327
328if:
329  %u = add <3 x i32> %v, %v
330  br label %endif
331
332endif:
333  %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ]
334  %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
335  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0
336  ret void
337}
338
339; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128:
340; GCN: s_add_i32
341; GCN: s_add_i32
342; GCN: s_add_i32
343; GCN: s_add_i32
344; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
345; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
346; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
347define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
348entry:
349  %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
350  %cc = icmp eq i32 %cond, 1
351  br i1 %cc, label %if, label %endif
352
353if:
354  %u = add <4 x i32> %v, %v
355  br label %endif
356
357endif:
358  %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ]
359  call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0
360  ret void
361}
362
363; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
364; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
365; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
366define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
367entry:
368  %cmp0 = icmp eq i32 %cond, 0
369  br i1 %cmp0, label %else, label %if
370
371if:
372  br label %done
373
374else:
375  br label %done
376
377done:
378  %value = phi i32 [0, %if], [1, %else]
379  store i32 %value, i32 addrspace(1)* %out
380  ret void
381}
382
383; GCN-LABEL: {{^}}ifcvt_undef_scc:
384; GCN: {{^}}; %bb.0:
385; GCN-NEXT: s_load_dwordx2
386; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 0, 1{{$}}
387define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
388entry:
389  br i1 undef, label %else, label %if
390
391if:
392  br label %done
393
394else:
395  br label %done
396
397done:
398  %value = phi i32 [0, %if], [1, %else]
399  store i32 %value, i32 addrspace(1)* %out
400  ret void
401}
402
403; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256:
404; GCN: v_cmp_neq_f32
405; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
406
407; GCN: v_add_i32
408; GCN: v_add_i32
409
410; GCN: [[ENDIF]]:
411; GCN: buffer_store_dword
412define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
413entry:
414  %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
415  %cc = fcmp oeq float %cnd, 1.000000e+00
416  br i1 %cc, label %if, label %endif
417
418if:
419  %u = add <8 x i32> %v, %v
420  br label %endif
421
422endif:
423  %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ]
424  store <8 x i32> %r, <8 x i32> addrspace(1)* %out
425  ret void
426}
427
428; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512:
429; GCN: v_cmp_neq_f32
430; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]]
431
432; GCN: v_add_i32
433; GCN: v_add_i32
434
435; GCN: [[ENDIF]]:
436; GCN: buffer_store_dword
437define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
438entry:
439  %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
440  %cc = fcmp oeq float %cnd, 1.000000e+00
441  br i1 %cc, label %if, label %endif
442
443if:
444  %u = add <16 x i32> %v, %v
445  br label %endif
446
447endif:
448  %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ]
449  store <16 x i32> %r, <16 x i32> addrspace(1)* %out
450  ret void
451}
452
453attributes #0 = { nounwind }
454