1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3
4; GCN-LABEL: {{^}}select_f16:
5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
7; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
8; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
9; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
10; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
11; SI-DAG:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
12; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
13; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
14; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
15; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
16; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
17; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
18; GCN: buffer_store_short v[[R_F16]]
19; GCN: s_endpgm
20define amdgpu_kernel void @select_f16(
21    half addrspace(1)* %r,
22    half addrspace(1)* %a,
23    half addrspace(1)* %b,
24    half addrspace(1)* %c,
25    half addrspace(1)* %d) {
26entry:
27  %a.val = load volatile half, half addrspace(1)* %a
28  %b.val = load volatile half, half addrspace(1)* %b
29  %c.val = load volatile half, half addrspace(1)* %c
30  %d.val = load volatile half, half addrspace(1)* %d
31  %fcmp = fcmp olt half %a.val, %b.val
32  %r.val = select i1 %fcmp, half %c.val, half %d.val
33  store half %r.val, half addrspace(1)* %r
34  ret void
35}
36
37; GCN-LABEL: {{^}}select_f16_imm_a:
38; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
39; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
40; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
41; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
42; SI-DAG:  v_cmp_lt_f32_e32 vcc, 0.5, v[[B_F32]]
43; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
44; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
45; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
46; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
47; VI:  v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
48; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
49; GCN: buffer_store_short v[[R_F16]]
50; GCN: s_endpgm
51define amdgpu_kernel void @select_f16_imm_a(
52    half addrspace(1)* %r,
53    half addrspace(1)* %b,
54    half addrspace(1)* %c,
55    half addrspace(1)* %d) {
56entry:
57  %b.val = load volatile half, half addrspace(1)* %b
58  %c.val = load volatile half, half addrspace(1)* %c
59  %d.val = load volatile half, half addrspace(1)* %d
60  %fcmp = fcmp olt half 0xH3800, %b.val
61  %r.val = select i1 %fcmp, half %c.val, half %d.val
62  store half %r.val, half addrspace(1)* %r
63  ret void
64}
65
66; GCN-LABEL: {{^}}select_f16_imm_b:
67; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
68; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
69; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
70; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
71; SI-DAG:  v_cmp_gt_f32_e32 vcc, 0.5, v[[A_F32]]
72; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
73; SI-DAG:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
74; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
75; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
76
77; VI:  v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
78; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
79; GCN: buffer_store_short v[[R_F16]]
80; GCN: s_endpgm
81define amdgpu_kernel void @select_f16_imm_b(
82    half addrspace(1)* %r,
83    half addrspace(1)* %a,
84    half addrspace(1)* %c,
85    half addrspace(1)* %d) {
86entry:
87  %a.val = load volatile half, half addrspace(1)* %a
88  %c.val = load volatile half, half addrspace(1)* %c
89  %d.val = load volatile half, half addrspace(1)* %d
90  %fcmp = fcmp olt half %a.val, 0xH3800
91  %r.val = select i1 %fcmp, half %c.val, half %d.val
92  store half %r.val, half addrspace(1)* %r
93  ret void
94}
95
96; GCN-LABEL: {{^}}select_f16_imm_c:
97; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
98; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
99; GCN: buffer_load_ushort v[[D_F16:[0-9]+]]
100; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
101; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
102; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
103; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
104; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[D_F32]], vcc
105; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
106
107; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x3800{{$}}
108; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
109; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
110; GCN: buffer_store_short v[[R_F16]]
111; GCN: s_endpgm
112define amdgpu_kernel void @select_f16_imm_c(
113    half addrspace(1)* %r,
114    half addrspace(1)* %a,
115    half addrspace(1)* %b,
116    half addrspace(1)* %d) {
117entry:
118  %a.val = load volatile half, half addrspace(1)* %a
119  %b.val = load volatile half, half addrspace(1)* %b
120  %d.val = load volatile half, half addrspace(1)* %d
121  %fcmp = fcmp olt half %a.val, %b.val
122  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
123  store half %r.val, half addrspace(1)* %r
124  ret void
125}
126
127; GCN-LABEL: {{^}}select_f16_imm_d:
128; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
129; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
130; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
131; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
132; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
133; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
134; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
135; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], 0.5, v[[C_F32]]
136; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
137; VI:  v_mov_b32_e32 v[[D_F16:[0-9]+]], 0x3800{{$}}
138; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
139; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
140; GCN: buffer_store_short v[[R_F16]]
141; GCN: s_endpgm
142define amdgpu_kernel void @select_f16_imm_d(
143    half addrspace(1)* %r,
144    half addrspace(1)* %a,
145    half addrspace(1)* %b,
146    half addrspace(1)* %c) {
147entry:
148  %a.val = load volatile half, half addrspace(1)* %a
149  %b.val = load volatile half, half addrspace(1)* %b
150  %c.val = load volatile half, half addrspace(1)* %c
151  %fcmp = fcmp olt half %a.val, %b.val
152  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
153  store half %r.val, half addrspace(1)* %r
154  ret void
155}
156
157; GCN-LABEL: {{^}}select_v2f16:
158; SI: v_cvt_f32_f16_e32
159; SI: v_cvt_f32_f16_e32
160; SI: v_cvt_f32_f16_e32
161; SI: v_cvt_f32_f16_e32
162; SI: v_cmp_lt_f32_e32
163; SI: v_cndmask_b32_e32
164; SI: v_cmp_lt_f32_e32
165; SI: v_cndmask_b32_e32
166; SI: v_cvt_f16_f32_e32
167; SI: v_cvt_f16_f32_e32
168
169; VI: v_cmp_lt_f16_e32
170; VI: v_cndmask_b32_e32
171; VI: v_cmp_lt_f16_e32
172; VI: v_cndmask_b32_e32
173
174; GCN: s_endpgm
175define amdgpu_kernel void @select_v2f16(
176    <2 x half> addrspace(1)* %r,
177    <2 x half> addrspace(1)* %a,
178    <2 x half> addrspace(1)* %b,
179    <2 x half> addrspace(1)* %c,
180    <2 x half> addrspace(1)* %d) {
181entry:
182  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
183  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
184  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
185  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
186  %fcmp = fcmp olt <2 x half> %a.val, %b.val
187  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
188  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
189  ret void
190}
191
192; GCN-LABEL: {{^}}select_v2f16_imm_a:
193; SI:  v_cvt_f32_f16_e32
194; SI:  v_cvt_f32_f16_e32
195; SI:  v_cvt_f32_f16_e32
196; SI:  v_cvt_f32_f16_e32
197; SI:  v_cvt_f32_f16_e32
198; SI:  v_cvt_f32_f16_e32
199
200; SI: v_cmp_gt_f32_e32
201; SI: v_cndmask_b32_e32
202  ; SI: v_cmp_lt_f32_e32 vcc, 0.5
203; SI: v_cndmask_b32_e32
204
205; VI: v_cmp_lt_f16_e32
206; VI: v_cndmask_b32_e32
207; VI: v_cmp_gt_f16_e32
208; VI: v_cndmask_b32_e32
209
210; SI:  v_cvt_f16_f32_e32
211; SI:  v_cvt_f16_f32_e32
212; GCN: s_endpgm
213define amdgpu_kernel void @select_v2f16_imm_a(
214    <2 x half> addrspace(1)* %r,
215    <2 x half> addrspace(1)* %b,
216    <2 x half> addrspace(1)* %c,
217    <2 x half> addrspace(1)* %d) {
218entry:
219  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
220  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
221  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
222  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
223  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
224  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
225  ret void
226}
227
228; GCN-LABEL: {{^}}select_v2f16_imm_b:
229; SI:  v_cvt_f32_f16_e32
230; SI:  v_cvt_f32_f16_e32
231; SI:  v_cvt_f32_f16_e32
232; SI:  v_cvt_f32_f16_e32
233; SI:  v_cvt_f32_f16_e32
234; SI:  v_cvt_f32_f16_e32
235
236; SI: v_cmp_lt_f32_e32
237; SI: v_cndmask_b32_e32
238; SI: v_cmp_gt_f32_e32 vcc, 0.5
239; SI: v_cndmask_b32_e32
240
241; VI: v_cmp_gt_f16_e32
242; VI: v_cndmask_b32_e32
243; VI: v_cmp_lt_f16_e32
244; VI: v_cndmask_b32_e32
245
246; SI:  v_cvt_f16_f32_e32
247; SI:  v_cvt_f16_f32_e32
248; GCN: s_endpgm
249define amdgpu_kernel void @select_v2f16_imm_b(
250    <2 x half> addrspace(1)* %r,
251    <2 x half> addrspace(1)* %a,
252    <2 x half> addrspace(1)* %c,
253    <2 x half> addrspace(1)* %d) {
254entry:
255  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
256  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
257  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
258  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
259  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
260  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
261  ret void
262}
263
264; GCN-LABEL: {{^}}select_v2f16_imm_c:
265; SI:  v_cvt_f32_f16_e32
266; SI:  v_cvt_f32_f16_e32
267; SI:  v_cvt_f32_f16_e32
268; SI:  v_cvt_f32_f16_e32
269; SI:  v_cvt_f32_f16_e32
270; SI:  v_cvt_f32_f16_e32
271
272; SI: v_cmp_nlt_f32_e32
273; SI: v_cndmask_b32_e32
274; SI: v_cmp_nlt_f32_e32
275; SI-DAG: v_cndmask_b32_e32
276
277; VI: v_cmp_nlt_f16_e32
278; VI: v_cndmask_b32_e32
279
280; VI: v_cmp_nlt_f16_e32
281; VI: v_cndmask_b32_e32
282
283; SI-DAG: v_cvt_f16_f32_e32
284; SI: v_cvt_f16_f32_e32
285; GCN: s_endpgm
286define amdgpu_kernel void @select_v2f16_imm_c(
287    <2 x half> addrspace(1)* %r,
288    <2 x half> addrspace(1)* %a,
289    <2 x half> addrspace(1)* %b,
290    <2 x half> addrspace(1)* %d) {
291entry:
292  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
293  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
294  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
295  %fcmp = fcmp olt <2 x half> %a.val, %b.val
296  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
297  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
298  ret void
299}
300
301; GCN-LABEL: {{^}}select_v2f16_imm_d:
302; SI:  v_cvt_f32_f16_e32
303; SI:  v_cvt_f32_f16_e32
304; SI:  v_cvt_f32_f16_e32
305; SI:  v_cvt_f32_f16_e32
306; SI:  v_cvt_f32_f16_e32
307; SI:  v_cvt_f32_f16_e32
308
309; SI:  v_cmp_lt_f32_e32
310; SI: v_cndmask_b32
311; SI:  v_cmp_lt_f32_e32
312; SI: v_cndmask_b32
313
314; VI:  v_cmp_lt_f16_e32
315; VI: v_cndmask_b32
316; VI:  v_cmp_lt_f16_e32
317; VI: v_cndmask_b32
318
319; SI:  v_cvt_f16_f32_e32
320; SI:  v_cvt_f16_f32_e32
321; GCN: s_endpgm
322define amdgpu_kernel void @select_v2f16_imm_d(
323    <2 x half> addrspace(1)* %r,
324    <2 x half> addrspace(1)* %a,
325    <2 x half> addrspace(1)* %b,
326    <2 x half> addrspace(1)* %c) {
327entry:
328  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
329  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
330  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
331  %fcmp = fcmp olt <2 x half> %a.val, %b.val
332  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
333  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
334  ret void
335}
336