1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3
4; GCN-LABEL: {{^}}fcmp_f16_lt
5; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
7; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
8; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
9; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
10; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
11; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
12; GCN: buffer_store_dword v[[R_I32]]
13; GCN: s_endpgm
14define amdgpu_kernel void @fcmp_f16_lt(
15    i32 addrspace(1)* %r,
16    half addrspace(1)* %a,
17    half addrspace(1)* %b) {
18entry:
19  %a.val = load volatile half, half addrspace(1)* %a
20  %b.val = load volatile half, half addrspace(1)* %b
21  %r.val = fcmp olt half %a.val, %b.val
22  %r.val.sext = sext i1 %r.val to i32
23  store i32 %r.val.sext, i32 addrspace(1)* %r
24  ret void
25}
26
27; GCN-LABEL: {{^}}fcmp_f16_lt_abs:
28; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
29; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
30
31; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
32; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
33
34; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
35; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
36
37; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
38; GCN: buffer_store_dword v[[R_I32]]
39; GCN: s_endpgm
40define amdgpu_kernel void @fcmp_f16_lt_abs(
41    i32 addrspace(1)* %r,
42    half addrspace(1)* %a,
43    half addrspace(1)* %b) {
44entry:
45  %a.val = load volatile half, half addrspace(1)* %a
46  %b.val = load volatile half, half addrspace(1)* %b
47  %a.abs = call half @llvm.fabs.f16(half %a.val)
48  %b.abs = call half @llvm.fabs.f16(half %b.val)
49  %r.val = fcmp olt half %a.abs, %b.abs
50  %r.val.sext = sext i1 %r.val to i32
51  store i32 %r.val.sext, i32 addrspace(1)* %r
52  ret void
53}
54
55; GCN-LABEL: {{^}}fcmp_f16_eq
56; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
57; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
58; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
59; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
60; SI:  v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
61; VI:  v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
62; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
63; GCN: buffer_store_dword v[[R_I32]]
64; GCN: s_endpgm
65define amdgpu_kernel void @fcmp_f16_eq(
66    i32 addrspace(1)* %r,
67    half addrspace(1)* %a,
68    half addrspace(1)* %b) {
69entry:
70  %a.val = load volatile half, half addrspace(1)* %a
71  %b.val = load volatile half, half addrspace(1)* %b
72  %r.val = fcmp oeq half %a.val, %b.val
73  %r.val.sext = sext i1 %r.val to i32
74  store i32 %r.val.sext, i32 addrspace(1)* %r
75  ret void
76}
77
78; GCN-LABEL: {{^}}fcmp_f16_le
79; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
80; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
81; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
82; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
83; SI:  v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
84; VI:  v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
85; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
86; GCN: buffer_store_dword v[[R_I32]]
87; GCN: s_endpgm
88define amdgpu_kernel void @fcmp_f16_le(
89    i32 addrspace(1)* %r,
90    half addrspace(1)* %a,
91    half addrspace(1)* %b) {
92entry:
93  %a.val = load volatile half, half addrspace(1)* %a
94  %b.val = load volatile half, half addrspace(1)* %b
95  %r.val = fcmp ole half %a.val, %b.val
96  %r.val.sext = sext i1 %r.val to i32
97  store i32 %r.val.sext, i32 addrspace(1)* %r
98  ret void
99}
100
101; GCN-LABEL: {{^}}fcmp_f16_gt
102; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
103; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
104; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
105; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
106; SI:  v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
107; VI:  v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
108; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
109; GCN: buffer_store_dword v[[R_I32]]
110; GCN: s_endpgm
111define amdgpu_kernel void @fcmp_f16_gt(
112    i32 addrspace(1)* %r,
113    half addrspace(1)* %a,
114    half addrspace(1)* %b) {
115entry:
116  %a.val = load volatile half, half addrspace(1)* %a
117  %b.val = load volatile half, half addrspace(1)* %b
118  %r.val = fcmp ogt half %a.val, %b.val
119  %r.val.sext = sext i1 %r.val to i32
120  store i32 %r.val.sext, i32 addrspace(1)* %r
121  ret void
122}
123
124; GCN-LABEL: {{^}}fcmp_f16_lg
125; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
126; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
127; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
128; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
129; SI:  v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
130; VI:  v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
131; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
132; GCN: buffer_store_dword v[[R_I32]]
133; GCN: s_endpgm
134define amdgpu_kernel void @fcmp_f16_lg(
135    i32 addrspace(1)* %r,
136    half addrspace(1)* %a,
137    half addrspace(1)* %b) {
138entry:
139  %a.val = load volatile half, half addrspace(1)* %a
140  %b.val = load volatile half, half addrspace(1)* %b
141  %r.val = fcmp one half %a.val, %b.val
142  %r.val.sext = sext i1 %r.val to i32
143  store i32 %r.val.sext, i32 addrspace(1)* %r
144  ret void
145}
146
147; GCN-LABEL: {{^}}fcmp_f16_ge
148; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
149; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
150; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
151; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
152; SI:  v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
153; VI:  v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
154; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
155; GCN: buffer_store_dword v[[R_I32]]
156; GCN: s_endpgm
157define amdgpu_kernel void @fcmp_f16_ge(
158    i32 addrspace(1)* %r,
159    half addrspace(1)* %a,
160    half addrspace(1)* %b) {
161entry:
162  %a.val = load volatile half, half addrspace(1)* %a
163  %b.val = load volatile half, half addrspace(1)* %b
164  %r.val = fcmp oge half %a.val, %b.val
165  %r.val.sext = sext i1 %r.val to i32
166  store i32 %r.val.sext, i32 addrspace(1)* %r
167  ret void
168}
169
170; GCN-LABEL: {{^}}fcmp_f16_o
171; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
172; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
173; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
174; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
175; SI:  v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
176; VI:  v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
177; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
178; GCN: buffer_store_dword v[[R_I32]]
179; GCN: s_endpgm
180define amdgpu_kernel void @fcmp_f16_o(
181    i32 addrspace(1)* %r,
182    half addrspace(1)* %a,
183    half addrspace(1)* %b) {
184entry:
185  %a.val = load volatile half, half addrspace(1)* %a
186  %b.val = load volatile half, half addrspace(1)* %b
187  %r.val = fcmp ord half %a.val, %b.val
188  %r.val.sext = sext i1 %r.val to i32
189  store i32 %r.val.sext, i32 addrspace(1)* %r
190  ret void
191}
192
193; GCN-LABEL: {{^}}fcmp_f16_u
194; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
195; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
196; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
197; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
198; SI:  v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
199; VI:  v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
200; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
201; GCN: buffer_store_dword v[[R_I32]]
202; GCN: s_endpgm
203define amdgpu_kernel void @fcmp_f16_u(
204    i32 addrspace(1)* %r,
205    half addrspace(1)* %a,
206    half addrspace(1)* %b) {
207entry:
208  %a.val = load volatile half, half addrspace(1)* %a
209  %b.val = load volatile half, half addrspace(1)* %b
210  %r.val = fcmp uno half %a.val, %b.val
211  %r.val.sext = sext i1 %r.val to i32
212  store i32 %r.val.sext, i32 addrspace(1)* %r
213  ret void
214}
215
216; GCN-LABEL: {{^}}fcmp_f16_nge
217; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
218; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
219; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
220; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
221; SI:  v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
222; VI:  v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
223; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
224; GCN: buffer_store_dword v[[R_I32]]
225; GCN: s_endpgm
226define amdgpu_kernel void @fcmp_f16_nge(
227    i32 addrspace(1)* %r,
228    half addrspace(1)* %a,
229    half addrspace(1)* %b) {
230entry:
231  %a.val = load volatile half, half addrspace(1)* %a
232  %b.val = load volatile half, half addrspace(1)* %b
233  %r.val = fcmp ult half %a.val, %b.val
234  %r.val.sext = sext i1 %r.val to i32
235  store i32 %r.val.sext, i32 addrspace(1)* %r
236  ret void
237}
238
239; GCN-LABEL: {{^}}fcmp_f16_nlg
240; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
241; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
242; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
243; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
244; SI:  v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
245; VI:  v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
246; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
247; GCN: buffer_store_dword v[[R_I32]]
248; GCN: s_endpgm
249define amdgpu_kernel void @fcmp_f16_nlg(
250    i32 addrspace(1)* %r,
251    half addrspace(1)* %a,
252    half addrspace(1)* %b) {
253entry:
254  %a.val = load volatile half, half addrspace(1)* %a
255  %b.val = load volatile half, half addrspace(1)* %b
256  %r.val = fcmp ueq half %a.val, %b.val
257  %r.val.sext = sext i1 %r.val to i32
258  store i32 %r.val.sext, i32 addrspace(1)* %r
259  ret void
260}
261
262; GCN-LABEL: {{^}}fcmp_f16_ngt
263; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
264; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
265; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
266; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
267; SI:  v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
268; VI:  v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
269; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
270; GCN: buffer_store_dword v[[R_I32]]
271; GCN: s_endpgm
272define amdgpu_kernel void @fcmp_f16_ngt(
273    i32 addrspace(1)* %r,
274    half addrspace(1)* %a,
275    half addrspace(1)* %b) {
276entry:
277  %a.val = load volatile half, half addrspace(1)* %a
278  %b.val = load volatile half, half addrspace(1)* %b
279  %r.val = fcmp ule half %a.val, %b.val
280  %r.val.sext = sext i1 %r.val to i32
281  store i32 %r.val.sext, i32 addrspace(1)* %r
282  ret void
283}
284
285; GCN-LABEL: {{^}}fcmp_f16_nle
286; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
287; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
288; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
289; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
290; SI:  v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
291; VI:  v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
292; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
293; GCN: buffer_store_dword v[[R_I32]]
294; GCN: s_endpgm
295define amdgpu_kernel void @fcmp_f16_nle(
296    i32 addrspace(1)* %r,
297    half addrspace(1)* %a,
298    half addrspace(1)* %b) {
299entry:
300  %a.val = load volatile half, half addrspace(1)* %a
301  %b.val = load volatile half, half addrspace(1)* %b
302  %r.val = fcmp ugt half %a.val, %b.val
303  %r.val.sext = sext i1 %r.val to i32
304  store i32 %r.val.sext, i32 addrspace(1)* %r
305  ret void
306}
307
308; GCN-LABEL: {{^}}fcmp_f16_neq
309; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
310; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
311; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
312; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
313; SI:  v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
314; VI:  v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
315; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
316; GCN: buffer_store_dword v[[R_I32]]
317; GCN: s_endpgm
318define amdgpu_kernel void @fcmp_f16_neq(
319    i32 addrspace(1)* %r,
320    half addrspace(1)* %a,
321    half addrspace(1)* %b) {
322entry:
323  %a.val = load volatile half, half addrspace(1)* %a
324  %b.val = load volatile half, half addrspace(1)* %b
325  %r.val = fcmp une half %a.val, %b.val
326  %r.val.sext = sext i1 %r.val to i32
327  store i32 %r.val.sext, i32 addrspace(1)* %r
328  ret void
329}
330
331; GCN-LABEL: {{^}}fcmp_f16_nlt
332; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
333; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
334; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
335; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
336; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
337; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
338; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
339; GCN: buffer_store_dword v[[R_I32]]
340; GCN: s_endpgm
341define amdgpu_kernel void @fcmp_f16_nlt(
342    i32 addrspace(1)* %r,
343    half addrspace(1)* %a,
344    half addrspace(1)* %b) {
345entry:
346  %a.val = load volatile half, half addrspace(1)* %a
347  %b.val = load volatile half, half addrspace(1)* %b
348  %r.val = fcmp uge half %a.val, %b.val
349  %r.val.sext = sext i1 %r.val to i32
350  store i32 %r.val.sext, i32 addrspace(1)* %r
351  ret void
352}
353
354; GCN-LABEL: {{^}}fcmp_v2f16_lt:
355; SI: v_cmp_lt_f32_e32 vcc,
356; SI: v_cmp_lt_f32_e32 vcc,
357
358; VI: v_cmp_lt_f16_e32 vcc,
359; VI: v_cmp_lt_f16_e32 vcc,
360define amdgpu_kernel void @fcmp_v2f16_lt(
361    <2 x i32> addrspace(1)* %r,
362    <2 x half> addrspace(1)* %a,
363    <2 x half> addrspace(1)* %b) {
364entry:
365  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
366  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
367  %r.val = fcmp olt <2 x half> %a.val, %b.val
368  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
369  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
370  ret void
371}
372
373; GCN-LABEL: {{^}}fcmp_v2f16_eq
374; SI:  v_cmp_eq_f32_e32 vcc,
375; SI:  v_cmp_eq_f32_e32 vcc,
376
377; VI:  v_cmp_eq_f16_e32 vcc,
378; VI:  v_cmp_eq_f16_e32 vcc,
379define amdgpu_kernel void @fcmp_v2f16_eq(
380    <2 x i32> addrspace(1)* %r,
381    <2 x half> addrspace(1)* %a,
382    <2 x half> addrspace(1)* %b) {
383entry:
384  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
385  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
386  %r.val = fcmp oeq <2 x half> %a.val, %b.val
387  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
388  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
389  ret void
390}
391
392; GCN-LABEL: {{^}}fcmp_v2f16_le:
393; SI:  v_cmp_le_f32_e32 vcc
394; SI:  v_cmp_le_f32_e32 vcc
395; VI:  v_cmp_le_f16_e32 vcc
396; VI:  v_cmp_le_f16_e32 vcc
397define amdgpu_kernel void @fcmp_v2f16_le(
398    <2 x i32> addrspace(1)* %r,
399    <2 x half> addrspace(1)* %a,
400    <2 x half> addrspace(1)* %b) {
401entry:
402  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
403  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
404  %r.val = fcmp ole <2 x half> %a.val, %b.val
405  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
406  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
407  ret void
408}
409
410; GCN-LABEL: {{^}}fcmp_v2f16_gt:
411; SI: v_cmp_gt_f32_e32 vcc,
412; SI: v_cmp_gt_f32_e32 vcc,
413
414; VI: v_cmp_gt_f16_e32 vcc,
415; VI: v_cmp_gt_f16_e32 vcc,
416define amdgpu_kernel void @fcmp_v2f16_gt(
417    <2 x i32> addrspace(1)* %r,
418    <2 x half> addrspace(1)* %a,
419    <2 x half> addrspace(1)* %b) {
420entry:
421  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
422  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
423  %r.val = fcmp ogt <2 x half> %a.val, %b.val
424  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
425  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
426  ret void
427}
428
429; GCN-LABEL: {{^}}fcmp_v2f16_lg:
430; SI: v_cmp_lg_f32_e32 vcc,
431; SI: v_cmp_lg_f32_e32 vcc,
432
433; VI: v_cmp_lg_f16_e32 vcc,
434; VI: v_cmp_lg_f16_e32 vcc,
435define amdgpu_kernel void @fcmp_v2f16_lg(
436    <2 x i32> addrspace(1)* %r,
437    <2 x half> addrspace(1)* %a,
438    <2 x half> addrspace(1)* %b) {
439entry:
440  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
441  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
442  %r.val = fcmp one <2 x half> %a.val, %b.val
443  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
444  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
445  ret void
446}
447
448; GCN-LABEL: {{^}}fcmp_v2f16_ge:
449; SI:  v_cmp_ge_f32_e32 vcc,
450; SI:  v_cmp_ge_f32_e32 vcc,
451
452; VI:  v_cmp_ge_f16_e32 vcc,
453; VI:  v_cmp_ge_f16_e32 vcc,
454define amdgpu_kernel void @fcmp_v2f16_ge(
455    <2 x i32> addrspace(1)* %r,
456    <2 x half> addrspace(1)* %a,
457    <2 x half> addrspace(1)* %b) {
458entry:
459  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
460  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
461  %r.val = fcmp oge <2 x half> %a.val, %b.val
462  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
463  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
464  ret void
465}
466
467; GCN-LABEL: {{^}}fcmp_v2f16_o:
468; SI:  v_cmp_o_f32_e32 vcc,
469; SI:  v_cmp_o_f32_e32 vcc,
470
471; VI:  v_cmp_o_f16_e32 vcc,
472; VI:  v_cmp_o_f16_e32 vcc,
473define amdgpu_kernel void @fcmp_v2f16_o(
474    <2 x i32> addrspace(1)* %r,
475    <2 x half> addrspace(1)* %a,
476    <2 x half> addrspace(1)* %b) {
477entry:
478  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
479  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
480  %r.val = fcmp ord <2 x half> %a.val, %b.val
481  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
482  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
483  ret void
484}
485
486; GCN-LABEL: {{^}}fcmp_v2f16_u:
487; SI:  v_cmp_u_f32_e32 vcc,
488; SI:  v_cmp_u_f32_e32 vcc,
489
490; VI:  v_cmp_u_f16_e32 vcc,
491; VI:  v_cmp_u_f16_e32 vcc,
492define amdgpu_kernel void @fcmp_v2f16_u(
493    <2 x i32> addrspace(1)* %r,
494    <2 x half> addrspace(1)* %a,
495    <2 x half> addrspace(1)* %b) {
496entry:
497  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
498  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
499  %r.val = fcmp uno <2 x half> %a.val, %b.val
500  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
501  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
502  ret void
503}
504
505; GCN-LABEL: {{^}}fcmp_v2f16_nge
506; SI:  v_cmp_nge_f32_e32 vcc,
507; SI:  v_cmp_nge_f32_e32 vcc,
508
509; VI:  v_cmp_nge_f16_e32 vcc,
510; VI:  v_cmp_nge_f16_e32 vcc,
511define amdgpu_kernel void @fcmp_v2f16_nge(
512    <2 x i32> addrspace(1)* %r,
513    <2 x half> addrspace(1)* %a,
514    <2 x half> addrspace(1)* %b) {
515entry:
516  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
517  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
518  %r.val = fcmp ult <2 x half> %a.val, %b.val
519  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
520  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
521  ret void
522}
523
524; GCN-LABEL: {{^}}fcmp_v2f16_nlg
525; SI:  v_cmp_nlg_f32_e32 vcc
526; SI:  v_cmp_nlg_f32_e32 vcc
527
528; VI:  v_cmp_nlg_f16_e32 vcc
529; VI:  v_cmp_nlg_f16_e32 vcc
530define amdgpu_kernel void @fcmp_v2f16_nlg(
531    <2 x i32> addrspace(1)* %r,
532    <2 x half> addrspace(1)* %a,
533    <2 x half> addrspace(1)* %b) {
534entry:
535  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
536  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
537  %r.val = fcmp ueq <2 x half> %a.val, %b.val
538  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
539  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
540  ret void
541}
542
543; GCN-LABEL: {{^}}fcmp_v2f16_ngt
544; SI:  v_cmp_ngt_f32_e32 vcc,
545; SI:  v_cmp_ngt_f32_e32 vcc,
546
547; VI:  v_cmp_ngt_f16_e32 vcc,
548; VI:  v_cmp_ngt_f16_e32 vcc,
549define amdgpu_kernel void @fcmp_v2f16_ngt(
550    <2 x i32> addrspace(1)* %r,
551    <2 x half> addrspace(1)* %a,
552    <2 x half> addrspace(1)* %b) {
553entry:
554  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
555  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
556  %r.val = fcmp ule <2 x half> %a.val, %b.val
557  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
558  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
559  ret void
560}
561
562; GCN-LABEL: {{^}}fcmp_v2f16_nle
563; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
564; SI: v_cmp_nle_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
565
566; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
567; VI: v_cmp_nle_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
568define amdgpu_kernel void @fcmp_v2f16_nle(
569    <2 x i32> addrspace(1)* %r,
570    <2 x half> addrspace(1)* %a,
571    <2 x half> addrspace(1)* %b) {
572entry:
573  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
574  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
575  %r.val = fcmp ugt <2 x half> %a.val, %b.val
576  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
577  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
578  ret void
579}
580
581; GCN-LABEL: {{^}}fcmp_v2f16_neq
582; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
583; SI:  v_cmp_neq_f32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
584
585; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
586; VI:  v_cmp_neq_f16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
587define amdgpu_kernel void @fcmp_v2f16_neq(
588    <2 x i32> addrspace(1)* %r,
589    <2 x half> addrspace(1)* %a,
590    <2 x half> addrspace(1)* %b) {
591entry:
592  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
593  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
594  %r.val = fcmp une <2 x half> %a.val, %b.val
595  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
596  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
597  ret void
598}
599
600; GCN-LABEL: {{^}}fcmp_v2f16_nlt
601; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
602; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
603; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
604; GCN-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
605; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
606; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
607
608; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
609; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
610; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
611; SI-DAG:  v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
612; VI-DAG:  v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]]
613; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
614
615; VI:  v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
616; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
617; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
618; GCN: s_endpgm
619define amdgpu_kernel void @fcmp_v2f16_nlt(
620    <2 x i32> addrspace(1)* %r,
621    <2 x half> addrspace(1)* %a,
622    <2 x half> addrspace(1)* %b) {
623entry:
624  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
625  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
626  %r.val = fcmp uge <2 x half> %a.val, %b.val
627  %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
628  store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
629  ret void
630}
631
632declare half @llvm.fabs.f16(half) #1
633
634attributes #0 = { nounwind }
635attributes #1 = { nounwind readnone }
636