1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs  | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
4
5define amdgpu_kernel void @select_f16(
6; SI-LABEL: select_f16:
7; SI:       ; %bb.0: ; %entry
8; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
9; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_mov_b32 s18, s2
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s16, s6
15; SI-NEXT:    s_mov_b32 s17, s7
16; SI-NEXT:    s_mov_b32 s19, s3
17; SI-NEXT:    s_mov_b32 s20, s8
18; SI-NEXT:    s_mov_b32 s21, s9
19; SI-NEXT:    s_mov_b32 s8, s10
20; SI-NEXT:    s_mov_b32 s9, s11
21; SI-NEXT:    s_mov_b32 s22, s2
22; SI-NEXT:    s_mov_b32 s23, s3
23; SI-NEXT:    s_mov_b32 s10, s2
24; SI-NEXT:    s_mov_b32 s11, s3
25; SI-NEXT:    s_mov_b32 s14, s2
26; SI-NEXT:    s_mov_b32 s15, s3
27; SI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
28; SI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
29; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
30; SI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
31; SI-NEXT:    s_mov_b32 s0, s4
32; SI-NEXT:    s_mov_b32 s1, s5
33; SI-NEXT:    s_waitcnt vmcnt(3)
34; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
35; SI-NEXT:    s_waitcnt vmcnt(2)
36; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
37; SI-NEXT:    s_waitcnt vmcnt(1)
38; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
39; SI-NEXT:    s_waitcnt vmcnt(0)
40; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
41; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
42; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
43; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
44; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
45; SI-NEXT:    s_endpgm
46;
47; VI-LABEL: select_f16:
48; VI:       ; %bb.0: ; %entry
49; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
50; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
51; VI-NEXT:    s_mov_b32 s3, 0xf000
52; VI-NEXT:    s_mov_b32 s2, -1
53; VI-NEXT:    s_mov_b32 s18, s2
54; VI-NEXT:    s_waitcnt lgkmcnt(0)
55; VI-NEXT:    s_mov_b32 s16, s6
56; VI-NEXT:    s_mov_b32 s17, s7
57; VI-NEXT:    s_mov_b32 s19, s3
58; VI-NEXT:    s_mov_b32 s20, s8
59; VI-NEXT:    s_mov_b32 s21, s9
60; VI-NEXT:    s_mov_b32 s8, s10
61; VI-NEXT:    s_mov_b32 s9, s11
62; VI-NEXT:    s_mov_b32 s22, s2
63; VI-NEXT:    s_mov_b32 s23, s3
64; VI-NEXT:    s_mov_b32 s10, s2
65; VI-NEXT:    s_mov_b32 s11, s3
66; VI-NEXT:    s_mov_b32 s14, s2
67; VI-NEXT:    s_mov_b32 s15, s3
68; VI-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
69; VI-NEXT:    buffer_load_ushort v1, off, s[20:23], 0
70; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0
71; VI-NEXT:    buffer_load_ushort v3, off, s[12:15], 0
72; VI-NEXT:    s_mov_b32 s0, s4
73; VI-NEXT:    s_mov_b32 s1, s5
74; VI-NEXT:    s_waitcnt vmcnt(2)
75; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
76; VI-NEXT:    s_waitcnt vmcnt(0)
77; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
78; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
79; VI-NEXT:    s_endpgm
80    half addrspace(1)* %r,
81    half addrspace(1)* %a,
82    half addrspace(1)* %b,
83    half addrspace(1)* %c,
84    half addrspace(1)* %d) {
85entry:
86  %a.val = load volatile half, half addrspace(1)* %a
87  %b.val = load volatile half, half addrspace(1)* %b
88  %c.val = load volatile half, half addrspace(1)* %c
89  %d.val = load volatile half, half addrspace(1)* %d
90  %fcmp = fcmp olt half %a.val, %b.val
91  %r.val = select i1 %fcmp, half %c.val, half %d.val
92  store half %r.val, half addrspace(1)* %r
93  ret void
94}
95
96define amdgpu_kernel void @select_f16_imm_a(
97; SI-LABEL: select_f16_imm_a:
98; SI:       ; %bb.0: ; %entry
99; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
100; SI-NEXT:    s_mov_b32 s11, 0xf000
101; SI-NEXT:    s_mov_b32 s10, -1
102; SI-NEXT:    s_mov_b32 s14, s10
103; SI-NEXT:    s_mov_b32 s15, s11
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    s_mov_b32 s12, s2
106; SI-NEXT:    s_mov_b32 s13, s3
107; SI-NEXT:    s_mov_b32 s16, s4
108; SI-NEXT:    s_mov_b32 s17, s5
109; SI-NEXT:    s_mov_b32 s4, s6
110; SI-NEXT:    s_mov_b32 s5, s7
111; SI-NEXT:    s_mov_b32 s18, s10
112; SI-NEXT:    s_mov_b32 s19, s11
113; SI-NEXT:    s_mov_b32 s6, s10
114; SI-NEXT:    s_mov_b32 s7, s11
115; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
116; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
117; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
118; SI-NEXT:    s_mov_b32 s8, s0
119; SI-NEXT:    s_mov_b32 s9, s1
120; SI-NEXT:    s_waitcnt vmcnt(2)
121; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
122; SI-NEXT:    s_waitcnt vmcnt(1)
123; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
124; SI-NEXT:    s_waitcnt vmcnt(0)
125; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
126; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
127; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
128; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
129; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
130; SI-NEXT:    s_endpgm
131;
132; VI-LABEL: select_f16_imm_a:
133; VI:       ; %bb.0: ; %entry
134; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
135; VI-NEXT:    s_mov_b32 s11, 0xf000
136; VI-NEXT:    s_mov_b32 s10, -1
137; VI-NEXT:    s_mov_b32 s14, s10
138; VI-NEXT:    s_mov_b32 s15, s11
139; VI-NEXT:    s_waitcnt lgkmcnt(0)
140; VI-NEXT:    s_mov_b32 s12, s2
141; VI-NEXT:    s_mov_b32 s13, s3
142; VI-NEXT:    s_mov_b32 s16, s4
143; VI-NEXT:    s_mov_b32 s17, s5
144; VI-NEXT:    s_mov_b32 s4, s6
145; VI-NEXT:    s_mov_b32 s5, s7
146; VI-NEXT:    s_mov_b32 s18, s10
147; VI-NEXT:    s_mov_b32 s19, s11
148; VI-NEXT:    s_mov_b32 s6, s10
149; VI-NEXT:    s_mov_b32 s7, s11
150; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
151; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
152; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
153; VI-NEXT:    s_mov_b32 s8, s0
154; VI-NEXT:    s_mov_b32 s9, s1
155; VI-NEXT:    s_waitcnt vmcnt(2)
156; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
157; VI-NEXT:    s_waitcnt vmcnt(0)
158; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
159; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
160; VI-NEXT:    s_endpgm
161    half addrspace(1)* %r,
162    half addrspace(1)* %b,
163    half addrspace(1)* %c,
164    half addrspace(1)* %d) {
165entry:
166  %b.val = load volatile half, half addrspace(1)* %b
167  %c.val = load volatile half, half addrspace(1)* %c
168  %d.val = load volatile half, half addrspace(1)* %d
169  %fcmp = fcmp olt half 0xH3800, %b.val
170  %r.val = select i1 %fcmp, half %c.val, half %d.val
171  store half %r.val, half addrspace(1)* %r
172  ret void
173}
174
175define amdgpu_kernel void @select_f16_imm_b(
176; SI-LABEL: select_f16_imm_b:
177; SI:       ; %bb.0: ; %entry
178; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
179; SI-NEXT:    s_mov_b32 s11, 0xf000
180; SI-NEXT:    s_mov_b32 s10, -1
181; SI-NEXT:    s_mov_b32 s14, s10
182; SI-NEXT:    s_mov_b32 s15, s11
183; SI-NEXT:    s_waitcnt lgkmcnt(0)
184; SI-NEXT:    s_mov_b32 s12, s2
185; SI-NEXT:    s_mov_b32 s13, s3
186; SI-NEXT:    s_mov_b32 s16, s4
187; SI-NEXT:    s_mov_b32 s17, s5
188; SI-NEXT:    s_mov_b32 s4, s6
189; SI-NEXT:    s_mov_b32 s5, s7
190; SI-NEXT:    s_mov_b32 s18, s10
191; SI-NEXT:    s_mov_b32 s19, s11
192; SI-NEXT:    s_mov_b32 s6, s10
193; SI-NEXT:    s_mov_b32 s7, s11
194; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
195; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
196; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
197; SI-NEXT:    s_mov_b32 s8, s0
198; SI-NEXT:    s_mov_b32 s9, s1
199; SI-NEXT:    s_waitcnt vmcnt(2)
200; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
201; SI-NEXT:    s_waitcnt vmcnt(1)
202; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
203; SI-NEXT:    s_waitcnt vmcnt(0)
204; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
205; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
206; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
207; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
208; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
209; SI-NEXT:    s_endpgm
210;
211; VI-LABEL: select_f16_imm_b:
212; VI:       ; %bb.0: ; %entry
213; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
214; VI-NEXT:    s_mov_b32 s11, 0xf000
215; VI-NEXT:    s_mov_b32 s10, -1
216; VI-NEXT:    s_mov_b32 s14, s10
217; VI-NEXT:    s_mov_b32 s15, s11
218; VI-NEXT:    s_waitcnt lgkmcnt(0)
219; VI-NEXT:    s_mov_b32 s12, s2
220; VI-NEXT:    s_mov_b32 s13, s3
221; VI-NEXT:    s_mov_b32 s16, s4
222; VI-NEXT:    s_mov_b32 s17, s5
223; VI-NEXT:    s_mov_b32 s4, s6
224; VI-NEXT:    s_mov_b32 s5, s7
225; VI-NEXT:    s_mov_b32 s18, s10
226; VI-NEXT:    s_mov_b32 s19, s11
227; VI-NEXT:    s_mov_b32 s6, s10
228; VI-NEXT:    s_mov_b32 s7, s11
229; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
230; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
231; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
232; VI-NEXT:    s_mov_b32 s8, s0
233; VI-NEXT:    s_mov_b32 s9, s1
234; VI-NEXT:    s_waitcnt vmcnt(2)
235; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
236; VI-NEXT:    s_waitcnt vmcnt(0)
237; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
238; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
239; VI-NEXT:    s_endpgm
240    half addrspace(1)* %r,
241    half addrspace(1)* %a,
242    half addrspace(1)* %c,
243    half addrspace(1)* %d) {
244entry:
245  %a.val = load volatile half, half addrspace(1)* %a
246  %c.val = load volatile half, half addrspace(1)* %c
247  %d.val = load volatile half, half addrspace(1)* %d
248  %fcmp = fcmp olt half %a.val, 0xH3800
249  %r.val = select i1 %fcmp, half %c.val, half %d.val
250  store half %r.val, half addrspace(1)* %r
251  ret void
252}
253
254define amdgpu_kernel void @select_f16_imm_c(
255; SI-LABEL: select_f16_imm_c:
256; SI:       ; %bb.0: ; %entry
257; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
258; SI-NEXT:    s_mov_b32 s11, 0xf000
259; SI-NEXT:    s_mov_b32 s10, -1
260; SI-NEXT:    s_mov_b32 s14, s10
261; SI-NEXT:    s_mov_b32 s15, s11
262; SI-NEXT:    s_waitcnt lgkmcnt(0)
263; SI-NEXT:    s_mov_b32 s12, s2
264; SI-NEXT:    s_mov_b32 s13, s3
265; SI-NEXT:    s_mov_b32 s16, s4
266; SI-NEXT:    s_mov_b32 s17, s5
267; SI-NEXT:    s_mov_b32 s4, s6
268; SI-NEXT:    s_mov_b32 s5, s7
269; SI-NEXT:    s_mov_b32 s18, s10
270; SI-NEXT:    s_mov_b32 s19, s11
271; SI-NEXT:    s_mov_b32 s6, s10
272; SI-NEXT:    s_mov_b32 s7, s11
273; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
274; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
275; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
276; SI-NEXT:    s_mov_b32 s8, s0
277; SI-NEXT:    s_mov_b32 s9, s1
278; SI-NEXT:    s_waitcnt vmcnt(2)
279; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
280; SI-NEXT:    s_waitcnt vmcnt(1)
281; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
282; SI-NEXT:    s_waitcnt vmcnt(0)
283; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
284; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
285; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
286; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
287; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
288; SI-NEXT:    s_endpgm
289;
290; VI-LABEL: select_f16_imm_c:
291; VI:       ; %bb.0: ; %entry
292; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
293; VI-NEXT:    s_mov_b32 s11, 0xf000
294; VI-NEXT:    s_mov_b32 s10, -1
295; VI-NEXT:    s_mov_b32 s14, s10
296; VI-NEXT:    s_mov_b32 s15, s11
297; VI-NEXT:    s_waitcnt lgkmcnt(0)
298; VI-NEXT:    s_mov_b32 s12, s2
299; VI-NEXT:    s_mov_b32 s13, s3
300; VI-NEXT:    s_mov_b32 s16, s4
301; VI-NEXT:    s_mov_b32 s17, s5
302; VI-NEXT:    s_mov_b32 s4, s6
303; VI-NEXT:    s_mov_b32 s5, s7
304; VI-NEXT:    s_mov_b32 s18, s10
305; VI-NEXT:    s_mov_b32 s19, s11
306; VI-NEXT:    s_mov_b32 s6, s10
307; VI-NEXT:    s_mov_b32 s7, s11
308; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
309; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
310; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
311; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
312; VI-NEXT:    s_mov_b32 s8, s0
313; VI-NEXT:    s_mov_b32 s9, s1
314; VI-NEXT:    s_waitcnt vmcnt(1)
315; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
316; VI-NEXT:    s_waitcnt vmcnt(0)
317; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
318; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
319; VI-NEXT:    s_endpgm
320    half addrspace(1)* %r,
321    half addrspace(1)* %a,
322    half addrspace(1)* %b,
323    half addrspace(1)* %d) {
324entry:
325  %a.val = load volatile half, half addrspace(1)* %a
326  %b.val = load volatile half, half addrspace(1)* %b
327  %d.val = load volatile half, half addrspace(1)* %d
328  %fcmp = fcmp olt half %a.val, %b.val
329  %r.val = select i1 %fcmp, half 0xH3800, half %d.val
330  store half %r.val, half addrspace(1)* %r
331  ret void
332}
333
334define amdgpu_kernel void @select_f16_imm_d(
335; SI-LABEL: select_f16_imm_d:
336; SI:       ; %bb.0: ; %entry
337; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s11, 0xf000
339; SI-NEXT:    s_mov_b32 s10, -1
340; SI-NEXT:    s_mov_b32 s14, s10
341; SI-NEXT:    s_mov_b32 s15, s11
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_mov_b32 s12, s2
344; SI-NEXT:    s_mov_b32 s13, s3
345; SI-NEXT:    s_mov_b32 s16, s4
346; SI-NEXT:    s_mov_b32 s17, s5
347; SI-NEXT:    s_mov_b32 s4, s6
348; SI-NEXT:    s_mov_b32 s5, s7
349; SI-NEXT:    s_mov_b32 s18, s10
350; SI-NEXT:    s_mov_b32 s19, s11
351; SI-NEXT:    s_mov_b32 s6, s10
352; SI-NEXT:    s_mov_b32 s7, s11
353; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
354; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
355; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
356; SI-NEXT:    s_mov_b32 s8, s0
357; SI-NEXT:    s_mov_b32 s9, s1
358; SI-NEXT:    s_waitcnt vmcnt(2)
359; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
360; SI-NEXT:    s_waitcnt vmcnt(1)
361; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
362; SI-NEXT:    s_waitcnt vmcnt(0)
363; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
364; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
365; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
366; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
367; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
368; SI-NEXT:    s_endpgm
369;
370; VI-LABEL: select_f16_imm_d:
371; VI:       ; %bb.0: ; %entry
372; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
373; VI-NEXT:    s_mov_b32 s11, 0xf000
374; VI-NEXT:    s_mov_b32 s10, -1
375; VI-NEXT:    s_mov_b32 s14, s10
376; VI-NEXT:    s_mov_b32 s15, s11
377; VI-NEXT:    s_waitcnt lgkmcnt(0)
378; VI-NEXT:    s_mov_b32 s12, s2
379; VI-NEXT:    s_mov_b32 s13, s3
380; VI-NEXT:    s_mov_b32 s16, s4
381; VI-NEXT:    s_mov_b32 s17, s5
382; VI-NEXT:    s_mov_b32 s4, s6
383; VI-NEXT:    s_mov_b32 s5, s7
384; VI-NEXT:    s_mov_b32 s18, s10
385; VI-NEXT:    s_mov_b32 s19, s11
386; VI-NEXT:    s_mov_b32 s6, s10
387; VI-NEXT:    s_mov_b32 s7, s11
388; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
389; VI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
390; VI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
391; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
392; VI-NEXT:    s_mov_b32 s8, s0
393; VI-NEXT:    s_mov_b32 s9, s1
394; VI-NEXT:    s_waitcnt vmcnt(1)
395; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
396; VI-NEXT:    s_waitcnt vmcnt(0)
397; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
398; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
399; VI-NEXT:    s_endpgm
400    half addrspace(1)* %r,
401    half addrspace(1)* %a,
402    half addrspace(1)* %b,
403    half addrspace(1)* %c) {
404entry:
405  %a.val = load volatile half, half addrspace(1)* %a
406  %b.val = load volatile half, half addrspace(1)* %b
407  %c.val = load volatile half, half addrspace(1)* %c
408  %fcmp = fcmp olt half %a.val, %b.val
409  %r.val = select i1 %fcmp, half %c.val, half 0xH3800
410  store half %r.val, half addrspace(1)* %r
411  ret void
412}
413
414define amdgpu_kernel void @select_v2f16(
415; SI-LABEL: select_v2f16:
416; SI:       ; %bb.0: ; %entry
417; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x9
418; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x11
419; SI-NEXT:    s_mov_b32 s3, 0xf000
420; SI-NEXT:    s_mov_b32 s2, -1
421; SI-NEXT:    s_mov_b32 s18, s2
422; SI-NEXT:    s_waitcnt lgkmcnt(0)
423; SI-NEXT:    s_mov_b32 s16, s6
424; SI-NEXT:    s_mov_b32 s17, s7
425; SI-NEXT:    s_mov_b32 s19, s3
426; SI-NEXT:    s_mov_b32 s20, s8
427; SI-NEXT:    s_mov_b32 s21, s9
428; SI-NEXT:    s_mov_b32 s8, s10
429; SI-NEXT:    s_mov_b32 s9, s11
430; SI-NEXT:    s_mov_b32 s22, s2
431; SI-NEXT:    s_mov_b32 s23, s3
432; SI-NEXT:    s_mov_b32 s10, s2
433; SI-NEXT:    s_mov_b32 s11, s3
434; SI-NEXT:    s_mov_b32 s14, s2
435; SI-NEXT:    s_mov_b32 s15, s3
436; SI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
437; SI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
438; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
439; SI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
440; SI-NEXT:    s_mov_b32 s0, s4
441; SI-NEXT:    s_mov_b32 s1, s5
442; SI-NEXT:    s_waitcnt vmcnt(3)
443; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
444; SI-NEXT:    s_waitcnt vmcnt(2)
445; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
446; SI-NEXT:    s_waitcnt vmcnt(1)
447; SI-NEXT:    v_cvt_f32_f16_e32 v4, v2
448; SI-NEXT:    s_waitcnt vmcnt(0)
449; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
450; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
451; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
452; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
453; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
454; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
455; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
456; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
457; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
458; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v6
459; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
460; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
461; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
462; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
463; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
464; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
465; SI-NEXT:    v_or_b32_e32 v0, v0, v1
466; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
467; SI-NEXT:    s_endpgm
468;
469; VI-LABEL: select_v2f16:
470; VI:       ; %bb.0: ; %entry
471; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
472; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x44
473; VI-NEXT:    s_mov_b32 s3, 0xf000
474; VI-NEXT:    s_mov_b32 s2, -1
475; VI-NEXT:    s_mov_b32 s18, s2
476; VI-NEXT:    s_waitcnt lgkmcnt(0)
477; VI-NEXT:    s_mov_b32 s16, s6
478; VI-NEXT:    s_mov_b32 s17, s7
479; VI-NEXT:    s_mov_b32 s19, s3
480; VI-NEXT:    s_mov_b32 s20, s8
481; VI-NEXT:    s_mov_b32 s21, s9
482; VI-NEXT:    s_mov_b32 s8, s10
483; VI-NEXT:    s_mov_b32 s9, s11
484; VI-NEXT:    s_mov_b32 s22, s2
485; VI-NEXT:    s_mov_b32 s23, s3
486; VI-NEXT:    s_mov_b32 s10, s2
487; VI-NEXT:    s_mov_b32 s11, s3
488; VI-NEXT:    s_mov_b32 s14, s2
489; VI-NEXT:    s_mov_b32 s15, s3
490; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
491; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
492; VI-NEXT:    buffer_load_dword v2, off, s[12:15], 0
493; VI-NEXT:    buffer_load_dword v3, off, s[8:11], 0
494; VI-NEXT:    s_mov_b32 s0, s4
495; VI-NEXT:    s_mov_b32 s1, s5
496; VI-NEXT:    s_waitcnt vmcnt(3)
497; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
498; VI-NEXT:    s_waitcnt vmcnt(2)
499; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
500; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
501; VI-NEXT:    s_waitcnt vmcnt(0)
502; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
503; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
504; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
505; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
506; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
507; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
508; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
509; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
510; VI-NEXT:    s_endpgm
511    <2 x half> addrspace(1)* %r,
512    <2 x half> addrspace(1)* %a,
513    <2 x half> addrspace(1)* %b,
514    <2 x half> addrspace(1)* %c,
515    <2 x half> addrspace(1)* %d) {
516entry:
517  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
518  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
519  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
520  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
521  %fcmp = fcmp olt <2 x half> %a.val, %b.val
522  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
523  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
524  ret void
525}
526
527define amdgpu_kernel void @select_v2f16_imm_a(
528; SI-LABEL: select_v2f16_imm_a:
529; SI:       ; %bb.0: ; %entry
530; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
531; SI-NEXT:    s_mov_b32 s11, 0xf000
532; SI-NEXT:    s_mov_b32 s10, -1
533; SI-NEXT:    s_mov_b32 s14, s10
534; SI-NEXT:    s_mov_b32 s15, s11
535; SI-NEXT:    s_waitcnt lgkmcnt(0)
536; SI-NEXT:    s_mov_b32 s12, s2
537; SI-NEXT:    s_mov_b32 s13, s3
538; SI-NEXT:    s_mov_b32 s16, s4
539; SI-NEXT:    s_mov_b32 s17, s5
540; SI-NEXT:    s_mov_b32 s4, s6
541; SI-NEXT:    s_mov_b32 s5, s7
542; SI-NEXT:    s_mov_b32 s18, s10
543; SI-NEXT:    s_mov_b32 s19, s11
544; SI-NEXT:    s_mov_b32 s6, s10
545; SI-NEXT:    s_mov_b32 s7, s11
546; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
547; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
548; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
549; SI-NEXT:    s_mov_b32 s2, 0x3f200000
550; SI-NEXT:    s_mov_b32 s8, s0
551; SI-NEXT:    s_mov_b32 s9, s1
552; SI-NEXT:    s_waitcnt vmcnt(2)
553; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
554; SI-NEXT:    s_waitcnt vmcnt(1)
555; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
556; SI-NEXT:    s_waitcnt vmcnt(0)
557; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
558; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
559; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
560; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
561; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
562; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
563; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
564; SI-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v3
565; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
566; SI-NEXT:    v_cmp_lt_f32_e32 vcc, 0.5, v0
567; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
568; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
569; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
570; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
571; SI-NEXT:    v_or_b32_e32 v0, v0, v1
572; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
573; SI-NEXT:    s_endpgm
574;
575; VI-LABEL: select_v2f16_imm_a:
576; VI:       ; %bb.0: ; %entry
577; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
578; VI-NEXT:    s_mov_b32 s11, 0xf000
579; VI-NEXT:    s_mov_b32 s10, -1
580; VI-NEXT:    s_mov_b32 s14, s10
581; VI-NEXT:    s_mov_b32 s15, s11
582; VI-NEXT:    s_waitcnt lgkmcnt(0)
583; VI-NEXT:    s_mov_b32 s12, s2
584; VI-NEXT:    s_mov_b32 s13, s3
585; VI-NEXT:    s_mov_b32 s16, s4
586; VI-NEXT:    s_mov_b32 s17, s5
587; VI-NEXT:    s_mov_b32 s4, s6
588; VI-NEXT:    s_mov_b32 s5, s7
589; VI-NEXT:    s_mov_b32 s18, s10
590; VI-NEXT:    s_mov_b32 s19, s11
591; VI-NEXT:    s_mov_b32 s6, s10
592; VI-NEXT:    s_mov_b32 s7, s11
593; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
594; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
595; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
596; VI-NEXT:    s_movk_i32 s2, 0x3900
597; VI-NEXT:    s_mov_b32 s8, s0
598; VI-NEXT:    s_mov_b32 s9, s1
599; VI-NEXT:    s_waitcnt vmcnt(2)
600; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
601; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
602; VI-NEXT:    s_waitcnt vmcnt(0)
603; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
604; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
605; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
606; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
607; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
608; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
609; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
610; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
611; VI-NEXT:    s_endpgm
612    <2 x half> addrspace(1)* %r,
613    <2 x half> addrspace(1)* %b,
614    <2 x half> addrspace(1)* %c,
615    <2 x half> addrspace(1)* %d) {
616entry:
617  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
618  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
619  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
620  %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val
621  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
622  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
623  ret void
624}
625
626define amdgpu_kernel void @select_v2f16_imm_b(
627; SI-LABEL: select_v2f16_imm_b:
628; SI:       ; %bb.0: ; %entry
629; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
630; SI-NEXT:    s_mov_b32 s11, 0xf000
631; SI-NEXT:    s_mov_b32 s10, -1
632; SI-NEXT:    s_mov_b32 s14, s10
633; SI-NEXT:    s_mov_b32 s15, s11
634; SI-NEXT:    s_waitcnt lgkmcnt(0)
635; SI-NEXT:    s_mov_b32 s12, s2
636; SI-NEXT:    s_mov_b32 s13, s3
637; SI-NEXT:    s_mov_b32 s16, s4
638; SI-NEXT:    s_mov_b32 s17, s5
639; SI-NEXT:    s_mov_b32 s4, s6
640; SI-NEXT:    s_mov_b32 s5, s7
641; SI-NEXT:    s_mov_b32 s18, s10
642; SI-NEXT:    s_mov_b32 s19, s11
643; SI-NEXT:    s_mov_b32 s6, s10
644; SI-NEXT:    s_mov_b32 s7, s11
645; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
646; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
647; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
648; SI-NEXT:    s_mov_b32 s2, 0x3f200000
649; SI-NEXT:    s_mov_b32 s8, s0
650; SI-NEXT:    s_mov_b32 s9, s1
651; SI-NEXT:    s_waitcnt vmcnt(2)
652; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
653; SI-NEXT:    s_waitcnt vmcnt(1)
654; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
655; SI-NEXT:    s_waitcnt vmcnt(0)
656; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
657; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
658; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
659; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
660; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
661; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
662; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
663; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
664; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
665; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0.5, v0
666; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
667; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
668; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
669; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
670; SI-NEXT:    v_or_b32_e32 v0, v0, v1
671; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
672; SI-NEXT:    s_endpgm
673;
674; VI-LABEL: select_v2f16_imm_b:
675; VI:       ; %bb.0: ; %entry
676; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
677; VI-NEXT:    s_mov_b32 s11, 0xf000
678; VI-NEXT:    s_mov_b32 s10, -1
679; VI-NEXT:    s_mov_b32 s14, s10
680; VI-NEXT:    s_mov_b32 s15, s11
681; VI-NEXT:    s_waitcnt lgkmcnt(0)
682; VI-NEXT:    s_mov_b32 s12, s2
683; VI-NEXT:    s_mov_b32 s13, s3
684; VI-NEXT:    s_mov_b32 s16, s4
685; VI-NEXT:    s_mov_b32 s17, s5
686; VI-NEXT:    s_mov_b32 s4, s6
687; VI-NEXT:    s_mov_b32 s5, s7
688; VI-NEXT:    s_mov_b32 s18, s10
689; VI-NEXT:    s_mov_b32 s19, s11
690; VI-NEXT:    s_mov_b32 s6, s10
691; VI-NEXT:    s_mov_b32 s7, s11
692; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
693; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
694; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
695; VI-NEXT:    s_movk_i32 s2, 0x3900
696; VI-NEXT:    s_mov_b32 s8, s0
697; VI-NEXT:    s_mov_b32 s9, s1
698; VI-NEXT:    s_waitcnt vmcnt(2)
699; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
700; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
701; VI-NEXT:    s_waitcnt vmcnt(0)
702; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
703; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
704; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
705; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
706; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
707; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
708; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
709; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
710; VI-NEXT:    s_endpgm
711    <2 x half> addrspace(1)* %r,
712    <2 x half> addrspace(1)* %a,
713    <2 x half> addrspace(1)* %c,
714    <2 x half> addrspace(1)* %d) {
715entry:
716  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
717  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
718  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
719  %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900>
720  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val
721  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
722  ret void
723}
724
725define amdgpu_kernel void @select_v2f16_imm_c(
726; SI-LABEL: select_v2f16_imm_c:
727; SI:       ; %bb.0: ; %entry
728; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
729; SI-NEXT:    s_mov_b32 s11, 0xf000
730; SI-NEXT:    s_mov_b32 s10, -1
731; SI-NEXT:    s_mov_b32 s14, s10
732; SI-NEXT:    s_mov_b32 s15, s11
733; SI-NEXT:    s_waitcnt lgkmcnt(0)
734; SI-NEXT:    s_mov_b32 s12, s2
735; SI-NEXT:    s_mov_b32 s13, s3
736; SI-NEXT:    s_mov_b32 s16, s4
737; SI-NEXT:    s_mov_b32 s17, s5
738; SI-NEXT:    s_mov_b32 s4, s6
739; SI-NEXT:    s_mov_b32 s5, s7
740; SI-NEXT:    s_mov_b32 s18, s10
741; SI-NEXT:    s_mov_b32 s19, s11
742; SI-NEXT:    s_mov_b32 s6, s10
743; SI-NEXT:    s_mov_b32 s7, s11
744; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
745; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
746; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
747; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
748; SI-NEXT:    s_mov_b32 s8, s0
749; SI-NEXT:    s_mov_b32 s9, s1
750; SI-NEXT:    s_waitcnt vmcnt(2)
751; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
752; SI-NEXT:    s_waitcnt vmcnt(1)
753; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
754; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
755; SI-NEXT:    s_waitcnt vmcnt(0)
756; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
757; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
758; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
759; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
760; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
761; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
762; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v5
763; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
764; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, v4, v1
765; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
766; SI-NEXT:    v_cndmask_b32_e32 v1, 0.5, v2, vcc
767; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
768; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
769; SI-NEXT:    v_or_b32_e32 v0, v1, v0
770; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
771; SI-NEXT:    s_endpgm
772;
773; VI-LABEL: select_v2f16_imm_c:
774; VI:       ; %bb.0: ; %entry
775; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
776; VI-NEXT:    s_mov_b32 s11, 0xf000
777; VI-NEXT:    s_mov_b32 s10, -1
778; VI-NEXT:    s_mov_b32 s14, s10
779; VI-NEXT:    s_mov_b32 s15, s11
780; VI-NEXT:    s_waitcnt lgkmcnt(0)
781; VI-NEXT:    s_mov_b32 s12, s2
782; VI-NEXT:    s_mov_b32 s13, s3
783; VI-NEXT:    s_mov_b32 s16, s4
784; VI-NEXT:    s_mov_b32 s17, s5
785; VI-NEXT:    s_mov_b32 s4, s6
786; VI-NEXT:    s_mov_b32 s5, s7
787; VI-NEXT:    s_mov_b32 s18, s10
788; VI-NEXT:    s_mov_b32 s19, s11
789; VI-NEXT:    s_mov_b32 s6, s10
790; VI-NEXT:    s_mov_b32 s7, s11
791; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
792; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
793; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
794; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
795; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
796; VI-NEXT:    s_mov_b32 s8, s0
797; VI-NEXT:    s_mov_b32 s9, s1
798; VI-NEXT:    s_waitcnt vmcnt(2)
799; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
800; VI-NEXT:    s_waitcnt vmcnt(1)
801; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v0, v1
802; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
803; VI-NEXT:    s_waitcnt vmcnt(0)
804; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
805; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
806; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
807; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
808; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
809; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
810; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
811; VI-NEXT:    s_endpgm
812    <2 x half> addrspace(1)* %r,
813    <2 x half> addrspace(1)* %a,
814    <2 x half> addrspace(1)* %b,
815    <2 x half> addrspace(1)* %d) {
816entry:
817  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
818  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
819  %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
820  %fcmp = fcmp olt <2 x half> %a.val, %b.val
821  %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val
822  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
823  ret void
824}
825
826define amdgpu_kernel void @select_v2f16_imm_d(
827; SI-LABEL: select_v2f16_imm_d:
828; SI:       ; %bb.0: ; %entry
829; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
830; SI-NEXT:    s_mov_b32 s11, 0xf000
831; SI-NEXT:    s_mov_b32 s10, -1
832; SI-NEXT:    s_mov_b32 s14, s10
833; SI-NEXT:    s_mov_b32 s15, s11
834; SI-NEXT:    s_waitcnt lgkmcnt(0)
835; SI-NEXT:    s_mov_b32 s12, s2
836; SI-NEXT:    s_mov_b32 s13, s3
837; SI-NEXT:    s_mov_b32 s16, s4
838; SI-NEXT:    s_mov_b32 s17, s5
839; SI-NEXT:    s_mov_b32 s4, s6
840; SI-NEXT:    s_mov_b32 s5, s7
841; SI-NEXT:    s_mov_b32 s18, s10
842; SI-NEXT:    s_mov_b32 s19, s11
843; SI-NEXT:    s_mov_b32 s6, s10
844; SI-NEXT:    s_mov_b32 s7, s11
845; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
846; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
847; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
848; SI-NEXT:    v_mov_b32_e32 v3, 0x3f200000
849; SI-NEXT:    s_mov_b32 s8, s0
850; SI-NEXT:    s_mov_b32 s9, s1
851; SI-NEXT:    s_waitcnt vmcnt(2)
852; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
853; SI-NEXT:    s_waitcnt vmcnt(1)
854; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
855; SI-NEXT:    s_waitcnt vmcnt(0)
856; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
857; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
858; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
859; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
860; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
861; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
862; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
863; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v5
864; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
865; SI-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
866; SI-NEXT:    v_cndmask_b32_e32 v0, 0.5, v2, vcc
867; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
868; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
869; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
870; SI-NEXT:    v_or_b32_e32 v0, v0, v1
871; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
872; SI-NEXT:    s_endpgm
873;
874; VI-LABEL: select_v2f16_imm_d:
875; VI:       ; %bb.0: ; %entry
876; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
877; VI-NEXT:    s_mov_b32 s11, 0xf000
878; VI-NEXT:    s_mov_b32 s10, -1
879; VI-NEXT:    s_mov_b32 s14, s10
880; VI-NEXT:    s_mov_b32 s15, s11
881; VI-NEXT:    s_waitcnt lgkmcnt(0)
882; VI-NEXT:    s_mov_b32 s12, s2
883; VI-NEXT:    s_mov_b32 s13, s3
884; VI-NEXT:    s_mov_b32 s16, s4
885; VI-NEXT:    s_mov_b32 s17, s5
886; VI-NEXT:    s_mov_b32 s4, s6
887; VI-NEXT:    s_mov_b32 s5, s7
888; VI-NEXT:    s_mov_b32 s18, s10
889; VI-NEXT:    s_mov_b32 s19, s11
890; VI-NEXT:    s_mov_b32 s6, s10
891; VI-NEXT:    s_mov_b32 s7, s11
892; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
893; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
894; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
895; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
896; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
897; VI-NEXT:    s_mov_b32 s8, s0
898; VI-NEXT:    s_mov_b32 s9, s1
899; VI-NEXT:    s_waitcnt vmcnt(2)
900; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
901; VI-NEXT:    s_waitcnt vmcnt(1)
902; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v0, v1
903; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
904; VI-NEXT:    s_waitcnt vmcnt(0)
905; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
906; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
907; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
908; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
909; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
910; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
911; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
912; VI-NEXT:    s_endpgm
913    <2 x half> addrspace(1)* %r,
914    <2 x half> addrspace(1)* %a,
915    <2 x half> addrspace(1)* %b,
916    <2 x half> addrspace(1)* %c) {
917entry:
918  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
919  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
920  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
921  %fcmp = fcmp olt <2 x half> %a.val, %b.val
922  %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900>
923  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
924  ret void
925}
926