1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9; SI-LABEL: v_uitofp_i32_to_f32_mask255:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
13; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
14; SI-NEXT:    s_setpc_b64 s[30:31]
15;
16; VI-LABEL: v_uitofp_i32_to_f32_mask255:
17; VI:       ; %bb.0:
18; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20; VI-NEXT:    s_setpc_b64 s[30:31]
21  %masked = and i32 %arg0, 255
22  %cvt = uitofp i32 %masked to float
23  ret float %cvt
24}
25
26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27; SI-LABEL: v_sitofp_i32_to_f32_mask255:
28; SI:       ; %bb.0:
29; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
31; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
32; SI-NEXT:    s_setpc_b64 s[30:31]
33;
34; VI-LABEL: v_sitofp_i32_to_f32_mask255:
35; VI:       ; %bb.0:
36; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38; VI-NEXT:    s_setpc_b64 s[30:31]
39  %masked = and i32 %arg0, 255
40  %cvt = sitofp i32 %masked to float
41  ret float %cvt
42}
43
44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45; SI-LABEL: v_uitofp_to_f32_lshr7_mask255:
46; SI:       ; %bb.0:
47; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; SI-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
49; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
50; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
51; SI-NEXT:    s_setpc_b64 s[30:31]
52;
53; VI-LABEL: v_uitofp_to_f32_lshr7_mask255:
54; VI:       ; %bb.0:
55; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; VI-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
57; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
58; VI-NEXT:    s_setpc_b64 s[30:31]
59  %lshr.7 = lshr i32 %arg0, 7
60  %masked = and i32 %lshr.7, 255
61  %cvt = uitofp i32 %masked to float
62  ret float %cvt
63}
64
65define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
66; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
70; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
71; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
72; SI-NEXT:    s_setpc_b64 s[30:31]
73;
74; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
75; VI:       ; %bb.0:
76; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
78; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
79; VI-NEXT:    s_setpc_b64 s[30:31]
80  %lshr.8 = lshr i32 %arg0, 8
81  %masked = and i32 %lshr.8, 255
82  %cvt = uitofp i32 %masked to float
83  ret float %cvt
84}
85
86define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
87; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
88; SI:       ; %bb.0:
89; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
91; SI-NEXT:    s_mov_b32 s6, -1
92; SI-NEXT:    s_mov_b32 s7, 0xf000
93; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
94; SI-NEXT:    s_waitcnt expcnt(0)
95; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
96; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
97; SI-NEXT:    s_waitcnt vmcnt(0)
98; SI-NEXT:    s_setpc_b64 s[30:31]
99;
100; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
101; VI:       ; %bb.0:
102; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
104; VI-NEXT:    flat_store_dword v[0:1], v0
105; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
106; VI-NEXT:    s_waitcnt vmcnt(0)
107; VI-NEXT:    s_setpc_b64 s[30:31]
108  %lshr.8 = lshr i32 %arg0, 8
109  store i32 %lshr.8, i32 addrspace(1)* undef
110  %masked = and i32 %lshr.8, 255
111  %cvt = uitofp i32 %masked to float
112  ret float %cvt
113}
114
115define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
116; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
117; SI:       ; %bb.0:
118; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
120; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
121; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
122; SI-NEXT:    s_setpc_b64 s[30:31]
123;
124; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
125; VI:       ; %bb.0:
126; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; VI-NEXT:    v_mov_b32_e32 v1, 0xff
128; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
129; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
130; VI-NEXT:    s_setpc_b64 s[30:31]
131  %lshr.16 = lshr i32 %arg0, 16
132  %masked = and i32 %lshr.16, 255
133  %cvt = uitofp i32 %masked to float
134  ret float %cvt
135}
136
137define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
138; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
139; GCN:       ; %bb.0:
140; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
142; GCN-NEXT:    s_setpc_b64 s[30:31]
143  %lshr.16 = lshr i32 %arg0, 24
144  %masked = and i32 %lshr.16, 255
145  %cvt = uitofp i32 %masked to float
146  ret float %cvt
147}
148
149define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
150; SI-LABEL: v_uitofp_i8_to_f32:
151; SI:       ; %bb.0:
152; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
154; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
155; SI-NEXT:    s_setpc_b64 s[30:31]
156;
157; VI-LABEL: v_uitofp_i8_to_f32:
158; VI:       ; %bb.0:
159; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
161; VI-NEXT:    s_setpc_b64 s[30:31]
162  %cvt = uitofp i8 %arg0 to float
163  ret float %cvt
164}
165
166define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
167; SI-LABEL: v_uitofp_v2i8_to_v2f32:
168; SI:       ; %bb.0:
169; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
170; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
171; SI-NEXT:    s_movk_i32 s4, 0xff
172; SI-NEXT:    v_and_b32_e32 v0, s4, v0
173; SI-NEXT:    v_and_b32_e32 v1, s4, v1
174; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
175; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
176; SI-NEXT:    s_setpc_b64 s[30:31]
177;
178; VI-LABEL: v_uitofp_v2i8_to_v2f32:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
182; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
183; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
184; VI-NEXT:    s_setpc_b64 s[30:31]
185  %val = bitcast i16 %arg0 to <2 x i8>
186  %cvt = uitofp <2 x i8> %val to <2 x float>
187  ret <2 x float> %cvt
188}
189
190define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
191; SI-LABEL: v_uitofp_v3i8_to_v3f32:
192; SI:       ; %bb.0:
193; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
195; SI-NEXT:    s_movk_i32 s4, 0xff
196; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
197; SI-NEXT:    v_and_b32_e32 v0, s4, v0
198; SI-NEXT:    v_and_b32_e32 v1, s4, v1
199; SI-NEXT:    v_and_b32_e32 v2, s4, v2
200; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
201; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
202; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
203; SI-NEXT:    s_setpc_b64 s[30:31]
204;
205; VI-LABEL: v_uitofp_v3i8_to_v3f32:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
208; VI-NEXT:    s_movk_i32 s4, 0xff
209; VI-NEXT:    v_mov_b32_e32 v2, s4
210; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
211; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
212; VI-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
213; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
214; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
215; VI-NEXT:    v_mov_b32_e32 v0, v3
216; VI-NEXT:    s_setpc_b64 s[30:31]
217  %trunc = trunc i32 %arg0 to i24
218  %val = bitcast i24 %trunc to <3 x i8>
219  %cvt = uitofp <3 x i8> %val to <3 x float>
220  ret <3 x float> %cvt
221}
222
223define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
224; SI-LABEL: v_uitofp_v4i8_to_v4f32:
225; SI:       ; %bb.0:
226; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; SI-NEXT:    s_movk_i32 s4, 0xff
228; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
229; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
230; SI-NEXT:    v_and_b32_e32 v3, s4, v0
231; SI-NEXT:    v_and_b32_e32 v1, s4, v1
232; SI-NEXT:    v_and_b32_e32 v2, s4, v2
233; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v3
234; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
235; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
236; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
237; SI-NEXT:    v_mov_b32_e32 v0, v4
238; SI-NEXT:    s_setpc_b64 s[30:31]
239;
240; VI-LABEL: v_uitofp_v4i8_to_v4f32:
241; VI:       ; %bb.0:
242; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; VI-NEXT:    s_movk_i32 s4, 0xff
244; VI-NEXT:    v_mov_b32_e32 v2, s4
245; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
246; VI-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
247; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
248; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
249; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
250; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
251; VI-NEXT:    v_mov_b32_e32 v0, v4
252; VI-NEXT:    s_setpc_b64 s[30:31]
253  %val = bitcast i32 %arg0 to <4 x i8>
254  %cvt = uitofp <4 x i8> %val to <4 x float>
255  ret <4 x float> %cvt
256}
257
258define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
259; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
260; SI:       ; %bb.0:
261; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; SI-NEXT:    s_movk_i32 s4, 0xff
263; SI-NEXT:    v_and_b32_e32 v1, s4, v0
264; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
265; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
266; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
267; SI-NEXT:    v_and_b32_e32 v1, s4, v1
268; SI-NEXT:    v_and_b32_e32 v2, s4, v2
269; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
270; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
271; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
272; SI-NEXT:    v_mov_b32_e32 v0, v4
273; SI-NEXT:    s_setpc_b64 s[30:31]
274;
275; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
276; VI:       ; %bb.0:
277; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
278; VI-NEXT:    s_movk_i32 s4, 0xff
279; VI-NEXT:    v_mov_b32_e32 v2, s4
280; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
281; VI-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
282; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
283; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
284; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
285; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
286; VI-NEXT:    v_mov_b32_e32 v0, v4
287; VI-NEXT:    s_setpc_b64 s[30:31]
288  %mask.arg0 = and i32 %arg0, 255
289  %cvt0 = uitofp i32 %mask.arg0 to float
290
291  %lshr.8 = lshr i32 %arg0, 8
292  %mask.lshr.8 = and i32 %lshr.8, 255
293  %cvt1 = uitofp i32 %mask.lshr.8 to float
294
295  %lshr.16 = lshr i32 %arg0, 16
296  %mask.lshr.16 = and i32 %lshr.16, 255
297  %cvt2 = uitofp i32 %mask.lshr.16 to float
298
299  %lshr.24 = lshr i32 %arg0, 24
300  %mask.lshr.24 = and i32 %lshr.24, 255
301  %cvt3 = uitofp i32 %mask.lshr.24 to float
302
303  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
304  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
305  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
306  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
307  ret <4 x float> %ins.3
308}
309
310define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
311; SI-LABEL: v_uitofp_i32_to_f16_mask255:
312; SI:       ; %bb.0:
313; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
315; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
316; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
317; SI-NEXT:    s_setpc_b64 s[30:31]
318;
319; VI-LABEL: v_uitofp_i32_to_f16_mask255:
320; VI:       ; %bb.0:
321; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
323; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
324; VI-NEXT:    s_setpc_b64 s[30:31]
325  %masked = and i32 %arg0, 255
326  %cvt = uitofp i32 %masked to half
327  ret half %cvt
328}
329
330define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
331; SI-LABEL: v_sitofp_i32_to_f16_mask255:
332; SI:       ; %bb.0:
333; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
335; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
336; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
337; SI-NEXT:    s_setpc_b64 s[30:31]
338;
339; VI-LABEL: v_sitofp_i32_to_f16_mask255:
340; VI:       ; %bb.0:
341; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
343; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
344; VI-NEXT:    s_setpc_b64 s[30:31]
345  %masked = and i32 %arg0, 255
346  %cvt = sitofp i32 %masked to half
347  ret half %cvt
348}
349
350define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
351; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
352; SI:       ; %bb.0:
353; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
355; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
356; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
357; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
358; SI-NEXT:    s_setpc_b64 s[30:31]
359;
360; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
361; VI:       ; %bb.0:
362; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
364; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
365; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
366; VI-NEXT:    s_setpc_b64 s[30:31]
367  %lshr.8 = lshr i32 %arg0, 8
368  %masked = and i32 %lshr.8, 255
369  %cvt = uitofp i32 %masked to half
370  ret half %cvt
371}
372
373define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
374; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
375; SI:       ; %bb.0:
376; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
378; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
379; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
380; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
381; SI-NEXT:    s_setpc_b64 s[30:31]
382;
383; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
384; VI:       ; %bb.0:
385; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
386; VI-NEXT:    v_mov_b32_e32 v1, 0xff
387; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
388; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
389; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
390; VI-NEXT:    s_setpc_b64 s[30:31]
391  %lshr.16 = lshr i32 %arg0, 16
392  %masked = and i32 %lshr.16, 255
393  %cvt = uitofp i32 %masked to half
394  ret half %cvt
395}
396
397define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
398; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
399; GCN:       ; %bb.0:
400; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
402; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
403; GCN-NEXT:    s_setpc_b64 s[30:31]
404  %lshr.16 = lshr i32 %arg0, 24
405  %masked = and i32 %lshr.16, 255
406  %cvt = uitofp i32 %masked to half
407  ret half %cvt
408}
409
410define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
411; SI-LABEL: v_uitofp_i8_to_f16:
412; SI:       ; %bb.0:
413; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
415; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
416; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
417; SI-NEXT:    s_setpc_b64 s[30:31]
418;
419; VI-LABEL: v_uitofp_i8_to_f16:
420; VI:       ; %bb.0:
421; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
423; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
424; VI-NEXT:    s_setpc_b64 s[30:31]
425  %cvt = uitofp i8 %arg0 to half
426  ret half %cvt
427}
428
429define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
430; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
431; GCN:       ; %bb.0:
432; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
434; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
435; GCN-NEXT:    s_setpc_b64 s[30:31]
436  %masked = and i32 %arg0, 255
437  %cvt = uitofp i32 %masked to double
438  ret double %cvt
439}
440
441define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
442; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
443; GCN:       ; %bb.0:
444; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
446; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
447; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
448; GCN-NEXT:    s_setpc_b64 s[30:31]
449  %lshr.8 = lshr i32 %arg0, 8
450  %masked = and i32 %lshr.8, 255
451  %cvt = uitofp i32 %masked to double
452  ret double %cvt
453}
454
455define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
456; SI-LABEL: v_uitofp_to_f64_lshr16_mask255:
457; SI:       ; %bb.0:
458; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
460; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
461; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
462; SI-NEXT:    s_setpc_b64 s[30:31]
463;
464; VI-LABEL: v_uitofp_to_f64_lshr16_mask255:
465; VI:       ; %bb.0:
466; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; VI-NEXT:    v_mov_b32_e32 v1, 0xff
468; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
469; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
470; VI-NEXT:    s_setpc_b64 s[30:31]
471  %lshr.16 = lshr i32 %arg0, 16
472  %masked = and i32 %lshr.16, 255
473  %cvt = uitofp i32 %masked to double
474  ret double %cvt
475}
476
477define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
478; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
479; GCN:       ; %bb.0:
480; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
482; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
483; GCN-NEXT:    s_setpc_b64 s[30:31]
484  %lshr.16 = lshr i32 %arg0, 24
485  %masked = and i32 %lshr.16, 255
486  %cvt = uitofp i32 %masked to double
487  ret double %cvt
488}
489
490define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
491; GCN-LABEL: v_uitofp_i8_to_f64:
492; GCN:       ; %bb.0:
493; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
495; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
496; GCN-NEXT:    s_setpc_b64 s[30:31]
497  %cvt = uitofp i8 %arg0 to double
498  ret double %cvt
499}
500
501define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
502; SI-LABEL: load_i8_to_f32:
503; SI:       ; %bb.0:
504; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
505; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
506; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
507; SI-NEXT:    s_mov_b32 s2, 0
508; SI-NEXT:    s_mov_b32 s3, 0xf000
509; SI-NEXT:    s_waitcnt lgkmcnt(0)
510; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
511; SI-NEXT:    s_mov_b32 s2, -1
512; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
513; SI-NEXT:    s_waitcnt vmcnt(0)
514; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
515; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
516; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
517; SI-NEXT:    s_endpgm
518;
519; VI-LABEL: load_i8_to_f32:
520; VI:       ; %bb.0:
521; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
522; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
523; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
524; VI-NEXT:    s_waitcnt lgkmcnt(0)
525; VI-NEXT:    v_mov_b32_e32 v2, s1
526; VI-NEXT:    v_mov_b32_e32 v1, s0
527; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
528; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
529; VI-NEXT:    flat_load_ubyte v0, v[0:1]
530; VI-NEXT:    s_waitcnt vmcnt(0)
531; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
532; VI-NEXT:    v_mov_b32_e32 v0, s2
533; VI-NEXT:    v_mov_b32_e32 v1, s3
534; VI-NEXT:    flat_store_dword v[0:1], v2
535; VI-NEXT:    s_endpgm
536  %tid = call i32 @llvm.amdgcn.workitem.id.x()
537  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
538  %load = load i8, i8 addrspace(1)* %gep, align 1
539  %cvt = uitofp i8 %load to float
540  store float %cvt, float addrspace(1)* %out, align 4
541  ret void
542}
543
544; FIXME:
545; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
546;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
547;   %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
548;   %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
549;   %cvt = uitofp <2 x i8> %load to <2 x float>
550;   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
551;   ret void
552; }
553
554; FIXME:
555; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
556;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
557;   %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
558;   %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
559;   %cvt = uitofp <3 x i8> %load to <3 x float>
560;   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
561;   ret void
562; }
563
564; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
565;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
566;   %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
567;   %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
568;   %cvt = uitofp <4 x i8> %load to <4 x float>
569;   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
570;   ret void
571; }
572
573; This should not be adding instructions to shift into the correct
574; position in the word for the component.
575
576; FIXME: Packing bytes
577define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
578; SI-LABEL: load_v4i8_to_v4f32_unaligned:
579; SI:       ; %bb.0:
580; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
581; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
582; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
583; SI-NEXT:    v_mov_b32_e32 v1, 0
584; SI-NEXT:    s_mov_b32 s2, 0
585; SI-NEXT:    s_mov_b32 s3, 0xf000
586; SI-NEXT:    s_waitcnt lgkmcnt(0)
587; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
588; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
589; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
590; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
591; SI-NEXT:    s_movk_i32 s0, 0xff
592; SI-NEXT:    s_mov_b32 s2, -1
593; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
594; SI-NEXT:    s_waitcnt vmcnt(3)
595; SI-NEXT:    v_and_b32_e32 v1, s0, v2
596; SI-NEXT:    s_waitcnt vmcnt(2)
597; SI-NEXT:    v_and_b32_e32 v2, s0, v3
598; SI-NEXT:    s_waitcnt vmcnt(1)
599; SI-NEXT:    v_and_b32_e32 v3, s0, v4
600; SI-NEXT:    s_waitcnt vmcnt(0)
601; SI-NEXT:    v_and_b32_e32 v4, s0, v0
602; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
603; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
604; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
605; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
606; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
607; SI-NEXT:    s_endpgm
608;
609; VI-LABEL: load_v4i8_to_v4f32_unaligned:
610; VI:       ; %bb.0:
611; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
612; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
613; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
614; VI-NEXT:    s_waitcnt lgkmcnt(0)
615; VI-NEXT:    v_mov_b32_e32 v0, s0
616; VI-NEXT:    v_mov_b32_e32 v1, s1
617; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
618; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
619; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
620; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
621; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
622; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
623; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
624; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
625; VI-NEXT:    flat_load_ubyte v0, v[0:1]
626; VI-NEXT:    flat_load_ubyte v1, v[2:3]
627; VI-NEXT:    flat_load_ubyte v2, v[4:5]
628; VI-NEXT:    flat_load_ubyte v3, v[6:7]
629; VI-NEXT:    v_mov_b32_e32 v5, s3
630; VI-NEXT:    v_mov_b32_e32 v4, s2
631; VI-NEXT:    s_waitcnt vmcnt(3)
632; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
633; VI-NEXT:    s_waitcnt vmcnt(2)
634; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
635; VI-NEXT:    s_waitcnt vmcnt(1)
636; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
637; VI-NEXT:    s_waitcnt vmcnt(0)
638; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
639; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
640; VI-NEXT:    s_endpgm
641  %tid = call i32 @llvm.amdgcn.workitem.id.x()
642  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
643  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
644  %cvt = uitofp <4 x i8> %load to <4 x float>
645  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
646  ret void
647}
648
649; FIXME: Need to handle non-uniform case for function below (load without gep).
650; Instructions still emitted to repack bytes for add use.
651; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
652;   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
653;   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
654;   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
655;   %cvt = uitofp <4 x i8> %load to <4 x float>
656;   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
657;   %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
658;   store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
659;   ret void
660; }
661
662; Make sure this doesn't crash.
663; FIXME:
664; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
665;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
666;   %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
667;   %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
668;   %cvt = uitofp <7 x i8> %load to <7 x float>
669;   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
670;   ret void
671; }
672
673; FIXME
674; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
675;   %tid = call i32 @llvm.amdgcn.workitem.id.x()
676;   %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
677;   %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
678;   %cvt = uitofp <8 x i8> %load to <8 x float>
679;   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
680;   ret void
681; }
682
683define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
684; SI-LABEL: i8_zext_inreg_i32_to_f32:
685; SI:       ; %bb.0:
686; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
687; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
688; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
689; SI-NEXT:    v_mov_b32_e32 v1, 0
690; SI-NEXT:    s_mov_b32 s2, 0
691; SI-NEXT:    s_mov_b32 s3, 0xf000
692; SI-NEXT:    s_waitcnt lgkmcnt(0)
693; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
694; SI-NEXT:    s_mov_b32 s2, -1
695; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
696; SI-NEXT:    s_waitcnt vmcnt(0)
697; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
698; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
699; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
700; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
701; SI-NEXT:    s_endpgm
702;
703; VI-LABEL: i8_zext_inreg_i32_to_f32:
704; VI:       ; %bb.0:
705; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
706; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
707; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
708; VI-NEXT:    s_waitcnt lgkmcnt(0)
709; VI-NEXT:    v_mov_b32_e32 v0, s0
710; VI-NEXT:    v_mov_b32_e32 v1, s1
711; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
712; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
713; VI-NEXT:    flat_load_dword v0, v[0:1]
714; VI-NEXT:    s_waitcnt vmcnt(0)
715; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
716; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
717; VI-NEXT:    v_mov_b32_e32 v0, s2
718; VI-NEXT:    v_mov_b32_e32 v1, s3
719; VI-NEXT:    flat_store_dword v[0:1], v2
720; VI-NEXT:    s_endpgm
721  %tid = call i32 @llvm.amdgcn.workitem.id.x()
722  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
723  %load = load i32, i32 addrspace(1)* %gep, align 4
724  %add = add i32 %load, 2
725  %inreg = and i32 %add, 255
726  %cvt = uitofp i32 %inreg to float
727  store float %cvt, float addrspace(1)* %out, align 4
728  ret void
729}
730
731define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
732; SI-LABEL: i8_zext_inreg_hi1_to_f32:
733; SI:       ; %bb.0:
734; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
735; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
736; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
737; SI-NEXT:    v_mov_b32_e32 v1, 0
738; SI-NEXT:    s_mov_b32 s2, 0
739; SI-NEXT:    s_mov_b32 s3, 0xf000
740; SI-NEXT:    s_waitcnt lgkmcnt(0)
741; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
742; SI-NEXT:    s_mov_b32 s2, -1
743; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
744; SI-NEXT:    s_waitcnt vmcnt(0)
745; SI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
746; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
747; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
748; SI-NEXT:    s_endpgm
749;
750; VI-LABEL: i8_zext_inreg_hi1_to_f32:
751; VI:       ; %bb.0:
752; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
753; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
754; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
755; VI-NEXT:    s_waitcnt lgkmcnt(0)
756; VI-NEXT:    v_mov_b32_e32 v0, s0
757; VI-NEXT:    v_mov_b32_e32 v1, s1
758; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
759; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
760; VI-NEXT:    flat_load_dword v0, v[0:1]
761; VI-NEXT:    s_waitcnt vmcnt(0)
762; VI-NEXT:    v_and_b32_e32 v0, 0xff00, v0
763; VI-NEXT:    v_cvt_f32_ubyte1_e32 v2, v0
764; VI-NEXT:    v_mov_b32_e32 v0, s2
765; VI-NEXT:    v_mov_b32_e32 v1, s3
766; VI-NEXT:    flat_store_dword v[0:1], v2
767; VI-NEXT:    s_endpgm
768  %tid = call i32 @llvm.amdgcn.workitem.id.x()
769  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
770  %load = load i32, i32 addrspace(1)* %gep, align 4
771  %inreg = and i32 %load, 65280
772  %shr = lshr i32 %inreg, 8
773  %cvt = uitofp i32 %shr to float
774  store float %cvt, float addrspace(1)* %out, align 4
775  ret void
776}
777
778; We don't get these ones because of the zext, but instcombine removes
779; them so it shouldn't really matter.
780define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
781; SI-LABEL: i8_zext_i32_to_f32:
782; SI:       ; %bb.0:
783; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
784; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
785; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
786; SI-NEXT:    s_mov_b32 s2, 0
787; SI-NEXT:    s_mov_b32 s3, 0xf000
788; SI-NEXT:    s_waitcnt lgkmcnt(0)
789; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
790; SI-NEXT:    s_mov_b32 s2, -1
791; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
792; SI-NEXT:    s_waitcnt vmcnt(0)
793; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
794; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
795; SI-NEXT:    s_endpgm
796;
797; VI-LABEL: i8_zext_i32_to_f32:
798; VI:       ; %bb.0:
799; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
800; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
801; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
802; VI-NEXT:    s_waitcnt lgkmcnt(0)
803; VI-NEXT:    v_mov_b32_e32 v2, s1
804; VI-NEXT:    v_mov_b32_e32 v1, s0
805; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
806; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
807; VI-NEXT:    flat_load_ubyte v0, v[0:1]
808; VI-NEXT:    s_waitcnt vmcnt(0)
809; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
810; VI-NEXT:    v_mov_b32_e32 v0, s2
811; VI-NEXT:    v_mov_b32_e32 v1, s3
812; VI-NEXT:    flat_store_dword v[0:1], v2
813; VI-NEXT:    s_endpgm
814  %tid = call i32 @llvm.amdgcn.workitem.id.x()
815  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
816  %load = load i8, i8 addrspace(1)* %gep, align 1
817  %ext = zext i8 %load to i32
818  %cvt = uitofp i32 %ext to float
819  store float %cvt, float addrspace(1)* %out, align 4
820  ret void
821}
822
823define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
824; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
825; SI:       ; %bb.0:
826; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
827; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
828; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
829; SI-NEXT:    v_mov_b32_e32 v1, 0
830; SI-NEXT:    s_mov_b32 s2, 0
831; SI-NEXT:    s_mov_b32 s3, 0xf000
832; SI-NEXT:    s_waitcnt lgkmcnt(0)
833; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
834; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
835; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
836; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
837; SI-NEXT:    s_movk_i32 s0, 0xff
838; SI-NEXT:    s_mov_b32 s2, -1
839; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
840; SI-NEXT:    s_waitcnt vmcnt(3)
841; SI-NEXT:    v_and_b32_e32 v1, s0, v2
842; SI-NEXT:    s_waitcnt vmcnt(2)
843; SI-NEXT:    v_and_b32_e32 v2, s0, v3
844; SI-NEXT:    s_waitcnt vmcnt(1)
845; SI-NEXT:    v_and_b32_e32 v3, s0, v4
846; SI-NEXT:    s_waitcnt vmcnt(0)
847; SI-NEXT:    v_and_b32_e32 v4, s0, v0
848; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
849; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
850; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
851; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v4
852; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
853; SI-NEXT:    s_endpgm
854;
855; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
856; VI:       ; %bb.0:
857; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
858; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
859; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
860; VI-NEXT:    s_waitcnt lgkmcnt(0)
861; VI-NEXT:    v_mov_b32_e32 v0, s0
862; VI-NEXT:    v_mov_b32_e32 v1, s1
863; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
864; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
865; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
866; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
867; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
868; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
869; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
870; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
871; VI-NEXT:    flat_load_ubyte v0, v[0:1]
872; VI-NEXT:    flat_load_ubyte v1, v[2:3]
873; VI-NEXT:    flat_load_ubyte v2, v[4:5]
874; VI-NEXT:    flat_load_ubyte v3, v[6:7]
875; VI-NEXT:    v_mov_b32_e32 v5, s3
876; VI-NEXT:    v_mov_b32_e32 v4, s2
877; VI-NEXT:    s_waitcnt vmcnt(3)
878; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
879; VI-NEXT:    s_waitcnt vmcnt(2)
880; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
881; VI-NEXT:    s_waitcnt vmcnt(1)
882; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
883; VI-NEXT:    s_waitcnt vmcnt(0)
884; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
885; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
886; VI-NEXT:    s_endpgm
887  %tid = call i32 @llvm.amdgcn.workitem.id.x()
888  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
889  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
890  %ext = zext <4 x i8> %load to <4 x i32>
891  %cvt = uitofp <4 x i32> %ext to <4 x float>
892  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
893  ret void
894}
895
896define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
897; SI-LABEL: extract_byte0_to_f32:
898; SI:       ; %bb.0:
899; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
900; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
901; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
902; SI-NEXT:    v_mov_b32_e32 v1, 0
903; SI-NEXT:    s_mov_b32 s2, 0
904; SI-NEXT:    s_mov_b32 s3, 0xf000
905; SI-NEXT:    s_waitcnt lgkmcnt(0)
906; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
907; SI-NEXT:    s_mov_b32 s2, -1
908; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
909; SI-NEXT:    s_waitcnt vmcnt(0)
910; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
911; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
912; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
913; SI-NEXT:    s_endpgm
914;
915; VI-LABEL: extract_byte0_to_f32:
916; VI:       ; %bb.0:
917; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
918; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
919; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
920; VI-NEXT:    s_waitcnt lgkmcnt(0)
921; VI-NEXT:    v_mov_b32_e32 v0, s0
922; VI-NEXT:    v_mov_b32_e32 v1, s1
923; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
924; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
925; VI-NEXT:    flat_load_dword v0, v[0:1]
926; VI-NEXT:    s_waitcnt vmcnt(0)
927; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
928; VI-NEXT:    v_mov_b32_e32 v0, s2
929; VI-NEXT:    v_mov_b32_e32 v1, s3
930; VI-NEXT:    flat_store_dword v[0:1], v2
931; VI-NEXT:    s_endpgm
932  %tid = call i32 @llvm.amdgcn.workitem.id.x()
933  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
934  %val = load i32, i32 addrspace(1)* %gep
935  %and = and i32 %val, 255
936  %cvt = uitofp i32 %and to float
937  store float %cvt, float addrspace(1)* %out
938  ret void
939}
940
941define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
942; SI-LABEL: extract_byte1_to_f32:
943; SI:       ; %bb.0:
944; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
945; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
946; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
947; SI-NEXT:    v_mov_b32_e32 v1, 0
948; SI-NEXT:    s_mov_b32 s2, 0
949; SI-NEXT:    s_mov_b32 s3, 0xf000
950; SI-NEXT:    s_waitcnt lgkmcnt(0)
951; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
952; SI-NEXT:    s_mov_b32 s2, -1
953; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
954; SI-NEXT:    s_waitcnt vmcnt(0)
955; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
956; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
957; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
958; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
959; SI-NEXT:    s_endpgm
960;
961; VI-LABEL: extract_byte1_to_f32:
962; VI:       ; %bb.0:
963; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
964; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
965; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
966; VI-NEXT:    s_waitcnt lgkmcnt(0)
967; VI-NEXT:    v_mov_b32_e32 v0, s0
968; VI-NEXT:    v_mov_b32_e32 v1, s1
969; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
970; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
971; VI-NEXT:    flat_load_dword v0, v[0:1]
972; VI-NEXT:    s_waitcnt vmcnt(0)
973; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
974; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
975; VI-NEXT:    v_mov_b32_e32 v0, s2
976; VI-NEXT:    v_mov_b32_e32 v1, s3
977; VI-NEXT:    flat_store_dword v[0:1], v2
978; VI-NEXT:    s_endpgm
979  %tid = call i32 @llvm.amdgcn.workitem.id.x()
980  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
981  %val = load i32, i32 addrspace(1)* %gep
982  %srl = lshr i32 %val, 8
983  %and = and i32 %srl, 255
984  %cvt = uitofp i32 %and to float
985  store float %cvt, float addrspace(1)* %out
986  ret void
987}
988
989define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
990; SI-LABEL: extract_byte2_to_f32:
991; SI:       ; %bb.0:
992; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
993; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
994; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
995; SI-NEXT:    v_mov_b32_e32 v1, 0
996; SI-NEXT:    s_mov_b32 s2, 0
997; SI-NEXT:    s_mov_b32 s3, 0xf000
998; SI-NEXT:    s_waitcnt lgkmcnt(0)
999; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1000; SI-NEXT:    s_mov_b32 s2, -1
1001; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1002; SI-NEXT:    s_waitcnt vmcnt(0)
1003; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1004; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1005; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1006; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1007; SI-NEXT:    s_endpgm
1008;
1009; VI-LABEL: extract_byte2_to_f32:
1010; VI:       ; %bb.0:
1011; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1012; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1013; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1014; VI-NEXT:    s_waitcnt lgkmcnt(0)
1015; VI-NEXT:    v_mov_b32_e32 v0, s0
1016; VI-NEXT:    v_mov_b32_e32 v1, s1
1017; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1018; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1019; VI-NEXT:    flat_load_dword v0, v[0:1]
1020; VI-NEXT:    v_mov_b32_e32 v1, 0xff
1021; VI-NEXT:    s_waitcnt vmcnt(0)
1022; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1023; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
1024; VI-NEXT:    v_mov_b32_e32 v0, s2
1025; VI-NEXT:    v_mov_b32_e32 v1, s3
1026; VI-NEXT:    flat_store_dword v[0:1], v2
1027; VI-NEXT:    s_endpgm
1028  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1029  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1030  %val = load i32, i32 addrspace(1)* %gep
1031  %srl = lshr i32 %val, 16
1032  %and = and i32 %srl, 255
1033  %cvt = uitofp i32 %and to float
1034  store float %cvt, float addrspace(1)* %out
1035  ret void
1036}
1037
1038define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1039; SI-LABEL: extract_byte3_to_f32:
1040; SI:       ; %bb.0:
1041; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1042; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1043; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1044; SI-NEXT:    v_mov_b32_e32 v1, 0
1045; SI-NEXT:    s_mov_b32 s2, 0
1046; SI-NEXT:    s_mov_b32 s3, 0xf000
1047; SI-NEXT:    s_waitcnt lgkmcnt(0)
1048; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1049; SI-NEXT:    s_mov_b32 s2, -1
1050; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1051; SI-NEXT:    s_waitcnt vmcnt(0)
1052; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1053; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1054; SI-NEXT:    s_endpgm
1055;
1056; VI-LABEL: extract_byte3_to_f32:
1057; VI:       ; %bb.0:
1058; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1059; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1060; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1061; VI-NEXT:    s_waitcnt lgkmcnt(0)
1062; VI-NEXT:    v_mov_b32_e32 v0, s0
1063; VI-NEXT:    v_mov_b32_e32 v1, s1
1064; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1065; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1066; VI-NEXT:    flat_load_dword v0, v[0:1]
1067; VI-NEXT:    s_waitcnt vmcnt(0)
1068; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
1069; VI-NEXT:    v_mov_b32_e32 v0, s2
1070; VI-NEXT:    v_mov_b32_e32 v1, s3
1071; VI-NEXT:    flat_store_dword v[0:1], v2
1072; VI-NEXT:    s_endpgm
1073  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1074  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1075  %val = load i32, i32 addrspace(1)* %gep
1076  %srl = lshr i32 %val, 24
1077  %and = and i32 %srl, 255
1078  %cvt = uitofp i32 %and to float
1079  store float %cvt, float addrspace(1)* %out
1080  ret void
1081}
1082
1083define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1084; SI-LABEL: cvt_ubyte0_or_multiuse:
1085; SI:       ; %bb.0: ; %bb
1086; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1087; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1088; SI-NEXT:    v_mov_b32_e32 v1, 0
1089; SI-NEXT:    s_mov_b32 s6, 0
1090; SI-NEXT:    s_mov_b32 s7, 0xf000
1091; SI-NEXT:    s_waitcnt lgkmcnt(0)
1092; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1093; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1094; SI-NEXT:    s_mov_b32 s6, -1
1095; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1096; SI-NEXT:    s_waitcnt vmcnt(0)
1097; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1098; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
1099; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1100; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1101; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1102; SI-NEXT:    s_endpgm
1103;
1104; VI-LABEL: cvt_ubyte0_or_multiuse:
1105; VI:       ; %bb.0: ; %bb
1106; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1107; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1108; VI-NEXT:    s_waitcnt lgkmcnt(0)
1109; VI-NEXT:    v_mov_b32_e32 v0, s0
1110; VI-NEXT:    v_mov_b32_e32 v1, s1
1111; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1112; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1113; VI-NEXT:    flat_load_dword v0, v[0:1]
1114; VI-NEXT:    s_waitcnt vmcnt(0)
1115; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1116; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1117; VI-NEXT:    v_add_f32_e32 v2, v0, v1
1118; VI-NEXT:    v_mov_b32_e32 v0, s2
1119; VI-NEXT:    v_mov_b32_e32 v1, s3
1120; VI-NEXT:    flat_store_dword v[0:1], v2
1121; VI-NEXT:    s_endpgm
1122bb:
1123  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1124  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1125  %load = load i32, i32 addrspace(1)* %gep
1126  %or = or i32 %load, -2147483647
1127  %and = and i32 %or, 255
1128  %uitofp = uitofp i32 %and to float
1129  %cast = bitcast i32 %or to float
1130  %add = fadd float %cast, %uitofp
1131  store float %add, float addrspace(1)* %out
1132  ret void
1133}
1134
1135define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1136; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1137; SI:       ; %bb.0:
1138; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1139; SI-NEXT:    s_movk_i32 s6, 0xff
1140; SI-NEXT:    v_and_b32_e32 v2, s6, v0
1141; SI-NEXT:    v_add_i32_e32 v2, vcc, 0, v2
1142; SI-NEXT:    v_ffbh_u32_e32 v4, v2
1143; SI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
1144; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
1145; SI-NEXT:    v_ffbh_u32_e32 v5, v3
1146; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1147; SI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
1148; SI-NEXT:    v_mov_b32_e32 v5, 0xbe
1149; SI-NEXT:    v_sub_i32_e32 v6, vcc, v5, v4
1150; SI-NEXT:    v_lshl_b64 v[4:5], v[2:3], v4
1151; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1152; SI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v5
1153; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
1154; SI-NEXT:    v_and_b32_e32 v5, s6, v3
1155; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
1156; SI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
1157; SI-NEXT:    s_mov_b32 s4, 0
1158; SI-NEXT:    s_movk_i32 s5, 0x80
1159; SI-NEXT:    v_or_b32_e32 v2, v2, v3
1160; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
1161; SI-NEXT:    v_and_b32_e32 v3, 1, v2
1162; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
1163; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
1164; SI-NEXT:    v_mov_b32_e32 v0, 0
1165; SI-NEXT:    v_cndmask_b32_e64 v3, v3, 1, vcc
1166; SI-NEXT:    v_mov_b32_e32 v1, v0
1167; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1168; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1169; SI-NEXT:    v_cndmask_b32_e64 v0, v2, -v2, vcc
1170; SI-NEXT:    s_setpc_b64 s[30:31]
1171;
1172; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1173; VI:       ; %bb.0:
1174; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1175; VI-NEXT:    s_movk_i32 s6, 0xff
1176; VI-NEXT:    v_and_b32_e32 v2, s6, v0
1177; VI-NEXT:    v_add_u32_e32 v2, vcc, 0, v2
1178; VI-NEXT:    v_ffbh_u32_e32 v4, v2
1179; VI-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
1180; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v4
1181; VI-NEXT:    v_ffbh_u32_e32 v5, v3
1182; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1183; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
1184; VI-NEXT:    v_mov_b32_e32 v5, 0xbe
1185; VI-NEXT:    v_sub_u32_e32 v6, vcc, v5, v4
1186; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, v[2:3]
1187; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1188; VI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v5
1189; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
1190; VI-NEXT:    v_and_b32_e32 v5, s6, v3
1191; VI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
1192; VI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
1193; VI-NEXT:    s_mov_b32 s4, 0
1194; VI-NEXT:    s_movk_i32 s5, 0x80
1195; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1196; VI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
1197; VI-NEXT:    v_and_b32_e32 v3, 1, v2
1198; VI-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
1199; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
1200; VI-NEXT:    v_mov_b32_e32 v0, 0
1201; VI-NEXT:    v_cndmask_b32_e64 v3, v3, 1, vcc
1202; VI-NEXT:    v_mov_b32_e32 v1, v0
1203; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1204; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1205; VI-NEXT:    v_cndmask_b32_e64 v0, v2, -v2, vcc
1206; VI-NEXT:    s_setpc_b64 s[30:31]
1207  %masked = and i64 %arg0, 255
1208  %itofp = sitofp i64 %masked to float
1209  ret float %itofp
1210}
1211
1212define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1213; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1214; SI:       ; %bb.0:
1215; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1216; SI-NEXT:    s_movk_i32 s4, 0xff
1217; SI-NEXT:    v_and_b32_e32 v0, s4, v0
1218; SI-NEXT:    v_ffbh_u32_e32 v2, v0
1219; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1220; SI-NEXT:    v_ffbh_u32_e32 v3, 0
1221; SI-NEXT:    v_cmp_eq_u32_e64 vcc, 0, 0
1222; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1223; SI-NEXT:    v_mov_b32_e32 v3, 0xbe
1224; SI-NEXT:    v_mov_b32_e32 v1, 0
1225; SI-NEXT:    v_sub_i32_e32 v4, vcc, v3, v2
1226; SI-NEXT:    v_lshl_b64 v[2:3], v[0:1], v2
1227; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1228; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v3
1229; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
1230; SI-NEXT:    v_and_b32_e32 v3, s4, v1
1231; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
1232; SI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1233; SI-NEXT:    s_mov_b32 s4, 0
1234; SI-NEXT:    s_movk_i32 s5, 0x80
1235; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1236; SI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
1237; SI-NEXT:    v_and_b32_e32 v1, 1, v0
1238; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1239; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
1240; SI-NEXT:    v_cndmask_b32_e64 v1, v1, 1, vcc
1241; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1242; SI-NEXT:    s_setpc_b64 s[30:31]
1243;
1244; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1245; VI:       ; %bb.0:
1246; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247; VI-NEXT:    s_movk_i32 s4, 0xff
1248; VI-NEXT:    v_and_b32_e32 v0, s4, v0
1249; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1250; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
1251; VI-NEXT:    v_ffbh_u32_e32 v3, 0
1252; VI-NEXT:    v_cmp_eq_u32_e64 vcc, 0, 0
1253; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1254; VI-NEXT:    v_mov_b32_e32 v3, 0xbe
1255; VI-NEXT:    v_mov_b32_e32 v1, 0
1256; VI-NEXT:    v_sub_u32_e32 v4, vcc, v3, v2
1257; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
1258; VI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
1259; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v3
1260; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
1261; VI-NEXT:    v_and_b32_e32 v3, s4, v1
1262; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
1263; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
1264; VI-NEXT:    s_mov_b32 s4, 0
1265; VI-NEXT:    s_movk_i32 s5, 0x80
1266; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1267; VI-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
1268; VI-NEXT:    v_and_b32_e32 v1, 1, v0
1269; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
1270; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
1271; VI-NEXT:    v_cndmask_b32_e64 v1, v1, 1, vcc
1272; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1273; VI-NEXT:    s_setpc_b64 s[30:31]
1274  %masked = and i64 %arg0, 255
1275  %itofp = uitofp i64 %masked to float
1276  ret float %itofp
1277}
1278
1279define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1280; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1281; SI:       ; %bb.0:
1282; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1284; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
1285; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1286; SI-NEXT:    s_setpc_b64 s[30:31]
1287;
1288; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1289; VI:       ; %bb.0:
1290; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1291; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1292; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1293; VI-NEXT:    s_setpc_b64 s[30:31]
1294  %masked = and i16 %arg0, 255
1295  %itofp = sitofp i16 %masked to float
1296  ret float %itofp
1297}
1298
1299define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1300; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1301; SI:       ; %bb.0:
1302; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1303; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1304; SI-NEXT:    v_bfe_u32 v0, v0, 0, 16
1305; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1306; SI-NEXT:    s_setpc_b64 s[30:31]
1307;
1308; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1309; VI:       ; %bb.0:
1310; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1312; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1313; VI-NEXT:    s_setpc_b64 s[30:31]
1314  %masked = and i16 %arg0, 255
1315  %itofp = uitofp i16 %masked to float
1316  ret float %itofp
1317}
1318