1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
10; GCN:       ; %bb.0:
11; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
13; GCN-NEXT:    s_setpc_b64 s[30:31]
14  %masked = and i32 %arg0, 255
15  %cvt = uitofp i32 %masked to float
16  ret float %cvt
17}
18
19define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
20; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
21; GCN:       ; %bb.0:
22; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
24; GCN-NEXT:    s_setpc_b64 s[30:31]
25  %masked = and i32 %arg0, 255
26  %cvt = sitofp i32 %masked to float
27  ret float %cvt
28}
29
30define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
31; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
32; GCN:       ; %bb.0:
33; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GCN-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
35; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
36; GCN-NEXT:    s_setpc_b64 s[30:31]
37  %lshr.7 = lshr i32 %arg0, 7
38  %masked = and i32 %lshr.7, 255
39  %cvt = uitofp i32 %masked to float
40  ret float %cvt
41}
42
43define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
44; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
45; GCN:       ; %bb.0:
46; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
48; GCN-NEXT:    s_setpc_b64 s[30:31]
49  %lshr.8 = lshr i32 %arg0, 8
50  %masked = and i32 %lshr.8, 255
51  %cvt = uitofp i32 %masked to float
52  ret float %cvt
53}
54
55define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
56; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
57; SI:       ; %bb.0:
58; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
60; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
61; SI-NEXT:    s_mov_b32 s7, 0xf000
62; SI-NEXT:    s_mov_b32 s6, -1
63; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
64; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
65; SI-NEXT:    s_setpc_b64 s[30:31]
66;
67; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
68; VI:       ; %bb.0:
69; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
71; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
72; VI-NEXT:    s_mov_b32 s7, 0xf000
73; VI-NEXT:    s_mov_b32 s6, -1
74; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
75; VI-NEXT:    s_waitcnt vmcnt(0)
76; VI-NEXT:    s_setpc_b64 s[30:31]
77  %lshr.8 = lshr i32 %arg0, 8
78  store i32 %lshr.8, i32 addrspace(1)* undef
79  %masked = and i32 %lshr.8, 255
80  %cvt = uitofp i32 %masked to float
81  ret float %cvt
82}
83
84define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
85; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
86; GCN:       ; %bb.0:
87; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
89; GCN-NEXT:    s_setpc_b64 s[30:31]
90  %lshr.16 = lshr i32 %arg0, 16
91  %masked = and i32 %lshr.16, 255
92  %cvt = uitofp i32 %masked to float
93  ret float %cvt
94}
95
96define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
97; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
98; GCN:       ; %bb.0:
99; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
101; GCN-NEXT:    s_setpc_b64 s[30:31]
102  %lshr.16 = lshr i32 %arg0, 24
103  %masked = and i32 %lshr.16, 255
104  %cvt = uitofp i32 %masked to float
105  ret float %cvt
106}
107
108define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
109; GCN-LABEL: v_uitofp_i8_to_f32:
110; GCN:       ; %bb.0:
111; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
113; GCN-NEXT:    s_setpc_b64 s[30:31]
114  %cvt = uitofp i8 %arg0 to float
115  ret float %cvt
116}
117
118define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
119; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
120; GCN:       ; %bb.0:
121; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
123; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
124; GCN-NEXT:    v_mov_b32_e32 v0, v2
125; GCN-NEXT:    s_setpc_b64 s[30:31]
126  %val = bitcast i16 %arg0 to <2 x i8>
127  %cvt = uitofp <2 x i8> %val to <2 x float>
128  ret <2 x float> %cvt
129}
130
131define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
132; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
133; GCN:       ; %bb.0:
134; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
136; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
137; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
138; GCN-NEXT:    v_mov_b32_e32 v0, v3
139; GCN-NEXT:    s_setpc_b64 s[30:31]
140  %trunc = trunc i32 %arg0 to i24
141  %val = bitcast i24 %trunc to <3 x i8>
142  %cvt = uitofp <3 x i8> %val to <3 x float>
143  ret <3 x float> %cvt
144}
145
146define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
147; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
148; GCN:       ; %bb.0:
149; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
151; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
152; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
153; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
154; GCN-NEXT:    v_mov_b32_e32 v0, v4
155; GCN-NEXT:    s_setpc_b64 s[30:31]
156  %val = bitcast i32 %arg0 to <4 x i8>
157  %cvt = uitofp <4 x i8> %val to <4 x float>
158  ret <4 x float> %cvt
159}
160
161define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
162; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
163; GCN:       ; %bb.0:
164; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
166; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
167; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
168; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
169; GCN-NEXT:    v_mov_b32_e32 v0, v4
170; GCN-NEXT:    s_setpc_b64 s[30:31]
171  %mask.arg0 = and i32 %arg0, 255
172  %cvt0 = uitofp i32 %mask.arg0 to float
173
174  %lshr.8 = lshr i32 %arg0, 8
175  %mask.lshr.8 = and i32 %lshr.8, 255
176  %cvt1 = uitofp i32 %mask.lshr.8 to float
177
178  %lshr.16 = lshr i32 %arg0, 16
179  %mask.lshr.16 = and i32 %lshr.16, 255
180  %cvt2 = uitofp i32 %mask.lshr.16 to float
181
182  %lshr.24 = lshr i32 %arg0, 24
183  %mask.lshr.24 = and i32 %lshr.24, 255
184  %cvt3 = uitofp i32 %mask.lshr.24 to float
185
186  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
187  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
188  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
189  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
190  ret <4 x float> %ins.3
191}
192
193define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
194; SI-LABEL: v_uitofp_i32_to_f16_mask255:
195; SI:       ; %bb.0:
196; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
198; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
199; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
200; SI-NEXT:    s_setpc_b64 s[30:31]
201;
202; VI-LABEL: v_uitofp_i32_to_f16_mask255:
203; VI:       ; %bb.0:
204; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
205; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
206; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
207; VI-NEXT:    s_setpc_b64 s[30:31]
208  %masked = and i32 %arg0, 255
209  %cvt = uitofp i32 %masked to half
210  ret half %cvt
211}
212
213define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
214; SI-LABEL: v_sitofp_i32_to_f16_mask255:
215; SI:       ; %bb.0:
216; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
218; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
219; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
220; SI-NEXT:    s_setpc_b64 s[30:31]
221;
222; VI-LABEL: v_sitofp_i32_to_f16_mask255:
223; VI:       ; %bb.0:
224; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
226; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
227; VI-NEXT:    s_setpc_b64 s[30:31]
228  %masked = and i32 %arg0, 255
229  %cvt = sitofp i32 %masked to half
230  ret half %cvt
231}
232
233define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
234; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
235; SI:       ; %bb.0:
236; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
238; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
239; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
240; SI-NEXT:    s_setpc_b64 s[30:31]
241;
242; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
243; VI:       ; %bb.0:
244; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
246; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
247; VI-NEXT:    s_setpc_b64 s[30:31]
248  %lshr.8 = lshr i32 %arg0, 8
249  %masked = and i32 %lshr.8, 255
250  %cvt = uitofp i32 %masked to half
251  ret half %cvt
252}
253
254define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
255; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
256; SI:       ; %bb.0:
257; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
259; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
260; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
261; SI-NEXT:    s_setpc_b64 s[30:31]
262;
263; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
264; VI:       ; %bb.0:
265; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
266; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
267; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
268; VI-NEXT:    s_setpc_b64 s[30:31]
269  %lshr.16 = lshr i32 %arg0, 16
270  %masked = and i32 %lshr.16, 255
271  %cvt = uitofp i32 %masked to half
272  ret half %cvt
273}
274
275define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
276; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
277; SI:       ; %bb.0:
278; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
280; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
281; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
282; SI-NEXT:    s_setpc_b64 s[30:31]
283;
284; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
285; VI:       ; %bb.0:
286; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
287; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
288; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
289; VI-NEXT:    s_setpc_b64 s[30:31]
290  %lshr.16 = lshr i32 %arg0, 24
291  %masked = and i32 %lshr.16, 255
292  %cvt = uitofp i32 %masked to half
293  ret half %cvt
294}
295
296define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
297; SI-LABEL: v_uitofp_i8_to_f16:
298; SI:       ; %bb.0:
299; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
301; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
302; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
303; SI-NEXT:    s_setpc_b64 s[30:31]
304;
305; VI-LABEL: v_uitofp_i8_to_f16:
306; VI:       ; %bb.0:
307; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; VI-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
309; VI-NEXT:    s_setpc_b64 s[30:31]
310  %cvt = uitofp i8 %arg0 to half
311  ret half %cvt
312}
313
314define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
315; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
316; GCN:       ; %bb.0:
317; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
319; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
320; GCN-NEXT:    s_setpc_b64 s[30:31]
321  %masked = and i32 %arg0, 255
322  %cvt = uitofp i32 %masked to double
323  ret double %cvt
324}
325
326define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
327; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
328; GCN:       ; %bb.0:
329; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
331; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
332; GCN-NEXT:    s_setpc_b64 s[30:31]
333  %lshr.8 = lshr i32 %arg0, 8
334  %masked = and i32 %lshr.8, 255
335  %cvt = uitofp i32 %masked to double
336  ret double %cvt
337}
338
339define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
340; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
341; GCN:       ; %bb.0:
342; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
343; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
344; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
345; GCN-NEXT:    s_setpc_b64 s[30:31]
346  %lshr.16 = lshr i32 %arg0, 16
347  %masked = and i32 %lshr.16, 255
348  %cvt = uitofp i32 %masked to double
349  ret double %cvt
350}
351
352define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
353; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
354; GCN:       ; %bb.0:
355; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
356; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
357; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
358; GCN-NEXT:    s_setpc_b64 s[30:31]
359  %lshr.16 = lshr i32 %arg0, 24
360  %masked = and i32 %lshr.16, 255
361  %cvt = uitofp i32 %masked to double
362  ret double %cvt
363}
364
365define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
366; SI-LABEL: v_uitofp_i8_to_f64:
367; SI:       ; %bb.0:
368; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
370; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
371; SI-NEXT:    s_setpc_b64 s[30:31]
372;
373; VI-LABEL: v_uitofp_i8_to_f64:
374; VI:       ; %bb.0:
375; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
377; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
378; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
379; VI-NEXT:    s_setpc_b64 s[30:31]
380  %cvt = uitofp i8 %arg0 to double
381  ret double %cvt
382}
383
384define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
385; SI-LABEL: load_i8_to_f32:
386; SI:       ; %bb.0:
387; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
388; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
389; SI-NEXT:    s_mov_b32 s7, 0xf000
390; SI-NEXT:    v_mov_b32_e32 v1, 0
391; SI-NEXT:    s_mov_b32 s2, 0
392; SI-NEXT:    s_mov_b32 s3, s7
393; SI-NEXT:    s_waitcnt lgkmcnt(0)
394; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
395; SI-NEXT:    s_mov_b32 s6, -1
396; SI-NEXT:    s_waitcnt vmcnt(0)
397; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
398; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
399; SI-NEXT:    s_endpgm
400;
401; VI-LABEL: load_i8_to_f32:
402; VI:       ; %bb.0:
403; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
404; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
405; VI-NEXT:    s_mov_b32 s7, 0xf000
406; VI-NEXT:    s_mov_b32 s6, -1
407; VI-NEXT:    s_waitcnt lgkmcnt(0)
408; VI-NEXT:    v_mov_b32_e32 v1, s1
409; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
410; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
411; VI-NEXT:    flat_load_ubyte v0, v[0:1]
412; VI-NEXT:    s_waitcnt vmcnt(0)
413; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
414; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
415; VI-NEXT:    s_endpgm
416  %tid = call i32 @llvm.amdgcn.workitem.id.x()
417  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
418  %load = load i8, i8 addrspace(1)* %gep, align 1
419  %cvt = uitofp i8 %load to float
420  store float %cvt, float addrspace(1)* %out, align 4
421  ret void
422}
423
424define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
425; SI-LABEL: load_v2i8_to_v2f32:
426; SI:       ; %bb.0:
427; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
428; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
429; SI-NEXT:    s_mov_b32 s7, 0xf000
430; SI-NEXT:    s_mov_b32 s2, 0
431; SI-NEXT:    s_mov_b32 s3, s7
432; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
433; SI-NEXT:    v_mov_b32_e32 v1, 0
434; SI-NEXT:    s_waitcnt lgkmcnt(0)
435; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
436; SI-NEXT:    s_mov_b32 s6, -1
437; SI-NEXT:    s_waitcnt vmcnt(0)
438; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
439; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
440; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
441; SI-NEXT:    s_endpgm
442;
443; VI-LABEL: load_v2i8_to_v2f32:
444; VI:       ; %bb.0:
445; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
446; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
447; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
448; VI-NEXT:    s_mov_b32 s7, 0xf000
449; VI-NEXT:    s_mov_b32 s6, -1
450; VI-NEXT:    s_waitcnt lgkmcnt(0)
451; VI-NEXT:    v_mov_b32_e32 v1, s1
452; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
453; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
454; VI-NEXT:    flat_load_ushort v0, v[0:1]
455; VI-NEXT:    s_waitcnt vmcnt(0)
456; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
457; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
458; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
459; VI-NEXT:    s_endpgm
460  %tid = call i32 @llvm.amdgcn.workitem.id.x()
461  %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
462  %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
463  %cvt = uitofp <2 x i8> %load to <2 x float>
464  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
465  ret void
466}
467
468define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
469; SI-LABEL: load_v3i8_to_v3f32:
470; SI:       ; %bb.0:
471; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
472; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
473; SI-NEXT:    s_mov_b32 s7, 0xf000
474; SI-NEXT:    s_mov_b32 s2, 0
475; SI-NEXT:    s_mov_b32 s3, s7
476; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
477; SI-NEXT:    v_mov_b32_e32 v1, 0
478; SI-NEXT:    s_waitcnt lgkmcnt(0)
479; SI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
480; SI-NEXT:    s_mov_b32 s6, -1
481; SI-NEXT:    s_waitcnt vmcnt(0)
482; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v2
483; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
484; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
485; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
486; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
487; SI-NEXT:    s_endpgm
488;
489; VI-LABEL: load_v3i8_to_v3f32:
490; VI:       ; %bb.0:
491; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
492; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
493; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
494; VI-NEXT:    s_mov_b32 s7, 0xf000
495; VI-NEXT:    s_mov_b32 s6, -1
496; VI-NEXT:    s_waitcnt lgkmcnt(0)
497; VI-NEXT:    v_mov_b32_e32 v1, s1
498; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
499; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
500; VI-NEXT:    flat_load_dword v0, v[0:1]
501; VI-NEXT:    s_waitcnt vmcnt(0)
502; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
503; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
504; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
505; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
506; VI-NEXT:    s_endpgm
507  %tid = call i32 @llvm.amdgcn.workitem.id.x()
508  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
509  %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
510  %cvt = uitofp <3 x i8> %load to <3 x float>
511  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
512  ret void
513}
514
515define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
516; SI-LABEL: load_v4i8_to_v4f32:
517; SI:       ; %bb.0:
518; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
519; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
520; SI-NEXT:    s_mov_b32 s7, 0xf000
521; SI-NEXT:    s_mov_b32 s2, 0
522; SI-NEXT:    s_mov_b32 s3, s7
523; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
524; SI-NEXT:    v_mov_b32_e32 v1, 0
525; SI-NEXT:    s_waitcnt lgkmcnt(0)
526; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
527; SI-NEXT:    s_mov_b32 s6, -1
528; SI-NEXT:    s_waitcnt vmcnt(0)
529; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
530; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
531; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
532; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
533; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
534; SI-NEXT:    s_endpgm
535;
536; VI-LABEL: load_v4i8_to_v4f32:
537; VI:       ; %bb.0:
538; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
539; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
540; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
541; VI-NEXT:    s_mov_b32 s7, 0xf000
542; VI-NEXT:    s_mov_b32 s6, -1
543; VI-NEXT:    s_waitcnt lgkmcnt(0)
544; VI-NEXT:    v_mov_b32_e32 v1, s1
545; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
546; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
547; VI-NEXT:    flat_load_dword v0, v[0:1]
548; VI-NEXT:    s_waitcnt vmcnt(0)
549; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
550; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
551; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
552; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
553; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
554; VI-NEXT:    s_endpgm
555  %tid = call i32 @llvm.amdgcn.workitem.id.x()
556  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
557  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
558  %cvt = uitofp <4 x i8> %load to <4 x float>
559  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
560  ret void
561}
562
563; This should not be adding instructions to shift into the correct
564; position in the word for the component.
565
566; FIXME: Packing bytes
567define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
568; SI-LABEL: load_v4i8_to_v4f32_unaligned:
569; SI:       ; %bb.0:
570; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
571; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
572; SI-NEXT:    s_mov_b32 s7, 0xf000
573; SI-NEXT:    s_mov_b32 s2, 0
574; SI-NEXT:    s_mov_b32 s3, s7
575; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
576; SI-NEXT:    v_mov_b32_e32 v1, 0
577; SI-NEXT:    s_waitcnt lgkmcnt(0)
578; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
579; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
580; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
581; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
582; SI-NEXT:    s_mov_b32 s6, -1
583; SI-NEXT:    s_waitcnt vmcnt(2)
584; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v2
585; SI-NEXT:    s_waitcnt vmcnt(0)
586; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
587; SI-NEXT:    v_or_b32_e32 v0, v0, v3
588; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
589; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
590; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
591; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
592; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
593; SI-NEXT:    s_endpgm
594;
595; VI-LABEL: load_v4i8_to_v4f32_unaligned:
596; VI:       ; %bb.0:
597; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
598; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
599; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
600; VI-NEXT:    s_mov_b32 s7, 0xf000
601; VI-NEXT:    s_mov_b32 s6, -1
602; VI-NEXT:    s_waitcnt lgkmcnt(0)
603; VI-NEXT:    v_mov_b32_e32 v1, s1
604; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
605; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
606; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
607; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
608; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
609; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
610; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
611; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
612; VI-NEXT:    flat_load_ubyte v4, v[4:5]
613; VI-NEXT:    flat_load_ubyte v5, v[6:7]
614; VI-NEXT:    flat_load_ubyte v6, v[2:3]
615; VI-NEXT:    flat_load_ubyte v0, v[0:1]
616; VI-NEXT:    s_waitcnt vmcnt(3)
617; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
618; VI-NEXT:    s_waitcnt vmcnt(2)
619; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v5
620; VI-NEXT:    s_waitcnt vmcnt(1)
621; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v6
622; VI-NEXT:    s_waitcnt vmcnt(0)
623; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
624; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
625; VI-NEXT:    s_endpgm
626  %tid = call i32 @llvm.amdgcn.workitem.id.x()
627  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
628  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
629  %cvt = uitofp <4 x i8> %load to <4 x float>
630  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
631  ret void
632}
633
634; FIXME: Need to handle non-uniform case for function below (load without gep).
635; Instructions still emitted to repack bytes for add use.
636define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
637; SI-LABEL: load_v4i8_to_v4f32_2_uses:
638; SI:       ; %bb.0:
639; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
640; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
641; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
642; SI-NEXT:    s_mov_b32 s11, 0xf000
643; SI-NEXT:    s_mov_b32 s2, 0
644; SI-NEXT:    s_mov_b32 s3, s11
645; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
646; SI-NEXT:    v_mov_b32_e32 v1, 0
647; SI-NEXT:    s_waitcnt lgkmcnt(0)
648; SI-NEXT:    buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
649; SI-NEXT:    s_mov_b32 s10, -1
650; SI-NEXT:    s_movk_i32 s0, 0xff
651; SI-NEXT:    s_mov_b32 s6, s10
652; SI-NEXT:    s_mov_b32 s7, s11
653; SI-NEXT:    s_waitcnt vmcnt(0)
654; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
655; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
656; SI-NEXT:    v_and_b32_e32 v7, 0xff00, v4
657; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
658; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
659; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
660; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
661; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
662; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
663; SI-NEXT:    s_waitcnt expcnt(0)
664; SI-NEXT:    v_and_b32_e32 v0, s0, v4
665; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
666; SI-NEXT:    v_or_b32_e32 v0, v7, v0
667; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
668; SI-NEXT:    v_and_b32_e32 v2, s0, v2
669; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
670; SI-NEXT:    v_or_b32_e32 v1, v1, v2
671; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
672; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
673; SI-NEXT:    v_or_b32_e32 v0, v1, v0
674; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
675; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
676; SI-NEXT:    s_endpgm
677;
678; VI-LABEL: load_v4i8_to_v4f32_2_uses:
679; VI:       ; %bb.0:
680; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
681; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
682; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
683; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
684; VI-NEXT:    s_mov_b32 s11, 0xf000
685; VI-NEXT:    s_mov_b32 s10, -1
686; VI-NEXT:    v_mov_b32_e32 v5, 9
687; VI-NEXT:    s_waitcnt lgkmcnt(0)
688; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
689; VI-NEXT:    v_mov_b32_e32 v1, s1
690; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
691; VI-NEXT:    flat_load_dword v4, v[0:1]
692; VI-NEXT:    s_mov_b32 s6, s10
693; VI-NEXT:    s_mov_b32 s7, s11
694; VI-NEXT:    s_movk_i32 s0, 0x900
695; VI-NEXT:    s_waitcnt vmcnt(0)
696; VI-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
697; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
698; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
699; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
700; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
701; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
702; VI-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
703; VI-NEXT:    v_add_u16_e32 v8, 9, v4
704; VI-NEXT:    v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
705; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
706; VI-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
707; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
708; VI-NEXT:    v_mov_b32_e32 v2, s0
709; VI-NEXT:    v_add_u16_e32 v0, s0, v0
710; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
711; VI-NEXT:    v_or_b32_e32 v0, v0, v1
712; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
713; VI-NEXT:    s_endpgm
714  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
715  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
716  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
717  %cvt = uitofp <4 x i8> %load to <4 x float>
718  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
719  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
720  store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
721  ret void
722}
723
724; Make sure this doesn't crash.
725define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
726; SI-LABEL: load_v7i8_to_v7f32:
727; SI:       ; %bb.0:
728; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
729; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
730; SI-NEXT:    s_mov_b32 s7, 0xf000
731; SI-NEXT:    s_mov_b32 s2, 0
732; SI-NEXT:    s_mov_b32 s3, s7
733; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
734; SI-NEXT:    v_mov_b32_e32 v1, 0
735; SI-NEXT:    s_waitcnt lgkmcnt(0)
736; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
737; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
738; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
739; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
740; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
741; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
742; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
743; SI-NEXT:    s_mov_b32 s6, -1
744; SI-NEXT:    s_waitcnt vmcnt(6)
745; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
746; SI-NEXT:    s_waitcnt vmcnt(5)
747; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v3
748; SI-NEXT:    s_waitcnt vmcnt(3)
749; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
750; SI-NEXT:    v_or_b32_e32 v3, v9, v6
751; SI-NEXT:    s_waitcnt vmcnt(1)
752; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v5
753; SI-NEXT:    s_waitcnt vmcnt(0)
754; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v8
755; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:24
756; SI-NEXT:    s_waitcnt expcnt(0)
757; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
758; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
759; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
760; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
761; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
762; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
763; SI-NEXT:    s_endpgm
764;
765; VI-LABEL: load_v7i8_to_v7f32:
766; VI:       ; %bb.0:
767; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
768; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
769; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
770; VI-NEXT:    s_mov_b32 s7, 0xf000
771; VI-NEXT:    s_mov_b32 s6, -1
772; VI-NEXT:    s_waitcnt lgkmcnt(0)
773; VI-NEXT:    v_mov_b32_e32 v1, s1
774; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
775; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
776; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v0
777; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
778; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
779; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
780; VI-NEXT:    flat_load_ubyte v12, v[4:5]
781; VI-NEXT:    v_add_u32_e32 v4, vcc, 6, v0
782; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
783; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v0
784; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
785; VI-NEXT:    v_add_u32_e32 v8, vcc, 5, v0
786; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
787; VI-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
788; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
789; VI-NEXT:    flat_load_ubyte v8, v[8:9]
790; VI-NEXT:    flat_load_ubyte v9, v[10:11]
791; VI-NEXT:    flat_load_ubyte v6, v[6:7]
792; VI-NEXT:    flat_load_ubyte v7, v[4:5]
793; VI-NEXT:    flat_load_ubyte v2, v[2:3]
794; VI-NEXT:    flat_load_ubyte v0, v[0:1]
795; VI-NEXT:    s_waitcnt vmcnt(5)
796; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v8
797; VI-NEXT:    s_waitcnt vmcnt(4)
798; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v9
799; VI-NEXT:    s_waitcnt vmcnt(3)
800; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v6
801; VI-NEXT:    s_waitcnt vmcnt(2)
802; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v7
803; VI-NEXT:    s_waitcnt vmcnt(1)
804; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
805; VI-NEXT:    v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
806; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
807; VI-NEXT:    s_waitcnt vmcnt(0)
808; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
809; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
810; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
811; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
812; VI-NEXT:    s_endpgm
813  %tid = call i32 @llvm.amdgcn.workitem.id.x()
814  %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
815  %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
816  %cvt = uitofp <7 x i8> %load to <7 x float>
817  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
818  ret void
819}
820
821define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
822; SI-LABEL: load_v8i8_to_v8f32:
823; SI:       ; %bb.0:
824; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
825; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
826; SI-NEXT:    s_mov_b32 s7, 0xf000
827; SI-NEXT:    s_mov_b32 s2, 0
828; SI-NEXT:    s_mov_b32 s3, s7
829; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
830; SI-NEXT:    v_mov_b32_e32 v1, 0
831; SI-NEXT:    s_waitcnt lgkmcnt(0)
832; SI-NEXT:    buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64
833; SI-NEXT:    s_mov_b32 s6, -1
834; SI-NEXT:    s_waitcnt vmcnt(0)
835; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
836; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
837; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
838; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
839; SI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
840; SI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
841; SI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
842; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
843; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
844; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
845; SI-NEXT:    s_endpgm
846;
847; VI-LABEL: load_v8i8_to_v8f32:
848; VI:       ; %bb.0:
849; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
850; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
851; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
852; VI-NEXT:    s_mov_b32 s7, 0xf000
853; VI-NEXT:    s_mov_b32 s6, -1
854; VI-NEXT:    s_waitcnt lgkmcnt(0)
855; VI-NEXT:    v_mov_b32_e32 v1, s1
856; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
857; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
858; VI-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
859; VI-NEXT:    s_waitcnt vmcnt(0)
860; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
861; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
862; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
863; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
864; VI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
865; VI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
866; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
867; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
868; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
869; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
870; VI-NEXT:    s_endpgm
871  %tid = call i32 @llvm.amdgcn.workitem.id.x()
872  %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
873  %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
874  %cvt = uitofp <8 x i8> %load to <8 x float>
875  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
876  ret void
877}
878
879define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
880; SI-LABEL: i8_zext_inreg_i32_to_f32:
881; SI:       ; %bb.0:
882; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
883; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
884; SI-NEXT:    s_mov_b32 s7, 0xf000
885; SI-NEXT:    s_mov_b32 s2, 0
886; SI-NEXT:    s_mov_b32 s3, s7
887; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
888; SI-NEXT:    v_mov_b32_e32 v1, 0
889; SI-NEXT:    s_waitcnt lgkmcnt(0)
890; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
891; SI-NEXT:    s_mov_b32 s6, -1
892; SI-NEXT:    s_waitcnt vmcnt(0)
893; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
894; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
895; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
896; SI-NEXT:    s_endpgm
897;
898; VI-LABEL: i8_zext_inreg_i32_to_f32:
899; VI:       ; %bb.0:
900; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
901; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
902; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
903; VI-NEXT:    s_mov_b32 s7, 0xf000
904; VI-NEXT:    s_mov_b32 s6, -1
905; VI-NEXT:    s_waitcnt lgkmcnt(0)
906; VI-NEXT:    v_mov_b32_e32 v1, s1
907; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
908; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
909; VI-NEXT:    flat_load_dword v0, v[0:1]
910; VI-NEXT:    s_waitcnt vmcnt(0)
911; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
912; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
913; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
914; VI-NEXT:    s_endpgm
915  %tid = call i32 @llvm.amdgcn.workitem.id.x()
916  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
917  %load = load i32, i32 addrspace(1)* %gep, align 4
918  %add = add i32 %load, 2
919  %inreg = and i32 %add, 255
920  %cvt = uitofp i32 %inreg to float
921  store float %cvt, float addrspace(1)* %out, align 4
922  ret void
923}
924
925define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
926; SI-LABEL: i8_zext_inreg_hi1_to_f32:
927; SI:       ; %bb.0:
928; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
929; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
930; SI-NEXT:    s_mov_b32 s7, 0xf000
931; SI-NEXT:    s_mov_b32 s2, 0
932; SI-NEXT:    s_mov_b32 s3, s7
933; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
934; SI-NEXT:    v_mov_b32_e32 v1, 0
935; SI-NEXT:    s_waitcnt lgkmcnt(0)
936; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
937; SI-NEXT:    s_mov_b32 s6, -1
938; SI-NEXT:    s_waitcnt vmcnt(0)
939; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
940; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
941; SI-NEXT:    s_endpgm
942;
943; VI-LABEL: i8_zext_inreg_hi1_to_f32:
944; VI:       ; %bb.0:
945; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
946; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
947; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
948; VI-NEXT:    s_mov_b32 s7, 0xf000
949; VI-NEXT:    s_mov_b32 s6, -1
950; VI-NEXT:    s_waitcnt lgkmcnt(0)
951; VI-NEXT:    v_mov_b32_e32 v1, s1
952; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
953; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
954; VI-NEXT:    flat_load_dword v0, v[0:1]
955; VI-NEXT:    s_waitcnt vmcnt(0)
956; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
957; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
958; VI-NEXT:    s_endpgm
959  %tid = call i32 @llvm.amdgcn.workitem.id.x()
960  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
961  %load = load i32, i32 addrspace(1)* %gep, align 4
962  %inreg = and i32 %load, 65280
963  %shr = lshr i32 %inreg, 8
964  %cvt = uitofp i32 %shr to float
965  store float %cvt, float addrspace(1)* %out, align 4
966  ret void
967}
968
969; We don't get these ones because of the zext, but instcombine removes
970; them so it shouldn't really matter.
971define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
972; SI-LABEL: i8_zext_i32_to_f32:
973; SI:       ; %bb.0:
974; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
975; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
976; SI-NEXT:    s_mov_b32 s7, 0xf000
977; SI-NEXT:    v_mov_b32_e32 v1, 0
978; SI-NEXT:    s_mov_b32 s2, 0
979; SI-NEXT:    s_mov_b32 s3, s7
980; SI-NEXT:    s_waitcnt lgkmcnt(0)
981; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
982; SI-NEXT:    s_mov_b32 s6, -1
983; SI-NEXT:    s_waitcnt vmcnt(0)
984; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
985; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
986; SI-NEXT:    s_endpgm
987;
988; VI-LABEL: i8_zext_i32_to_f32:
989; VI:       ; %bb.0:
990; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
991; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
992; VI-NEXT:    s_mov_b32 s7, 0xf000
993; VI-NEXT:    s_mov_b32 s6, -1
994; VI-NEXT:    s_waitcnt lgkmcnt(0)
995; VI-NEXT:    v_mov_b32_e32 v1, s1
996; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
997; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
998; VI-NEXT:    flat_load_ubyte v0, v[0:1]
999; VI-NEXT:    s_waitcnt vmcnt(0)
1000; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1001; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1002; VI-NEXT:    s_endpgm
1003  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1004  %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
1005  %load = load i8, i8 addrspace(1)* %gep, align 1
1006  %ext = zext i8 %load to i32
1007  %cvt = uitofp i32 %ext to float
1008  store float %cvt, float addrspace(1)* %out, align 4
1009  ret void
1010}
1011
1012define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
1013; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1014; SI:       ; %bb.0:
1015; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1016; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1017; SI-NEXT:    s_mov_b32 s7, 0xf000
1018; SI-NEXT:    s_mov_b32 s2, 0
1019; SI-NEXT:    s_mov_b32 s3, s7
1020; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1021; SI-NEXT:    v_mov_b32_e32 v1, 0
1022; SI-NEXT:    s_waitcnt lgkmcnt(0)
1023; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
1024; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:1
1025; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:2
1026; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
1027; SI-NEXT:    s_mov_b32 s6, -1
1028; SI-NEXT:    s_waitcnt vmcnt(2)
1029; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v2
1030; SI-NEXT:    s_waitcnt vmcnt(0)
1031; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1032; SI-NEXT:    v_or_b32_e32 v0, v0, v3
1033; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1034; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1035; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1036; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1037; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1038; SI-NEXT:    s_endpgm
1039;
1040; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1041; VI:       ; %bb.0:
1042; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1043; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1044; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1045; VI-NEXT:    s_mov_b32 s7, 0xf000
1046; VI-NEXT:    s_mov_b32 s6, -1
1047; VI-NEXT:    s_waitcnt lgkmcnt(0)
1048; VI-NEXT:    v_mov_b32_e32 v1, s1
1049; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1050; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1051; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v0
1052; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1053; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
1054; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1055; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
1056; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1057; VI-NEXT:    flat_load_ubyte v4, v[4:5]
1058; VI-NEXT:    flat_load_ubyte v5, v[6:7]
1059; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1060; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1061; VI-NEXT:    s_waitcnt vmcnt(1)
1062; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1063; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1064; VI-NEXT:    s_waitcnt vmcnt(0)
1065; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1066; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v1
1067; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v1
1068; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v5
1069; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1070; VI-NEXT:    s_endpgm
1071  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1072  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
1073  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
1074  %ext = zext <4 x i8> %load to <4 x i32>
1075  %cvt = uitofp <4 x i32> %ext to <4 x float>
1076  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
1077  ret void
1078}
1079
1080define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1081; SI-LABEL: extract_byte0_to_f32:
1082; SI:       ; %bb.0:
1083; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1084; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1085; SI-NEXT:    s_mov_b32 s7, 0xf000
1086; SI-NEXT:    s_mov_b32 s2, 0
1087; SI-NEXT:    s_mov_b32 s3, s7
1088; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1089; SI-NEXT:    v_mov_b32_e32 v1, 0
1090; SI-NEXT:    s_waitcnt lgkmcnt(0)
1091; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1092; SI-NEXT:    s_mov_b32 s6, -1
1093; SI-NEXT:    s_waitcnt vmcnt(0)
1094; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1095; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1096; SI-NEXT:    s_endpgm
1097;
1098; VI-LABEL: extract_byte0_to_f32:
1099; VI:       ; %bb.0:
1100; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1101; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1102; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1103; VI-NEXT:    s_mov_b32 s7, 0xf000
1104; VI-NEXT:    s_mov_b32 s6, -1
1105; VI-NEXT:    s_waitcnt lgkmcnt(0)
1106; VI-NEXT:    v_mov_b32_e32 v1, s1
1107; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1108; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1109; VI-NEXT:    flat_load_dword v0, v[0:1]
1110; VI-NEXT:    s_waitcnt vmcnt(0)
1111; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1112; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1113; VI-NEXT:    s_endpgm
1114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1115  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1116  %val = load i32, i32 addrspace(1)* %gep
1117  %and = and i32 %val, 255
1118  %cvt = uitofp i32 %and to float
1119  store float %cvt, float addrspace(1)* %out
1120  ret void
1121}
1122
1123define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1124; SI-LABEL: extract_byte1_to_f32:
1125; SI:       ; %bb.0:
1126; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1127; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1128; SI-NEXT:    s_mov_b32 s7, 0xf000
1129; SI-NEXT:    s_mov_b32 s2, 0
1130; SI-NEXT:    s_mov_b32 s3, s7
1131; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1132; SI-NEXT:    v_mov_b32_e32 v1, 0
1133; SI-NEXT:    s_waitcnt lgkmcnt(0)
1134; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1135; SI-NEXT:    s_mov_b32 s6, -1
1136; SI-NEXT:    s_waitcnt vmcnt(0)
1137; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1138; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1139; SI-NEXT:    s_endpgm
1140;
1141; VI-LABEL: extract_byte1_to_f32:
1142; VI:       ; %bb.0:
1143; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1144; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1145; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1146; VI-NEXT:    s_mov_b32 s7, 0xf000
1147; VI-NEXT:    s_mov_b32 s6, -1
1148; VI-NEXT:    s_waitcnt lgkmcnt(0)
1149; VI-NEXT:    v_mov_b32_e32 v1, s1
1150; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1151; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1152; VI-NEXT:    flat_load_dword v0, v[0:1]
1153; VI-NEXT:    s_waitcnt vmcnt(0)
1154; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
1155; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1156; VI-NEXT:    s_endpgm
1157  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1158  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1159  %val = load i32, i32 addrspace(1)* %gep
1160  %srl = lshr i32 %val, 8
1161  %and = and i32 %srl, 255
1162  %cvt = uitofp i32 %and to float
1163  store float %cvt, float addrspace(1)* %out
1164  ret void
1165}
1166
1167define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1168; SI-LABEL: extract_byte2_to_f32:
1169; SI:       ; %bb.0:
1170; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1171; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1172; SI-NEXT:    s_mov_b32 s7, 0xf000
1173; SI-NEXT:    s_mov_b32 s2, 0
1174; SI-NEXT:    s_mov_b32 s3, s7
1175; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1176; SI-NEXT:    v_mov_b32_e32 v1, 0
1177; SI-NEXT:    s_waitcnt lgkmcnt(0)
1178; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1179; SI-NEXT:    s_mov_b32 s6, -1
1180; SI-NEXT:    s_waitcnt vmcnt(0)
1181; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
1182; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1183; SI-NEXT:    s_endpgm
1184;
1185; VI-LABEL: extract_byte2_to_f32:
1186; VI:       ; %bb.0:
1187; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1188; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1189; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1190; VI-NEXT:    s_mov_b32 s7, 0xf000
1191; VI-NEXT:    s_mov_b32 s6, -1
1192; VI-NEXT:    s_waitcnt lgkmcnt(0)
1193; VI-NEXT:    v_mov_b32_e32 v1, s1
1194; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1195; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1196; VI-NEXT:    flat_load_dword v0, v[0:1]
1197; VI-NEXT:    s_waitcnt vmcnt(0)
1198; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
1199; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1200; VI-NEXT:    s_endpgm
1201  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1202  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1203  %val = load i32, i32 addrspace(1)* %gep
1204  %srl = lshr i32 %val, 16
1205  %and = and i32 %srl, 255
1206  %cvt = uitofp i32 %and to float
1207  store float %cvt, float addrspace(1)* %out
1208  ret void
1209}
1210
1211define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
1212; SI-LABEL: extract_byte3_to_f32:
1213; SI:       ; %bb.0:
1214; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1215; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1216; SI-NEXT:    s_mov_b32 s7, 0xf000
1217; SI-NEXT:    s_mov_b32 s2, 0
1218; SI-NEXT:    s_mov_b32 s3, s7
1219; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1220; SI-NEXT:    v_mov_b32_e32 v1, 0
1221; SI-NEXT:    s_waitcnt lgkmcnt(0)
1222; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1223; SI-NEXT:    s_mov_b32 s6, -1
1224; SI-NEXT:    s_waitcnt vmcnt(0)
1225; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1226; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1227; SI-NEXT:    s_endpgm
1228;
1229; VI-LABEL: extract_byte3_to_f32:
1230; VI:       ; %bb.0:
1231; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1232; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1233; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1234; VI-NEXT:    s_mov_b32 s7, 0xf000
1235; VI-NEXT:    s_mov_b32 s6, -1
1236; VI-NEXT:    s_waitcnt lgkmcnt(0)
1237; VI-NEXT:    v_mov_b32_e32 v1, s1
1238; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1239; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1240; VI-NEXT:    flat_load_dword v0, v[0:1]
1241; VI-NEXT:    s_waitcnt vmcnt(0)
1242; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1243; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1244; VI-NEXT:    s_endpgm
1245  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1246  %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
1247  %val = load i32, i32 addrspace(1)* %gep
1248  %srl = lshr i32 %val, 24
1249  %and = and i32 %srl, 255
1250  %cvt = uitofp i32 %and to float
1251  store float %cvt, float addrspace(1)* %out
1252  ret void
1253}
1254
1255define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
1256; SI-LABEL: cvt_ubyte0_or_multiuse:
1257; SI:       ; %bb.0: ; %bb
1258; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1259; SI-NEXT:    s_mov_b32 s3, 0xf000
1260; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1261; SI-NEXT:    v_mov_b32_e32 v1, 0
1262; SI-NEXT:    s_mov_b32 s2, -1
1263; SI-NEXT:    s_waitcnt lgkmcnt(0)
1264; SI-NEXT:    s_mov_b32 s0, s6
1265; SI-NEXT:    s_mov_b32 s1, s7
1266; SI-NEXT:    s_mov_b32 s6, 0
1267; SI-NEXT:    s_mov_b32 s7, s3
1268; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1269; SI-NEXT:    s_waitcnt vmcnt(0)
1270; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1271; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
1272; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1273; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1274; SI-NEXT:    s_endpgm
1275;
1276; VI-LABEL: cvt_ubyte0_or_multiuse:
1277; VI:       ; %bb.0: ; %bb
1278; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1279; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1280; VI-NEXT:    s_mov_b32 s3, 0xf000
1281; VI-NEXT:    s_mov_b32 s2, -1
1282; VI-NEXT:    s_waitcnt lgkmcnt(0)
1283; VI-NEXT:    v_mov_b32_e32 v1, s5
1284; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
1285; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1286; VI-NEXT:    flat_load_dword v0, v[0:1]
1287; VI-NEXT:    s_mov_b32 s0, s6
1288; VI-NEXT:    s_mov_b32 s1, s7
1289; VI-NEXT:    s_waitcnt vmcnt(0)
1290; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1291; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
1292; VI-NEXT:    v_add_f32_e32 v0, v0, v1
1293; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1294; VI-NEXT:    s_endpgm
1295bb:
1296  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1297  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid
1298  %load = load i32, i32 addrspace(1)* %gep
1299  %or = or i32 %load, -2147483647
1300  %and = and i32 %or, 255
1301  %uitofp = uitofp i32 %and to float
1302  %cast = bitcast i32 %or to float
1303  %add = fadd float %cast, %uitofp
1304  store float %add, float addrspace(1)* %out
1305  ret void
1306}
1307