1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
4
5define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
6; SI-LABEL: s_sext_i1_to_i32:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
10; SI-NEXT:    s_mov_b32 s7, 0xf000
11; SI-NEXT:    s_mov_b32 s6, -1
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    v_mov_b32_e32 v0, s1
14; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
15; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
16; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: s_sext_i1_to_i32:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
22; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
23; VI-NEXT:    s_mov_b32 s7, 0xf000
24; VI-NEXT:    s_mov_b32 s6, -1
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    v_mov_b32_e32 v0, s1
27; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
28; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
29; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
30; VI-NEXT:    s_endpgm
31  %cmp = icmp eq i32 %a, %b
32  %sext = sext i1 %cmp to i32
33  store i32 %sext, i32 addrspace(1)* %out, align 4
34  ret void
35}
36
37define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
38; SI-LABEL: test_s_sext_i32_to_i64:
39; SI:       ; %bb.0: ; %entry
40; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
41; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
42; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
43; SI-NEXT:    s_mov_b32 s7, 0xf000
44; SI-NEXT:    s_mov_b32 s6, -1
45; SI-NEXT:    s_waitcnt lgkmcnt(0)
46; SI-NEXT:    s_mul_i32 s1, s2, s3
47; SI-NEXT:    s_add_i32 s1, s1, s0
48; SI-NEXT:    s_ashr_i32 s0, s1, 31
49; SI-NEXT:    v_mov_b32_e32 v0, s1
50; SI-NEXT:    v_mov_b32_e32 v1, s0
51; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
52; SI-NEXT:    s_endpgm
53;
54; VI-LABEL: test_s_sext_i32_to_i64:
55; VI:       ; %bb.0: ; %entry
56; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
57; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
58; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
59; VI-NEXT:    s_mov_b32 s7, 0xf000
60; VI-NEXT:    s_mov_b32 s6, -1
61; VI-NEXT:    s_waitcnt lgkmcnt(0)
62; VI-NEXT:    s_mul_i32 s1, s2, s3
63; VI-NEXT:    s_add_i32 s1, s1, s0
64; VI-NEXT:    s_ashr_i32 s0, s1, 31
65; VI-NEXT:    v_mov_b32_e32 v0, s1
66; VI-NEXT:    v_mov_b32_e32 v1, s0
67; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68; VI-NEXT:    s_endpgm
69entry:
70  %mul = mul i32 %a, %b
71  %add = add i32 %mul, %c
72  %sext = sext i32 %add to i64
73  store i64 %sext, i64 addrspace(1)* %out, align 8
74  ret void
75}
76
77define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
78; SI-LABEL: s_sext_i1_to_i64:
79; SI:       ; %bb.0:
80; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
81; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
82; SI-NEXT:    s_mov_b32 s7, 0xf000
83; SI-NEXT:    s_mov_b32 s6, -1
84; SI-NEXT:    s_waitcnt lgkmcnt(0)
85; SI-NEXT:    v_mov_b32_e32 v0, s1
86; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
87; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
88; SI-NEXT:    v_mov_b32_e32 v1, v0
89; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
90; SI-NEXT:    s_endpgm
91;
92; VI-LABEL: s_sext_i1_to_i64:
93; VI:       ; %bb.0:
94; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
95; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
96; VI-NEXT:    s_mov_b32 s7, 0xf000
97; VI-NEXT:    s_mov_b32 s6, -1
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    v_mov_b32_e32 v0, s1
100; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
101; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
102; VI-NEXT:    v_mov_b32_e32 v1, v0
103; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104; VI-NEXT:    s_endpgm
105  %cmp = icmp eq i32 %a, %b
106  %sext = sext i1 %cmp to i64
107  store i64 %sext, i64 addrspace(1)* %out, align 8
108  ret void
109}
110
111define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
112; SI-LABEL: s_sext_i32_to_i64:
113; SI:       ; %bb.0:
114; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
115; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
116; SI-NEXT:    s_mov_b32 s7, 0xf000
117; SI-NEXT:    s_mov_b32 s6, -1
118; SI-NEXT:    s_waitcnt lgkmcnt(0)
119; SI-NEXT:    s_ashr_i32 s1, s0, 31
120; SI-NEXT:    v_mov_b32_e32 v0, s0
121; SI-NEXT:    v_mov_b32_e32 v1, s1
122; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
123; SI-NEXT:    s_endpgm
124;
125; VI-LABEL: s_sext_i32_to_i64:
126; VI:       ; %bb.0:
127; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
128; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
129; VI-NEXT:    s_mov_b32 s7, 0xf000
130; VI-NEXT:    s_mov_b32 s6, -1
131; VI-NEXT:    s_waitcnt lgkmcnt(0)
132; VI-NEXT:    s_ashr_i32 s1, s0, 31
133; VI-NEXT:    v_mov_b32_e32 v0, s0
134; VI-NEXT:    v_mov_b32_e32 v1, s1
135; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
136; VI-NEXT:    s_endpgm
137  %sext = sext i32 %a to i64
138  store i64 %sext, i64 addrspace(1)* %out, align 8
139  ret void
140}
141
142define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
143; SI-LABEL: v_sext_i32_to_i64:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
146; SI-NEXT:    s_mov_b32 s3, 0xf000
147; SI-NEXT:    s_mov_b32 s2, -1
148; SI-NEXT:    s_waitcnt lgkmcnt(0)
149; SI-NEXT:    s_mov_b32 s0, s4
150; SI-NEXT:    s_mov_b32 s1, s5
151; SI-NEXT:    s_mov_b32 s4, s6
152; SI-NEXT:    s_mov_b32 s5, s7
153; SI-NEXT:    s_mov_b32 s6, s2
154; SI-NEXT:    s_mov_b32 s7, s3
155; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
158; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
159; SI-NEXT:    s_endpgm
160;
161; VI-LABEL: v_sext_i32_to_i64:
162; VI:       ; %bb.0:
163; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
164; VI-NEXT:    s_mov_b32 s3, 0xf000
165; VI-NEXT:    s_mov_b32 s2, -1
166; VI-NEXT:    s_waitcnt lgkmcnt(0)
167; VI-NEXT:    s_mov_b32 s0, s4
168; VI-NEXT:    s_mov_b32 s1, s5
169; VI-NEXT:    s_mov_b32 s4, s6
170; VI-NEXT:    s_mov_b32 s5, s7
171; VI-NEXT:    s_mov_b32 s6, s2
172; VI-NEXT:    s_mov_b32 s7, s3
173; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
176; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
177; VI-NEXT:    s_endpgm
178  %val = load i32, i32 addrspace(1)* %in, align 4
179  %sext = sext i32 %val to i64
180  store i64 %sext, i64 addrspace(1)* %out, align 8
181  ret void
182}
183
184define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
185; SI-LABEL: s_sext_i16_to_i64:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
188; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
189; SI-NEXT:    s_mov_b32 s7, 0xf000
190; SI-NEXT:    s_mov_b32 s6, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
193; SI-NEXT:    v_mov_b32_e32 v0, s0
194; SI-NEXT:    v_mov_b32_e32 v1, s1
195; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
196; SI-NEXT:    s_endpgm
197;
198; VI-LABEL: s_sext_i16_to_i64:
199; VI:       ; %bb.0:
200; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
201; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
202; VI-NEXT:    s_mov_b32 s7, 0xf000
203; VI-NEXT:    s_mov_b32 s6, -1
204; VI-NEXT:    s_waitcnt lgkmcnt(0)
205; VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
206; VI-NEXT:    v_mov_b32_e32 v0, s0
207; VI-NEXT:    v_mov_b32_e32 v1, s1
208; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
209; VI-NEXT:    s_endpgm
210  %sext = sext i16 %a to i64
211  store i64 %sext, i64 addrspace(1)* %out, align 8
212  ret void
213}
214
215define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
216; SI-LABEL: s_sext_i1_to_i16:
217; SI:       ; %bb.0:
218; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
219; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
220; SI-NEXT:    s_mov_b32 s7, 0xf000
221; SI-NEXT:    s_mov_b32 s6, -1
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    v_mov_b32_e32 v0, s1
224; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
225; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
226; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
227; SI-NEXT:    s_endpgm
228;
229; VI-LABEL: s_sext_i1_to_i16:
230; VI:       ; %bb.0:
231; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
232; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
233; VI-NEXT:    s_mov_b32 s7, 0xf000
234; VI-NEXT:    s_mov_b32 s6, -1
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    v_mov_b32_e32 v0, s1
237; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
238; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
239; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
240; VI-NEXT:    s_endpgm
241  %cmp = icmp eq i32 %a, %b
242  %sext = sext i1 %cmp to i16
243  store i16 %sext, i16 addrspace(1)* %out
244  ret void
245}
246
247; This purpose of this test is to make sure the i16 = sign_extend i1 node
248; makes it all the way throught the legalizer/optimizer to make sure
249; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
250; is optimized to a select very early.
251define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
252; SI-LABEL: s_sext_i1_to_i16_with_and:
253; SI:       ; %bb.0:
254; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
255; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
256; SI-NEXT:    s_mov_b32 s7, 0xf000
257; SI-NEXT:    s_mov_b32 s6, -1
258; SI-NEXT:    s_waitcnt lgkmcnt(0)
259; SI-NEXT:    v_mov_b32_e32 v0, s1
260; SI-NEXT:    v_mov_b32_e32 v1, s3
261; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
262; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
263; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
264; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
265; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
266; SI-NEXT:    s_endpgm
267;
268; VI-LABEL: s_sext_i1_to_i16_with_and:
269; VI:       ; %bb.0:
270; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
271; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
272; VI-NEXT:    s_mov_b32 s7, 0xf000
273; VI-NEXT:    s_mov_b32 s6, -1
274; VI-NEXT:    s_waitcnt lgkmcnt(0)
275; VI-NEXT:    v_mov_b32_e32 v0, s1
276; VI-NEXT:    v_mov_b32_e32 v1, s3
277; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
278; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
279; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
280; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
281; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
282; VI-NEXT:    s_endpgm
283  %cmp0 = icmp eq i32 %a, %b
284  %cmp1 = icmp eq i32 %c, %d
285  %cmp = and i1 %cmp0, %cmp1
286  %sext = sext i1 %cmp to i16
287  store i16 %sext, i16 addrspace(1)* %out
288  ret void
289}
290
291define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
292; SI-LABEL: v_sext_i1_to_i16_with_and:
293; SI:       ; %bb.0:
294; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
295; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
296; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
297; SI-NEXT:    s_mov_b32 s7, 0xf000
298; SI-NEXT:    s_mov_b32 s6, -1
299; SI-NEXT:    s_waitcnt lgkmcnt(0)
300; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
301; SI-NEXT:    v_mov_b32_e32 v0, s0
302; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
303; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
304; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
305; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
306; SI-NEXT:    s_endpgm
307;
308; VI-LABEL: v_sext_i1_to_i16_with_and:
309; VI:       ; %bb.0:
310; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
311; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
312; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
313; VI-NEXT:    s_mov_b32 s7, 0xf000
314; VI-NEXT:    s_mov_b32 s6, -1
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
317; VI-NEXT:    v_mov_b32_e32 v0, s0
318; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
319; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
320; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
321; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
322; VI-NEXT:    s_endpgm
323  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
324  %cmp0 = icmp eq i32 %a, %tid
325  %cmp1 = icmp eq i32 %b, %c
326  %cmp = and i1 %cmp0, %cmp1
327  %sext = sext i1 %cmp to i16
328  store i16 %sext, i16 addrspace(1)* %out
329  ret void
330}
331
332; FIXME: We end up with a v_bfe instruction, because the i16 srl
333; gets selected to a v_lshrrev_b16 instructions, so the input to
334; the bfe is a vector registers.  To fix this we need to be able to
335; optimize:
336; t29: i16 = truncate t10
337; t55: i16 = srl t29, Constant:i32<8>
338; t63: i32 = any_extend t55
339; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
340define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
341; SI-LABEL: s_sext_v4i8_to_v4i32:
342; SI:       ; %bb.0:
343; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
344; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
345; SI-NEXT:    s_mov_b32 s7, 0xf000
346; SI-NEXT:    s_mov_b32 s6, -1
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_ashr_i32 s1, s0, 24
349; SI-NEXT:    s_bfe_i32 s2, s0, 0x80010
350; SI-NEXT:    s_bfe_i32 s3, s0, 0x80008
351; SI-NEXT:    s_sext_i32_i8 s0, s0
352; SI-NEXT:    v_mov_b32_e32 v0, s0
353; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
354; SI-NEXT:    s_waitcnt expcnt(0)
355; SI-NEXT:    v_mov_b32_e32 v0, s3
356; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
357; SI-NEXT:    s_waitcnt expcnt(0)
358; SI-NEXT:    v_mov_b32_e32 v0, s2
359; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
360; SI-NEXT:    s_waitcnt expcnt(0)
361; SI-NEXT:    v_mov_b32_e32 v0, s1
362; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; SI-NEXT:    s_endpgm
364;
365; VI-LABEL: s_sext_v4i8_to_v4i32:
366; VI:       ; %bb.0:
367; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
368; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
369; VI-NEXT:    s_mov_b32 s7, 0xf000
370; VI-NEXT:    s_mov_b32 s6, -1
371; VI-NEXT:    s_waitcnt lgkmcnt(0)
372; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s0
373; VI-NEXT:    s_ashr_i32 s1, s0, 24
374; VI-NEXT:    s_bfe_i32 s2, s0, 0x80010
375; VI-NEXT:    s_sext_i32_i8 s0, s0
376; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
377; VI-NEXT:    v_mov_b32_e32 v1, s0
378; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
379; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
380; VI-NEXT:    v_mov_b32_e32 v0, s2
381; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
382; VI-NEXT:    v_mov_b32_e32 v0, s1
383; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
384; VI-NEXT:    s_endpgm
385  %cast = bitcast i32 %a to <4 x i8>
386  %ext = sext <4 x i8> %cast to <4 x i32>
387  %elt0 = extractelement <4 x i32> %ext, i32 0
388  %elt1 = extractelement <4 x i32> %ext, i32 1
389  %elt2 = extractelement <4 x i32> %ext, i32 2
390  %elt3 = extractelement <4 x i32> %ext, i32 3
391  store volatile i32 %elt0, i32 addrspace(1)* %out
392  store volatile i32 %elt1, i32 addrspace(1)* %out
393  store volatile i32 %elt2, i32 addrspace(1)* %out
394  store volatile i32 %elt3, i32 addrspace(1)* %out
395  ret void
396}
397
398; FIXME: need to optimize same sequence as above test to avoid
399; this shift.
400define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
401; SI-LABEL: v_sext_v4i8_to_v4i32:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
404; SI-NEXT:    s_mov_b32 s3, 0xf000
405; SI-NEXT:    s_mov_b32 s2, -1
406; SI-NEXT:    s_mov_b32 s10, s2
407; SI-NEXT:    s_mov_b32 s11, s3
408; SI-NEXT:    s_waitcnt lgkmcnt(0)
409; SI-NEXT:    s_mov_b32 s8, s6
410; SI-NEXT:    s_mov_b32 s9, s7
411; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
412; SI-NEXT:    s_mov_b32 s0, s4
413; SI-NEXT:    s_mov_b32 s1, s5
414; SI-NEXT:    s_waitcnt vmcnt(0)
415; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
416; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
417; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
418; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
419; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
420; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
421; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
422; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
423; SI-NEXT:    s_endpgm
424;
425; VI-LABEL: v_sext_v4i8_to_v4i32:
426; VI:       ; %bb.0:
427; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
428; VI-NEXT:    s_mov_b32 s3, 0xf000
429; VI-NEXT:    s_mov_b32 s2, -1
430; VI-NEXT:    s_mov_b32 s10, s2
431; VI-NEXT:    s_mov_b32 s11, s3
432; VI-NEXT:    s_waitcnt lgkmcnt(0)
433; VI-NEXT:    s_mov_b32 s8, s6
434; VI-NEXT:    s_mov_b32 s9, s7
435; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
436; VI-NEXT:    s_mov_b32 s0, s4
437; VI-NEXT:    s_mov_b32 s1, s5
438; VI-NEXT:    s_waitcnt vmcnt(0)
439; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
440; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
441; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
442; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
443; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
444; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
445; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
446; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
447; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
448; VI-NEXT:    s_endpgm
449  %a = load i32, i32 addrspace(1)* %in
450  %cast = bitcast i32 %a to <4 x i8>
451  %ext = sext <4 x i8> %cast to <4 x i32>
452  %elt0 = extractelement <4 x i32> %ext, i32 0
453  %elt1 = extractelement <4 x i32> %ext, i32 1
454  %elt2 = extractelement <4 x i32> %ext, i32 2
455  %elt3 = extractelement <4 x i32> %ext, i32 3
456  store volatile i32 %elt0, i32 addrspace(1)* %out
457  store volatile i32 %elt1, i32 addrspace(1)* %out
458  store volatile i32 %elt2, i32 addrspace(1)* %out
459  store volatile i32 %elt3, i32 addrspace(1)* %out
460  ret void
461}
462
463; FIXME: s_bfe_i64, same on SI and VI
464define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
465; SI-LABEL: s_sext_v4i16_to_v4i32:
466; SI:       ; %bb.0:
467; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
468; SI-NEXT:    s_mov_b32 s3, 0xf000
469; SI-NEXT:    s_mov_b32 s2, -1
470; SI-NEXT:    s_waitcnt lgkmcnt(0)
471; SI-NEXT:    s_mov_b32 s0, s4
472; SI-NEXT:    s_mov_b32 s1, s5
473; SI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 48
474; SI-NEXT:    s_ashr_i32 s5, s6, 16
475; SI-NEXT:    s_sext_i32_i16 s6, s6
476; SI-NEXT:    v_mov_b32_e32 v0, s6
477; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
478; SI-NEXT:    s_waitcnt expcnt(0)
479; SI-NEXT:    v_mov_b32_e32 v0, s5
480; SI-NEXT:    s_sext_i32_i16 s7, s7
481; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; SI-NEXT:    s_waitcnt expcnt(0)
483; SI-NEXT:    v_mov_b32_e32 v0, s7
484; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
485; SI-NEXT:    s_waitcnt expcnt(0)
486; SI-NEXT:    v_mov_b32_e32 v0, s4
487; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
488; SI-NEXT:    s_endpgm
489;
490; VI-LABEL: s_sext_v4i16_to_v4i32:
491; VI:       ; %bb.0:
492; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
493; VI-NEXT:    s_mov_b32 s3, 0xf000
494; VI-NEXT:    s_mov_b32 s2, -1
495; VI-NEXT:    s_waitcnt lgkmcnt(0)
496; VI-NEXT:    s_mov_b32 s1, s5
497; VI-NEXT:    s_ashr_i32 s5, s6, 16
498; VI-NEXT:    s_sext_i32_i16 s6, s6
499; VI-NEXT:    s_mov_b32 s0, s4
500; VI-NEXT:    v_mov_b32_e32 v0, s6
501; VI-NEXT:    s_ashr_i32 s4, s7, 16
502; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
503; VI-NEXT:    v_mov_b32_e32 v0, s5
504; VI-NEXT:    s_sext_i32_i16 s7, s7
505; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
506; VI-NEXT:    v_mov_b32_e32 v0, s7
507; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
508; VI-NEXT:    v_mov_b32_e32 v0, s4
509; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
510; VI-NEXT:    s_endpgm
511  %cast = bitcast i64 %a to <4 x i16>
512  %ext = sext <4 x i16> %cast to <4 x i32>
513  %elt0 = extractelement <4 x i32> %ext, i32 0
514  %elt1 = extractelement <4 x i32> %ext, i32 1
515  %elt2 = extractelement <4 x i32> %ext, i32 2
516  %elt3 = extractelement <4 x i32> %ext, i32 3
517  store volatile i32 %elt0, i32 addrspace(1)* %out
518  store volatile i32 %elt1, i32 addrspace(1)* %out
519  store volatile i32 %elt2, i32 addrspace(1)* %out
520  store volatile i32 %elt3, i32 addrspace(1)* %out
521  ret void
522}
523
524define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
525; SI-LABEL: v_sext_v4i16_to_v4i32:
526; SI:       ; %bb.0:
527; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
528; SI-NEXT:    s_mov_b32 s3, 0xf000
529; SI-NEXT:    s_mov_b32 s2, -1
530; SI-NEXT:    s_mov_b32 s10, s2
531; SI-NEXT:    s_mov_b32 s11, s3
532; SI-NEXT:    s_waitcnt lgkmcnt(0)
533; SI-NEXT:    s_mov_b32 s8, s6
534; SI-NEXT:    s_mov_b32 s9, s7
535; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
536; SI-NEXT:    s_mov_b32 s0, s4
537; SI-NEXT:    s_mov_b32 s1, s5
538; SI-NEXT:    s_waitcnt vmcnt(0)
539; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
540; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
541; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
542; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
543; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
544; SI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
545; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
546; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
547; SI-NEXT:    s_endpgm
548;
549; VI-LABEL: v_sext_v4i16_to_v4i32:
550; VI:       ; %bb.0:
551; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
552; VI-NEXT:    s_mov_b32 s3, 0xf000
553; VI-NEXT:    s_mov_b32 s2, -1
554; VI-NEXT:    s_mov_b32 s10, s2
555; VI-NEXT:    s_mov_b32 s11, s3
556; VI-NEXT:    s_waitcnt lgkmcnt(0)
557; VI-NEXT:    s_mov_b32 s8, s6
558; VI-NEXT:    s_mov_b32 s9, s7
559; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
560; VI-NEXT:    s_mov_b32 s0, s4
561; VI-NEXT:    s_mov_b32 s1, s5
562; VI-NEXT:    s_waitcnt vmcnt(0)
563; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
564; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
565; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
566; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
567; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
568; VI-NEXT:    buffer_store_dword v3, off, s[0:3], 0
569; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
570; VI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
571; VI-NEXT:    s_endpgm
572  %a = load i64, i64 addrspace(1)* %in
573  %cast = bitcast i64 %a to <4 x i16>
574  %ext = sext <4 x i16> %cast to <4 x i32>
575  %elt0 = extractelement <4 x i32> %ext, i32 0
576  %elt1 = extractelement <4 x i32> %ext, i32 1
577  %elt2 = extractelement <4 x i32> %ext, i32 2
578  %elt3 = extractelement <4 x i32> %ext, i32 3
579  store volatile i32 %elt0, i32 addrspace(1)* %out
580  store volatile i32 %elt1, i32 addrspace(1)* %out
581  store volatile i32 %elt2, i32 addrspace(1)* %out
582  store volatile i32 %elt3, i32 addrspace(1)* %out
583  ret void
584}
585
586declare i32 @llvm.amdgcn.workitem.id.x() #1
587
588attributes #1 = { nounwind readnone }
589