1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI %s
4
5define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
6; SI-LABEL: bfe_u32_arg_arg_arg:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
9; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10; SI-NEXT:    s_mov_b32 s7, 0xf000
11; SI-NEXT:    s_mov_b32 s6, -1
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    v_mov_b32_e32 v0, s2
14; SI-NEXT:    v_bfe_u32 v0, v0, s3, s3
15; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
16; SI-NEXT:    s_endpgm
17;
18; VI-LABEL: bfe_u32_arg_arg_arg:
19; VI:       ; %bb.0:
20; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
21; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
22; VI-NEXT:    s_mov_b32 s7, 0xf000
23; VI-NEXT:    s_mov_b32 s6, -1
24; VI-NEXT:    s_waitcnt lgkmcnt(0)
25; VI-NEXT:    v_mov_b32_e32 v0, s0
26; VI-NEXT:    v_bfe_u32 v0, v0, s1, s1
27; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
28; VI-NEXT:    s_endpgm
29  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
30  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
31  ret void
32}
33
34define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
35; SI-LABEL: bfe_u32_arg_arg_imm:
36; SI:       ; %bb.0:
37; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
38; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
39; SI-NEXT:    s_mov_b32 s7, 0xf000
40; SI-NEXT:    s_mov_b32 s6, -1
41; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
42; SI-NEXT:    s_waitcnt lgkmcnt(0)
43; SI-NEXT:    v_mov_b32_e32 v1, s3
44; SI-NEXT:    v_bfe_u32 v0, s2, v1, v0
45; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
46; SI-NEXT:    s_endpgm
47;
48; VI-LABEL: bfe_u32_arg_arg_imm:
49; VI:       ; %bb.0:
50; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
51; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
52; VI-NEXT:    v_mov_b32_e32 v1, 0x7b
53; VI-NEXT:    s_mov_b32 s7, 0xf000
54; VI-NEXT:    s_mov_b32 s6, -1
55; VI-NEXT:    s_waitcnt lgkmcnt(0)
56; VI-NEXT:    v_mov_b32_e32 v0, s1
57; VI-NEXT:    v_bfe_u32 v0, s0, v0, v1
58; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
59; VI-NEXT:    s_endpgm
60  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
61  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
62  ret void
63}
64
65define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
66; SI-LABEL: bfe_u32_arg_imm_arg:
67; SI:       ; %bb.0:
68; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
69; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
70; SI-NEXT:    s_mov_b32 s7, 0xf000
71; SI-NEXT:    s_mov_b32 s6, -1
72; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
73; SI-NEXT:    s_waitcnt lgkmcnt(0)
74; SI-NEXT:    v_mov_b32_e32 v1, s3
75; SI-NEXT:    v_bfe_u32 v0, s2, v0, v1
76; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
77; SI-NEXT:    s_endpgm
78;
79; VI-LABEL: bfe_u32_arg_imm_arg:
80; VI:       ; %bb.0:
81; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
82; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
83; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
84; VI-NEXT:    s_mov_b32 s7, 0xf000
85; VI-NEXT:    s_mov_b32 s6, -1
86; VI-NEXT:    s_waitcnt lgkmcnt(0)
87; VI-NEXT:    v_mov_b32_e32 v1, s1
88; VI-NEXT:    v_bfe_u32 v0, s0, v0, v1
89; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
90; VI-NEXT:    s_endpgm
91  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
92  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
93  ret void
94}
95
96define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
97; SI-LABEL: bfe_u32_imm_arg_arg:
98; SI:       ; %bb.0:
99; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
101; SI-NEXT:    s_mov_b32 s7, 0xf000
102; SI-NEXT:    s_mov_b32 s6, -1
103; SI-NEXT:    s_movk_i32 s0, 0x7b
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s2
106; SI-NEXT:    v_mov_b32_e32 v1, s3
107; SI-NEXT:    v_bfe_u32 v0, s0, v0, v1
108; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
109; SI-NEXT:    s_endpgm
110;
111; VI-LABEL: bfe_u32_imm_arg_arg:
112; VI:       ; %bb.0:
113; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
114; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
115; VI-NEXT:    s_movk_i32 s2, 0x7b
116; VI-NEXT:    s_mov_b32 s7, 0xf000
117; VI-NEXT:    s_mov_b32 s6, -1
118; VI-NEXT:    s_waitcnt lgkmcnt(0)
119; VI-NEXT:    v_mov_b32_e32 v0, s0
120; VI-NEXT:    v_mov_b32_e32 v1, s1
121; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
122; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
123; VI-NEXT:    s_endpgm
124  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
125  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
126  ret void
127}
128
129define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
130; SI-LABEL: bfe_u32_arg_0_width_reg_offset:
131; SI:       ; %bb.0:
132; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
133; SI-NEXT:    s_mov_b32 s3, 0xf000
134; SI-NEXT:    s_mov_b32 s2, -1
135; SI-NEXT:    v_mov_b32_e32 v0, 0
136; SI-NEXT:    s_waitcnt lgkmcnt(0)
137; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
138; SI-NEXT:    s_endpgm
139;
140; VI-LABEL: bfe_u32_arg_0_width_reg_offset:
141; VI:       ; %bb.0:
142; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
143; VI-NEXT:    s_mov_b32 s3, 0xf000
144; VI-NEXT:    s_mov_b32 s2, -1
145; VI-NEXT:    v_mov_b32_e32 v0, 0
146; VI-NEXT:    s_waitcnt lgkmcnt(0)
147; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
148; VI-NEXT:    s_endpgm
149  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
150  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
151  ret void
152}
153
154define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
155; SI-LABEL: bfe_u32_arg_0_width_imm_offset:
156; SI:       ; %bb.0:
157; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
158; SI-NEXT:    s_mov_b32 s3, 0xf000
159; SI-NEXT:    s_mov_b32 s2, -1
160; SI-NEXT:    v_mov_b32_e32 v0, 0
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
163; SI-NEXT:    s_endpgm
164;
165; VI-LABEL: bfe_u32_arg_0_width_imm_offset:
166; VI:       ; %bb.0:
167; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; VI-NEXT:    s_mov_b32 s3, 0xf000
169; VI-NEXT:    s_mov_b32 s2, -1
170; VI-NEXT:    v_mov_b32_e32 v0, 0
171; VI-NEXT:    s_waitcnt lgkmcnt(0)
172; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
173; VI-NEXT:    s_endpgm
174  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
175  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
176  ret void
177}
178
179define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
180; SI-LABEL: bfe_u32_zextload_i8:
181; SI:       ; %bb.0:
182; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
183; SI-NEXT:    s_mov_b32 s7, 0xf000
184; SI-NEXT:    s_mov_b32 s6, -1
185; SI-NEXT:    s_mov_b32 s10, s6
186; SI-NEXT:    s_mov_b32 s11, s7
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    s_mov_b32 s8, s2
189; SI-NEXT:    s_mov_b32 s9, s3
190; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
191; SI-NEXT:    s_mov_b32 s4, s0
192; SI-NEXT:    s_mov_b32 s5, s1
193; SI-NEXT:    s_waitcnt vmcnt(0)
194; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
195; SI-NEXT:    s_endpgm
196;
197; VI-LABEL: bfe_u32_zextload_i8:
198; VI:       ; %bb.0:
199; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
200; VI-NEXT:    s_mov_b32 s3, 0xf000
201; VI-NEXT:    s_mov_b32 s2, -1
202; VI-NEXT:    s_waitcnt lgkmcnt(0)
203; VI-NEXT:    s_mov_b32 s0, s4
204; VI-NEXT:    s_mov_b32 s1, s5
205; VI-NEXT:    s_mov_b32 s4, s6
206; VI-NEXT:    s_mov_b32 s5, s7
207; VI-NEXT:    s_mov_b32 s6, s2
208; VI-NEXT:    s_mov_b32 s7, s3
209; VI-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
210; VI-NEXT:    s_waitcnt vmcnt(0)
211; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
212; VI-NEXT:    s_endpgm
213  %load = load i8, i8 addrspace(1)* %in
214  %ext = zext i8 %load to i32
215  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
216  store i32 %bfe, i32 addrspace(1)* %out, align 4
217  ret void
218}
219
220; FIXME: Should be using s_add_i32
221define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
222; SI-LABEL: bfe_u32_zext_in_reg_i8:
223; SI:       ; %bb.0:
224; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
225; SI-NEXT:    s_mov_b32 s7, 0xf000
226; SI-NEXT:    s_mov_b32 s6, -1
227; SI-NEXT:    s_mov_b32 s10, s6
228; SI-NEXT:    s_mov_b32 s11, s7
229; SI-NEXT:    s_waitcnt lgkmcnt(0)
230; SI-NEXT:    s_mov_b32 s8, s2
231; SI-NEXT:    s_mov_b32 s9, s3
232; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
233; SI-NEXT:    s_mov_b32 s4, s0
234; SI-NEXT:    s_mov_b32 s5, s1
235; SI-NEXT:    s_waitcnt vmcnt(0)
236; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
237; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
238; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
239; SI-NEXT:    s_endpgm
240;
241; VI-LABEL: bfe_u32_zext_in_reg_i8:
242; VI:       ; %bb.0:
243; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
244; VI-NEXT:    s_mov_b32 s3, 0xf000
245; VI-NEXT:    s_mov_b32 s2, -1
246; VI-NEXT:    s_waitcnt lgkmcnt(0)
247; VI-NEXT:    s_mov_b32 s0, s4
248; VI-NEXT:    s_mov_b32 s1, s5
249; VI-NEXT:    s_mov_b32 s4, s6
250; VI-NEXT:    s_mov_b32 s5, s7
251; VI-NEXT:    s_mov_b32 s6, s2
252; VI-NEXT:    s_mov_b32 s7, s3
253; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
254; VI-NEXT:    s_waitcnt vmcnt(0)
255; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
256; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
257; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
258; VI-NEXT:    s_endpgm
259  %load = load i32, i32 addrspace(1)* %in, align 4
260  %add = add i32 %load, 1
261  %ext = and i32 %add, 255
262  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
263  store i32 %bfe, i32 addrspace(1)* %out, align 4
264  ret void
265}
266
267define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
268; SI-LABEL: bfe_u32_zext_in_reg_i16:
269; SI:       ; %bb.0:
270; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
271; SI-NEXT:    s_mov_b32 s7, 0xf000
272; SI-NEXT:    s_mov_b32 s6, -1
273; SI-NEXT:    s_mov_b32 s10, s6
274; SI-NEXT:    s_mov_b32 s11, s7
275; SI-NEXT:    s_waitcnt lgkmcnt(0)
276; SI-NEXT:    s_mov_b32 s8, s2
277; SI-NEXT:    s_mov_b32 s9, s3
278; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
279; SI-NEXT:    s_mov_b32 s4, s0
280; SI-NEXT:    s_mov_b32 s5, s1
281; SI-NEXT:    s_waitcnt vmcnt(0)
282; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
283; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
284; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
285; SI-NEXT:    s_endpgm
286;
287; VI-LABEL: bfe_u32_zext_in_reg_i16:
288; VI:       ; %bb.0:
289; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
290; VI-NEXT:    s_mov_b32 s3, 0xf000
291; VI-NEXT:    s_mov_b32 s2, -1
292; VI-NEXT:    s_waitcnt lgkmcnt(0)
293; VI-NEXT:    s_mov_b32 s0, s4
294; VI-NEXT:    s_mov_b32 s1, s5
295; VI-NEXT:    s_mov_b32 s4, s6
296; VI-NEXT:    s_mov_b32 s5, s7
297; VI-NEXT:    s_mov_b32 s6, s2
298; VI-NEXT:    s_mov_b32 s7, s3
299; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
300; VI-NEXT:    s_waitcnt vmcnt(0)
301; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
302; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
303; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
304; VI-NEXT:    s_endpgm
305  %load = load i32, i32 addrspace(1)* %in, align 4
306  %add = add i32 %load, 1
307  %ext = and i32 %add, 65535
308  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16)
309  store i32 %bfe, i32 addrspace(1)* %out, align 4
310  ret void
311}
312
313define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
314; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
315; SI:       ; %bb.0:
316; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
317; SI-NEXT:    s_mov_b32 s7, 0xf000
318; SI-NEXT:    s_mov_b32 s6, -1
319; SI-NEXT:    s_mov_b32 s10, s6
320; SI-NEXT:    s_mov_b32 s11, s7
321; SI-NEXT:    s_waitcnt lgkmcnt(0)
322; SI-NEXT:    s_mov_b32 s8, s2
323; SI-NEXT:    s_mov_b32 s9, s3
324; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
325; SI-NEXT:    s_mov_b32 s4, s0
326; SI-NEXT:    s_mov_b32 s5, s1
327; SI-NEXT:    s_waitcnt vmcnt(0)
328; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
329; SI-NEXT:    v_and_b32_e32 v0, 0xfe, v0
330; SI-NEXT:    v_bfe_u32 v0, v0, 1, 8
331; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
332; SI-NEXT:    s_endpgm
333;
334; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
335; VI:       ; %bb.0:
336; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
337; VI-NEXT:    s_mov_b32 s3, 0xf000
338; VI-NEXT:    s_mov_b32 s2, -1
339; VI-NEXT:    s_waitcnt lgkmcnt(0)
340; VI-NEXT:    s_mov_b32 s0, s4
341; VI-NEXT:    s_mov_b32 s1, s5
342; VI-NEXT:    s_mov_b32 s4, s6
343; VI-NEXT:    s_mov_b32 s5, s7
344; VI-NEXT:    s_mov_b32 s6, s2
345; VI-NEXT:    s_mov_b32 s7, s3
346; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
347; VI-NEXT:    s_waitcnt vmcnt(0)
348; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
349; VI-NEXT:    v_and_b32_e32 v0, 0xfe, v0
350; VI-NEXT:    v_bfe_u32 v0, v0, 1, 8
351; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
352; VI-NEXT:    s_endpgm
353  %load = load i32, i32 addrspace(1)* %in, align 4
354  %add = add i32 %load, 1
355  %ext = and i32 %add, 255
356  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8)
357  store i32 %bfe, i32 addrspace(1)* %out, align 4
358  ret void
359}
360
361define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
362; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
363; SI:       ; %bb.0:
364; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
365; SI-NEXT:    s_mov_b32 s7, 0xf000
366; SI-NEXT:    s_mov_b32 s6, -1
367; SI-NEXT:    s_mov_b32 s10, s6
368; SI-NEXT:    s_mov_b32 s11, s7
369; SI-NEXT:    s_waitcnt lgkmcnt(0)
370; SI-NEXT:    s_mov_b32 s8, s2
371; SI-NEXT:    s_mov_b32 s9, s3
372; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
373; SI-NEXT:    s_mov_b32 s4, s0
374; SI-NEXT:    s_mov_b32 s5, s1
375; SI-NEXT:    s_waitcnt vmcnt(0)
376; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
377; SI-NEXT:    v_and_b32_e32 v0, 0xf8, v0
378; SI-NEXT:    v_bfe_u32 v0, v0, 3, 8
379; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
380; SI-NEXT:    s_endpgm
381;
382; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
383; VI:       ; %bb.0:
384; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
385; VI-NEXT:    s_mov_b32 s3, 0xf000
386; VI-NEXT:    s_mov_b32 s2, -1
387; VI-NEXT:    s_waitcnt lgkmcnt(0)
388; VI-NEXT:    s_mov_b32 s0, s4
389; VI-NEXT:    s_mov_b32 s1, s5
390; VI-NEXT:    s_mov_b32 s4, s6
391; VI-NEXT:    s_mov_b32 s5, s7
392; VI-NEXT:    s_mov_b32 s6, s2
393; VI-NEXT:    s_mov_b32 s7, s3
394; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
395; VI-NEXT:    s_waitcnt vmcnt(0)
396; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
397; VI-NEXT:    v_and_b32_e32 v0, 0xf8, v0
398; VI-NEXT:    v_bfe_u32 v0, v0, 3, 8
399; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
400; VI-NEXT:    s_endpgm
401  %load = load i32, i32 addrspace(1)* %in, align 4
402  %add = add i32 %load, 1
403  %ext = and i32 %add, 255
404  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8)
405  store i32 %bfe, i32 addrspace(1)* %out, align 4
406  ret void
407}
408
409define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
410; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
411; SI:       ; %bb.0:
412; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
413; SI-NEXT:    s_mov_b32 s7, 0xf000
414; SI-NEXT:    s_mov_b32 s6, -1
415; SI-NEXT:    s_mov_b32 s10, s6
416; SI-NEXT:    s_mov_b32 s11, s7
417; SI-NEXT:    s_waitcnt lgkmcnt(0)
418; SI-NEXT:    s_mov_b32 s8, s2
419; SI-NEXT:    s_mov_b32 s9, s3
420; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
421; SI-NEXT:    s_mov_b32 s4, s0
422; SI-NEXT:    s_mov_b32 s5, s1
423; SI-NEXT:    s_waitcnt vmcnt(0)
424; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
425; SI-NEXT:    v_and_b32_e32 v0, 0x80, v0
426; SI-NEXT:    v_bfe_u32 v0, v0, 7, 8
427; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
428; SI-NEXT:    s_endpgm
429;
430; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
431; VI:       ; %bb.0:
432; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
433; VI-NEXT:    s_mov_b32 s3, 0xf000
434; VI-NEXT:    s_mov_b32 s2, -1
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    s_mov_b32 s0, s4
437; VI-NEXT:    s_mov_b32 s1, s5
438; VI-NEXT:    s_mov_b32 s4, s6
439; VI-NEXT:    s_mov_b32 s5, s7
440; VI-NEXT:    s_mov_b32 s6, s2
441; VI-NEXT:    s_mov_b32 s7, s3
442; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
443; VI-NEXT:    s_waitcnt vmcnt(0)
444; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
445; VI-NEXT:    v_and_b32_e32 v0, 0x80, v0
446; VI-NEXT:    v_bfe_u32 v0, v0, 7, 8
447; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
448; VI-NEXT:    s_endpgm
449  %load = load i32, i32 addrspace(1)* %in, align 4
450  %add = add i32 %load, 1
451  %ext = and i32 %add, 255
452  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8)
453  store i32 %bfe, i32 addrspace(1)* %out, align 4
454  ret void
455}
456
457define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
458; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
459; SI:       ; %bb.0:
460; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
461; SI-NEXT:    s_mov_b32 s7, 0xf000
462; SI-NEXT:    s_mov_b32 s6, -1
463; SI-NEXT:    s_mov_b32 s10, s6
464; SI-NEXT:    s_mov_b32 s11, s7
465; SI-NEXT:    s_waitcnt lgkmcnt(0)
466; SI-NEXT:    s_mov_b32 s8, s2
467; SI-NEXT:    s_mov_b32 s9, s3
468; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
469; SI-NEXT:    s_mov_b32 s4, s0
470; SI-NEXT:    s_mov_b32 s5, s1
471; SI-NEXT:    s_waitcnt vmcnt(0)
472; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
473; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
474; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
475; SI-NEXT:    s_endpgm
476;
477; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
478; VI:       ; %bb.0:
479; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
480; VI-NEXT:    s_mov_b32 s3, 0xf000
481; VI-NEXT:    s_mov_b32 s2, -1
482; VI-NEXT:    s_waitcnt lgkmcnt(0)
483; VI-NEXT:    s_mov_b32 s0, s4
484; VI-NEXT:    s_mov_b32 s1, s5
485; VI-NEXT:    s_mov_b32 s4, s6
486; VI-NEXT:    s_mov_b32 s5, s7
487; VI-NEXT:    s_mov_b32 s6, s2
488; VI-NEXT:    s_mov_b32 s7, s3
489; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
490; VI-NEXT:    s_waitcnt vmcnt(0)
491; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
492; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
493; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
494; VI-NEXT:    s_endpgm
495  %load = load i32, i32 addrspace(1)* %in, align 4
496  %add = add i32 %load, 1
497  %ext = and i32 %add, 65535
498  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8)
499  store i32 %bfe, i32 addrspace(1)* %out, align 4
500  ret void
501}
502
503define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
504; SI-LABEL: bfe_u32_test_1:
505; SI:       ; %bb.0:
506; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
507; SI-NEXT:    s_mov_b32 s7, 0xf000
508; SI-NEXT:    s_mov_b32 s6, -1
509; SI-NEXT:    s_mov_b32 s10, s6
510; SI-NEXT:    s_mov_b32 s11, s7
511; SI-NEXT:    s_waitcnt lgkmcnt(0)
512; SI-NEXT:    s_mov_b32 s8, s2
513; SI-NEXT:    s_mov_b32 s9, s3
514; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
515; SI-NEXT:    s_mov_b32 s4, s0
516; SI-NEXT:    s_mov_b32 s5, s1
517; SI-NEXT:    s_waitcnt vmcnt(0)
518; SI-NEXT:    v_and_b32_e32 v0, 1, v0
519; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
520; SI-NEXT:    s_endpgm
521;
522; VI-LABEL: bfe_u32_test_1:
523; VI:       ; %bb.0:
524; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
525; VI-NEXT:    s_mov_b32 s3, 0xf000
526; VI-NEXT:    s_mov_b32 s2, -1
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    s_mov_b32 s0, s4
529; VI-NEXT:    s_mov_b32 s1, s5
530; VI-NEXT:    s_mov_b32 s4, s6
531; VI-NEXT:    s_mov_b32 s5, s7
532; VI-NEXT:    s_mov_b32 s6, s2
533; VI-NEXT:    s_mov_b32 s7, s3
534; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
535; VI-NEXT:    s_waitcnt vmcnt(0)
536; VI-NEXT:    v_and_b32_e32 v0, 1, v0
537; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
538; VI-NEXT:    s_endpgm
539  %x = load i32, i32 addrspace(1)* %in, align 4
540  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
541  store i32 %bfe, i32 addrspace(1)* %out, align 4
542  ret void
543}
544
545define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
546; SI-LABEL: bfe_u32_test_2:
547; SI:       ; %bb.0:
548; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
549; SI-NEXT:    s_waitcnt lgkmcnt(0)
550; SI-NEXT:    s_mov_b32 s3, 0xf000
551; SI-NEXT:    s_mov_b32 s2, -1
552; SI-NEXT:    v_mov_b32_e32 v0, 0
553; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
554; SI-NEXT:    s_endpgm
555;
556; VI-LABEL: bfe_u32_test_2:
557; VI:       ; %bb.0:
558; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
559; VI-NEXT:    s_waitcnt lgkmcnt(0)
560; VI-NEXT:    s_mov_b32 s3, 0xf000
561; VI-NEXT:    s_mov_b32 s2, -1
562; VI-NEXT:    v_mov_b32_e32 v0, 0
563; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
564; VI-NEXT:    s_endpgm
565  %x = load i32, i32 addrspace(1)* %in, align 4
566  %shl = shl i32 %x, 31
567  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
568  store i32 %bfe, i32 addrspace(1)* %out, align 4
569  ret void
570}
571
572define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
573; SI-LABEL: bfe_u32_test_3:
574; SI:       ; %bb.0:
575; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
576; SI-NEXT:    s_waitcnt lgkmcnt(0)
577; SI-NEXT:    s_mov_b32 s3, 0xf000
578; SI-NEXT:    s_mov_b32 s2, -1
579; SI-NEXT:    v_mov_b32_e32 v0, 0
580; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
581; SI-NEXT:    s_endpgm
582;
583; VI-LABEL: bfe_u32_test_3:
584; VI:       ; %bb.0:
585; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
586; VI-NEXT:    s_waitcnt lgkmcnt(0)
587; VI-NEXT:    s_mov_b32 s3, 0xf000
588; VI-NEXT:    s_mov_b32 s2, -1
589; VI-NEXT:    v_mov_b32_e32 v0, 0
590; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
591; VI-NEXT:    s_endpgm
592  %x = load i32, i32 addrspace(1)* %in, align 4
593  %shl = shl i32 %x, 31
594  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
595  store i32 %bfe, i32 addrspace(1)* %out, align 4
596  ret void
597}
598
599define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
600; SI-LABEL: bfe_u32_test_4:
601; SI:       ; %bb.0:
602; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
603; SI-NEXT:    s_waitcnt lgkmcnt(0)
604; SI-NEXT:    s_mov_b32 s3, 0xf000
605; SI-NEXT:    s_mov_b32 s2, -1
606; SI-NEXT:    v_mov_b32_e32 v0, 0
607; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
608; SI-NEXT:    s_endpgm
609;
610; VI-LABEL: bfe_u32_test_4:
611; VI:       ; %bb.0:
612; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
613; VI-NEXT:    s_waitcnt lgkmcnt(0)
614; VI-NEXT:    s_mov_b32 s3, 0xf000
615; VI-NEXT:    s_mov_b32 s2, -1
616; VI-NEXT:    v_mov_b32_e32 v0, 0
617; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
618; VI-NEXT:    s_endpgm
619  %x = load i32, i32 addrspace(1)* %in, align 4
620  %shl = shl i32 %x, 31
621  %shr = lshr i32 %shl, 31
622  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1)
623  store i32 %bfe, i32 addrspace(1)* %out, align 4
624  ret void
625}
626
627define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
628; SI-LABEL: bfe_u32_test_5:
629; SI:       ; %bb.0:
630; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
631; SI-NEXT:    s_mov_b32 s7, 0xf000
632; SI-NEXT:    s_mov_b32 s6, -1
633; SI-NEXT:    s_mov_b32 s10, s6
634; SI-NEXT:    s_mov_b32 s11, s7
635; SI-NEXT:    s_waitcnt lgkmcnt(0)
636; SI-NEXT:    s_mov_b32 s8, s2
637; SI-NEXT:    s_mov_b32 s9, s3
638; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
639; SI-NEXT:    s_mov_b32 s4, s0
640; SI-NEXT:    s_mov_b32 s5, s1
641; SI-NEXT:    s_waitcnt vmcnt(0)
642; SI-NEXT:    v_bfe_i32 v0, v0, 0, 1
643; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
644; SI-NEXT:    s_endpgm
645;
646; VI-LABEL: bfe_u32_test_5:
647; VI:       ; %bb.0:
648; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
649; VI-NEXT:    s_mov_b32 s3, 0xf000
650; VI-NEXT:    s_mov_b32 s2, -1
651; VI-NEXT:    s_waitcnt lgkmcnt(0)
652; VI-NEXT:    s_mov_b32 s0, s4
653; VI-NEXT:    s_mov_b32 s1, s5
654; VI-NEXT:    s_mov_b32 s4, s6
655; VI-NEXT:    s_mov_b32 s5, s7
656; VI-NEXT:    s_mov_b32 s6, s2
657; VI-NEXT:    s_mov_b32 s7, s3
658; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
659; VI-NEXT:    s_waitcnt vmcnt(0)
660; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
661; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
662; VI-NEXT:    s_endpgm
663  %x = load i32, i32 addrspace(1)* %in, align 4
664  %shl = shl i32 %x, 31
665  %shr = ashr i32 %shl, 31
666  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1)
667  store i32 %bfe, i32 addrspace(1)* %out, align 4
668  ret void
669}
670
671define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
672; SI-LABEL: bfe_u32_test_6:
673; SI:       ; %bb.0:
674; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
675; SI-NEXT:    s_mov_b32 s7, 0xf000
676; SI-NEXT:    s_mov_b32 s6, -1
677; SI-NEXT:    s_mov_b32 s10, s6
678; SI-NEXT:    s_mov_b32 s11, s7
679; SI-NEXT:    s_waitcnt lgkmcnt(0)
680; SI-NEXT:    s_mov_b32 s8, s2
681; SI-NEXT:    s_mov_b32 s9, s3
682; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
683; SI-NEXT:    s_mov_b32 s4, s0
684; SI-NEXT:    s_mov_b32 s5, s1
685; SI-NEXT:    s_waitcnt vmcnt(0)
686; SI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
687; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
688; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
689; SI-NEXT:    s_endpgm
690;
691; VI-LABEL: bfe_u32_test_6:
692; VI:       ; %bb.0:
693; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
694; VI-NEXT:    s_mov_b32 s3, 0xf000
695; VI-NEXT:    s_mov_b32 s2, -1
696; VI-NEXT:    s_waitcnt lgkmcnt(0)
697; VI-NEXT:    s_mov_b32 s0, s4
698; VI-NEXT:    s_mov_b32 s1, s5
699; VI-NEXT:    s_mov_b32 s4, s6
700; VI-NEXT:    s_mov_b32 s5, s7
701; VI-NEXT:    s_mov_b32 s6, s2
702; VI-NEXT:    s_mov_b32 s7, s3
703; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
704; VI-NEXT:    s_waitcnt vmcnt(0)
705; VI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
706; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
707; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
708; VI-NEXT:    s_endpgm
709  %x = load i32, i32 addrspace(1)* %in, align 4
710  %shl = shl i32 %x, 31
711  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
712  store i32 %bfe, i32 addrspace(1)* %out, align 4
713  ret void
714}
715
716define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
717; SI-LABEL: bfe_u32_test_7:
718; SI:       ; %bb.0:
719; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
720; SI-NEXT:    s_mov_b32 s7, 0xf000
721; SI-NEXT:    s_mov_b32 s6, -1
722; SI-NEXT:    s_mov_b32 s10, s6
723; SI-NEXT:    s_mov_b32 s11, s7
724; SI-NEXT:    s_waitcnt lgkmcnt(0)
725; SI-NEXT:    s_mov_b32 s8, s2
726; SI-NEXT:    s_mov_b32 s9, s3
727; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
728; SI-NEXT:    s_mov_b32 s4, s0
729; SI-NEXT:    s_mov_b32 s5, s1
730; SI-NEXT:    s_waitcnt vmcnt(0)
731; SI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
732; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
733; SI-NEXT:    s_endpgm
734;
735; VI-LABEL: bfe_u32_test_7:
736; VI:       ; %bb.0:
737; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
738; VI-NEXT:    s_mov_b32 s3, 0xf000
739; VI-NEXT:    s_mov_b32 s2, -1
740; VI-NEXT:    s_waitcnt lgkmcnt(0)
741; VI-NEXT:    s_mov_b32 s0, s4
742; VI-NEXT:    s_mov_b32 s1, s5
743; VI-NEXT:    s_mov_b32 s4, s6
744; VI-NEXT:    s_mov_b32 s5, s7
745; VI-NEXT:    s_mov_b32 s6, s2
746; VI-NEXT:    s_mov_b32 s7, s3
747; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
748; VI-NEXT:    s_waitcnt vmcnt(0)
749; VI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
750; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
751; VI-NEXT:    s_endpgm
752  %x = load i32, i32 addrspace(1)* %in, align 4
753  %shl = shl i32 %x, 31
754  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
755  store i32 %bfe, i32 addrspace(1)* %out, align 4
756  ret void
757}
758
759define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
760; SI-LABEL: bfe_u32_test_8:
761; SI:       ; %bb.0:
762; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
763; SI-NEXT:    s_mov_b32 s7, 0xf000
764; SI-NEXT:    s_mov_b32 s6, -1
765; SI-NEXT:    s_mov_b32 s10, s6
766; SI-NEXT:    s_mov_b32 s11, s7
767; SI-NEXT:    s_waitcnt lgkmcnt(0)
768; SI-NEXT:    s_mov_b32 s8, s2
769; SI-NEXT:    s_mov_b32 s9, s3
770; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
771; SI-NEXT:    s_mov_b32 s4, s0
772; SI-NEXT:    s_mov_b32 s5, s1
773; SI-NEXT:    s_waitcnt vmcnt(0)
774; SI-NEXT:    v_and_b32_e32 v0, 1, v0
775; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
776; SI-NEXT:    s_endpgm
777;
778; VI-LABEL: bfe_u32_test_8:
779; VI:       ; %bb.0:
780; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
781; VI-NEXT:    s_mov_b32 s3, 0xf000
782; VI-NEXT:    s_mov_b32 s2, -1
783; VI-NEXT:    s_waitcnt lgkmcnt(0)
784; VI-NEXT:    s_mov_b32 s0, s4
785; VI-NEXT:    s_mov_b32 s1, s5
786; VI-NEXT:    s_mov_b32 s4, s6
787; VI-NEXT:    s_mov_b32 s5, s7
788; VI-NEXT:    s_mov_b32 s6, s2
789; VI-NEXT:    s_mov_b32 s7, s3
790; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
791; VI-NEXT:    s_waitcnt vmcnt(0)
792; VI-NEXT:    v_and_b32_e32 v0, 1, v0
793; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
794; VI-NEXT:    s_endpgm
795  %x = load i32, i32 addrspace(1)* %in, align 4
796  %shl = shl i32 %x, 31
797  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
798  store i32 %bfe, i32 addrspace(1)* %out, align 4
799  ret void
800}
801
802define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
803; SI-LABEL: bfe_u32_test_9:
804; SI:       ; %bb.0:
805; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
806; SI-NEXT:    s_mov_b32 s7, 0xf000
807; SI-NEXT:    s_mov_b32 s6, -1
808; SI-NEXT:    s_mov_b32 s10, s6
809; SI-NEXT:    s_mov_b32 s11, s7
810; SI-NEXT:    s_waitcnt lgkmcnt(0)
811; SI-NEXT:    s_mov_b32 s8, s2
812; SI-NEXT:    s_mov_b32 s9, s3
813; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
814; SI-NEXT:    s_mov_b32 s4, s0
815; SI-NEXT:    s_mov_b32 s5, s1
816; SI-NEXT:    s_waitcnt vmcnt(0)
817; SI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
818; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
819; SI-NEXT:    s_endpgm
820;
821; VI-LABEL: bfe_u32_test_9:
822; VI:       ; %bb.0:
823; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
824; VI-NEXT:    s_mov_b32 s3, 0xf000
825; VI-NEXT:    s_mov_b32 s2, -1
826; VI-NEXT:    s_waitcnt lgkmcnt(0)
827; VI-NEXT:    s_mov_b32 s0, s4
828; VI-NEXT:    s_mov_b32 s1, s5
829; VI-NEXT:    s_mov_b32 s4, s6
830; VI-NEXT:    s_mov_b32 s5, s7
831; VI-NEXT:    s_mov_b32 s6, s2
832; VI-NEXT:    s_mov_b32 s7, s3
833; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
834; VI-NEXT:    s_waitcnt vmcnt(0)
835; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
836; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
837; VI-NEXT:    s_endpgm
838  %x = load i32, i32 addrspace(1)* %in, align 4
839  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
840  store i32 %bfe, i32 addrspace(1)* %out, align 4
841  ret void
842}
843
844define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
845; SI-LABEL: bfe_u32_test_10:
846; SI:       ; %bb.0:
847; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
848; SI-NEXT:    s_mov_b32 s7, 0xf000
849; SI-NEXT:    s_mov_b32 s6, -1
850; SI-NEXT:    s_mov_b32 s10, s6
851; SI-NEXT:    s_mov_b32 s11, s7
852; SI-NEXT:    s_waitcnt lgkmcnt(0)
853; SI-NEXT:    s_mov_b32 s8, s2
854; SI-NEXT:    s_mov_b32 s9, s3
855; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
856; SI-NEXT:    s_mov_b32 s4, s0
857; SI-NEXT:    s_mov_b32 s5, s1
858; SI-NEXT:    s_waitcnt vmcnt(0)
859; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
860; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
861; SI-NEXT:    s_endpgm
862;
863; VI-LABEL: bfe_u32_test_10:
864; VI:       ; %bb.0:
865; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
866; VI-NEXT:    s_mov_b32 s3, 0xf000
867; VI-NEXT:    s_mov_b32 s2, -1
868; VI-NEXT:    s_waitcnt lgkmcnt(0)
869; VI-NEXT:    s_mov_b32 s0, s4
870; VI-NEXT:    s_mov_b32 s1, s5
871; VI-NEXT:    s_mov_b32 s4, s6
872; VI-NEXT:    s_mov_b32 s5, s7
873; VI-NEXT:    s_mov_b32 s6, s2
874; VI-NEXT:    s_mov_b32 s7, s3
875; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
876; VI-NEXT:    s_waitcnt vmcnt(0)
877; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
878; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
879; VI-NEXT:    s_endpgm
880  %x = load i32, i32 addrspace(1)* %in, align 4
881  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
882  store i32 %bfe, i32 addrspace(1)* %out, align 4
883  ret void
884}
885
886define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
887; SI-LABEL: bfe_u32_test_11:
888; SI:       ; %bb.0:
889; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
890; SI-NEXT:    s_mov_b32 s7, 0xf000
891; SI-NEXT:    s_mov_b32 s6, -1
892; SI-NEXT:    s_mov_b32 s10, s6
893; SI-NEXT:    s_mov_b32 s11, s7
894; SI-NEXT:    s_waitcnt lgkmcnt(0)
895; SI-NEXT:    s_mov_b32 s8, s2
896; SI-NEXT:    s_mov_b32 s9, s3
897; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
898; SI-NEXT:    s_mov_b32 s4, s0
899; SI-NEXT:    s_mov_b32 s5, s1
900; SI-NEXT:    s_waitcnt vmcnt(0)
901; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
902; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
903; SI-NEXT:    s_endpgm
904;
905; VI-LABEL: bfe_u32_test_11:
906; VI:       ; %bb.0:
907; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
908; VI-NEXT:    s_mov_b32 s3, 0xf000
909; VI-NEXT:    s_mov_b32 s2, -1
910; VI-NEXT:    s_waitcnt lgkmcnt(0)
911; VI-NEXT:    s_mov_b32 s0, s4
912; VI-NEXT:    s_mov_b32 s1, s5
913; VI-NEXT:    s_mov_b32 s4, s6
914; VI-NEXT:    s_mov_b32 s5, s7
915; VI-NEXT:    s_mov_b32 s6, s2
916; VI-NEXT:    s_mov_b32 s7, s3
917; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
918; VI-NEXT:    s_waitcnt vmcnt(0)
919; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
920; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
921; VI-NEXT:    s_endpgm
922  %x = load i32, i32 addrspace(1)* %in, align 4
923  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
924  store i32 %bfe, i32 addrspace(1)* %out, align 4
925  ret void
926}
927
928define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
929; SI-LABEL: bfe_u32_test_12:
930; SI:       ; %bb.0:
931; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
932; SI-NEXT:    s_mov_b32 s7, 0xf000
933; SI-NEXT:    s_mov_b32 s6, -1
934; SI-NEXT:    s_mov_b32 s10, s6
935; SI-NEXT:    s_mov_b32 s11, s7
936; SI-NEXT:    s_waitcnt lgkmcnt(0)
937; SI-NEXT:    s_mov_b32 s8, s2
938; SI-NEXT:    s_mov_b32 s9, s3
939; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
940; SI-NEXT:    s_mov_b32 s4, s0
941; SI-NEXT:    s_mov_b32 s5, s1
942; SI-NEXT:    s_waitcnt vmcnt(0)
943; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
944; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
945; SI-NEXT:    s_endpgm
946;
947; VI-LABEL: bfe_u32_test_12:
948; VI:       ; %bb.0:
949; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
950; VI-NEXT:    s_mov_b32 s3, 0xf000
951; VI-NEXT:    s_mov_b32 s2, -1
952; VI-NEXT:    s_waitcnt lgkmcnt(0)
953; VI-NEXT:    s_mov_b32 s0, s4
954; VI-NEXT:    s_mov_b32 s1, s5
955; VI-NEXT:    s_mov_b32 s4, s6
956; VI-NEXT:    s_mov_b32 s5, s7
957; VI-NEXT:    s_mov_b32 s6, s2
958; VI-NEXT:    s_mov_b32 s7, s3
959; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
960; VI-NEXT:    s_waitcnt vmcnt(0)
961; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
962; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
963; VI-NEXT:    s_endpgm
964  %x = load i32, i32 addrspace(1)* %in, align 4
965  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
966  store i32 %bfe, i32 addrspace(1)* %out, align 4
967  ret void
968}
969
970; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
971define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
972; SI-LABEL: bfe_u32_test_13:
973; SI:       ; %bb.0:
974; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
975; SI-NEXT:    s_mov_b32 s7, 0xf000
976; SI-NEXT:    s_mov_b32 s6, -1
977; SI-NEXT:    s_mov_b32 s10, s6
978; SI-NEXT:    s_mov_b32 s11, s7
979; SI-NEXT:    s_waitcnt lgkmcnt(0)
980; SI-NEXT:    s_mov_b32 s8, s2
981; SI-NEXT:    s_mov_b32 s9, s3
982; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
983; SI-NEXT:    s_mov_b32 s4, s0
984; SI-NEXT:    s_mov_b32 s5, s1
985; SI-NEXT:    s_waitcnt vmcnt(0)
986; SI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
987; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
988; SI-NEXT:    s_endpgm
989;
990; VI-LABEL: bfe_u32_test_13:
991; VI:       ; %bb.0:
992; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
993; VI-NEXT:    s_mov_b32 s3, 0xf000
994; VI-NEXT:    s_mov_b32 s2, -1
995; VI-NEXT:    s_waitcnt lgkmcnt(0)
996; VI-NEXT:    s_mov_b32 s0, s4
997; VI-NEXT:    s_mov_b32 s1, s5
998; VI-NEXT:    s_mov_b32 s4, s6
999; VI-NEXT:    s_mov_b32 s5, s7
1000; VI-NEXT:    s_mov_b32 s6, s2
1001; VI-NEXT:    s_mov_b32 s7, s3
1002; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1003; VI-NEXT:    s_waitcnt vmcnt(0)
1004; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
1005; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1006; VI-NEXT:    s_endpgm
1007  %x = load i32, i32 addrspace(1)* %in, align 4
1008  %shl = ashr i32 %x, 31
1009  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
1010  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
1011}
1012
1013define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
1014; SI-LABEL: bfe_u32_test_14:
1015; SI:       ; %bb.0:
1016; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1017; SI-NEXT:    s_waitcnt lgkmcnt(0)
1018; SI-NEXT:    s_mov_b32 s3, 0xf000
1019; SI-NEXT:    s_mov_b32 s2, -1
1020; SI-NEXT:    v_mov_b32_e32 v0, 0
1021; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1022; SI-NEXT:    s_endpgm
1023;
1024; VI-LABEL: bfe_u32_test_14:
1025; VI:       ; %bb.0:
1026; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1027; VI-NEXT:    s_waitcnt lgkmcnt(0)
1028; VI-NEXT:    s_mov_b32 s3, 0xf000
1029; VI-NEXT:    s_mov_b32 s2, -1
1030; VI-NEXT:    v_mov_b32_e32 v0, 0
1031; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1032; VI-NEXT:    s_endpgm
1033  %x = load i32, i32 addrspace(1)* %in, align 4
1034  %shl = lshr i32 %x, 31
1035  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
1036  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
1037}
1038
1039; EG-NOT: BFE
1040define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
1041; SI-LABEL: bfe_u32_constant_fold_test_0:
1042; SI:       ; %bb.0:
1043; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1044; SI-NEXT:    s_mov_b32 s3, 0xf000
1045; SI-NEXT:    s_mov_b32 s2, -1
1046; SI-NEXT:    v_mov_b32_e32 v0, 0
1047; SI-NEXT:    s_waitcnt lgkmcnt(0)
1048; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1049; SI-NEXT:    s_endpgm
1050;
1051; VI-LABEL: bfe_u32_constant_fold_test_0:
1052; VI:       ; %bb.0:
1053; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1054; VI-NEXT:    s_mov_b32 s3, 0xf000
1055; VI-NEXT:    s_mov_b32 s2, -1
1056; VI-NEXT:    v_mov_b32_e32 v0, 0
1057; VI-NEXT:    s_waitcnt lgkmcnt(0)
1058; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1059; VI-NEXT:    s_endpgm
1060  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
1061  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1062  ret void
1063}
1064
1065; EG-NOT: BFE
1066define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
1067; SI-LABEL: bfe_u32_constant_fold_test_1:
1068; SI:       ; %bb.0:
1069; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1070; SI-NEXT:    s_mov_b32 s3, 0xf000
1071; SI-NEXT:    s_mov_b32 s2, -1
1072; SI-NEXT:    v_mov_b32_e32 v0, 0
1073; SI-NEXT:    s_waitcnt lgkmcnt(0)
1074; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1075; SI-NEXT:    s_endpgm
1076;
1077; VI-LABEL: bfe_u32_constant_fold_test_1:
1078; VI:       ; %bb.0:
1079; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1080; VI-NEXT:    s_mov_b32 s3, 0xf000
1081; VI-NEXT:    s_mov_b32 s2, -1
1082; VI-NEXT:    v_mov_b32_e32 v0, 0
1083; VI-NEXT:    s_waitcnt lgkmcnt(0)
1084; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1085; VI-NEXT:    s_endpgm
1086  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
1087  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1088  ret void
1089}
1090
1091; EG-NOT: BFE
1092define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
1093; SI-LABEL: bfe_u32_constant_fold_test_2:
1094; SI:       ; %bb.0:
1095; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1096; SI-NEXT:    s_mov_b32 s3, 0xf000
1097; SI-NEXT:    s_mov_b32 s2, -1
1098; SI-NEXT:    v_mov_b32_e32 v0, 0
1099; SI-NEXT:    s_waitcnt lgkmcnt(0)
1100; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1101; SI-NEXT:    s_endpgm
1102;
1103; VI-LABEL: bfe_u32_constant_fold_test_2:
1104; VI:       ; %bb.0:
1105; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1106; VI-NEXT:    s_mov_b32 s3, 0xf000
1107; VI-NEXT:    s_mov_b32 s2, -1
1108; VI-NEXT:    v_mov_b32_e32 v0, 0
1109; VI-NEXT:    s_waitcnt lgkmcnt(0)
1110; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1111; VI-NEXT:    s_endpgm
1112  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
1113  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1114  ret void
1115}
1116
1117; EG-NOT: BFE
1118define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
1119; SI-LABEL: bfe_u32_constant_fold_test_3:
1120; SI:       ; %bb.0:
1121; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1122; SI-NEXT:    s_mov_b32 s3, 0xf000
1123; SI-NEXT:    s_mov_b32 s2, -1
1124; SI-NEXT:    v_mov_b32_e32 v0, 1
1125; SI-NEXT:    s_waitcnt lgkmcnt(0)
1126; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1127; SI-NEXT:    s_endpgm
1128;
1129; VI-LABEL: bfe_u32_constant_fold_test_3:
1130; VI:       ; %bb.0:
1131; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1132; VI-NEXT:    s_mov_b32 s3, 0xf000
1133; VI-NEXT:    s_mov_b32 s2, -1
1134; VI-NEXT:    v_mov_b32_e32 v0, 1
1135; VI-NEXT:    s_waitcnt lgkmcnt(0)
1136; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1137; VI-NEXT:    s_endpgm
1138  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
1139  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1140  ret void
1141}
1142
1143; EG-NOT: BFE
1144define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
1145; SI-LABEL: bfe_u32_constant_fold_test_4:
1146; SI:       ; %bb.0:
1147; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1148; SI-NEXT:    s_mov_b32 s3, 0xf000
1149; SI-NEXT:    s_mov_b32 s2, -1
1150; SI-NEXT:    v_mov_b32_e32 v0, -1
1151; SI-NEXT:    s_waitcnt lgkmcnt(0)
1152; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1153; SI-NEXT:    s_endpgm
1154;
1155; VI-LABEL: bfe_u32_constant_fold_test_4:
1156; VI:       ; %bb.0:
1157; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1158; VI-NEXT:    s_mov_b32 s3, 0xf000
1159; VI-NEXT:    s_mov_b32 s2, -1
1160; VI-NEXT:    v_mov_b32_e32 v0, -1
1161; VI-NEXT:    s_waitcnt lgkmcnt(0)
1162; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1163; VI-NEXT:    s_endpgm
1164  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
1165  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1166  ret void
1167}
1168
1169; EG-NOT: BFE
1170define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
1171; SI-LABEL: bfe_u32_constant_fold_test_5:
1172; SI:       ; %bb.0:
1173; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1174; SI-NEXT:    s_mov_b32 s3, 0xf000
1175; SI-NEXT:    s_mov_b32 s2, -1
1176; SI-NEXT:    v_mov_b32_e32 v0, 1
1177; SI-NEXT:    s_waitcnt lgkmcnt(0)
1178; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1179; SI-NEXT:    s_endpgm
1180;
1181; VI-LABEL: bfe_u32_constant_fold_test_5:
1182; VI:       ; %bb.0:
1183; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1184; VI-NEXT:    s_mov_b32 s3, 0xf000
1185; VI-NEXT:    s_mov_b32 s2, -1
1186; VI-NEXT:    v_mov_b32_e32 v0, 1
1187; VI-NEXT:    s_waitcnt lgkmcnt(0)
1188; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1189; VI-NEXT:    s_endpgm
1190  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
1191  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1192  ret void
1193}
1194
1195; EG-NOT: BFE
1196define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
1197; SI-LABEL: bfe_u32_constant_fold_test_6:
1198; SI:       ; %bb.0:
1199; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1200; SI-NEXT:    s_mov_b32 s3, 0xf000
1201; SI-NEXT:    s_mov_b32 s2, -1
1202; SI-NEXT:    v_mov_b32_e32 v0, 0x80
1203; SI-NEXT:    s_waitcnt lgkmcnt(0)
1204; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1205; SI-NEXT:    s_endpgm
1206;
1207; VI-LABEL: bfe_u32_constant_fold_test_6:
1208; VI:       ; %bb.0:
1209; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1210; VI-NEXT:    s_mov_b32 s3, 0xf000
1211; VI-NEXT:    s_mov_b32 s2, -1
1212; VI-NEXT:    v_mov_b32_e32 v0, 0x80
1213; VI-NEXT:    s_waitcnt lgkmcnt(0)
1214; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1215; VI-NEXT:    s_endpgm
1216  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
1217  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1218  ret void
1219}
1220
1221; EG-NOT: BFE
1222define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
1223; SI-LABEL: bfe_u32_constant_fold_test_7:
1224; SI:       ; %bb.0:
1225; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1226; SI-NEXT:    s_mov_b32 s3, 0xf000
1227; SI-NEXT:    s_mov_b32 s2, -1
1228; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1229; SI-NEXT:    s_waitcnt lgkmcnt(0)
1230; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1231; SI-NEXT:    s_endpgm
1232;
1233; VI-LABEL: bfe_u32_constant_fold_test_7:
1234; VI:       ; %bb.0:
1235; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1236; VI-NEXT:    s_mov_b32 s3, 0xf000
1237; VI-NEXT:    s_mov_b32 s2, -1
1238; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1239; VI-NEXT:    s_waitcnt lgkmcnt(0)
1240; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1241; VI-NEXT:    s_endpgm
1242  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
1243  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1244  ret void
1245}
1246
1247; EG-NOT: BFE
1248define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
1249; SI-LABEL: bfe_u32_constant_fold_test_8:
1250; SI:       ; %bb.0:
1251; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1252; SI-NEXT:    s_mov_b32 s3, 0xf000
1253; SI-NEXT:    s_mov_b32 s2, -1
1254; SI-NEXT:    v_mov_b32_e32 v0, 1
1255; SI-NEXT:    s_waitcnt lgkmcnt(0)
1256; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1257; SI-NEXT:    s_endpgm
1258;
1259; VI-LABEL: bfe_u32_constant_fold_test_8:
1260; VI:       ; %bb.0:
1261; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1262; VI-NEXT:    s_mov_b32 s3, 0xf000
1263; VI-NEXT:    s_mov_b32 s2, -1
1264; VI-NEXT:    v_mov_b32_e32 v0, 1
1265; VI-NEXT:    s_waitcnt lgkmcnt(0)
1266; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1267; VI-NEXT:    s_endpgm
1268  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
1269  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1270  ret void
1271}
1272
1273; EG-NOT: BFE
1274define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
1275; SI-LABEL: bfe_u32_constant_fold_test_9:
1276; SI:       ; %bb.0:
1277; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1278; SI-NEXT:    s_mov_b32 s3, 0xf000
1279; SI-NEXT:    s_mov_b32 s2, -1
1280; SI-NEXT:    v_mov_b32_e32 v0, 1
1281; SI-NEXT:    s_waitcnt lgkmcnt(0)
1282; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1283; SI-NEXT:    s_endpgm
1284;
1285; VI-LABEL: bfe_u32_constant_fold_test_9:
1286; VI:       ; %bb.0:
1287; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1288; VI-NEXT:    s_mov_b32 s3, 0xf000
1289; VI-NEXT:    s_mov_b32 s2, -1
1290; VI-NEXT:    v_mov_b32_e32 v0, 1
1291; VI-NEXT:    s_waitcnt lgkmcnt(0)
1292; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1293; VI-NEXT:    s_endpgm
1294  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
1295  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1296  ret void
1297}
1298
1299; EG-NOT: BFE
1300define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
1301; SI-LABEL: bfe_u32_constant_fold_test_10:
1302; SI:       ; %bb.0:
1303; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1304; SI-NEXT:    s_mov_b32 s3, 0xf000
1305; SI-NEXT:    s_mov_b32 s2, -1
1306; SI-NEXT:    v_mov_b32_e32 v0, 0
1307; SI-NEXT:    s_waitcnt lgkmcnt(0)
1308; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1309; SI-NEXT:    s_endpgm
1310;
1311; VI-LABEL: bfe_u32_constant_fold_test_10:
1312; VI:       ; %bb.0:
1313; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1314; VI-NEXT:    s_mov_b32 s3, 0xf000
1315; VI-NEXT:    s_mov_b32 s2, -1
1316; VI-NEXT:    v_mov_b32_e32 v0, 0
1317; VI-NEXT:    s_waitcnt lgkmcnt(0)
1318; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1319; VI-NEXT:    s_endpgm
1320  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
1321  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1322  ret void
1323}
1324
1325; EG-NOT: BFE
1326define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
1327; SI-LABEL: bfe_u32_constant_fold_test_11:
1328; SI:       ; %bb.0:
1329; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1330; SI-NEXT:    s_mov_b32 s3, 0xf000
1331; SI-NEXT:    s_mov_b32 s2, -1
1332; SI-NEXT:    v_mov_b32_e32 v0, 10
1333; SI-NEXT:    s_waitcnt lgkmcnt(0)
1334; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1335; SI-NEXT:    s_endpgm
1336;
1337; VI-LABEL: bfe_u32_constant_fold_test_11:
1338; VI:       ; %bb.0:
1339; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1340; VI-NEXT:    s_mov_b32 s3, 0xf000
1341; VI-NEXT:    s_mov_b32 s2, -1
1342; VI-NEXT:    v_mov_b32_e32 v0, 10
1343; VI-NEXT:    s_waitcnt lgkmcnt(0)
1344; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1345; VI-NEXT:    s_endpgm
1346  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
1347  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1348  ret void
1349}
1350
1351; EG-NOT: BFE
1352define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
1353; SI-LABEL: bfe_u32_constant_fold_test_12:
1354; SI:       ; %bb.0:
1355; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1356; SI-NEXT:    s_mov_b32 s3, 0xf000
1357; SI-NEXT:    s_mov_b32 s2, -1
1358; SI-NEXT:    v_mov_b32_e32 v0, 0
1359; SI-NEXT:    s_waitcnt lgkmcnt(0)
1360; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1361; SI-NEXT:    s_endpgm
1362;
1363; VI-LABEL: bfe_u32_constant_fold_test_12:
1364; VI:       ; %bb.0:
1365; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1366; VI-NEXT:    s_mov_b32 s3, 0xf000
1367; VI-NEXT:    s_mov_b32 s2, -1
1368; VI-NEXT:    v_mov_b32_e32 v0, 0
1369; VI-NEXT:    s_waitcnt lgkmcnt(0)
1370; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1371; VI-NEXT:    s_endpgm
1372  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
1373  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1374  ret void
1375}
1376
1377; EG-NOT: BFE
1378define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
1379; SI-LABEL: bfe_u32_constant_fold_test_13:
1380; SI:       ; %bb.0:
1381; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1382; SI-NEXT:    s_mov_b32 s3, 0xf000
1383; SI-NEXT:    s_mov_b32 s2, -1
1384; SI-NEXT:    v_mov_b32_e32 v0, 1
1385; SI-NEXT:    s_waitcnt lgkmcnt(0)
1386; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1387; SI-NEXT:    s_endpgm
1388;
1389; VI-LABEL: bfe_u32_constant_fold_test_13:
1390; VI:       ; %bb.0:
1391; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1392; VI-NEXT:    s_mov_b32 s3, 0xf000
1393; VI-NEXT:    s_mov_b32 s2, -1
1394; VI-NEXT:    v_mov_b32_e32 v0, 1
1395; VI-NEXT:    s_waitcnt lgkmcnt(0)
1396; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1397; VI-NEXT:    s_endpgm
1398  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
1399  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1400  ret void
1401}
1402
1403; EG-NOT: BFE
1404define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
1405; SI-LABEL: bfe_u32_constant_fold_test_14:
1406; SI:       ; %bb.0:
1407; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1408; SI-NEXT:    s_mov_b32 s3, 0xf000
1409; SI-NEXT:    s_mov_b32 s2, -1
1410; SI-NEXT:    v_mov_b32_e32 v0, 40
1411; SI-NEXT:    s_waitcnt lgkmcnt(0)
1412; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1413; SI-NEXT:    s_endpgm
1414;
1415; VI-LABEL: bfe_u32_constant_fold_test_14:
1416; VI:       ; %bb.0:
1417; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1418; VI-NEXT:    s_mov_b32 s3, 0xf000
1419; VI-NEXT:    s_mov_b32 s2, -1
1420; VI-NEXT:    v_mov_b32_e32 v0, 40
1421; VI-NEXT:    s_waitcnt lgkmcnt(0)
1422; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1423; VI-NEXT:    s_endpgm
1424  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
1425  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1426  ret void
1427}
1428
1429; EG-NOT: BFE
1430define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
1431; SI-LABEL: bfe_u32_constant_fold_test_15:
1432; SI:       ; %bb.0:
1433; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1434; SI-NEXT:    s_mov_b32 s3, 0xf000
1435; SI-NEXT:    s_mov_b32 s2, -1
1436; SI-NEXT:    v_mov_b32_e32 v0, 10
1437; SI-NEXT:    s_waitcnt lgkmcnt(0)
1438; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1439; SI-NEXT:    s_endpgm
1440;
1441; VI-LABEL: bfe_u32_constant_fold_test_15:
1442; VI:       ; %bb.0:
1443; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1444; VI-NEXT:    s_mov_b32 s3, 0xf000
1445; VI-NEXT:    s_mov_b32 s2, -1
1446; VI-NEXT:    v_mov_b32_e32 v0, 10
1447; VI-NEXT:    s_waitcnt lgkmcnt(0)
1448; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1449; VI-NEXT:    s_endpgm
1450  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
1451  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1452  ret void
1453}
1454
1455; EG-NOT: BFE
1456define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
1457; SI-LABEL: bfe_u32_constant_fold_test_16:
1458; SI:       ; %bb.0:
1459; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1460; SI-NEXT:    s_mov_b32 s3, 0xf000
1461; SI-NEXT:    s_mov_b32 s2, -1
1462; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1463; SI-NEXT:    s_waitcnt lgkmcnt(0)
1464; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1465; SI-NEXT:    s_endpgm
1466;
1467; VI-LABEL: bfe_u32_constant_fold_test_16:
1468; VI:       ; %bb.0:
1469; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1470; VI-NEXT:    s_mov_b32 s3, 0xf000
1471; VI-NEXT:    s_mov_b32 s2, -1
1472; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1473; VI-NEXT:    s_waitcnt lgkmcnt(0)
1474; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1475; VI-NEXT:    s_endpgm
1476  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
1477  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1478  ret void
1479}
1480
1481; EG-NOT: BFE
1482define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
1483; SI-LABEL: bfe_u32_constant_fold_test_17:
1484; SI:       ; %bb.0:
1485; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1486; SI-NEXT:    s_mov_b32 s3, 0xf000
1487; SI-NEXT:    s_mov_b32 s2, -1
1488; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1489; SI-NEXT:    s_waitcnt lgkmcnt(0)
1490; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1491; SI-NEXT:    s_endpgm
1492;
1493; VI-LABEL: bfe_u32_constant_fold_test_17:
1494; VI:       ; %bb.0:
1495; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1496; VI-NEXT:    s_mov_b32 s3, 0xf000
1497; VI-NEXT:    s_mov_b32 s2, -1
1498; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1499; VI-NEXT:    s_waitcnt lgkmcnt(0)
1500; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1501; VI-NEXT:    s_endpgm
1502  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
1503  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1504  ret void
1505}
1506
1507; EG-NOT: BFE
1508define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
1509; SI-LABEL: bfe_u32_constant_fold_test_18:
1510; SI:       ; %bb.0:
1511; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1512; SI-NEXT:    s_mov_b32 s3, 0xf000
1513; SI-NEXT:    s_mov_b32 s2, -1
1514; SI-NEXT:    v_mov_b32_e32 v0, 0
1515; SI-NEXT:    s_waitcnt lgkmcnt(0)
1516; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1517; SI-NEXT:    s_endpgm
1518;
1519; VI-LABEL: bfe_u32_constant_fold_test_18:
1520; VI:       ; %bb.0:
1521; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1522; VI-NEXT:    s_mov_b32 s3, 0xf000
1523; VI-NEXT:    s_mov_b32 s2, -1
1524; VI-NEXT:    v_mov_b32_e32 v0, 0
1525; VI-NEXT:    s_waitcnt lgkmcnt(0)
1526; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1527; VI-NEXT:    s_endpgm
1528  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
1529  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
1530  ret void
1531}
1532
1533; Make sure that SimplifyDemandedBits doesn't cause the and to be
1534; reduced to the bits demanded by the bfe.
1535
1536; XXX: The operand to v_bfe_u32 could also just directly be the load register.
1537define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
1538; SI-LABEL: simplify_bfe_u32_multi_use_arg:
1539; SI:       ; %bb.0:
1540; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
1541; SI-NEXT:    s_mov_b32 s3, 0xf000
1542; SI-NEXT:    s_mov_b32 s2, -1
1543; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1544; SI-NEXT:    s_mov_b32 s6, s2
1545; SI-NEXT:    s_mov_b32 s7, s3
1546; SI-NEXT:    s_waitcnt lgkmcnt(0)
1547; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1548; SI-NEXT:    s_mov_b32 s0, s8
1549; SI-NEXT:    s_mov_b32 s1, s9
1550; SI-NEXT:    s_mov_b32 s4, s10
1551; SI-NEXT:    s_mov_b32 s5, s11
1552; SI-NEXT:    s_waitcnt vmcnt(0)
1553; SI-NEXT:    v_and_b32_e32 v0, 63, v0
1554; SI-NEXT:    v_bfe_u32 v1, v0, 2, 2
1555; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1556; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1557; SI-NEXT:    s_endpgm
1558;
1559; VI-LABEL: simplify_bfe_u32_multi_use_arg:
1560; VI:       ; %bb.0:
1561; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1562; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1563; VI-NEXT:    s_mov_b32 s11, 0xf000
1564; VI-NEXT:    s_mov_b32 s10, -1
1565; VI-NEXT:    s_mov_b32 s2, s10
1566; VI-NEXT:    s_mov_b32 s3, s11
1567; VI-NEXT:    s_waitcnt lgkmcnt(0)
1568; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
1569; VI-NEXT:    s_mov_b32 s8, s4
1570; VI-NEXT:    s_mov_b32 s9, s5
1571; VI-NEXT:    s_mov_b32 s0, s6
1572; VI-NEXT:    s_mov_b32 s1, s7
1573; VI-NEXT:    s_waitcnt vmcnt(0)
1574; VI-NEXT:    v_and_b32_e32 v0, 63, v0
1575; VI-NEXT:    v_bfe_u32 v1, v0, 2, 2
1576; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
1577; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1578; VI-NEXT:    s_endpgm
1579                                            i32 addrspace(1)* %out1,
1580                                            i32 addrspace(1)* %in) #0 {
1581  %src = load i32, i32 addrspace(1)* %in, align 4
1582  %and = and i32 %src, 63
1583  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2)
1584  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
1585  store i32 %and, i32 addrspace(1)* %out1, align 4
1586  ret void
1587}
1588
1589define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
1590; SI-LABEL: lshr_and:
1591; SI:       ; %bb.0:
1592; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
1593; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1594; SI-NEXT:    s_mov_b32 s7, 0xf000
1595; SI-NEXT:    s_waitcnt lgkmcnt(0)
1596; SI-NEXT:    s_bfe_u32 s0, s2, 0x30006
1597; SI-NEXT:    s_mov_b32 s6, -1
1598; SI-NEXT:    v_mov_b32_e32 v0, s0
1599; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1600; SI-NEXT:    s_endpgm
1601;
1602; VI-LABEL: lshr_and:
1603; VI:       ; %bb.0:
1604; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1605; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1606; VI-NEXT:    s_mov_b32 s7, 0xf000
1607; VI-NEXT:    s_mov_b32 s6, -1
1608; VI-NEXT:    s_waitcnt lgkmcnt(0)
1609; VI-NEXT:    s_bfe_u32 s0, s0, 0x30006
1610; VI-NEXT:    v_mov_b32_e32 v0, s0
1611; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1612; VI-NEXT:    s_endpgm
1613  %b = lshr i32 %a, 6
1614  %c = and i32 %b, 7
1615  store i32 %c, i32 addrspace(1)* %out, align 8
1616  ret void
1617}
1618
1619define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
1620; SI-LABEL: v_lshr_and:
1621; SI:       ; %bb.0:
1622; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
1623; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1624; SI-NEXT:    s_mov_b32 s7, 0xf000
1625; SI-NEXT:    s_waitcnt lgkmcnt(0)
1626; SI-NEXT:    s_lshr_b32 s0, s2, s3
1627; SI-NEXT:    s_and_b32 s0, s0, 7
1628; SI-NEXT:    s_mov_b32 s6, -1
1629; SI-NEXT:    v_mov_b32_e32 v0, s0
1630; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1631; SI-NEXT:    s_endpgm
1632;
1633; VI-LABEL: v_lshr_and:
1634; VI:       ; %bb.0:
1635; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1636; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1637; VI-NEXT:    s_mov_b32 s7, 0xf000
1638; VI-NEXT:    s_mov_b32 s6, -1
1639; VI-NEXT:    s_waitcnt lgkmcnt(0)
1640; VI-NEXT:    s_lshr_b32 s0, s0, s1
1641; VI-NEXT:    s_and_b32 s0, s0, 7
1642; VI-NEXT:    v_mov_b32_e32 v0, s0
1643; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1644; VI-NEXT:    s_endpgm
1645  %c = lshr i32 %a, %b
1646  %d = and i32 %c, 7
1647  store i32 %d, i32 addrspace(1)* %out, align 8
1648  ret void
1649}
1650
1651define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
1652; SI-LABEL: and_lshr:
1653; SI:       ; %bb.0:
1654; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
1655; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1656; SI-NEXT:    s_mov_b32 s7, 0xf000
1657; SI-NEXT:    s_waitcnt lgkmcnt(0)
1658; SI-NEXT:    s_bfe_u32 s0, s2, 0x30006
1659; SI-NEXT:    s_mov_b32 s6, -1
1660; SI-NEXT:    v_mov_b32_e32 v0, s0
1661; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1662; SI-NEXT:    s_endpgm
1663;
1664; VI-LABEL: and_lshr:
1665; VI:       ; %bb.0:
1666; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1667; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1668; VI-NEXT:    s_mov_b32 s7, 0xf000
1669; VI-NEXT:    s_mov_b32 s6, -1
1670; VI-NEXT:    s_waitcnt lgkmcnt(0)
1671; VI-NEXT:    s_bfe_u32 s0, s0, 0x30006
1672; VI-NEXT:    v_mov_b32_e32 v0, s0
1673; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1674; VI-NEXT:    s_endpgm
1675  %b = and i32 %a, 448
1676  %c = lshr i32 %b, 6
1677  store i32 %c, i32 addrspace(1)* %out, align 8
1678  ret void
1679}
1680
1681define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
1682; SI-LABEL: and_lshr2:
1683; SI:       ; %bb.0:
1684; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
1685; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1686; SI-NEXT:    s_mov_b32 s7, 0xf000
1687; SI-NEXT:    s_waitcnt lgkmcnt(0)
1688; SI-NEXT:    s_bfe_u32 s0, s2, 0x30006
1689; SI-NEXT:    s_mov_b32 s6, -1
1690; SI-NEXT:    v_mov_b32_e32 v0, s0
1691; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1692; SI-NEXT:    s_endpgm
1693;
1694; VI-LABEL: and_lshr2:
1695; VI:       ; %bb.0:
1696; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1697; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1698; VI-NEXT:    s_mov_b32 s7, 0xf000
1699; VI-NEXT:    s_mov_b32 s6, -1
1700; VI-NEXT:    s_waitcnt lgkmcnt(0)
1701; VI-NEXT:    s_bfe_u32 s0, s0, 0x30006
1702; VI-NEXT:    v_mov_b32_e32 v0, s0
1703; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1704; VI-NEXT:    s_endpgm
1705  %b = and i32 %a, 511
1706  %c = lshr i32 %b, 6
1707  store i32 %c, i32 addrspace(1)* %out, align 8
1708  ret void
1709}
1710
1711define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
1712; SI-LABEL: shl_lshr:
1713; SI:       ; %bb.0:
1714; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
1715; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1716; SI-NEXT:    s_mov_b32 s7, 0xf000
1717; SI-NEXT:    s_waitcnt lgkmcnt(0)
1718; SI-NEXT:    s_bfe_u32 s0, s2, 0x150002
1719; SI-NEXT:    s_mov_b32 s6, -1
1720; SI-NEXT:    v_mov_b32_e32 v0, s0
1721; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1722; SI-NEXT:    s_endpgm
1723;
1724; VI-LABEL: shl_lshr:
1725; VI:       ; %bb.0:
1726; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1727; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1728; VI-NEXT:    s_mov_b32 s7, 0xf000
1729; VI-NEXT:    s_mov_b32 s6, -1
1730; VI-NEXT:    s_waitcnt lgkmcnt(0)
1731; VI-NEXT:    s_bfe_u32 s0, s0, 0x150002
1732; VI-NEXT:    v_mov_b32_e32 v0, s0
1733; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1734; VI-NEXT:    s_endpgm
1735  %b = shl i32 %a, 9
1736  %c = lshr i32 %b, 11
1737  store i32 %c, i32 addrspace(1)* %out, align 8
1738  ret void
1739}
1740
1741declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1
1742
1743attributes #0 = { nounwind }
1744attributes #1 = { nounwind readnone }
1745