1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
3
4declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
5declare i32 @llvm.r600.read.tidig.x() nounwind readnone
6
7
8; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
9; SI: s_load_dword [[ARG:s[0-9]+]],
10; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
11; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
12; SI: buffer_store_dword [[EXTRACT]],
13
14; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
15; EG: LSHR * [[ADDR]]
16; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
17define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) {
18  %shl = shl i32 %in, 31
19  %sext = ashr i32 %shl, 31
20  store i32 %sext, i32 addrspace(1)* %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
25; SI: s_add_i32 [[VAL:s[0-9]+]],
26; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
27; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
28; SI: buffer_store_dword [[VEXTRACT]],
29
30; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
31; EG: ADD_INT
32; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
33; EG-NEXT: LSHR * [[ADDR]]
34define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
35  %c = add i32 %a, %b ; add to prevent folding into extload
36  %shl = shl i32 %c, 24
37  %ashr = ashr i32 %shl, 24
38  store i32 %ashr, i32 addrspace(1)* %out, align 4
39  ret void
40}
41
42; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
43; SI: s_add_i32 [[VAL:s[0-9]+]],
44; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
45; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
46; SI: buffer_store_dword [[VEXTRACT]],
47
48; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
49; EG: ADD_INT
50; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
51; EG-NEXT: LSHR * [[ADDR]]
52define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
53  %c = add i32 %a, %b ; add to prevent folding into extload
54  %shl = shl i32 %c, 16
55  %ashr = ashr i32 %shl, 16
56  store i32 %ashr, i32 addrspace(1)* %out, align 4
57  ret void
58}
59
60; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
61; SI: s_add_i32 [[VAL:s[0-9]+]],
62; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
63; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
64; SI: buffer_store_dword [[VEXTRACT]],
65
66; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
67; EG: ADD_INT
68; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
69; EG-NEXT: LSHR * [[ADDR]]
70define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
71  %c = add <1 x i32> %a, %b ; add to prevent folding into extload
72  %shl = shl <1 x i32> %c, <i32 24>
73  %ashr = ashr <1 x i32> %shl, <i32 24>
74  store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4
75  ret void
76}
77
78; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
79; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
80; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
81; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
82; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
83; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
84define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
85  %c = shl i64 %a, %b
86  %shl = shl i64 %c, 63
87  %ashr = ashr i64 %shl, 63
88  store i64 %ashr, i64 addrspace(1)* %out, align 8
89  ret void
90}
91
92; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
93; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
94; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
95; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
96; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
97; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
98define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
99  %c = shl i64 %a, %b
100  %shl = shl i64 %c, 56
101  %ashr = ashr i64 %shl, 56
102  store i64 %ashr, i64 addrspace(1)* %out, align 8
103  ret void
104}
105
106; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
107; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
108; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
109; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
110; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
111; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
112
113define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
114  %c = shl i64 %a, %b
115  %shl = shl i64 %c, 48
116  %ashr = ashr i64 %shl, 48
117  store i64 %ashr, i64 addrspace(1)* %out, align 8
118  ret void
119}
120
121; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
122; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
123; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
124; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
125; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
126; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
127define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
128  %c = shl i64 %a, %b
129  %shl = shl i64 %c, 32
130  %ashr = ashr i64 %shl, 32
131  store i64 %ashr, i64 addrspace(1)* %out, align 8
132  ret void
133}
134
135; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
136; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
137; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
138; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
139; XSI: buffer_store_dword
140; XEG: BFE_INT
141; XEG: ASHR
142; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
143;   %c = add <1 x i64> %a, %b
144;   %shl = shl <1 x i64> %c, <i64 56>
145;   %ashr = ashr <1 x i64> %shl, <i64 56>
146;   store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8
147;   ret void
148; }
149
150; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
151; SI: buffer_load_dwordx2
152; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
153; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
154; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
155; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
156define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
157  %tid = call i32 @llvm.r600.read.tidig.x()
158  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
159  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
160  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
161  %a = load i64, i64 addrspace(1)* %a.gep, align 8
162  %b = load i64, i64 addrspace(1)* %b.gep, align 8
163
164  %c = shl i64 %a, %b
165  %shl = shl i64 %c, 63
166  %ashr = ashr i64 %shl, 63
167  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
168  ret void
169}
170
171; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
172; SI: buffer_load_dwordx2
173; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
174; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
175; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
176; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
177define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
178  %tid = call i32 @llvm.r600.read.tidig.x()
179  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
180  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
181  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
182  %a = load i64, i64 addrspace(1)* %a.gep, align 8
183  %b = load i64, i64 addrspace(1)* %b.gep, align 8
184
185  %c = shl i64 %a, %b
186  %shl = shl i64 %c, 56
187  %ashr = ashr i64 %shl, 56
188  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
189  ret void
190}
191
192; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
193; SI: buffer_load_dwordx2
194; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
195; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
196; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
197; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
198define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
199  %tid = call i32 @llvm.r600.read.tidig.x()
200  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
201  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
202  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
203  %a = load i64, i64 addrspace(1)* %a.gep, align 8
204  %b = load i64, i64 addrspace(1)* %b.gep, align 8
205
206  %c = shl i64 %a, %b
207  %shl = shl i64 %c, 48
208  %ashr = ashr i64 %shl, 48
209  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
210  ret void
211}
212
213; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
214; SI: buffer_load_dwordx2
215; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
216; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
217; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
218define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
219  %tid = call i32 @llvm.r600.read.tidig.x()
220  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
221  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
222  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
223  %a = load i64, i64 addrspace(1)* %a.gep, align 8
224  %b = load i64, i64 addrspace(1)* %b.gep, align 8
225
226  %c = shl i64 %a, %b
227  %shl = shl i64 %c, 32
228  %ashr = ashr i64 %shl, 32
229  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
230  ret void
231}
232
233; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
234; SI-NOT: s_lshl
235; SI-NOT: s_ashr
236; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
237
238; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
239; EG-NOT: BFE
240; EG: ADD_INT
241; EG: LSHL
242; EG: ASHR [[RES]]
243; EG: LSHR {{\*?}} [[ADDR]]
244define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
245  %c = add i32 %a, %b
246  %x = shl i32 %c, 6
247  %y = ashr i32 %x, 7
248  store i32 %y, i32 addrspace(1)* %out
249  ret void
250}
251
252; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
253; SI-NOT: s_lshl
254; SI-NOT: s_ashr
255; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
256; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001
257; SI: s_endpgm
258
259; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
260; EG-NOT: BFE
261; EG: ADD_INT
262; EG: LSHL
263; EG: ASHR [[RES]]
264; EG: LSHL
265; EG: ASHR [[RES]]
266; EG: LSHR {{\*?}} [[ADDR]]
267define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
268  %c = add <2 x i32> %a, %b
269  %x = shl <2 x i32> %c, <i32 6, i32 6>
270  %y = ashr <2 x i32> %x, <i32 7, i32 7>
271  store <2 x i32> %y, <2 x i32> addrspace(1)* %out
272  ret void
273}
274
275
276; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
277; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
278; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
279; SI: buffer_store_dwordx2
280
281; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
282; EG: BFE_INT [[RES]]
283; EG: BFE_INT [[RES]]
284; EG: LSHR {{\*?}} [[ADDR]]
285define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
286  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
287  %shl = shl <2 x i32> %c, <i32 31, i32 31>
288  %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
289  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
290  ret void
291}
292
293; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
294; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
295; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
296; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
297; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
298; SI: buffer_store_dwordx4
299
300; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
301; EG: BFE_INT [[RES]]
302; EG: BFE_INT [[RES]]
303; EG: BFE_INT [[RES]]
304; EG: BFE_INT [[RES]]
305; EG: LSHR {{\*?}} [[ADDR]]
306define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
307  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
308  %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
309  %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
310  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
311  ret void
312}
313
314; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
315; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
316; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
317; SI: buffer_store_dwordx2
318
319; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
320; EG: BFE_INT [[RES]]
321; EG: BFE_INT [[RES]]
322; EG: LSHR {{\*?}} [[ADDR]]
323define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
324  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
325  %shl = shl <2 x i32> %c, <i32 24, i32 24>
326  %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
327  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
328  ret void
329}
330
331; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
332; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
333; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
334; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
335; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
336; SI: buffer_store_dwordx4
337
338; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
339; EG: BFE_INT [[RES]]
340; EG: BFE_INT [[RES]]
341; EG: BFE_INT [[RES]]
342; EG: BFE_INT [[RES]]
343; EG: LSHR {{\*?}} [[ADDR]]
344define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
345  %c = add <4 x i32> %a, %b ; add to prevent folding into extload
346  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
347  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
348  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
349  ret void
350}
351
352; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
353; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
354; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
355; SI: buffer_store_dwordx2
356
357; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
358; EG: BFE_INT [[RES]]
359; EG: BFE_INT [[RES]]
360; EG: LSHR {{\*?}} [[ADDR]]
361define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
362  %c = add <2 x i32> %a, %b ; add to prevent folding into extload
363  %shl = shl <2 x i32> %c, <i32 16, i32 16>
364  %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
365  store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8
366  ret void
367}
368
369; FUNC-LABEL: {{^}}testcase:
370define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
371  %and_a_1 = and i8 %a, 1
372  %cmp_eq = icmp eq i8 %and_a_1, 0
373  %cmp_slt = icmp slt i8 %a, 0
374  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
375  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
376  %xor = xor i8 %sel0, %sel1
377  store i8 %xor, i8 addrspace(1)* %out
378  ret void
379}
380
381; FUNC-LABEL: {{^}}testcase_3:
382define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
383  %and_a_1 = and i8 %a, 1
384  %cmp_eq = icmp eq i8 %and_a_1, 0
385  %cmp_slt = icmp slt i8 %a, 0
386  %sel0 = select i1 %cmp_slt, i8 0, i8 %a
387  %sel1 = select i1 %cmp_eq, i8 0, i8 %a
388  %xor = xor i8 %sel0, %sel1
389  store i8 %xor, i8 addrspace(1)* %out
390  ret void
391}
392
393; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
394; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
395; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
396; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
397; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
398define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
399  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
400  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
401  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
402  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
403  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
404  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
405  ret void
406}
407
408; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
409; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
410; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
411define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
412  %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
413  %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
414  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
415  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
416  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
417  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
418  ret void
419}
420
421; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
422; SI: buffer_load_sbyte
423; SI: v_max_i32
424; SI-NOT: bfe
425; SI: buffer_store_short
426define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
427  %tmp5 = load i8, i8 addrspace(1)* %src, align 1
428  %tmp2 = sext i8 %tmp5 to i32
429  %tmp2.5 = icmp sgt i32 %tmp2, 0
430  %tmp3 = select i1 %tmp2.5, i32 %tmp2, i32 0
431  %tmp4 = trunc i32 %tmp3 to i8
432  %tmp6 = sext i8 %tmp4 to i16
433  store i16 %tmp6, i16 addrspace(1)* %out, align 2
434  ret void
435}
436
437declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
438
439; FUNC-LABEL: {{^}}bfe_0_width:
440; SI-NOT: {{[^@]}}bfe
441; SI: s_endpgm
442define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
443  %load = load i32, i32 addrspace(1)* %ptr, align 4
444  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
445  store i32 %bfe, i32 addrspace(1)* %out, align 4
446  ret void
447}
448
449; FUNC-LABEL: {{^}}bfe_8_bfe_8:
450; SI: v_bfe_i32
451; SI-NOT: {{[^@]}}bfe
452; SI: s_endpgm
453define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
454  %load = load i32, i32 addrspace(1)* %ptr, align 4
455  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
456  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
457  store i32 %bfe1, i32 addrspace(1)* %out, align 4
458  ret void
459}
460
461; FUNC-LABEL: {{^}}bfe_8_bfe_16:
462; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
463; SI: s_endpgm
464define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
465  %load = load i32, i32 addrspace(1)* %ptr, align 4
466  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
467  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
468  store i32 %bfe1, i32 addrspace(1)* %out, align 4
469  ret void
470}
471
472; This really should be folded into 1
473; FUNC-LABEL: {{^}}bfe_16_bfe_8:
474; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
475; SI-NOT: {{[^@]}}bfe
476; SI: s_endpgm
477define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
478  %load = load i32, i32 addrspace(1)* %ptr, align 4
479  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
480  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
481  store i32 %bfe1, i32 addrspace(1)* %out, align 4
482  ret void
483}
484
485; Make sure there isn't a redundant BFE
486; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
487; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
488; SI-NOT: {{[^@]}}bfe
489; SI: s_endpgm
490define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
491  %c = add i32 %a, %b ; add to prevent folding into extload
492  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
493  %shl = shl i32 %bfe, 24
494  %ashr = ashr i32 %shl, 24
495  store i32 %ashr, i32 addrspace(1)* %out, align 4
496  ret void
497}
498
499; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
500define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
501  %c = add i32 %a, %b ; add to prevent folding into extload
502  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
503  %shl = shl i32 %bfe, 24
504  %ashr = ashr i32 %shl, 24
505  store i32 %ashr, i32 addrspace(1)* %out, align 4
506  ret void
507}
508
509; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
510; SI: buffer_load_sbyte
511; SI-NOT: {{[^@]}}bfe
512; SI: s_endpgm
513define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
514  %load = load i8, i8 addrspace(1)* %ptr, align 1
515  %sext = sext i8 %load to i32
516  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
517  %shl = shl i32 %bfe, 24
518  %ashr = ashr i32 %shl, 24
519  store i32 %ashr, i32 addrspace(1)* %out, align 4
520  ret void
521}
522
523; SI: .text
524; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
525; SI-NOT: {{[^@]}}bfe
526; SI: s_endpgm
527define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
528  %load = load i8, i8 addrspace(1)* %ptr, align 1
529  %sext = sext i8 %load to i32
530  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
531  %shl = shl i32 %bfe, 24
532  %ashr = ashr i32 %shl, 24
533  store i32 %ashr, i32 addrspace(1)* %out, align 4
534  ret void
535}
536
537; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
538; SI-NOT: shr
539; SI-NOT: shl
540; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
541; SI: s_endpgm
542define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
543  %x = load i32, i32 addrspace(1)* %in, align 4
544  %shl = shl i32 %x, 31
545  %shr = ashr i32 %shl, 31
546  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
547  store i32 %bfe, i32 addrspace(1)* %out, align 4
548  ret void
549}
550
551; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
552; SI: buffer_load_dword
553; SI-NOT: shl
554; SI-NOT: shr
555; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
556; SI: s_endpgm
557define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
558  %x = load i32, i32 addrspace(1)* %in, align 4
559  %shl = shl i32 %x, 30
560  %shr = ashr i32 %shl, 30
561  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
562  store i32 %bfe, i32 addrspace(1)* %out, align 4
563  ret void
564}
565
566; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
567; SI: buffer_load_dword
568; SI-NOT: v_lshl
569; SI-NOT: v_ashr
570; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
571; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
572; SI: s_endpgm
573define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
574  %x = load i32, i32 addrspace(1)* %in, align 4
575  %shl = shl i32 %x, 30
576  %shr = ashr i32 %shl, 30
577  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
578  store i32 %bfe, i32 addrspace(1)* %out, align 4
579  ret void
580}
581
582; Make sure we propagate the VALUness to users of a moved scalar BFE.
583
584; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use:
585; SI: buffer_load_dwordx2
586; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
587; SI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
588; SI-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
589; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
590; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
591; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
592define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind {
593  %tid = call i32 @llvm.r600.read.tidig.x()
594  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
595  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
596  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
597  %a = load i64, i64 addrspace(1)* %a.gep, align 8
598  %b = load i64, i64 addrspace(1)* %b.gep, align 8
599
600  %c = shl i64 %a, %b
601  %shl = shl i64 %c, 63
602  %ashr = ashr i64 %shl, 63
603
604  %and = and i64 %ashr, %s.val
605  store i64 %and, i64 addrspace(1)* %out.gep, align 8
606  ret void
607}
608
609; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64_move_use:
610; SI: buffer_load_dwordx2
611; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
612; SI-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
613; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
614; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]]
615; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
616define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind {
617  %tid = call i32 @llvm.r600.read.tidig.x()
618  %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
619  %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
620  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
621  %a = load i64, i64 addrspace(1)* %a.gep, align 8
622  %b = load i64, i64 addrspace(1)* %b.gep, align 8
623
624  %c = shl i64 %a, %b
625  %shl = shl i64 %c, 32
626  %ashr = ashr i64 %shl, 32
627  %and = and i64 %ashr, %s.val
628  store i64 %and, i64 addrspace(1)* %out.gep, align 8
629  ret void
630}
631