1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
6; SI-LABEL: widen_i16_constant_load:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
9; SI-NEXT:    s_mov_b32 s3, 0xf000
10; SI-NEXT:    s_mov_b32 s2, -1
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
13; SI-NEXT:    s_mov_b32 s0, 0
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_addk_i32 s1, 0x3e7
16; SI-NEXT:    s_or_b32 s4, s1, 4
17; SI-NEXT:    s_mov_b32 s1, s0
18; SI-NEXT:    v_mov_b32_e32 v0, s4
19; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
20; SI-NEXT:    s_endpgm
21;
22; VI-LABEL: widen_i16_constant_load:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
25; VI-NEXT:    v_mov_b32_e32 v0, 0
26; VI-NEXT:    v_mov_b32_e32 v1, 0
27; VI-NEXT:    s_waitcnt lgkmcnt(0)
28; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    s_addk_i32 s0, 0x3e7
31; VI-NEXT:    s_or_b32 s0, s0, 4
32; VI-NEXT:    v_mov_b32_e32 v2, s0
33; VI-NEXT:    flat_store_short v[0:1], v2
34; VI-NEXT:    s_endpgm
35  %load = load i16, i16 addrspace(4)* %arg, align 4
36  %add = add i16 %load, 999
37  %or = or i16 %add, 4
38  store i16 %or, i16 addrspace(1)* null
39  ret void
40}
41
42define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) {
43; SI-LABEL: widen_i16_constant_load_zext_i32:
44; SI:       ; %bb.0:
45; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
46; SI-NEXT:    s_mov_b32 s3, 0xf000
47; SI-NEXT:    s_mov_b32 s2, -1
48; SI-NEXT:    s_waitcnt lgkmcnt(0)
49; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
50; SI-NEXT:    s_mov_b32 s0, 0
51; SI-NEXT:    s_waitcnt lgkmcnt(0)
52; SI-NEXT:    s_and_b32 s1, s1, 0xffff
53; SI-NEXT:    s_addk_i32 s1, 0x3e7
54; SI-NEXT:    s_or_b32 s4, s1, 4
55; SI-NEXT:    s_mov_b32 s1, s0
56; SI-NEXT:    v_mov_b32_e32 v0, s4
57; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
58; SI-NEXT:    s_endpgm
59;
60; VI-LABEL: widen_i16_constant_load_zext_i32:
61; VI:       ; %bb.0:
62; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
63; VI-NEXT:    v_mov_b32_e32 v0, 0
64; VI-NEXT:    v_mov_b32_e32 v1, 0
65; VI-NEXT:    s_waitcnt lgkmcnt(0)
66; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
67; VI-NEXT:    s_waitcnt lgkmcnt(0)
68; VI-NEXT:    s_and_b32 s0, s0, 0xffff
69; VI-NEXT:    s_addk_i32 s0, 0x3e7
70; VI-NEXT:    s_or_b32 s0, s0, 4
71; VI-NEXT:    v_mov_b32_e32 v2, s0
72; VI-NEXT:    flat_store_dword v[0:1], v2
73; VI-NEXT:    s_endpgm
74  %load = load i16, i16 addrspace(4)* %arg, align 4
75  %ext = zext i16 %load to i32
76  %add = add i32 %ext, 999
77  %or = or i32 %add, 4
78  store i32 %or, i32 addrspace(1)* null
79  ret void
80}
81
82define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) {
83; SI-LABEL: widen_i16_constant_load_sext_i32:
84; SI:       ; %bb.0:
85; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
86; SI-NEXT:    s_mov_b32 s3, 0xf000
87; SI-NEXT:    s_mov_b32 s2, -1
88; SI-NEXT:    s_waitcnt lgkmcnt(0)
89; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
90; SI-NEXT:    s_mov_b32 s0, 0
91; SI-NEXT:    s_waitcnt lgkmcnt(0)
92; SI-NEXT:    s_sext_i32_i16 s1, s1
93; SI-NEXT:    s_addk_i32 s1, 0x3e7
94; SI-NEXT:    s_or_b32 s4, s1, 4
95; SI-NEXT:    s_mov_b32 s1, s0
96; SI-NEXT:    v_mov_b32_e32 v0, s4
97; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
98; SI-NEXT:    s_endpgm
99;
100; VI-LABEL: widen_i16_constant_load_sext_i32:
101; VI:       ; %bb.0:
102; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
103; VI-NEXT:    v_mov_b32_e32 v0, 0
104; VI-NEXT:    v_mov_b32_e32 v1, 0
105; VI-NEXT:    s_waitcnt lgkmcnt(0)
106; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
107; VI-NEXT:    s_waitcnt lgkmcnt(0)
108; VI-NEXT:    s_sext_i32_i16 s0, s0
109; VI-NEXT:    s_addk_i32 s0, 0x3e7
110; VI-NEXT:    s_or_b32 s0, s0, 4
111; VI-NEXT:    v_mov_b32_e32 v2, s0
112; VI-NEXT:    flat_store_dword v[0:1], v2
113; VI-NEXT:    s_endpgm
114  %load = load i16, i16 addrspace(4)* %arg, align 4
115  %ext = sext i16 %load to i32
116  %add = add i32 %ext, 999
117  %or = or i32 %add, 4
118  store i32 %or, i32 addrspace(1)* null
119  ret void
120}
121
122define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
123; SI-LABEL: widen_i17_constant_load:
124; SI:       ; %bb.0:
125; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x9
126; SI-NEXT:    s_mov_b32 s0, 0
127; SI-NEXT:    s_mov_b32 s3, 0xf000
128; SI-NEXT:    s_mov_b32 s2, -1
129; SI-NEXT:    s_mov_b32 s1, s0
130; SI-NEXT:    s_waitcnt lgkmcnt(0)
131; SI-NEXT:    s_load_dword s7, s[6:7], 0x0
132; SI-NEXT:    s_mov_b32 s4, 2
133; SI-NEXT:    s_mov_b32 s5, s0
134; SI-NEXT:    s_mov_b32 s6, s2
135; SI-NEXT:    s_waitcnt lgkmcnt(0)
136; SI-NEXT:    s_add_i32 s7, s7, 34
137; SI-NEXT:    s_or_b32 s7, s7, 4
138; SI-NEXT:    v_mov_b32_e32 v0, s7
139; SI-NEXT:    s_bfe_u32 s8, s7, 0x10010
140; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
141; SI-NEXT:    s_mov_b32 s7, s3
142; SI-NEXT:    s_waitcnt expcnt(0)
143; SI-NEXT:    v_mov_b32_e32 v0, s8
144; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
145; SI-NEXT:    s_endpgm
146;
147; VI-LABEL: widen_i17_constant_load:
148; VI:       ; %bb.0:
149; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; VI-NEXT:    v_mov_b32_e32 v0, 0
151; VI-NEXT:    v_mov_b32_e32 v2, 2
152; VI-NEXT:    v_mov_b32_e32 v1, 0
153; VI-NEXT:    v_mov_b32_e32 v3, 0
154; VI-NEXT:    s_waitcnt lgkmcnt(0)
155; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
156; VI-NEXT:    s_waitcnt lgkmcnt(0)
157; VI-NEXT:    s_add_i32 s0, s0, 34
158; VI-NEXT:    s_or_b32 s0, s0, 4
159; VI-NEXT:    v_mov_b32_e32 v4, s0
160; VI-NEXT:    s_bfe_u32 s0, s0, 0x10010
161; VI-NEXT:    flat_store_short v[0:1], v4
162; VI-NEXT:    v_mov_b32_e32 v0, s0
163; VI-NEXT:    flat_store_byte v[2:3], v0
164; VI-NEXT:    s_endpgm
165  %load = load i17, i17 addrspace(4)* %arg, align 4
166  %add = add i17 %load, 34
167  %or = or i17 %add, 4
168  store i17 %or, i17 addrspace(1)* null
169  ret void
170}
171
172define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
173; SI-LABEL: widen_f16_constant_load:
174; SI:       ; %bb.0:
175; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
176; SI-NEXT:    s_mov_b32 s3, 0xf000
177; SI-NEXT:    s_mov_b32 s2, -1
178; SI-NEXT:    s_waitcnt lgkmcnt(0)
179; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
180; SI-NEXT:    s_waitcnt lgkmcnt(0)
181; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
182; SI-NEXT:    s_mov_b32 s0, 0
183; SI-NEXT:    s_mov_b32 s1, s0
184; SI-NEXT:    v_add_f32_e32 v0, 4.0, v0
185; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
186; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
187; SI-NEXT:    s_endpgm
188;
189; VI-LABEL: widen_f16_constant_load:
190; VI:       ; %bb.0:
191; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
192; VI-NEXT:    v_mov_b32_e32 v0, 0
193; VI-NEXT:    v_mov_b32_e32 v1, 0
194; VI-NEXT:    s_waitcnt lgkmcnt(0)
195; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
196; VI-NEXT:    s_waitcnt lgkmcnt(0)
197; VI-NEXT:    v_add_f16_e64 v2, s0, 4.0
198; VI-NEXT:    flat_store_short v[0:1], v2
199; VI-NEXT:    s_endpgm
200  %load = load half, half addrspace(4)* %arg, align 4
201  %add = fadd half %load, 4.0
202  store half %add, half addrspace(1)* null
203  ret void
204}
205
206; FIXME: valu usage on VI
207define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
208; SI-LABEL: widen_v2i8_constant_load:
209; SI:       ; %bb.0:
210; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
211; SI-NEXT:    s_mov_b32 s0, 0
212; SI-NEXT:    s_waitcnt lgkmcnt(0)
213; SI-NEXT:    s_load_dword s1, s[2:3], 0x0
214; SI-NEXT:    s_mov_b32 s3, 0xf000
215; SI-NEXT:    s_mov_b32 s2, -1
216; SI-NEXT:    s_waitcnt lgkmcnt(0)
217; SI-NEXT:    s_and_b32 s4, s1, 0xff00
218; SI-NEXT:    s_add_i32 s1, s1, 12
219; SI-NEXT:    s_or_b32 s1, s1, 4
220; SI-NEXT:    s_and_b32 s1, s1, 0xff
221; SI-NEXT:    s_or_b32 s1, s4, s1
222; SI-NEXT:    s_addk_i32 s1, 0x2c00
223; SI-NEXT:    s_or_b32 s4, s1, 0x300
224; SI-NEXT:    s_mov_b32 s1, s0
225; SI-NEXT:    v_mov_b32_e32 v0, s4
226; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
227; SI-NEXT:    s_endpgm
228;
229; VI-LABEL: widen_v2i8_constant_load:
230; VI:       ; %bb.0:
231; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
232; VI-NEXT:    v_mov_b32_e32 v0, 44
233; VI-NEXT:    v_mov_b32_e32 v1, 3
234; VI-NEXT:    s_waitcnt lgkmcnt(0)
235; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
236; VI-NEXT:    s_waitcnt lgkmcnt(0)
237; VI-NEXT:    s_and_b32 s1, s0, 0xffff
238; VI-NEXT:    v_mov_b32_e32 v2, s0
239; VI-NEXT:    s_add_i32 s1, s1, 12
240; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
241; VI-NEXT:    s_or_b32 s0, s1, 4
242; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
243; VI-NEXT:    s_and_b32 s0, s0, 0xff
244; VI-NEXT:    v_or_b32_e32 v2, s0, v0
245; VI-NEXT:    v_mov_b32_e32 v0, 0
246; VI-NEXT:    v_mov_b32_e32 v1, 0
247; VI-NEXT:    flat_store_short v[0:1], v2
248; VI-NEXT:    s_endpgm
249  %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4
250  %add = add <2 x i8> %load, <i8 12, i8 44>
251  %or = or <2 x i8> %add, <i8 4, i8 3>
252  store <2 x i8> %or, <2 x i8> addrspace(1)* null
253  ret void
254}
255
256define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) {
257; SI-LABEL: no_widen_i16_constant_divergent_load:
258; SI:       ; %bb.0:
259; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
260; SI-NEXT:    s_mov_b32 s2, 0
261; SI-NEXT:    s_mov_b32 s3, 0xf000
262; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
263; SI-NEXT:    v_mov_b32_e32 v1, 0
264; SI-NEXT:    s_waitcnt lgkmcnt(0)
265; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
266; SI-NEXT:    s_mov_b32 s6, -1
267; SI-NEXT:    s_mov_b32 s4, s2
268; SI-NEXT:    s_mov_b32 s5, s2
269; SI-NEXT:    s_mov_b32 s7, s3
270; SI-NEXT:    s_waitcnt vmcnt(0)
271; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x3e7, v0
272; SI-NEXT:    v_or_b32_e32 v0, 4, v0
273; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
274; SI-NEXT:    s_endpgm
275;
276; VI-LABEL: no_widen_i16_constant_divergent_load:
277; VI:       ; %bb.0:
278; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
279; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
280; VI-NEXT:    s_waitcnt lgkmcnt(0)
281; VI-NEXT:    v_mov_b32_e32 v1, s1
282; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
283; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
284; VI-NEXT:    flat_load_ushort v0, v[0:1]
285; VI-NEXT:    s_waitcnt vmcnt(0)
286; VI-NEXT:    v_add_u16_e32 v0, 0x3e7, v0
287; VI-NEXT:    v_or_b32_e32 v2, 4, v0
288; VI-NEXT:    v_mov_b32_e32 v0, 0
289; VI-NEXT:    v_mov_b32_e32 v1, 0
290; VI-NEXT:    flat_store_short v[0:1], v2
291; VI-NEXT:    s_endpgm
292  %tid = call i32 @llvm.amdgcn.workitem.id.x()
293  %tid.ext = zext i32 %tid to i64
294  %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext
295  %load = load i16, i16 addrspace(4)* %gep.arg, align 4
296  %add = add i16 %load, 999
297  %or = or i16 %add, 4
298  store i16 %or, i16 addrspace(1)* null
299  ret void
300}
301
302define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
303; SI-LABEL: widen_i1_constant_load:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
306; SI-NEXT:    s_mov_b32 s3, 0xf000
307; SI-NEXT:    s_mov_b32 s2, -1
308; SI-NEXT:    s_waitcnt lgkmcnt(0)
309; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
310; SI-NEXT:    s_mov_b32 s0, 0
311; SI-NEXT:    s_waitcnt lgkmcnt(0)
312; SI-NEXT:    s_and_b32 s4, s1, 1
313; SI-NEXT:    s_mov_b32 s1, s0
314; SI-NEXT:    v_mov_b32_e32 v0, s4
315; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
316; SI-NEXT:    s_endpgm
317;
318; VI-LABEL: widen_i1_constant_load:
319; VI:       ; %bb.0:
320; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
321; VI-NEXT:    v_mov_b32_e32 v0, 0
322; VI-NEXT:    v_mov_b32_e32 v1, 0
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
325; VI-NEXT:    s_waitcnt lgkmcnt(0)
326; VI-NEXT:    s_and_b32 s0, s0, 1
327; VI-NEXT:    v_mov_b32_e32 v2, s0
328; VI-NEXT:    flat_store_byte v[0:1], v2
329; VI-NEXT:    s_endpgm
330  %load = load i1, i1 addrspace(4)* %arg, align 4
331  %and = and i1 %load, true
332  store i1 %and, i1 addrspace(1)* null
333  ret void
334}
335
336define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) {
337; SI-LABEL: widen_i16_zextload_i64_constant_load:
338; SI:       ; %bb.0:
339; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
340; SI-NEXT:    s_mov_b32 s3, 0xf000
341; SI-NEXT:    s_mov_b32 s2, -1
342; SI-NEXT:    s_waitcnt lgkmcnt(0)
343; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
344; SI-NEXT:    s_mov_b32 s0, 0
345; SI-NEXT:    s_waitcnt lgkmcnt(0)
346; SI-NEXT:    s_and_b32 s1, s1, 0xffff
347; SI-NEXT:    s_addk_i32 s1, 0x3e7
348; SI-NEXT:    s_or_b32 s4, s1, 4
349; SI-NEXT:    s_mov_b32 s1, s0
350; SI-NEXT:    v_mov_b32_e32 v0, s4
351; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
352; SI-NEXT:    s_endpgm
353;
354; VI-LABEL: widen_i16_zextload_i64_constant_load:
355; VI:       ; %bb.0:
356; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
357; VI-NEXT:    v_mov_b32_e32 v0, 0
358; VI-NEXT:    v_mov_b32_e32 v1, 0
359; VI-NEXT:    s_waitcnt lgkmcnt(0)
360; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    s_and_b32 s0, s0, 0xffff
363; VI-NEXT:    s_addk_i32 s0, 0x3e7
364; VI-NEXT:    s_or_b32 s0, s0, 4
365; VI-NEXT:    v_mov_b32_e32 v2, s0
366; VI-NEXT:    flat_store_dword v[0:1], v2
367; VI-NEXT:    s_endpgm
368  %load = load i16, i16 addrspace(4)* %arg, align 4
369  %zext = zext i16 %load to i32
370  %add = add i32 %zext, 999
371  %or = or i32 %add, 4
372  store i32 %or, i32 addrspace(1)* null
373  ret void
374}
375
376define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) {
377; SI-LABEL: widen_i1_zext_to_i64_constant_load:
378; SI:       ; %bb.0:
379; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
380; SI-NEXT:    s_mov_b32 s3, 0xf000
381; SI-NEXT:    s_mov_b32 s2, -1
382; SI-NEXT:    s_waitcnt lgkmcnt(0)
383; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
384; SI-NEXT:    s_mov_b32 s0, 0
385; SI-NEXT:    s_waitcnt lgkmcnt(0)
386; SI-NEXT:    s_and_b32 s1, s1, 1
387; SI-NEXT:    s_add_u32 s4, s1, 0x3e7
388; SI-NEXT:    s_addc_u32 s5, 0, 0
389; SI-NEXT:    v_mov_b32_e32 v0, s4
390; SI-NEXT:    s_mov_b32 s1, s0
391; SI-NEXT:    v_mov_b32_e32 v1, s5
392; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
393; SI-NEXT:    s_endpgm
394;
395; VI-LABEL: widen_i1_zext_to_i64_constant_load:
396; VI:       ; %bb.0:
397; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
398; VI-NEXT:    v_mov_b32_e32 v0, 0
399; VI-NEXT:    v_mov_b32_e32 v1, 0
400; VI-NEXT:    s_waitcnt lgkmcnt(0)
401; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
402; VI-NEXT:    s_waitcnt lgkmcnt(0)
403; VI-NEXT:    s_and_b32 s0, s0, 1
404; VI-NEXT:    s_add_u32 s0, s0, 0x3e7
405; VI-NEXT:    s_addc_u32 s1, 0, 0
406; VI-NEXT:    v_mov_b32_e32 v3, s1
407; VI-NEXT:    v_mov_b32_e32 v2, s0
408; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
409; VI-NEXT:    s_endpgm
410  %load = load i1, i1 addrspace(4)* %arg, align 4
411  %zext = zext i1 %load to i64
412  %add = add i64 %zext, 999
413  store i64 %add, i64 addrspace(1)* null
414  ret void
415}
416
417define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
418; SI-LABEL: widen_i16_constant32_load:
419; SI:       ; %bb.0:
420; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
421; SI-NEXT:    s_mov_b32 s1, 0
422; SI-NEXT:    s_mov_b32 s3, 0xf000
423; SI-NEXT:    s_mov_b32 s2, -1
424; SI-NEXT:    s_waitcnt lgkmcnt(0)
425; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
426; SI-NEXT:    s_waitcnt lgkmcnt(0)
427; SI-NEXT:    s_addk_i32 s0, 0x3e7
428; SI-NEXT:    s_or_b32 s4, s0, 4
429; SI-NEXT:    s_mov_b32 s0, s1
430; SI-NEXT:    v_mov_b32_e32 v0, s4
431; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
432; SI-NEXT:    s_endpgm
433;
434; VI-LABEL: widen_i16_constant32_load:
435; VI:       ; %bb.0:
436; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
437; VI-NEXT:    s_mov_b32 s1, 0
438; VI-NEXT:    v_mov_b32_e32 v0, 0
439; VI-NEXT:    v_mov_b32_e32 v1, 0
440; VI-NEXT:    s_waitcnt lgkmcnt(0)
441; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
442; VI-NEXT:    s_waitcnt lgkmcnt(0)
443; VI-NEXT:    s_addk_i32 s0, 0x3e7
444; VI-NEXT:    s_or_b32 s0, s0, 4
445; VI-NEXT:    v_mov_b32_e32 v2, s0
446; VI-NEXT:    flat_store_short v[0:1], v2
447; VI-NEXT:    s_endpgm
448  %load = load i16, i16 addrspace(6)* %arg, align 4
449  %add = add i16 %load, 999
450  %or = or i16 %add, 4
451  store i16 %or, i16 addrspace(1)* null
452  ret void
453}
454
455define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) {
456; SI-LABEL: widen_i16_global_invariant_load:
457; SI:       ; %bb.0:
458; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
459; SI-NEXT:    s_mov_b32 s3, 0xf000
460; SI-NEXT:    s_mov_b32 s2, -1
461; SI-NEXT:    s_waitcnt lgkmcnt(0)
462; SI-NEXT:    s_load_dword s1, s[0:1], 0x0
463; SI-NEXT:    s_mov_b32 s0, 0
464; SI-NEXT:    s_waitcnt lgkmcnt(0)
465; SI-NEXT:    s_addk_i32 s1, 0x3e7
466; SI-NEXT:    s_or_b32 s4, s1, 1
467; SI-NEXT:    s_mov_b32 s1, s0
468; SI-NEXT:    v_mov_b32_e32 v0, s4
469; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
470; SI-NEXT:    s_endpgm
471;
472; VI-LABEL: widen_i16_global_invariant_load:
473; VI:       ; %bb.0:
474; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
475; VI-NEXT:    v_mov_b32_e32 v0, 0
476; VI-NEXT:    v_mov_b32_e32 v1, 0
477; VI-NEXT:    s_waitcnt lgkmcnt(0)
478; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
479; VI-NEXT:    s_waitcnt lgkmcnt(0)
480; VI-NEXT:    s_addk_i32 s0, 0x3e7
481; VI-NEXT:    s_or_b32 s0, s0, 1
482; VI-NEXT:    v_mov_b32_e32 v2, s0
483; VI-NEXT:    flat_store_short v[0:1], v2
484; VI-NEXT:    s_endpgm
485  %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0
486  %add = add i16 %load, 999
487  %or = or i16 %add, 1
488  store i16 %or, i16 addrspace(1)* null
489  ret void
490}
491
492declare i32 @llvm.amdgcn.workitem.id.x()
493
494!0 = !{}
495