1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-GFX9,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-GFX9: kernarg_segment_byte_size = 12
9; HSA-GFX9: kernarg_segment_alignment = 4
10
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14
15; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
16; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
17
18
19; EGCM: VTX_READ_8{{.*}} #3
20; EGCM: KC0[2].Y
21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22  %ext = zext i8 %in to i32
23  store i32 %ext, i32 addrspace(1)* %out, align 4
24  ret void
25}
26
27; FUNC-LABEL: {{^}}i8_zext_arg:
28; HSA-GFX9: kernarg_segment_byte_size = 12
29; HSA-GFX9: kernarg_segment_alignment = 4
30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32
33; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
34; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
35
36
37; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
38; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
39; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
40
41; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
42; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
43; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
44; CM-NEXT:	2(2.802597e-45), 0(0.000000e+00)
45define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
46  %ext = zext i8 %in to i32
47  store i32 %ext, i32 addrspace(1)* %out, align 4
48  ret void
49}
50
51; FUNC-LABEL: {{^}}i8_sext_arg:
52; HSA-GFX9: kernarg_segment_byte_size = 12
53; HSA-GFX9: kernarg_segment_alignment = 4
54; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
55
56; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
57
58; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
59; HSA-GFX9: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
60; HSA-GFX9: global_store_dword
61
62
63; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
65; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
66
67; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
68; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
69; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
70; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
71define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
72  %ext = sext i8 %in to i32
73  store i32 %ext, i32 addrspace(1)* %out, align 4
74  ret void
75}
76
77; FUNC-LABEL: {{^}}i16_arg:
78; HSA-GFX9: kernarg_segment_byte_size = 12
79; HSA-GFX9: kernarg_segment_alignment = 4
80
81; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
82
83; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
84; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
85
86; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
87; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
88; HSA-GFX9: global_store_dword
89
90; EGCM: VTX_READ_16
91; EGCM: KC0[2].Y
92define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
93  %ext = zext i16 %in to i32
94  store i32 %ext, i32 addrspace(1)* %out, align 4
95  ret void
96}
97
98; FUNC-LABEL: {{^}}i16_zext_arg:
99; HSA-GFX9: kernarg_segment_byte_size = 12
100; HSA-GFX9: kernarg_segment_alignment = 4
101
102; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
103; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
104
105; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
106; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
107; HSA-GFX9: global_store_dword
108
109; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
110; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
111; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
112
113; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
114; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
115; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
116; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
117define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
118  %ext = zext i16 %in to i32
119  store i32 %ext, i32 addrspace(1)* %out, align 4
120  ret void
121}
122
123; FUNC-LABEL: {{^}}i16_sext_arg:
124; HSA-GFX9: kernarg_segment_byte_size = 12
125; HSA-GFX9: kernarg_segment_alignment = 4
126
127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
129
130
131; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
132; HSA-GFX9: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
133; HSA-GFX9: global_store_dword
134
135; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
137; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
138
139; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
140; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
141; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
142; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
143define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
144  %ext = sext i16 %in to i32
145  store i32 %ext, i32 addrspace(1)* %out, align 4
146  ret void
147}
148
149; FUNC-LABEL: {{^}}i32_arg:
150; HSA-GFX9: kernarg_segment_byte_size = 12
151; HSA-GFX9: kernarg_segment_alignment = 4
152
153; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
154; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
155; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
156; HSA-GFX9: s_load_dword s{{[0-9]}}, s[4:5], 0x8
157define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
158entry:
159  store i32 %in, i32 addrspace(1)* %out, align 4
160  ret void
161}
162
163; FUNC-LABEL: {{^}}f32_arg:
164; HSA-GFX9: kernarg_segment_byte_size = 12
165; HSA-GFX9: kernarg_segment_alignment = 4
166; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
167; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
168; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
169; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
170define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
171entry:
172  store float %in, float addrspace(1)* %out, align 4
173  ret void
174}
175
176; FUNC-LABEL: {{^}}v2i8_arg:
177; HSA-GFX9: kernarg_segment_byte_size = 12
178; HSA-GFX9: kernarg_segment_alignment = 4
179
180; EGCM: VTX_READ_8
181; EGCM: VTX_READ_8
182
183; GCN: s_load_dword s
184; GCN-NOT: {{buffer|flat|global}}_load_
185define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
186entry:
187  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
188  ret void
189}
190
191; FUNC-LABEL: {{^}}v2i16_arg:
192; HSA-GFX9: kernarg_segment_byte_size = 12
193; HSA-GFX9: kernarg_segment_alignment = 4
194
195; EGCM: VTX_READ_16
196; EGCM: VTX_READ_16
197
198; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
199; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
200; HSA-GFX9: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
201define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
202entry:
203  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
204  ret void
205}
206
207; FUNC-LABEL: {{^}}v2i32_arg:
208; HSA-GFX9: kernarg_segment_byte_size = 16
209; HSA-GFX9: kernarg_segment_alignment = 4
210
211; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
212; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
215; HSA-GFX9: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
216define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
217entry:
218  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
219  ret void
220}
221
222; FUNC-LABEL: {{^}}v2f32_arg:
223; HSA-GFX9: kernarg_segment_byte_size = 16
224; HSA-GFX9: kernarg_segment_alignment = 4
225
226; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
227; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
228; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
229; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
230; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
231define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
232entry:
233  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
234  ret void
235}
236
237; FUNC-LABEL: {{^}}v3i8_arg:
238; HSA-GFX9: kernarg_segment_byte_size = 12
239; HSA-GFX9: kernarg_segment_alignment = 4
240
241; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
242; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
243; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
244
245; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
246
247; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
248; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
249define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
250entry:
251  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
252  ret void
253}
254
255; FUNC-LABEL: {{^}}v3i16_arg:
256; HSA-GFX9: kernarg_segment_byte_size = 16
257; HSA-GFX9: kernarg_segment_alignment = 4
258
259; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
260; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
261; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
262
263; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
264
265; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
266; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
267define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
268entry:
269  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
270  ret void
271}
272
273; FUNC-LABEL: {{^}}v3i32_arg:
274; HSA-GFX9: kernarg_segment_byte_size = 32
275; HSA-GFX9: kernarg_segment_alignment = 4
276; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
277; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
278; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
279; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
280; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
281; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
282define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
283entry:
284  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
285  ret void
286}
287
288; FUNC-LABEL: {{^}}v3f32_arg:
289; HSA-GFX9: kernarg_segment_byte_size = 32
290; HSA-GFX9: kernarg_segment_alignment = 4
291; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
292; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
293; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
294; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
295; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
296; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
297define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
298entry:
299  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
300  ret void
301}
302
303; FUNC-LABEL: {{^}}v4i8_arg:
304; HSA-GFX9: kernarg_segment_byte_size = 12
305; HSA-GFX9: kernarg_segment_alignment = 4
306; EGCM: VTX_READ_8
307; EGCM: VTX_READ_8
308; EGCM: VTX_READ_8
309; EGCM: VTX_READ_8
310
311; GCN-DAG: s_load_dwordx2 s
312; GCN-DAG: s_load_dword s
313define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
314entry:
315  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
316  ret void
317}
318
319; FUNC-LABEL: {{^}}v4i16_arg:
320; HSA-GFX9: kernarg_segment_byte_size = 16
321; HSA-GFX9: kernarg_segment_alignment = 4
322; EGCM: VTX_READ_16
323; EGCM: VTX_READ_16
324; EGCM: VTX_READ_16
325; EGCM: VTX_READ_16
326
327; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
328; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
329
330; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
331; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
332
333
334; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
335; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
336
337; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
338; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
339define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
340entry:
341  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
342  ret void
343}
344
345; FUNC-LABEL: {{^}}v4i32_arg:
346; HSA-GFX9: kernarg_segment_byte_size = 32
347; HSA-GFX9: kernarg_segment_alignment = 4
348; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
349; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
350; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
351; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
352
353; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
354; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
355; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
356define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
357entry:
358  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
359  ret void
360}
361
362; FUNC-LABEL: {{^}}v4f32_arg:
363; HSA-GFX9: kernarg_segment_byte_size = 32
364; HSA-GFX9: kernarg_segment_alignment = 4
365; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
366; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
367; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
368; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
369; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
370; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
371; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
372define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
373entry:
374  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
375  ret void
376}
377
378; FUNC-LABEL: {{^}}v5i8_arg:
379; HSA-GFX9: kernarg_segment_byte_size = 16
380; HSA-GFX9: kernarg_segment_alignment = 4
381
382; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
383; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
384; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
385
386; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
387
388; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
389; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
390define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
391entry:
392  store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
393  ret void
394}
395
396; FUNC-LABEL: {{^}}v5i16_arg:
397; HSA-GFX9: kernarg_segment_byte_size = 32
398; HSA-GFX9: kernarg_segment_alignment = 4
399
400; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
401; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
402; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
403
404; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
405
406; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
407; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
408define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
409entry:
410  store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
411  ret void
412}
413
414; FUNC-LABEL: {{^}}v5i32_arg:
415; HSA-GFX9: kernarg_segment_byte_size = 64
416; HSA-GFX9: kernarg_segment_alignment = 5
417; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
418; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
419; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
420; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
421; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
422; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
423define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
424entry:
425  store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
426  ret void
427}
428
429; FUNC-LABEL: {{^}}v5f32_arg:
430; HSA-GFX9: kernarg_segment_byte_size = 64
431; HSA-GFX9: kernarg_segment_alignment = 5
432; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
433; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
434; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
435; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
436; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
437; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
438define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
439entry:
440  store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
441  ret void
442}
443
444; FUNC-LABEL: {{^}}v5i64_arg:
445; HSA-GFX9: kernarg_segment_byte_size = 128
446; HSA-GFX9: kernarg_segment_alignment = 6
447; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
448; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
449; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
450; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
451; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
452; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
453; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
454; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
455; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
456; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
457; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
458; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
459; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
460; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
461; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
462; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
463define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
464entry:
465  store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
466  ret void
467}
468
469; FUNC-LABEL: {{^}}v5f64_arg:
470; HSA-GFX9: kernarg_segment_byte_size = 128
471; HSA-GFX9: kernarg_segment_alignment = 6
472; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
473; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
474; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
475; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
476; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
477; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
478; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
479; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
480; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
481; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
482; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
483; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
484; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
485; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
486; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
487; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
488define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
489entry:
490  store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
491  ret void
492}
493
494; FIXME: Lots of unpack and re-pack junk on VI
495; FUNC-LABEL: {{^}}v8i8_arg:
496; HSA-GFX9: kernarg_segment_byte_size = 16
497; HSA-GFX9: kernarg_segment_alignment = 4
498; EGCM: VTX_READ_8
499; EGCM: VTX_READ_8
500; EGCM: VTX_READ_8
501; EGCM: VTX_READ_8
502; EGCM: VTX_READ_8
503; EGCM: VTX_READ_8
504; EGCM: VTX_READ_8
505; EGCM: VTX_READ_8
506
507; SI-NOT: {{buffer|flat|global}}_load
508; SI: s_load_dwordx2 s
509; SI-NEXT: s_load_dwordx2 s
510; SI-NOT: {{buffer|flat|global}}_load
511
512; VI: s_load_dwordx2 s
513; VI-NEXT: s_load_dwordx2 s
514; VI-NOT: lshl
515; VI-NOT: _or
516; VI-NOT: _sdwa
517define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
518entry:
519  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
520  ret void
521}
522
523; FUNC-LABEL: {{^}}v8i16_arg:
524; HSA-GFX9: kernarg_segment_byte_size = 32
525; HSA-GFX9: kernarg_segment_alignment = 4
526; EGCM: VTX_READ_16
527; EGCM: VTX_READ_16
528; EGCM: VTX_READ_16
529; EGCM: VTX_READ_16
530; EGCM: VTX_READ_16
531; EGCM: VTX_READ_16
532; EGCM: VTX_READ_16
533; EGCM: VTX_READ_16
534
535; SI: s_load_dwordx4
536; SI-NEXT: s_load_dwordx2
537; SI-NOT: {{buffer|flat|global}}_load
538
539
540; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
541
542; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
543define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
544entry:
545  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
546  ret void
547}
548
549; FUNC-LABEL: {{^}}v8i32_arg:
550; HSA-GFX9: kernarg_segment_byte_size = 64
551; HSA-GFX9: kernarg_segment_alignment = 5
552; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
553; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
554; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
555; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
556; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
557; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
558; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
559; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
560
561; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
562; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
563; HSA-GFX9: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
564define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
565entry:
566  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
567  ret void
568}
569
570; FUNC-LABEL: {{^}}v8f32_arg:
571; HSA-GFX9: kernarg_segment_byte_size = 64
572; HSA-GFX9: kernarg_segment_alignment = 5
573; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
574; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
575; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
576; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
577; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
578; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
579; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
580; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
581; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
582define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
583entry:
584  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
585  ret void
586}
587
588; FIXME: Pack/repack on VI
589
590; FUNC-LABEL: {{^}}v16i8_arg:
591; HSA-GFX9: kernarg_segment_byte_size = 32
592; HSA-GFX9: kernarg_segment_alignment = 4
593; EGCM: VTX_READ_8
594; EGCM: VTX_READ_8
595; EGCM: VTX_READ_8
596; EGCM: VTX_READ_8
597; EGCM: VTX_READ_8
598; EGCM: VTX_READ_8
599; EGCM: VTX_READ_8
600; EGCM: VTX_READ_8
601; EGCM: VTX_READ_8
602; EGCM: VTX_READ_8
603; EGCM: VTX_READ_8
604; EGCM: VTX_READ_8
605; EGCM: VTX_READ_8
606; EGCM: VTX_READ_8
607; EGCM: VTX_READ_8
608; EGCM: VTX_READ_8
609
610; SI: s_load_dwordx4 s
611; SI-NEXT: s_load_dwordx2 s
612; SI-NOT: {{buffer|flat|global}}_load
613
614
615; VI: s_load_dwordx4 s
616; VI-NOT: shr
617; VI-NOT: shl
618; VI-NOT: _sdwa
619; VI-NOT: _or_
620define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
621entry:
622  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
623  ret void
624}
625
626; FUNC-LABEL: {{^}}v16i16_arg:
627; HSA-GFX9: kernarg_segment_byte_size = 64
628; HSA-GFX9: kernarg_segment_alignment = 5
629; EGCM: VTX_READ_16
630; EGCM: VTX_READ_16
631; EGCM: VTX_READ_16
632; EGCM: VTX_READ_16
633; EGCM: VTX_READ_16
634
635; EGCM: VTX_READ_16
636; EGCM: VTX_READ_16
637; EGCM: VTX_READ_16
638; EGCM: VTX_READ_16
639; EGCM: VTX_READ_16
640; EGCM: VTX_READ_16
641; EGCM: VTX_READ_16
642; EGCM: VTX_READ_16
643; EGCM: VTX_READ_16
644; EGCM: VTX_READ_16
645; EGCM: VTX_READ_16
646
647; SI: s_load_dwordx8 s
648; SI-NEXT: s_load_dwordx2 s
649; SI-NOT: {{buffer|flat|global}}_load
650
651
652; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
653
654; HSA-GFX9: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
655define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
656entry:
657  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
658  ret void
659}
660
661; FUNC-LABEL: {{^}}v16i32_arg:
662; HSA-GFX9: kernarg_segment_byte_size = 128
663; HSA-GFX9: kernarg_segment_alignment = 6
664; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
665; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
666; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
667; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
668; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
669; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
670; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
671; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
672; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
673; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
674; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
675; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
676; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
677; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
678; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
679; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
680; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
681; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
682; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
683define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
684entry:
685  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
686  ret void
687}
688
689; FUNC-LABEL: {{^}}v16f32_arg:
690; HSA-GFX9: kernarg_segment_byte_size = 128
691; HSA-GFX9: kernarg_segment_alignment = 6
692; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
693; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
694; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
695; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
696; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
697; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
698; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
699; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
700; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
701; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
702; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
703; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
704; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
705; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
706; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
707; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
708; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
709; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
710; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
711define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
712entry:
713  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
714  ret void
715}
716
717; FUNC-LABEL: {{^}}kernel_arg_i64:
718; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
719; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
720
721; MESA-GCN: buffer_store_dwordx2
722define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
723  store i64 %a, i64 addrspace(1)* %out, align 8
724  ret void
725}
726
727; FUNC-LABEL: {{^}}f64_kernel_arg:
728; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
729; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
730; MESA-GCN: buffer_store_dwordx2
731
732; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
733define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
734entry:
735  store double %in, double addrspace(1)* %out
736  ret void
737}
738
739; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
740; XGCN: s_load_dwordx2
741; XGCN: s_load_dwordx2
742; XGCN: buffer_store_dwordx2
743; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
744;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
745;   ret void
746; }
747
748; FUNC-LABEL: {{^}}i65_arg:
749; HSA-GFX9: kernarg_segment_byte_size = 24
750; HSA-GFX9: kernarg_segment_alignment = 4
751; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
752; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
753define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
754entry:
755  store i65 %in, i65 addrspace(1)* %out, align 4
756  ret void
757}
758
759; FUNC-LABEL: {{^}}i1_arg:
760; HSA-GFX9: kernarg_segment_byte_size = 12
761; HSA-GFX9: kernarg_segment_alignment = 4
762
763; GCN: s_load_dword s
764; GCN: s_and_b32
765; GCN: {{buffer|flat|global}}_store_byte
766define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
767  store i1 %x, i1 addrspace(1)* %out, align 1
768  ret void
769}
770
771; FUNC-LABEL: {{^}}i1_arg_zext_i32:
772; HSA-GFX9: kernarg_segment_byte_size = 12
773; HSA-GFX9: kernarg_segment_alignment = 4
774
775; GCN: s_load_dword
776; GCN: {{buffer|flat|global}}_store_dword
777define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
778  %ext = zext i1 %x to i32
779  store i32 %ext, i32 addrspace(1)* %out, align 4
780  ret void
781}
782
783; FUNC-LABEL: {{^}}i1_arg_zext_i64:
784; HSA-GFX9: kernarg_segment_byte_size = 12
785; HSA-GFX9: kernarg_segment_alignment = 4
786
787; GCN: s_load_dword s
788; GCN: {{buffer|flat|global}}_store_dwordx2
789define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
790  %ext = zext i1 %x to i64
791  store i64 %ext, i64 addrspace(1)* %out, align 8
792  ret void
793}
794
795; FUNC-LABEL: {{^}}i1_arg_sext_i32:
796; HSA-GFX9: kernarg_segment_byte_size = 12
797; HSA-GFX9: kernarg_segment_alignment = 4
798
799; GCN: s_load_dword
800; GCN: {{buffer|flat|global}}_store_dword
801define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
802  %ext = sext i1 %x to i32
803  store i32 %ext, i32addrspace(1)* %out, align 4
804  ret void
805}
806
807; FUNC-LABEL: {{^}}i1_arg_sext_i64:
808; HSA-GFX9: kernarg_segment_byte_size = 12
809; HSA-GFX9: kernarg_segment_alignment = 4
810
811; GCN: s_load_dword
812; GCN: s_bfe_i64
813; GCN: {{buffer|flat|global}}_store_dwordx2
814define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
815  %ext = sext i1 %x to i64
816  store i64 %ext, i64 addrspace(1)* %out, align 8
817  ret void
818}
819
820; FUNC-LABEL: {{^}}empty_struct_arg:
821; HSA-GFX9: kernarg_segment_byte_size = 0
822define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
823  ret void
824}
825
826; The correct load offsets for these:
827; load 4 from 0,
828; load 8 from 8
829; load 4 from 24
830; load 8 from 32
831
832; With the SelectionDAG argument lowering, the alignments for the
833; struct members is not properly considered, making these wrong.
834
835; FIXME: Total argument size is computed wrong
836; FUNC-LABEL: {{^}}struct_argument_alignment:
837; HSA-GFX9: kernarg_segment_byte_size = 40
838; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
839; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
840; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
841; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
842define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
843  %val0 = extractvalue {i32, i64} %arg0, 0
844  %val1 = extractvalue {i32, i64} %arg0, 1
845  %val2 = extractvalue {i32, i64} %arg1, 0
846  %val3 = extractvalue {i32, i64} %arg1, 1
847  store volatile i32 %val0, i32 addrspace(1)* null
848  store volatile i64 %val1, i64 addrspace(1)* null
849  store volatile i32 %val2, i32 addrspace(1)* null
850  store volatile i64 %val3, i64 addrspace(1)* null
851  ret void
852}
853
854; No padding between i8 and next struct, but round up at end to 4 byte
855; multiple.
856; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
857; HSA-GFX9: kernarg_segment_byte_size = 28
858; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
859; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
860; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
861; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17
862; HSA-GFX9: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13
863define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
864  %val0 = extractvalue <{i32, i64}> %arg0, 0
865  %val1 = extractvalue <{i32, i64}> %arg0, 1
866  %val2 = extractvalue <{i32, i64}> %arg1, 0
867  %val3 = extractvalue <{i32, i64}> %arg1, 1
868  store volatile i32 %val0, i32 addrspace(1)* null
869  store volatile i64 %val1, i64 addrspace(1)* null
870  store volatile i32 %val2, i32 addrspace(1)* null
871  store volatile i64 %val3, i64 addrspace(1)* null
872  ret void
873}
874
875; GCN-LABEL: {{^}}struct_argument_alignment_after:
876; HSA-GFX9: kernarg_segment_byte_size = 64
877; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
878; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
879; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
880; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
881; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
882define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
883  %val0 = extractvalue {i32, i64} %arg0, 0
884  %val1 = extractvalue {i32, i64} %arg0, 1
885  %val2 = extractvalue {i32, i64} %arg2, 0
886  %val3 = extractvalue {i32, i64} %arg2, 1
887  store volatile i32 %val0, i32 addrspace(1)* null
888  store volatile i64 %val1, i64 addrspace(1)* null
889  store volatile i32 %val2, i32 addrspace(1)* null
890  store volatile i64 %val3, i64 addrspace(1)* null
891  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
892  ret void
893}
894
895; GCN-LABEL: {{^}}array_3xi32:
896; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
897; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
898; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
899; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
900define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
901  store volatile i16 %arg0, i16 addrspace(1)* undef
902  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
903  ret void
904}
905
906; FIXME: Why not all scalar loads?
907; GCN-LABEL: {{^}}array_3xi16:
908; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
909; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2
910; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4
911; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6
912define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
913  store volatile i8 %arg0, i8 addrspace(1)* undef
914  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
915  ret void
916}
917
918; GCN-LABEL: {{^}}small_array_round_down_offset:
919; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
920; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1
921define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
922  %val = extractvalue [1 x i8] %arg, 0
923  store volatile i8 %val, i8 addrspace(1)* undef
924  ret void
925}
926
927; GCN-LABEL: {{^}}byref_align_constant_i32_arg:
928; HSA-GFX9: kernarg_segment_byte_size = 264
929; HSA-GFX9-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x100{{$}}
930define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
931  %in = load i32, i32 addrspace(4)* %in.byref
932  store volatile i32 %in, i32 addrspace(1)* %out, align 4
933  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
934  ret void
935}
936
937; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg:
938; HSA-GFX9: kernarg_segment_byte_size = 132
939; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80
940; HSA-GFX9-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}}
941define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
942  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
943  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
944  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
945  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
946  ret void
947}
948