1; Verifies correctness of load/store of parameters and return values.
2; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
3
4%s_i1 = type { i1 }
5%s_i8 = type { i8 }
6%s_i16 = type { i16 }
7%s_f16 = type { half }
8%s_i32 = type { i32 }
9%s_f32 = type { float }
10%s_i64 = type { i64 }
11%s_f64 = type { double }
12
13; More complicated types. i64 is used to increase natural alignment
14; requirement for the type.
15%s_i32x4 = type { i32, i32, i32, i32, i64}
16%s_i32f32 = type { i32, float, i32, float, i64}
17%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
18%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
19%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
20; All scalar parameters must be at least 32 bits in size.
21; i1 is loaded/stored as i8.
22
23; CHECK: .func  (.param .b32 func_retval0)
24; CHECK-LABEL: test_i1(
25; CHECK-NEXT: .param .b32 test_i1_param_0
26; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1_param_0];
27; CHECK:      and.b16 [[A:%rs[0-9]+]], [[A8]], 1;
28; CHECK:      setp.eq.b16 %p1, [[A]], 1
29; CHECK:      cvt.u32.u16 [[B:%r[0-9]+]], [[A8]]
30; CHECK:      and.b32 [[C:%r[0-9]+]], [[B]], 1;
31; CHECK:      .param .b32 param0;
32; CHECK:      st.param.b32    [param0+0], [[C]]
33; CHECK:      .param .b32 retval0;
34; CHECK:      call.uni
35; CHECK-NEXT: test_i1,
36; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
37; CHECK:      and.b32         [[R:%r[0-9]+]], [[R8]], 1;
38; CHECK:      st.param.b32    [func_retval0+0], [[R]];
39; CHECK:      ret;
40define i1 @test_i1(i1 %a) {
41  %r = tail call i1 @test_i1(i1 %a);
42  ret i1 %r;
43}
44
45; Signed i1 is a somewhat special case. We only care about one bit and
46; then us neg.s32 to convert it to 32-bit -1 if it's set.
47; CHECK: .func  (.param .b32 func_retval0)
48; CHECK-LABEL: test_i1s(
49; CHECK-NEXT: .param .b32 test_i1s_param_0
50; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
51; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
52; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
53; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
54; CHECK:      .param .b32 param0;
55; CHECK:      st.param.b32    [param0+0], [[A]];
56; CHECK:      .param .b32 retval0;
57; CHECK:      call.uni
58; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
59; CHECK:      and.b32         [[R1:%r[0-9]+]], [[R8]], 1;
60; CHECK:      neg.s32         [[R:%r[0-9]+]], [[R1]];
61; CHECK:      st.param.b32    [func_retval0+0], [[R]];
62; CHECK-NEXT: ret;
63define signext i1 @test_i1s(i1 signext %a) {
64       %r = tail call signext i1 @test_i1s(i1 signext %a);
65       ret i1 %r;
66}
67
68; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
69; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
70; CHECK-LABEL: test_v3i1(
71; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
72; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
73; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
74; CHECK:      .param .align 4 .b8 param0[4];
75; CHECK-DAG:  st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
76; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
77; CHECK:      .param .align 4 .b8 retval0[4];
78; CHECK:      call.uni (retval0),
79; CHECK-NEXT: test_v3i1,
80; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
81; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
82; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]}
83; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
84; CHECK-NEXT: ret;
85define <3 x i1> @test_v3i1(<3 x i1> %a) {
86       %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
87       ret <3 x i1> %r;
88}
89
90; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
91; CHECK-LABEL: test_v4i1(
92; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
93; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
94; CHECK:      .param .align 4 .b8 param0[4];
95; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
96; CHECK:      .param .align 4 .b8 retval0[4];
97; CHECK:      call.uni (retval0),
98; CHECK:      test_v4i1,
99; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
100; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
101; CHECK-NEXT: ret;
102define <4 x i1> @test_v4i1(<4 x i1> %a) {
103       %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
104       ret <4 x i1> %r;
105}
106
107; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
108; CHECK-LABEL: test_v5i1(
109; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
110; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
111; CHECK-DAG:  ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
112; CHECK:      .param .align 8 .b8 param0[8];
113; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
114; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
115; CHECK:      .param .align 8 .b8 retval0[8];
116; CHECK:      call.uni (retval0),
117; CHECK-NEXT: test_v5i1,
118; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
119; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
120; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
121; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
122; CHECK-NEXT: ret;
123define <5 x i1> @test_v5i1(<5 x i1> %a) {
124       %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
125       ret <5 x i1> %r;
126}
127
128; Unsigned i8 is loaded directly into 32-bit register.
129; CHECK: .func  (.param .b32 func_retval0)
130; CHECK-LABEL: test_i8(
131; CHECK-NEXT: .param .b32 test_i8_param_0
132; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
133; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
134; CHECK:      and.b32         [[A:%r[0-9]+]], [[A32]], 255;
135; CHECK:      .param .b32 param0;
136; CHECK:      st.param.b32    [param0+0], [[A]];
137; CHECK:      .param .b32 retval0;
138; CHECK:      call.uni (retval0),
139; CHECK:      test_i8,
140; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
141; CHECK:      and.b32         [[R:%r[0-9]+]], [[R32]], 255;
142; CHECK:      st.param.b32    [func_retval0+0], [[R]];
143; CHECK-NEXT: ret;
144define i8 @test_i8(i8 %a) {
145       %r = tail call i8 @test_i8(i8 %a);
146       ret i8 %r;
147}
148
149; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
150; CHECK: .func  (.param .b32 func_retval0)
151; CHECK-LABEL: test_i8s(
152; CHECK-NEXT: .param .b32 test_i8s_param_0
153; CHECK:      ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
154; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
155; CHECK:      .param .b32 param0;
156; CHECK:      st.param.b32    [param0+0], [[A]];
157; CHECK:      .param .b32 retval0;
158; CHECK:      call.uni (retval0),
159; CHECK:      test_i8s,
160; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
161; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
162; CHECK:      cvt.u16.u32     [[R16:%rs[0-9]+]], [[R32]];
163; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[R16]];
164; CHECK:      st.param.b32    [func_retval0+0], [[R]];
165; CHECK-NEXT: ret;
166define signext i8 @test_i8s(i8 signext %a) {
167       %r = tail call signext i8 @test_i8s(i8 signext %a);
168       ret i8 %r;
169}
170
171; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
172; CHECK-LABEL: test_v3i8(
173; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
174; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
175; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
176; CHECK:      .param .align 4 .b8 param0[4];
177; CHECK:      st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
178; CHECK:      st.param.b8     [param0+2], [[E2]];
179; CHECK:      .param .align 4 .b8 retval0[4];
180; CHECK:      call.uni (retval0),
181; CHECK-NEXT: test_v3i8,
182; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
183; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
184; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]};
185; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
186; CHECK-NEXT: ret;
187define <3 x i8> @test_v3i8(<3 x i8> %a) {
188       %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
189       ret <3 x i8> %r;
190}
191
192; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
193; CHECK-LABEL: test_v4i8(
194; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
195; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
196; CHECK:      .param .align 4 .b8 param0[4];
197; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
198; CHECK:      .param .align 4 .b8 retval0[4];
199; CHECK:      call.uni (retval0),
200; CHECK-NEXT: test_v4i8,
201; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
202; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
203; CHECK-NEXT: ret;
204define <4 x i8> @test_v4i8(<4 x i8> %a) {
205       %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
206       ret <4 x i8> %r;
207}
208
209; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
210; CHECK-LABEL: test_v5i8(
211; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
212; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
213; CHECK-DAG   ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
214; CHECK:      .param .align 8 .b8 param0[8];
215; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
216; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
217; CHECK:      .param .align 8 .b8 retval0[8];
218; CHECK:      call.uni (retval0),
219; CHECK-NEXT: test_v5i8,
220; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
221; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
222; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
223; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
224; CHECK-NEXT: ret;
225define <5 x i8> @test_v5i8(<5 x i8> %a) {
226       %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
227       ret <5 x i8> %r;
228}
229
230; CHECK: .func  (.param .b32 func_retval0)
231; CHECK-LABEL: test_i16(
232; CHECK-NEXT: .param .b32 test_i16_param_0
233; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16_param_0];
234; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
235; CHECK:      .param .b32 param0;
236; CHECK:      st.param.b32    [param0+0], [[E32]];
237; CHECK:      .param .b32 retval0;
238; CHECK:      call.uni (retval0),
239; CHECK-NEXT: test_i16,
240; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
241; CHECK:      and.b32         [[R:%r[0-9]+]], [[RE32]], 65535;
242; CHECK:      st.param.b32    [func_retval0+0], [[R]];
243; CHECK-NEXT: ret;
244define i16 @test_i16(i16 %a) {
245       %r = tail call i16 @test_i16(i16 %a);
246       ret i16 %r;
247}
248
249; CHECK: .func  (.param .b32 func_retval0)
250; CHECK-LABEL: test_i16s(
251; CHECK-NEXT: .param .b32 test_i16s_param_0
252; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
253; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
254; CHECK:      .param .b32 param0;
255; CHECK:      st.param.b32    [param0+0], [[E32]];
256; CHECK:      .param .b32 retval0;
257; CHECK:      call.uni (retval0),
258; CHECK-NEXT: test_i16s,
259; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
260; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
261; CHECK:      st.param.b32    [func_retval0+0], [[R]];
262; CHECK-NEXT: ret;
263define signext i16 @test_i16s(i16 signext %a) {
264       %r = tail call signext i16 @test_i16s(i16 signext %a);
265       ret i16 %r;
266}
267
268; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
269; CHECK-LABEL: test_v3i16(
270; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
271; CHECK-DAG:  ld.param.u16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
272; CHECK-DAG:  ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
273; CHECK:      .param .align 8 .b8 param0[8];
274; CHECK:      st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
275; CHECK:      st.param.b16    [param0+4], [[E2]];
276; CHECK:      .param .align 8 .b8 retval0[8];
277; CHECK:      call.uni (retval0),
278; CHECK-NEXT: test_v3i16,
279; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
280; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
281; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
282; CHECK-DAG:  st.param.b16    [func_retval0+4], [[RE2]];
283; CHECK-NEXT: ret;
284define <3 x i16> @test_v3i16(<3 x i16> %a) {
285       %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
286       ret <3 x i16> %r;
287}
288
289; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
290; CHECK-LABEL: test_v4i16(
291; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
292; CHECK:      ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
293; CHECK:      .param .align 8 .b8 param0[8];
294; CHECK:      st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
295; CHECK:      .param .align 8 .b8 retval0[8];
296; CHECK:      call.uni (retval0),
297; CHECK-NEXT: test_v4i16,
298; CHECK:      ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
299; CHECK:      st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
300; CHECK-NEXT: ret;
301define <4 x i16> @test_v4i16(<4 x i16> %a) {
302       %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
303       ret <4 x i16> %r;
304}
305
306; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
307; CHECK-LABEL: test_v5i16(
308; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
309; CHECK-DAG:  ld.param.u16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
310; CHECK-DAG   ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
311; CHECK:      .param .align 16 .b8 param0[16];
312; CHECK-DAG:  st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
313; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
314; CHECK:      .param .align 16 .b8 retval0[16];
315; CHECK:      call.uni (retval0),
316; CHECK-NEXT: test_v5i16,
317; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
318; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
319; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
320; CHECK-DAG:  st.param.b16    [func_retval0+8], [[RE4]];
321; CHECK-NEXT: ret;
322define <5 x i16> @test_v5i16(<5 x i16> %a) {
323       %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
324       ret <5 x i16> %r;
325}
326
327; CHECK: .func  (.param .b32 func_retval0)
328; CHECK-LABEL: test_f16(
329; CHECK-NEXT: .param .b32 test_f16_param_0
330; CHECK:      ld.param.b16    [[E:%h[0-9]+]], [test_f16_param_0];
331; CHECK:      .param .b32 param0;
332; CHECK:      st.param.b16    [param0+0], [[E]];
333; CHECK:      .param .b32 retval0;
334; CHECK:      call.uni (retval0),
335; CHECK-NEXT: test_f16,
336; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
337; CHECK:      st.param.b16    [func_retval0+0], [[R]]
338; CHECK-NEXT: ret;
339define half @test_f16(half %a) {
340       %r = tail call half @test_f16(half %a);
341       ret half %r;
342}
343
344; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
345; CHECK-LABEL: test_v2f16(
346; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
347; CHECK:      ld.param.b32    [[E:%hh[0-9]+]], [test_v2f16_param_0];
348; CHECK:      .param .align 4 .b8 param0[4];
349; CHECK:      st.param.b32    [param0+0], [[E]];
350; CHECK:      .param .align 4 .b8 retval0[4];
351; CHECK:      call.uni (retval0),
352; CHECK-NEXT: test_v2f16,
353; CHECK:      ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
354; CHECK:      st.param.b32    [func_retval0+0], [[R]]
355; CHECK-NEXT: ret;
356define <2 x half> @test_v2f16(<2 x half> %a) {
357       %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
358       ret <2 x half> %r;
359}
360
361; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
362; CHECK-LABEL: test_v3f16(
363; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
364; CHECK-DAG:  ld.param.b32    [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
365; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
366; CHECK-DAG:  ld.param.b16    [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
367; CHECK:      .param .align 8 .b8 param0[8];
368; CHECK-DAG:  st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
369; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
370; CHECK:      .param .align 8 .b8 retval0[8];
371; CHECK:      call.uni (retval0),
372; CHECK:      test_v3f16,
373; CHECK-DAG:  ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
374; CHECK-DAG:  ld.param.b16    [[R2:%h[0-9]+]], [retval0+4];
375; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
376; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
377; CHECK:      ret;
378define <3 x half> @test_v3f16(<3 x half> %a) {
379       %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
380       ret <3 x half> %r;
381}
382
383; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
384; CHECK-LABEL: test_v4f16(
385; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
386; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
387; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
388; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
389; CHECK:      .param .align 8 .b8 param0[8];
390; CHECK:      st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
391; CHECK:      .param .align 8 .b8 retval0[8];
392; CHECK:      call.uni (retval0),
393; CHECK:      test_v4f16,
394; CHECK:      ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
395; CHECK:      st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
396; CHECK:      ret;
397define <4 x half> @test_v4f16(<4 x half> %a) {
398       %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
399       ret <4 x half> %r;
400}
401
402; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
403; CHECK-LABEL: test_v5f16(
404; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
405; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
406; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
407; CHECK-DAG:  ld.param.b16    [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
408; CHECK:      .param .align 16 .b8 param0[16];
409; CHECK-DAG:  st.param.v4.b16 [param0+0],
410; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
411; CHECK:      .param .align 16 .b8 retval0[16];
412; CHECK:      call.uni (retval0),
413; CHECK:      test_v5f16,
414; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
415; CHECK-DAG:  ld.param.b16    [[R4:%h[0-9]+]], [retval0+8];
416; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
417; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
418; CHECK:      ret;
419define <5 x half> @test_v5f16(<5 x half> %a) {
420       %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
421       ret <5 x half> %r;
422}
423
424; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
425; CHECK-LABEL: test_v8f16(
426; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
427; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
428; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
429; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
430; CHECK-DAG:  mov.b32         [[HH45:%hh[0-9]+]], [[R45]];
431; CHECK-DAG:  mov.b32         [[HH67:%hh[0-9]+]], [[R67]];
432; CHECK:      .param .align 16 .b8 param0[16];
433; CHECK:      st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
434; CHECK:      .param .align 16 .b8 retval0[16];
435; CHECK:      call.uni (retval0),
436; CHECK:      test_v8f16,
437; CHECK:      ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
438; CHECK:      st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
439; CHECK:      ret;
440define <8 x half> @test_v8f16(<8 x half> %a) {
441       %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
442       ret <8 x half> %r;
443}
444
445; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
446; CHECK-LABEL: test_v9f16(
447; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
448; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
449; CHECK-DAG:  ld.param.v4.b16  {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
450; CHECK-DAG:  ld.param.b16     [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
451; CHECK:      .param .align 32 .b8 param0[32];
452; CHECK-DAG:  st.param.v4.b16 [param0+0],
453; CHECK-DAG:  st.param.v4.b16 [param0+8],
454; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
455; CHECK:      .param .align 32 .b8 retval0[32];
456; CHECK:      call.uni (retval0),
457; CHECK:      test_v9f16,
458; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
459; CHECK-DAG:  ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
460; CHECK-DAG:  ld.param.b16    [[R8:%h[0-9]+]], [retval0+16];
461; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
462; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
463; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
464; CHECK:      ret;
465define <9 x half> @test_v9f16(<9 x half> %a) {
466       %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
467       ret <9 x half> %r;
468}
469
470; CHECK: .func  (.param .b32 func_retval0)
471; CHECK-LABEL: test_i32(
472; CHECK-NEXT: .param .b32 test_i32_param_0
473; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_i32_param_0];
474; CHECK:      .param .b32 param0;
475; CHECK:      st.param.b32    [param0+0], [[E]];
476; CHECK:      .param .b32 retval0;
477; CHECK:      call.uni (retval0),
478; CHECK-NEXT: test_i32,
479; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
480; CHECK:      st.param.b32    [func_retval0+0], [[R]];
481; CHECK-NEXT: ret;
482define i32 @test_i32(i32 %a) {
483       %r = tail call i32 @test_i32(i32 %a);
484       ret i32 %r;
485}
486
487; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
488; CHECK-LABEL: test_v3i32(
489; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
490; CHECK-DAG:  ld.param.u32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
491; CHECK-DAG:  ld.param.v2.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
492; CHECK:      .param .align 16 .b8 param0[16];
493; CHECK:      st.param.v2.b32  [param0+0], {[[E0]], [[E1]]};
494; CHECK:      st.param.b32     [param0+8], [[E2]];
495; CHECK:      .param .align 16 .b8 retval0[16];
496; CHECK:      call.uni (retval0),
497; CHECK-NEXT: test_v3i32,
498; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
499; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
500; CHECK-DAG:  st.param.v2.b32  [func_retval0+0], {[[RE0]], [[RE1]]};
501; CHECK-DAG:  st.param.b32     [func_retval0+8], [[RE2]];
502; CHECK-NEXT: ret;
503define <3 x i32> @test_v3i32(<3 x i32> %a) {
504       %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
505       ret <3 x i32> %r;
506}
507
508; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
509; CHECK-LABEL: test_v4i32(
510; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
511; CHECK:      ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
512; CHECK:      .param .align 16 .b8 param0[16];
513; CHECK:      st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
514; CHECK:      .param .align 16 .b8 retval0[16];
515; CHECK:      call.uni (retval0),
516; CHECK-NEXT: test_v4i32,
517; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
518; CHECK:      st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
519; CHCK-NEXT: ret;
520define <4 x i32> @test_v4i32(<4 x i32> %a) {
521       %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
522       ret <4 x i32> %r;
523}
524
525; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
526; CHECK-LABEL: test_v5i32(
527; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
528; CHECK-DAG:  ld.param.u32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
529; CHECK-DAG   ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
530; CHECK:      .param .align 32 .b8 param0[32];
531; CHECK-DAG:  st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
532; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
533; CHECK:      .param .align 32 .b8 retval0[32];
534; CHECK:      call.uni (retval0),
535; CHECK-NEXT: test_v5i32,
536; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
537; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
538; CHECK-DAG:  st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
539; CHECK-DAG:  st.param.b32     [func_retval0+16], [[RE4]];
540; CHECK-NEXT: ret;
541define <5 x i32> @test_v5i32(<5 x i32> %a) {
542       %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
543       ret <5 x i32> %r;
544}
545
546; CHECK: .func  (.param .b32 func_retval0)
547; CHECK-LABEL: test_f32(
548; CHECK-NEXT: .param .b32 test_f32_param_0
549; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_f32_param_0];
550; CHECK:      .param .b32 param0;
551; CHECK:      st.param.f32    [param0+0], [[E]];
552; CHECK:      .param .b32 retval0;
553; CHECK:      call.uni (retval0),
554; CHECK-NEXT: test_f32,
555; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
556; CHECK:      st.param.f32    [func_retval0+0], [[R]];
557; CHECK-NEXT: ret;
558define float @test_f32(float %a) {
559       %r = tail call float @test_f32(float %a);
560       ret float %r;
561}
562
563; CHECK: .func  (.param .b64 func_retval0)
564; CHECK-LABEL: test_i64(
565; CHECK-NEXT: .param .b64 test_i64_param_0
566; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_i64_param_0];
567; CHECK:      .param .b64 param0;
568; CHECK:      st.param.b64    [param0+0], [[E]];
569; CHECK:      .param .b64 retval0;
570; CHECK:      call.uni (retval0),
571; CHECK-NEXT: test_i64,
572; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
573; CHECK:      st.param.b64    [func_retval0+0], [[R]];
574; CHECK-NEXT: ret;
575define i64 @test_i64(i64 %a) {
576       %r = tail call i64 @test_i64(i64 %a);
577       ret i64 %r;
578}
579
580; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
581; CHECK-LABEL: test_v3i64(
582; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
583; CHECK-DAG:  ld.param.u64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
584; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
585; CHECK:      .param .align 32 .b8 param0[32];
586; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
587; CHECK:      st.param.b64     [param0+16], [[E2]];
588; CHECK:      .param .align 32 .b8 retval0[32];
589; CHECK:      call.uni (retval0),
590; CHECK-NEXT: test_v3i64,
591; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
592; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
593; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
594; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
595; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
596; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
597; CHECK-NEXT: ret;
598define <3 x i64> @test_v3i64(<3 x i64> %a) {
599       %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
600       ret <3 x i64> %r;
601}
602
603; For i64 vector loads are limited by PTX to 2 elements.
604; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
605; CHECK-LABEL: test_v4i64(
606; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
607; CHECK-DAG:  ld.param.v2.u64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
608; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
609; CHECK:      .param .align 32 .b8 param0[32];
610; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
611; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
612; CHECK:      .param .align 32 .b8 retval0[32];
613; CHECK:      call.uni (retval0),
614; CHECK-NEXT: test_v4i64,
615; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
616; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
617; CHECK-DAG:  st.param.v2.b64  [func_retval0+16], {[[RE2]], [[RE3]]};
618; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
619; CHECK-NEXT: ret;
620define <4 x i64> @test_v4i64(<4 x i64> %a) {
621       %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
622       ret <4 x i64> %r;
623}
624
625; Aggregates, on the other hand, do not get extended.
626
627; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
628; CHECK-LABEL: test_s_i1(
629; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
630; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
631; CHECK:      .param .align 1 .b8 param0[1];
632; CHECK:      st.param.b8    [param0+0], [[A]]
633; CHECK:      .param .align 1 .b8 retval0[1];
634; CHECK:      call.uni
635; CHECK-NEXT: test_s_i1,
636; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
637; CHECK:      st.param.b8    [func_retval0+0], [[R]];
638; CHECK-NEXT: ret;
639define %s_i1 @test_s_i1(%s_i1 %a) {
640       %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
641       ret %s_i1 %r;
642}
643
644; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
645; CHECK-LABEL: test_s_i8(
646; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
647; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
648; CHECK:      .param .align 1 .b8 param0[1];
649; CHECK:      st.param.b8    [param0+0], [[A]]
650; CHECK:      .param .align 1 .b8 retval0[1];
651; CHECK:      call.uni
652; CHECK-NEXT: test_s_i8,
653; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
654; CHECK:      st.param.b8    [func_retval0+0], [[R]];
655; CHECK-NEXT: ret;
656define %s_i8 @test_s_i8(%s_i8 %a) {
657       %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
658       ret %s_i8 %r;
659}
660
661; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
662; CHECK-LABEL: test_s_i16(
663; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
664; CHECK:      ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
665; CHECK:      .param .align 2 .b8 param0[2];
666; CHECK:      st.param.b16    [param0+0], [[A]]
667; CHECK:      .param .align 2 .b8 retval0[2];
668; CHECK:      call.uni
669; CHECK-NEXT: test_s_i16,
670; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
671; CHECK:      st.param.b16    [func_retval0+0], [[R]];
672; CHECK-NEXT: ret;
673define %s_i16 @test_s_i16(%s_i16 %a) {
674       %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
675       ret %s_i16 %r;
676}
677
678; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
679; CHECK-LABEL: test_s_f16(
680; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
681; CHECK:      ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
682; CHECK:      .param .align 2 .b8 param0[2];
683; CHECK:      st.param.b16    [param0+0], [[A]]
684; CHECK:      .param .align 2 .b8 retval0[2];
685; CHECK:      call.uni
686; CHECK-NEXT: test_s_f16,
687; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
688; CHECK:      st.param.b16    [func_retval0+0], [[R]];
689; CHECK-NEXT: ret;
690define %s_f16 @test_s_f16(%s_f16 %a) {
691       %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
692       ret %s_f16 %r;
693}
694
695; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
696; CHECK-LABEL: test_s_i32(
697; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
698; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_s_i32_param_0];
699; CHECK:      .param .align 4 .b8 param0[4]
700; CHECK:      st.param.b32    [param0+0], [[E]];
701; CHECK:      .param .align 4 .b8 retval0[4];
702; CHECK:      call.uni (retval0),
703; CHECK-NEXT: test_s_i32,
704; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
705; CHECK:      st.param.b32    [func_retval0+0], [[R]];
706; CHECK-NEXT: ret;
707define %s_i32 @test_s_i32(%s_i32 %a) {
708       %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
709       ret %s_i32 %r;
710}
711
712; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
713; CHECK-LABEL: test_s_f32(
714; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
715; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_s_f32_param_0];
716; CHECK:      .param .align 4 .b8 param0[4]
717; CHECK:      st.param.f32    [param0+0], [[E]];
718; CHECK:      .param .align 4 .b8 retval0[4];
719; CHECK:      call.uni (retval0),
720; CHECK-NEXT: test_s_f32,
721; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
722; CHECK:      st.param.f32    [func_retval0+0], [[R]];
723; CHECK-NEXT: ret;
724define %s_f32 @test_s_f32(%s_f32 %a) {
725       %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
726       ret %s_f32 %r;
727}
728
729; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
730; CHECK-LABEL: test_s_i64(
731; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
732; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
733; CHECK:      .param .align 8 .b8 param0[8];
734; CHECK:      st.param.b64    [param0+0], [[E]];
735; CHECK:      .param .align 8 .b8 retval0[8];
736; CHECK:      call.uni (retval0),
737; CHECK-NEXT: test_s_i64,
738; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
739; CHECK:      st.param.b64    [func_retval0+0], [[R]];
740; CHECK-NEXT: ret;
741define %s_i64 @test_s_i64(%s_i64 %a) {
742       %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
743       ret %s_i64 %r;
744}
745
746; Fields that have different types, but identical sizes are not vectorized.
747; CHECK: .func  (.param .align 8 .b8 func_retval0[24])
748; CHECK-LABEL: test_s_i32f32(
749; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
750; CHECK-DAG:    ld.param.u64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
751; CHECK-DAG:    ld.param.f32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
752; CHECK-DAG:    ld.param.u32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
753; CHECK-DAG:    ld.param.f32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
754; CHECK-DAG:    ld.param.u32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
755; CHECK:        .param .align 8 .b8 param0[24];
756; CHECK-DAG:    st.param.b32    [param0+0], [[E0]];
757; CHECK-DAG:    st.param.f32    [param0+4], [[E1]];
758; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
759; CHECK-DAG:    st.param.f32    [param0+12], [[E3]];
760; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
761; CHECK:        .param .align 8 .b8 retval0[24];
762; CHECK:        call.uni (retval0),
763; CHECK-NEXT:   test_s_i32f32,
764; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0+0];
765; CHECK-DAG:    ld.param.f32    [[RE1:%f[0-9]+]], [retval0+4];
766; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
767; CHECK-DAG:    ld.param.f32    [[RE3:%f[0-9]+]], [retval0+12];
768; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
769; CHECK-DAG:    st.param.b32    [func_retval0+0], [[RE0]];
770; CHECK-DAG:    st.param.f32    [func_retval0+4], [[RE1]];
771; CHECK-DAG:    st.param.b32    [func_retval0+8], [[RE2]];
772; CHECK-DAG:    st.param.f32    [func_retval0+12], [[RE3]];
773; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
774; CHECK:        ret;
775define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
776       %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
777       ret %s_i32f32 %r;
778}
779
780; We do vectorize consecutive fields with matching types.
781; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[24])
782; CHECK-LABEL: test_s_i32x4(
783; CHECK:        .param .align 8 .b8 test_s_i32x4_param_0[24]
784; CHECK-DAG:    ld.param.u64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
785; CHECK-DAG:    ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
786; CHECK-DAG:    ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
787; CHECK:        .param .align 8 .b8 param0[24];
788; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
789; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
790; CHECK:        st.param.b64    [param0+16], [[E4]];
791; CHECK:        .param .align 8 .b8 retval0[24];
792; CHECK:        call.uni (retval0),
793; CHECK-NEXT:   test_s_i32x4,
794; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
795; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
796; CHECK:        ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
797; CHECK-DAG:    st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
798; CHECK-DAG:    st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
799; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
800; CHECK:        ret;
801
802define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
803       %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
804       ret %s_i32x4 %r;
805}
806
807; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[32])
808; CHECK-LABEL: test_s_i1i32x4(
809; CHECK:        .param .align 8 .b8 test_s_i1i32x4_param_0[32]
810; CHECK:        ld.param.u64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
811; CHECK:        ld.param.u32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
812; CHECK:        ld.param.u32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
813; CHECK:        ld.param.u8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
814; CHECK:        ld.param.v2.u32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
815; CHECK:        .param .align 8 .b8 param0[32];
816; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
817; CHECK:        st.param.b8     [param0+8], [[E2]];
818; CHECK:        st.param.b32    [param0+12], [[E3]];
819; CHECK:        st.param.b32    [param0+16], [[E4]];
820; CHECK:        st.param.b64    [param0+24], [[E5]];
821; CHECK:        .param .align 8 .b8 retval0[32];
822; CHECK:        call.uni (retval0),
823; CHECK:        test_s_i1i32x4,
824; CHECK:        (
825; CHECK:        param0
826; CHECK:        );
827; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
828; CHECK:        ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+8];
829; CHECK:        ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
830; CHECK:        ld.param.b32    [[RE4:%r[0-9]+]], [retval0+16];
831; CHECK:        ld.param.b64    [[RE5:%rd[0-9]+]], [retval0+24];
832; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
833; CHECK:        st.param.b8     [func_retval0+8], [[RE2]];
834; CHECK:        st.param.b32    [func_retval0+12], [[RE3]];
835; CHECK:        st.param.b32    [func_retval0+16], [[RE4]];
836; CHECK:        st.param.b64    [func_retval0+24], [[RE5]];
837; CHECK:        ret;
838
839define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
840       %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
841       ret %s_i8i32x4 %r;
842}
843
844; -- All loads/stores from parameters aligned by one must be done one
845; -- byte at a time.
846; CHECK:.visible .func  (.param .align 1 .b8 func_retval0[25])
847; CHECK-LABEL: test_s_i1i32x4p(
848; CHECK-DAG:        .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
849; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
850; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
851; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
852; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
853; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
854; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
855; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
856; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
857; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
858; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
859; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
860; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
861; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
862; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
863; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
864; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
865; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
866; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
867; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
868; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
869; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
870; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
871; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
872; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
873; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
874; --- TODO
875; --- Unaligned parameter store/ return value load is broken in both nvcc
876; --- and llvm and needs to be fixed.
877; CHECK:        .param .align 1 .b8 param0[25];
878; CHECK-DAG:        st.param.b32    [param0+0],
879; CHECK-DAG:        st.param.b32    [param0+4],
880; CHECK-DAG:        st.param.b8     [param0+8],
881; CHECK-DAG:        st.param.b32    [param0+9],
882; CHECK-DAG:        st.param.b32    [param0+13],
883; CHECK-DAG:        st.param.b64    [param0+17],
884; CHECK:            .param .align 1 .b8 retval0[25];
885; CHECK:            call.uni (retval0),
886; CHECK-NEXT:       test_s_i1i32x4p,
887; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
888; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
889; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
890; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
891; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
892; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
893; CHECK-DAG:        st.param.b32    [func_retval0+0],
894; CHECK-DAG:        st.param.b32    [func_retval0+4],
895; CHECK-DAG:        st.param.b8     [func_retval0+8],
896; CHECK-DAG:        st.param.b32    [func_retval0+9],
897; CHECK-DAG:        st.param.b32    [func_retval0+13],
898; CHECK-DAG:        st.param.b64    [func_retval0+17],
899
900define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
901       %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
902       ret %s_i8i32x4p %r;
903}
904
905; Check that we can vectorize loads that span multiple aggregate fields.
906; CHECK:.visible .func  (.param .align 16 .b8 func_retval0[80])
907; CHECK-LABEL: test_s_crossfield(
908; CHECK:        .param .align 16 .b8 test_s_crossfield_param_0[80]
909; CHECK:        ld.param.u32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
910; CHECK:        ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
911; CHECK:        ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
912; CHECK:        ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
913; CHECK:        ld.param.u32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
914; CHECK:        ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
915; CHECK:        .param .align 16 .b8 param0[80];
916; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
917; CHECK:        st.param.b32    [param0+8], [[E2]];
918; CHECK:        st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
919; CHECK:        st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
920; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
921; CHECK:        st.param.b32    [param0+64], [[E15]];
922; CHECK:        .param .align 16 .b8 retval0[80];
923; CHECK:        call.uni (retval0),
924; CHECK:        test_s_crossfield,
925; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
926; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
927; CHECK:        ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
928; CHECK:        ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
929; CHECK:        ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
930; CHECK:        ld.param.b32    [[RE15:%r[0-9]+]], [retval0+64];
931; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
932; CHECK:        st.param.b32    [func_retval0+8], [[RE2]];
933; CHECK:        st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
934; CHECK:        st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
935; CHECK:        st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
936; CHECK:        st.param.b32    [func_retval0+64], [[RE15]];
937; CHECK:        ret;
938
939define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
940       %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
941       ret %s_crossfield %r;
942}
943