1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
2
3%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
4%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
5%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
6%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
7%struct.__neon_int64x1x2_t = type { <1 x i64>, <1 x i64> }
8
9%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
10%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
11%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
12%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
13
14define <8 x i8> @vld2i8(i8* %A) nounwind {
15;CHECK-LABEL: vld2i8:
16;Check the alignment value.  Max for this instruction is 128 bits:
17;CHECK: vld2.8 {d16, d17}, [r0:64]
18	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8* %A, i32 8)
19        %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
20        %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
21        %tmp4 = add <8 x i8> %tmp2, %tmp3
22	ret <8 x i8> %tmp4
23}
24
25define <4 x i16> @vld2i16(i16* %A) nounwind {
26;CHECK-LABEL: vld2i16:
27;Check the alignment value.  Max for this instruction is 128 bits:
28;CHECK: vld2.16 {d16, d17}, [r0:128]
29	%tmp0 = bitcast i16* %A to i8*
30	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8* %tmp0, i32 32)
31        %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
32        %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
33        %tmp4 = add <4 x i16> %tmp2, %tmp3
34	ret <4 x i16> %tmp4
35}
36
37define <2 x i32> @vld2i32(i32* %A) nounwind {
38;CHECK-LABEL: vld2i32:
39;CHECK: vld2.32
40	%tmp0 = bitcast i32* %A to i8*
41	%tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8* %tmp0, i32 1)
42        %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
43        %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
44        %tmp4 = add <2 x i32> %tmp2, %tmp3
45	ret <2 x i32> %tmp4
46}
47
48define <2 x float> @vld2f(float* %A) nounwind {
49;CHECK-LABEL: vld2f:
50;CHECK: vld2.32
51	%tmp0 = bitcast float* %A to i8*
52	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
53        %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
54        %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
55        %tmp4 = fadd <2 x float> %tmp2, %tmp3
56	ret <2 x float> %tmp4
57}
58
59;Check for a post-increment updating load.
60define <2 x float> @vld2f_update(float** %ptr) nounwind {
61;CHECK-LABEL: vld2f_update:
62;CHECK: vld2.32 {d16, d17}, [r1]!
63	%A = load float*, float** %ptr
64	%tmp0 = bitcast float* %A to i8*
65	%tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8* %tmp0, i32 1)
66	%tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
67	%tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
68	%tmp4 = fadd <2 x float> %tmp2, %tmp3
69	%tmp5 = getelementptr float, float* %A, i32 4
70	store float* %tmp5, float** %ptr
71	ret <2 x float> %tmp4
72}
73
74define <1 x i64> @vld2i64(i64* %A) nounwind {
75;CHECK-LABEL: vld2i64:
76;Check the alignment value.  Max for this instruction is 128 bits:
77;CHECK: vld1.64 {d16, d17}, [r0:128]
78	%tmp0 = bitcast i64* %A to i8*
79	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8* %tmp0, i32 32)
80        %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
81        %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
82        %tmp4 = add <1 x i64> %tmp2, %tmp3
83	ret <1 x i64> %tmp4
84}
85
86define <16 x i8> @vld2Qi8(i8* %A) nounwind {
87;CHECK-LABEL: vld2Qi8:
88;Check the alignment value.  Max for this instruction is 256 bits:
89;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
90	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 8)
91        %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
92        %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
93        %tmp4 = add <16 x i8> %tmp2, %tmp3
94	ret <16 x i8> %tmp4
95}
96
97;Check for a post-increment updating load with register increment.
98define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
99;CHECK-LABEL: vld2Qi8_update:
100;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
101	%A = load i8*, i8** %ptr
102	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8* %A, i32 16)
103        %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
104        %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
105        %tmp4 = add <16 x i8> %tmp2, %tmp3
106	%tmp5 = getelementptr i8, i8* %A, i32 %inc
107	store i8* %tmp5, i8** %ptr
108	ret <16 x i8> %tmp4
109}
110
111define <8 x i16> @vld2Qi16(i16* %A) nounwind {
112;CHECK-LABEL: vld2Qi16:
113;Check the alignment value.  Max for this instruction is 256 bits:
114;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
115	%tmp0 = bitcast i16* %A to i8*
116	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8* %tmp0, i32 16)
117        %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
118        %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
119        %tmp4 = add <8 x i16> %tmp2, %tmp3
120	ret <8 x i16> %tmp4
121}
122
123define <4 x i32> @vld2Qi32(i32* %A) nounwind {
124;CHECK-LABEL: vld2Qi32:
125;Check the alignment value.  Max for this instruction is 256 bits:
126;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
127	%tmp0 = bitcast i32* %A to i8*
128	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8* %tmp0, i32 64)
129        %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
130        %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
131        %tmp4 = add <4 x i32> %tmp2, %tmp3
132	ret <4 x i32> %tmp4
133}
134
135define <4 x float> @vld2Qf(float* %A) nounwind {
136;CHECK-LABEL: vld2Qf:
137;CHECK: vld2.32
138	%tmp0 = bitcast float* %A to i8*
139	%tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8* %tmp0, i32 1)
140        %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
141        %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
142        %tmp4 = fadd <4 x float> %tmp2, %tmp3
143	ret <4 x float> %tmp4
144}
145
146declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8.p0i8(i8*, i32) nounwind readonly
147declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16.p0i8(i8*, i32) nounwind readonly
148declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32.p0i8(i8*, i32) nounwind readonly
149declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32.p0i8(i8*, i32) nounwind readonly
150declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64.p0i8(i8*, i32) nounwind readonly
151
152declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8.p0i8(i8*, i32) nounwind readonly
153declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16.p0i8(i8*, i32) nounwind readonly
154declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32.p0i8(i8*, i32) nounwind readonly
155declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32.p0i8(i8*, i32) nounwind readonly
156