1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s
2
3; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic %s -o - \
4; RUN:	| FileCheck %s
5
6define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
7;CHECK-LABEL: vld1lanei8:
8;Check the (default) alignment value.
9;CHECK: vld1.8 {d16[3]}, [r0]
10	%tmp1 = load <8 x i8>, <8 x i8>* %B
11	%tmp2 = load i8, i8* %A, align 8
12	%tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
13        ret <8 x i8> %tmp3
14}
15
16define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
17;CHECK-LABEL: vld1lanei16:
18;Check the alignment value.  Max for this instruction is 16 bits:
19;CHECK: vld1.16 {d16[2]}, [r0:16]
20	%tmp1 = load <4 x i16>, <4 x i16>* %B
21	%tmp2 = load i16, i16* %A, align 8
22	%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
23        ret <4 x i16> %tmp3
24}
25
26define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
27;CHECK-LABEL: vld1lanei32:
28;Check the alignment value.  Max for this instruction is 32 bits:
29;CHECK: vld1.32 {d16[1]}, [r0:32]
30	%tmp1 = load <2 x i32>, <2 x i32>* %B
31	%tmp2 = load i32, i32* %A, align 8
32	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
33        ret <2 x i32> %tmp3
34}
35
36define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind {
37;CHECK-LABEL: vld1lanei32a32:
38;Check the alignment value.  Legal values are none or :32.
39;CHECK: vld1.32 {d16[1]}, [r0:32]
40	%tmp1 = load <2 x i32>, <2 x i32>* %B
41	%tmp2 = load i32, i32* %A, align 4
42	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
43        ret <2 x i32> %tmp3
44}
45
46define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
47;CHECK-LABEL: vld1lanef:
48;CHECK: vld1.32 {d16[1]}, [r0:32]
49	%tmp1 = load <2 x float>, <2 x float>* %B
50	%tmp2 = load float, float* %A, align 4
51	%tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
52	ret <2 x float> %tmp3
53}
54
55define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
56;CHECK-LABEL: vld1laneQi8:
57;CHECK: vld1.8 {d17[1]}, [r0]
58	%tmp1 = load <16 x i8>, <16 x i8>* %B
59	%tmp2 = load i8, i8* %A, align 8
60	%tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
61	ret <16 x i8> %tmp3
62}
63
64define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
65;CHECK-LABEL: vld1laneQi16:
66;CHECK: vld1.16 {d17[1]}, [r0:16]
67	%tmp1 = load <8 x i16>, <8 x i16>* %B
68	%tmp2 = load i16, i16* %A, align 8
69	%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
70	ret <8 x i16> %tmp3
71}
72
73define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
74;CHECK-LABEL: vld1laneQi32:
75;CHECK: vld1.32 {d17[1]}, [r0:32]
76	%tmp1 = load <4 x i32>, <4 x i32>* %B
77	%tmp2 = load i32, i32* %A, align 8
78	%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
79	ret <4 x i32> %tmp3
80}
81
82define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
83;CHECK-LABEL: vld1laneQf:
84;CHECK: vld1.32 {d16[0]}, [r0:32]
85	%tmp1 = load <4 x float>, <4 x float>* %B
86	%tmp2 = load float, float* %A
87	%tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
88	ret <4 x float> %tmp3
89}
90
91%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
92%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
93%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
94%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
95
96%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
97%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
98%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
99
100define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
101;CHECK-LABEL: vld2lanei8:
102;Check the alignment value.  Max for this instruction is 16 bits:
103;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
104	%tmp1 = load <8 x i8>, <8 x i8>* %B
105	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
106        %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
107        %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
108        %tmp5 = add <8 x i8> %tmp3, %tmp4
109	ret <8 x i8> %tmp5
110}
111
112define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
113;CHECK-LABEL: vld2lanei16:
114;Check the alignment value.  Max for this instruction is 32 bits:
115;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
116	%tmp0 = bitcast i16* %A to i8*
117	%tmp1 = load <4 x i16>, <4 x i16>* %B
118	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
119        %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
120        %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
121        %tmp5 = add <4 x i16> %tmp3, %tmp4
122	ret <4 x i16> %tmp5
123}
124
125define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
126;CHECK-LABEL: vld2lanei32:
127;CHECK: vld2.32
128	%tmp0 = bitcast i32* %A to i8*
129	%tmp1 = load <2 x i32>, <2 x i32>* %B
130	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
131        %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
132        %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
133        %tmp5 = add <2 x i32> %tmp3, %tmp4
134	ret <2 x i32> %tmp5
135}
136
137;Check for a post-increment updating load.
138define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
139;CHECK-LABEL: vld2lanei32_update:
140;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
141	%A = load i32*, i32** %ptr
142	%tmp0 = bitcast i32* %A to i8*
143	%tmp1 = load <2 x i32>, <2 x i32>* %B
144	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
145	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
146	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
147	%tmp5 = add <2 x i32> %tmp3, %tmp4
148	%tmp6 = getelementptr i32, i32* %A, i32 2
149	store i32* %tmp6, i32** %ptr
150	ret <2 x i32> %tmp5
151}
152
153define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
154;CHECK-LABEL: vld2lanef:
155;CHECK: vld2.32
156	%tmp0 = bitcast float* %A to i8*
157	%tmp1 = load <2 x float>, <2 x float>* %B
158	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
159        %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
160        %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
161        %tmp5 = fadd <2 x float> %tmp3, %tmp4
162	ret <2 x float> %tmp5
163}
164
165define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
166;CHECK-LABEL: vld2laneQi16:
167;Check the (default) alignment.
168;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
169	%tmp0 = bitcast i16* %A to i8*
170	%tmp1 = load <8 x i16>, <8 x i16>* %B
171	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
172        %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
173        %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
174        %tmp5 = add <8 x i16> %tmp3, %tmp4
175	ret <8 x i16> %tmp5
176}
177
178define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
179;CHECK-LABEL: vld2laneQi32:
180;Check the alignment value.  Max for this instruction is 64 bits:
181;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
182	%tmp0 = bitcast i32* %A to i8*
183	%tmp1 = load <4 x i32>, <4 x i32>* %B
184	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
185        %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
186        %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
187        %tmp5 = add <4 x i32> %tmp3, %tmp4
188	ret <4 x i32> %tmp5
189}
190
191define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
192;CHECK-LABEL: vld2laneQf:
193;CHECK: vld2.32
194	%tmp0 = bitcast float* %A to i8*
195	%tmp1 = load <4 x float>, <4 x float>* %B
196	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
197        %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
198        %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
199        %tmp5 = fadd <4 x float> %tmp3, %tmp4
200	ret <4 x float> %tmp5
201}
202
203declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
204declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
205declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
206declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
207
208declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
209declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
210declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
211
212%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
213%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
214%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
215%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
216
217%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
218%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
219%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
220
221define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
222;CHECK-LABEL: vld3lanei8:
223;CHECK: vld3.8
224	%tmp1 = load <8 x i8>, <8 x i8>* %B
225	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
226        %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
227        %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
228        %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
229        %tmp6 = add <8 x i8> %tmp3, %tmp4
230        %tmp7 = add <8 x i8> %tmp5, %tmp6
231	ret <8 x i8> %tmp7
232}
233
234define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
235;CHECK-LABEL: vld3lanei16:
236;Check the (default) alignment value.  VLD3 does not support alignment.
237;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
238	%tmp0 = bitcast i16* %A to i8*
239	%tmp1 = load <4 x i16>, <4 x i16>* %B
240	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
241        %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
242        %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
243        %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
244        %tmp6 = add <4 x i16> %tmp3, %tmp4
245        %tmp7 = add <4 x i16> %tmp5, %tmp6
246	ret <4 x i16> %tmp7
247}
248
249define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
250;CHECK-LABEL: vld3lanei32:
251;CHECK: vld3.32
252	%tmp0 = bitcast i32* %A to i8*
253	%tmp1 = load <2 x i32>, <2 x i32>* %B
254	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
255        %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
256        %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
257        %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
258        %tmp6 = add <2 x i32> %tmp3, %tmp4
259        %tmp7 = add <2 x i32> %tmp5, %tmp6
260	ret <2 x i32> %tmp7
261}
262
263define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
264;CHECK-LABEL: vld3lanef:
265;CHECK: vld3.32
266	%tmp0 = bitcast float* %A to i8*
267	%tmp1 = load <2 x float>, <2 x float>* %B
268	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
269        %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
270        %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
271        %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
272        %tmp6 = fadd <2 x float> %tmp3, %tmp4
273        %tmp7 = fadd <2 x float> %tmp5, %tmp6
274	ret <2 x float> %tmp7
275}
276
277define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
278;CHECK-LABEL: vld3laneQi16:
279;Check the (default) alignment value.  VLD3 does not support alignment.
280;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
281	%tmp0 = bitcast i16* %A to i8*
282	%tmp1 = load <8 x i16>, <8 x i16>* %B
283	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
284        %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
285        %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
286        %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
287        %tmp6 = add <8 x i16> %tmp3, %tmp4
288        %tmp7 = add <8 x i16> %tmp5, %tmp6
289	ret <8 x i16> %tmp7
290}
291
292;Check for a post-increment updating load with register increment.
293define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
294;CHECK-LABEL: vld3laneQi16_update:
295;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
296	%A = load i16*, i16** %ptr
297	%tmp0 = bitcast i16* %A to i8*
298	%tmp1 = load <8 x i16>, <8 x i16>* %B
299	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
300	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
301	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
302	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
303	%tmp6 = add <8 x i16> %tmp3, %tmp4
304	%tmp7 = add <8 x i16> %tmp5, %tmp6
305	%tmp8 = getelementptr i16, i16* %A, i32 %inc
306	store i16* %tmp8, i16** %ptr
307	ret <8 x i16> %tmp7
308}
309
310define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
311;CHECK-LABEL: vld3laneQi32:
312;CHECK: vld3.32
313	%tmp0 = bitcast i32* %A to i8*
314	%tmp1 = load <4 x i32>, <4 x i32>* %B
315	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
316        %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
317        %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
318        %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
319        %tmp6 = add <4 x i32> %tmp3, %tmp4
320        %tmp7 = add <4 x i32> %tmp5, %tmp6
321	ret <4 x i32> %tmp7
322}
323
324define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
325;CHECK-LABEL: vld3laneQf:
326;CHECK: vld3.32
327	%tmp0 = bitcast float* %A to i8*
328	%tmp1 = load <4 x float>, <4 x float>* %B
329	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
330        %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
331        %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
332        %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
333        %tmp6 = fadd <4 x float> %tmp3, %tmp4
334        %tmp7 = fadd <4 x float> %tmp5, %tmp6
335	ret <4 x float> %tmp7
336}
337
338declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
339declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
340declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
341declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
342
343declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
344declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
345declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
346
347%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
348%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
349%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
350%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
351
352%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
353%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
354%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
355
356define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
357;CHECK-LABEL: vld4lanei8:
358;Check the alignment value.  Max for this instruction is 32 bits:
359;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
360	%tmp1 = load <8 x i8>, <8 x i8>* %B
361	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
362        %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
363        %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
364        %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
365        %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
366        %tmp7 = add <8 x i8> %tmp3, %tmp4
367        %tmp8 = add <8 x i8> %tmp5, %tmp6
368        %tmp9 = add <8 x i8> %tmp7, %tmp8
369	ret <8 x i8> %tmp9
370}
371
372;Check for a post-increment updating load.
373define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
374;CHECK-LABEL: vld4lanei8_update:
375;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
376	%A = load i8*, i8** %ptr
377	%tmp1 = load <8 x i8>, <8 x i8>* %B
378	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
379	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
380	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
381	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
382	%tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
383	%tmp7 = add <8 x i8> %tmp3, %tmp4
384	%tmp8 = add <8 x i8> %tmp5, %tmp6
385	%tmp9 = add <8 x i8> %tmp7, %tmp8
386	%tmp10 = getelementptr i8, i8* %A, i32 4
387	store i8* %tmp10, i8** %ptr
388	ret <8 x i8> %tmp9
389}
390
391define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
392;CHECK-LABEL: vld4lanei16:
393;Check that a power-of-two alignment smaller than the total size of the memory
394;being loaded is ignored.
395;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
396	%tmp0 = bitcast i16* %A to i8*
397	%tmp1 = load <4 x i16>, <4 x i16>* %B
398	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
399        %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
400        %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
401        %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
402        %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
403        %tmp7 = add <4 x i16> %tmp3, %tmp4
404        %tmp8 = add <4 x i16> %tmp5, %tmp6
405        %tmp9 = add <4 x i16> %tmp7, %tmp8
406	ret <4 x i16> %tmp9
407}
408
409define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
410;CHECK-LABEL: vld4lanei32:
411;Check the alignment value.  An 8-byte alignment is allowed here even though
412;it is smaller than the total size of the memory being loaded.
413;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
414	%tmp0 = bitcast i32* %A to i8*
415	%tmp1 = load <2 x i32>, <2 x i32>* %B
416	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
417        %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
418        %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
419        %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
420        %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
421        %tmp7 = add <2 x i32> %tmp3, %tmp4
422        %tmp8 = add <2 x i32> %tmp5, %tmp6
423        %tmp9 = add <2 x i32> %tmp7, %tmp8
424	ret <2 x i32> %tmp9
425}
426
427define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
428;CHECK-LABEL: vld4lanef:
429;CHECK: vld4.32
430	%tmp0 = bitcast float* %A to i8*
431	%tmp1 = load <2 x float>, <2 x float>* %B
432	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
433        %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
434        %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
435        %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
436        %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
437        %tmp7 = fadd <2 x float> %tmp3, %tmp4
438        %tmp8 = fadd <2 x float> %tmp5, %tmp6
439        %tmp9 = fadd <2 x float> %tmp7, %tmp8
440	ret <2 x float> %tmp9
441}
442
443define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
444;CHECK-LABEL: vld4laneQi16:
445;Check the alignment value.  Max for this instruction is 64 bits:
446;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
447	%tmp0 = bitcast i16* %A to i8*
448	%tmp1 = load <8 x i16>, <8 x i16>* %B
449	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
450        %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
451        %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
452        %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
453        %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
454        %tmp7 = add <8 x i16> %tmp3, %tmp4
455        %tmp8 = add <8 x i16> %tmp5, %tmp6
456        %tmp9 = add <8 x i16> %tmp7, %tmp8
457	ret <8 x i16> %tmp9
458}
459
460define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
461;CHECK-LABEL: vld4laneQi32:
462;Check the (default) alignment.
463;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
464	%tmp0 = bitcast i32* %A to i8*
465	%tmp1 = load <4 x i32>, <4 x i32>* %B
466	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
467        %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
468        %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
469        %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
470        %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
471        %tmp7 = add <4 x i32> %tmp3, %tmp4
472        %tmp8 = add <4 x i32> %tmp5, %tmp6
473        %tmp9 = add <4 x i32> %tmp7, %tmp8
474	ret <4 x i32> %tmp9
475}
476
477define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
478;CHECK-LABEL: vld4laneQf:
479;CHECK: vld4.32
480	%tmp0 = bitcast float* %A to i8*
481	%tmp1 = load <4 x float>, <4 x float>* %B
482	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
483        %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
484        %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
485        %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
486        %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
487        %tmp7 = fadd <4 x float> %tmp3, %tmp4
488        %tmp8 = fadd <4 x float> %tmp5, %tmp6
489        %tmp9 = fadd <4 x float> %tmp7, %tmp8
490	ret <4 x float> %tmp9
491}
492
493declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
494declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
495declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
496declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
497
498declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
499declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
500declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
501
502; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
503; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
504; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
505; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
506define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
507;CHECK-LABEL: test_qqqq_regsequence_subreg:
508;CHECK: vld3.16
509  %tmp63 = extractvalue [6 x i64] %b, 5
510  %tmp64 = zext i64 %tmp63 to i128
511  %tmp65 = shl i128 %tmp64, 64
512  %ins67 = or i128 %tmp65, 0
513  %tmp78 = bitcast i128 %ins67 to <8 x i16>
514  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
515  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
516  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
517  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
518  %tmp6 = add <8 x i16> %tmp3, %tmp4
519  %tmp7 = add <8 x i16> %tmp5, %tmp6
520  ret <8 x i16> %tmp7
521}
522
523declare void @llvm.trap() nounwind
524