1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
2; RUN: llc < %s -march=arm -mattr=+neon -regalloc=basic | FileCheck %s
3
4define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
5;CHECK: vld1lanei8:
6;Check the (default) alignment value.
7;CHECK: vld1.8 {d16[3]}, [r0]
8	%tmp1 = load <8 x i8>* %B
9	%tmp2 = load i8* %A, align 8
10	%tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
11        ret <8 x i8> %tmp3
12}
13
14define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
15;CHECK: vld1lanei16:
16;Check the alignment value.  Max for this instruction is 16 bits:
17;CHECK: vld1.16 {d16[2]}, [r0, :16]
18	%tmp1 = load <4 x i16>* %B
19	%tmp2 = load i16* %A, align 8
20	%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
21        ret <4 x i16> %tmp3
22}
23
24define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
25;CHECK: vld1lanei32:
26;Check the alignment value.  Max for this instruction is 32 bits:
27;CHECK: vld1.32 {d16[1]}, [r0, :32]
28	%tmp1 = load <2 x i32>* %B
29	%tmp2 = load i32* %A, align 8
30	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
31        ret <2 x i32> %tmp3
32}
33
34define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
35;CHECK: vld1lanef:
36;CHECK: vld1.32 {d16[1]}, [r0]
37	%tmp1 = load <2 x float>* %B
38	%tmp2 = load float* %A, align 4
39	%tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
40	ret <2 x float> %tmp3
41}
42
43define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
44;CHECK: vld1laneQi8:
45;CHECK: vld1.8 {d17[1]}, [r0]
46	%tmp1 = load <16 x i8>* %B
47	%tmp2 = load i8* %A, align 8
48	%tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
49	ret <16 x i8> %tmp3
50}
51
52define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
53;CHECK: vld1laneQi16:
54;CHECK: vld1.16 {d17[1]}, [r0, :16]
55	%tmp1 = load <8 x i16>* %B
56	%tmp2 = load i16* %A, align 8
57	%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
58	ret <8 x i16> %tmp3
59}
60
61define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
62;CHECK: vld1laneQi32:
63;CHECK: vld1.32 {d17[1]}, [r0, :32]
64	%tmp1 = load <4 x i32>* %B
65	%tmp2 = load i32* %A, align 8
66	%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
67	ret <4 x i32> %tmp3
68}
69
70define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
71;CHECK: vld1laneQf:
72;CHECK: vld1.32 {d16[0]}, [r0]
73	%tmp1 = load <4 x float>* %B
74	%tmp2 = load float* %A
75	%tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
76	ret <4 x float> %tmp3
77}
78
79%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
80%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
81%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
82%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
83
84%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
85%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
86%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
87
88define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
89;CHECK: vld2lanei8:
90;Check the alignment value.  Max for this instruction is 16 bits:
91;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16]
92	%tmp1 = load <8 x i8>* %B
93	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
94        %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
95        %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
96        %tmp5 = add <8 x i8> %tmp3, %tmp4
97	ret <8 x i8> %tmp5
98}
99
100define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
101;CHECK: vld2lanei16:
102;Check the alignment value.  Max for this instruction is 32 bits:
103;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32]
104	%tmp0 = bitcast i16* %A to i8*
105	%tmp1 = load <4 x i16>* %B
106	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
107        %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
108        %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
109        %tmp5 = add <4 x i16> %tmp3, %tmp4
110	ret <4 x i16> %tmp5
111}
112
113define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
114;CHECK: vld2lanei32:
115;CHECK: vld2.32
116	%tmp0 = bitcast i32* %A to i8*
117	%tmp1 = load <2 x i32>* %B
118	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
119        %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
120        %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
121        %tmp5 = add <2 x i32> %tmp3, %tmp4
122	ret <2 x i32> %tmp5
123}
124
125;Check for a post-increment updating load.
126define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
127;CHECK: vld2lanei32_update:
128;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
129	%A = load i32** %ptr
130	%tmp0 = bitcast i32* %A to i8*
131	%tmp1 = load <2 x i32>* %B
132	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
133	%tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
134	%tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
135	%tmp5 = add <2 x i32> %tmp3, %tmp4
136	%tmp6 = getelementptr i32* %A, i32 2
137	store i32* %tmp6, i32** %ptr
138	ret <2 x i32> %tmp5
139}
140
141define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
142;CHECK: vld2lanef:
143;CHECK: vld2.32
144	%tmp0 = bitcast float* %A to i8*
145	%tmp1 = load <2 x float>* %B
146	%tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
147        %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
148        %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
149        %tmp5 = fadd <2 x float> %tmp3, %tmp4
150	ret <2 x float> %tmp5
151}
152
153define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
154;CHECK: vld2laneQi16:
155;Check the (default) alignment.
156;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
157	%tmp0 = bitcast i16* %A to i8*
158	%tmp1 = load <8 x i16>* %B
159	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
160        %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
161        %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
162        %tmp5 = add <8 x i16> %tmp3, %tmp4
163	ret <8 x i16> %tmp5
164}
165
166define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
167;CHECK: vld2laneQi32:
168;Check the alignment value.  Max for this instruction is 64 bits:
169;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64]
170	%tmp0 = bitcast i32* %A to i8*
171	%tmp1 = load <4 x i32>* %B
172	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
173        %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
174        %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
175        %tmp5 = add <4 x i32> %tmp3, %tmp4
176	ret <4 x i32> %tmp5
177}
178
179define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
180;CHECK: vld2laneQf:
181;CHECK: vld2.32
182	%tmp0 = bitcast float* %A to i8*
183	%tmp1 = load <4 x float>* %B
184	%tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
185        %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
186        %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
187        %tmp5 = fadd <4 x float> %tmp3, %tmp4
188	ret <4 x float> %tmp5
189}
190
191declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
192declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
193declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
194declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
195
196declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
197declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
198declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
199
200%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
201%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
202%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
203%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
204
205%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
206%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
207%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
208
209define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
210;CHECK: vld3lanei8:
211;CHECK: vld3.8
212	%tmp1 = load <8 x i8>* %B
213	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
214        %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
215        %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
216        %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
217        %tmp6 = add <8 x i8> %tmp3, %tmp4
218        %tmp7 = add <8 x i8> %tmp5, %tmp6
219	ret <8 x i8> %tmp7
220}
221
222define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
223;CHECK: vld3lanei16:
224;Check the (default) alignment value.  VLD3 does not support alignment.
225;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
226	%tmp0 = bitcast i16* %A to i8*
227	%tmp1 = load <4 x i16>* %B
228	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
229        %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
230        %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
231        %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
232        %tmp6 = add <4 x i16> %tmp3, %tmp4
233        %tmp7 = add <4 x i16> %tmp5, %tmp6
234	ret <4 x i16> %tmp7
235}
236
237define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
238;CHECK: vld3lanei32:
239;CHECK: vld3.32
240	%tmp0 = bitcast i32* %A to i8*
241	%tmp1 = load <2 x i32>* %B
242	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
243        %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
244        %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
245        %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
246        %tmp6 = add <2 x i32> %tmp3, %tmp4
247        %tmp7 = add <2 x i32> %tmp5, %tmp6
248	ret <2 x i32> %tmp7
249}
250
251define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
252;CHECK: vld3lanef:
253;CHECK: vld3.32
254	%tmp0 = bitcast float* %A to i8*
255	%tmp1 = load <2 x float>* %B
256	%tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
257        %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
258        %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
259        %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
260        %tmp6 = fadd <2 x float> %tmp3, %tmp4
261        %tmp7 = fadd <2 x float> %tmp5, %tmp6
262	ret <2 x float> %tmp7
263}
264
265define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
266;CHECK: vld3laneQi16:
267;Check the (default) alignment value.  VLD3 does not support alignment.
268;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
269	%tmp0 = bitcast i16* %A to i8*
270	%tmp1 = load <8 x i16>* %B
271	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
272        %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
273        %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
274        %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
275        %tmp6 = add <8 x i16> %tmp3, %tmp4
276        %tmp7 = add <8 x i16> %tmp5, %tmp6
277	ret <8 x i16> %tmp7
278}
279
280;Check for a post-increment updating load with register increment.
281define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
282;CHECK: vld3laneQi16_update:
283;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
284	%A = load i16** %ptr
285	%tmp0 = bitcast i16* %A to i8*
286	%tmp1 = load <8 x i16>* %B
287	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
288	%tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
289	%tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
290	%tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
291	%tmp6 = add <8 x i16> %tmp3, %tmp4
292	%tmp7 = add <8 x i16> %tmp5, %tmp6
293	%tmp8 = getelementptr i16* %A, i32 %inc
294	store i16* %tmp8, i16** %ptr
295	ret <8 x i16> %tmp7
296}
297
298define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
299;CHECK: vld3laneQi32:
300;CHECK: vld3.32
301	%tmp0 = bitcast i32* %A to i8*
302	%tmp1 = load <4 x i32>* %B
303	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
304        %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
305        %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
306        %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
307        %tmp6 = add <4 x i32> %tmp3, %tmp4
308        %tmp7 = add <4 x i32> %tmp5, %tmp6
309	ret <4 x i32> %tmp7
310}
311
312define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
313;CHECK: vld3laneQf:
314;CHECK: vld3.32
315	%tmp0 = bitcast float* %A to i8*
316	%tmp1 = load <4 x float>* %B
317	%tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
318        %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
319        %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
320        %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
321        %tmp6 = fadd <4 x float> %tmp3, %tmp4
322        %tmp7 = fadd <4 x float> %tmp5, %tmp6
323	ret <4 x float> %tmp7
324}
325
326declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
327declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
328declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
329declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
330
331declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
332declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
333declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
334
335%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
336%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
337%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
338%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
339
340%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
341%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
342%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
343
344define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
345;CHECK: vld4lanei8:
346;Check the alignment value.  Max for this instruction is 32 bits:
347;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32]
348	%tmp1 = load <8 x i8>* %B
349	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
350        %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
351        %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
352        %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
353        %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
354        %tmp7 = add <8 x i8> %tmp3, %tmp4
355        %tmp8 = add <8 x i8> %tmp5, %tmp6
356        %tmp9 = add <8 x i8> %tmp7, %tmp8
357	ret <8 x i8> %tmp9
358}
359
360;Check for a post-increment updating load.
361define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
362;CHECK: vld4lanei8_update:
363;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]!
364	%A = load i8** %ptr
365	%tmp1 = load <8 x i8>* %B
366	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
367	%tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
368	%tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
369	%tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
370	%tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
371	%tmp7 = add <8 x i8> %tmp3, %tmp4
372	%tmp8 = add <8 x i8> %tmp5, %tmp6
373	%tmp9 = add <8 x i8> %tmp7, %tmp8
374	%tmp10 = getelementptr i8* %A, i32 4
375	store i8* %tmp10, i8** %ptr
376	ret <8 x i8> %tmp9
377}
378
379define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
380;CHECK: vld4lanei16:
381;Check that a power-of-two alignment smaller than the total size of the memory
382;being loaded is ignored.
383;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
384	%tmp0 = bitcast i16* %A to i8*
385	%tmp1 = load <4 x i16>* %B
386	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
387        %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
388        %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
389        %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
390        %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
391        %tmp7 = add <4 x i16> %tmp3, %tmp4
392        %tmp8 = add <4 x i16> %tmp5, %tmp6
393        %tmp9 = add <4 x i16> %tmp7, %tmp8
394	ret <4 x i16> %tmp9
395}
396
397define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
398;CHECK: vld4lanei32:
399;Check the alignment value.  An 8-byte alignment is allowed here even though
400;it is smaller than the total size of the memory being loaded.
401;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64]
402	%tmp0 = bitcast i32* %A to i8*
403	%tmp1 = load <2 x i32>* %B
404	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
405        %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
406        %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
407        %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
408        %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
409        %tmp7 = add <2 x i32> %tmp3, %tmp4
410        %tmp8 = add <2 x i32> %tmp5, %tmp6
411        %tmp9 = add <2 x i32> %tmp7, %tmp8
412	ret <2 x i32> %tmp9
413}
414
415define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
416;CHECK: vld4lanef:
417;CHECK: vld4.32
418	%tmp0 = bitcast float* %A to i8*
419	%tmp1 = load <2 x float>* %B
420	%tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
421        %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
422        %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
423        %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
424        %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
425        %tmp7 = fadd <2 x float> %tmp3, %tmp4
426        %tmp8 = fadd <2 x float> %tmp5, %tmp6
427        %tmp9 = fadd <2 x float> %tmp7, %tmp8
428	ret <2 x float> %tmp9
429}
430
431define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
432;CHECK: vld4laneQi16:
433;Check the alignment value.  Max for this instruction is 64 bits:
434;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64]
435	%tmp0 = bitcast i16* %A to i8*
436	%tmp1 = load <8 x i16>* %B
437	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
438        %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
439        %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
440        %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
441        %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
442        %tmp7 = add <8 x i16> %tmp3, %tmp4
443        %tmp8 = add <8 x i16> %tmp5, %tmp6
444        %tmp9 = add <8 x i16> %tmp7, %tmp8
445	ret <8 x i16> %tmp9
446}
447
448define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
449;CHECK: vld4laneQi32:
450;Check the (default) alignment.
451;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
452	%tmp0 = bitcast i32* %A to i8*
453	%tmp1 = load <4 x i32>* %B
454	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
455        %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
456        %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
457        %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
458        %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
459        %tmp7 = add <4 x i32> %tmp3, %tmp4
460        %tmp8 = add <4 x i32> %tmp5, %tmp6
461        %tmp9 = add <4 x i32> %tmp7, %tmp8
462	ret <4 x i32> %tmp9
463}
464
465define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
466;CHECK: vld4laneQf:
467;CHECK: vld4.32
468	%tmp0 = bitcast float* %A to i8*
469	%tmp1 = load <4 x float>* %B
470	%tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
471        %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
472        %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
473        %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
474        %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
475        %tmp7 = fadd <4 x float> %tmp3, %tmp4
476        %tmp8 = fadd <4 x float> %tmp5, %tmp6
477        %tmp9 = fadd <4 x float> %tmp7, %tmp8
478	ret <4 x float> %tmp9
479}
480
481declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
482declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
483declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
484declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
485
486declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
487declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
488declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
489
490; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
491; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
492; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
493; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
494define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
495;CHECK: test_qqqq_regsequence_subreg
496;CHECK: vld3.16
497  %tmp63 = extractvalue [6 x i64] %b, 5
498  %tmp64 = zext i64 %tmp63 to i128
499  %tmp65 = shl i128 %tmp64, 64
500  %ins67 = or i128 %tmp65, 0
501  %tmp78 = bitcast i128 %ins67 to <8 x i16>
502  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
503  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
504  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
505  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
506  %tmp6 = add <8 x i16> %tmp3, %tmp4
507  %tmp7 = add <8 x i16> %tmp5, %tmp6
508  ret <8 x i16> %tmp7
509}
510
511declare void @llvm.trap() nounwind
512