1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm -mattr=+neon | FileCheck %s
3
4;Check the (default) alignment.
5define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
6; CHECK-LABEL: vst1lanei8:
7; CHECK:       @ %bb.0:
8; CHECK-NEXT:    vldr d16, [r1]
9; CHECK-NEXT:    vst1.8 {d16[3]}, [r0]
10; CHECK-NEXT:    mov pc, lr
11	%tmp1 = load <8 x i8>, <8 x i8>* %B
12	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
13	store i8 %tmp2, i8* %A, align 8
14	ret void
15}
16
17;Check for a post-increment updating store.
18define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
19; CHECK-LABEL: vst1lanei8_update:
20; CHECK:       @ %bb.0:
21; CHECK-NEXT:    ldr r2, [r0]
22; CHECK-NEXT:    vldr d16, [r1]
23; CHECK-NEXT:    vst1.8 {d16[3]}, [r2]!
24; CHECK-NEXT:    str r2, [r0]
25; CHECK-NEXT:    mov pc, lr
26	%A = load i8*, i8** %ptr
27	%tmp1 = load <8 x i8>, <8 x i8>* %B
28	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
29	store i8 %tmp2, i8* %A, align 8
30	%tmp3 = getelementptr i8, i8* %A, i32 1
31	store i8* %tmp3, i8** %ptr
32	ret void
33}
34
35;Check the alignment value.  Max for this instruction is 16 bits:
36define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
37; CHECK-LABEL: vst1lanei16:
38; CHECK:       @ %bb.0:
39; CHECK-NEXT:    vldr d16, [r1]
40; CHECK-NEXT:    vst1.16 {d16[2]}, [r0:16]
41; CHECK-NEXT:    mov pc, lr
42	%tmp1 = load <4 x i16>, <4 x i16>* %B
43	%tmp2 = extractelement <4 x i16> %tmp1, i32 2
44	store i16 %tmp2, i16* %A, align 8
45	ret void
46}
47
48;Check the alignment value.  Max for this instruction is 32 bits:
49define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
50; CHECK-LABEL: vst1lanei32:
51; CHECK:       @ %bb.0:
52; CHECK-NEXT:    vldr d16, [r1]
53; CHECK-NEXT:    vst1.32 {d16[1]}, [r0:32]
54; CHECK-NEXT:    mov pc, lr
55	%tmp1 = load <2 x i32>, <2 x i32>* %B
56	%tmp2 = extractelement <2 x i32> %tmp1, i32 1
57	store i32 %tmp2, i32* %A, align 8
58	ret void
59}
60
61define void @vst1lanef(float* %A, <2 x float>* %B) nounwind {
62; CHECK-LABEL: vst1lanef:
63; CHECK:       @ %bb.0:
64; CHECK-NEXT:    vldr d16, [r1]
65; CHECK-NEXT:    vst1.32 {d16[1]}, [r0:32]
66; CHECK-NEXT:    mov pc, lr
67	%tmp1 = load <2 x float>, <2 x float>* %B
68	%tmp2 = extractelement <2 x float> %tmp1, i32 1
69	store float %tmp2, float* %A
70	ret void
71}
72
73; // Can use scalar load. No need to use vectors.
74; // CHE-CK: vst1.8 {d17[1]}, [r0]
75define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
76; CHECK-LABEL: vst1laneQi8:
77; CHECK:       @ %bb.0:
78; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
79; CHECK-NEXT:    vst1.8 {d17[1]}, [r0]
80; CHECK-NEXT:    mov pc, lr
81	%tmp1 = load <16 x i8>, <16 x i8>* %B
82	%tmp2 = extractelement <16 x i8> %tmp1, i32 9
83	store i8 %tmp2, i8* %A, align 8
84	ret void
85}
86
87define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
88; CHECK-LABEL: vst1laneQi16:
89; CHECK:       @ %bb.0:
90; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
91; CHECK-NEXT:    vst1.16 {d17[1]}, [r0:16]
92; CHECK-NEXT:    mov pc, lr
93	%tmp1 = load <8 x i16>, <8 x i16>* %B
94	%tmp2 = extractelement <8 x i16> %tmp1, i32 5
95	store i16 %tmp2, i16* %A, align 8
96	ret void
97}
98
99; // Can use scalar load. No need to use vectors.
100; // CHE-CK: vst1.32 {d17[1]}, [r0:32]
101define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
102; CHECK-LABEL: vst1laneQi32:
103; CHECK:       @ %bb.0:
104; CHECK-NEXT:    ldr r1, [r1, #12]
105; CHECK-NEXT:    str r1, [r0]
106; CHECK-NEXT:    mov pc, lr
107	%tmp1 = load <4 x i32>, <4 x i32>* %B
108	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
109	store i32 %tmp2, i32* %A, align 8
110	ret void
111}
112
113;Check for a post-increment updating store.
114; // Can use scalar load. No need to use vectors.
115; // CHE-CK: vst1.32 {d17[1]}, [r1:32]!
116define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
117; CHECK-LABEL: vst1laneQi32_update:
118; CHECK:       @ %bb.0:
119; CHECK-NEXT:    ldr r2, [r0]
120; CHECK-NEXT:    ldr r1, [r1, #12]
121; CHECK-NEXT:    str r1, [r2], #4
122; CHECK-NEXT:    str r2, [r0]
123; CHECK-NEXT:    mov pc, lr
124	%A = load i32*, i32** %ptr
125	%tmp1 = load <4 x i32>, <4 x i32>* %B
126	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
127	store i32 %tmp2, i32* %A, align 8
128	%tmp3 = getelementptr i32, i32* %A, i32 1
129	store i32* %tmp3, i32** %ptr
130	ret void
131}
132
133; // Can use scalar load. No need to use vectors.
134; // CHE-CK: vst1.32 {d17[1]}, [r0]
135define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind {
136; CHECK-LABEL: vst1laneQf:
137; CHECK:       @ %bb.0:
138; CHECK-NEXT:    ldr r1, [r1, #12]
139; CHECK-NEXT:    str r1, [r0]
140; CHECK-NEXT:    mov pc, lr
141	%tmp1 = load <4 x float>, <4 x float>* %B
142	%tmp2 = extractelement <4 x float> %tmp1, i32 3
143	store float %tmp2, float* %A
144	ret void
145}
146
147;Check the alignment value.  Max for this instruction is 16 bits:
148define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
149; CHECK-LABEL: vst2lanei8:
150; CHECK:       @ %bb.0:
151; CHECK-NEXT:    vldr d16, [r1]
152; CHECK-NEXT:    vorr d17, d16, d16
153; CHECK-NEXT:    vst2.8 {d16[1], d17[1]}, [r0:16]
154; CHECK-NEXT:    mov pc, lr
155	%tmp1 = load <8 x i8>, <8 x i8>* %B
156	call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
157	ret void
158}
159
160;Check the alignment value.  Max for this instruction is 32 bits:
161define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
162; CHECK-LABEL: vst2lanei16:
163; CHECK:       @ %bb.0:
164; CHECK-NEXT:    vldr d16, [r1]
165; CHECK-NEXT:    vorr d17, d16, d16
166; CHECK-NEXT:    vst2.16 {d16[1], d17[1]}, [r0:32]
167; CHECK-NEXT:    mov pc, lr
168	%tmp0 = bitcast i16* %A to i8*
169	%tmp1 = load <4 x i16>, <4 x i16>* %B
170	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
171	ret void
172}
173
174;Check for a post-increment updating store with register increment.
175define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
176; CHECK-LABEL: vst2lanei16_update:
177; CHECK:       @ %bb.0:
178; CHECK-NEXT:    vldr d16, [r1]
179; CHECK-NEXT:    lsl r1, r2, #1
180; CHECK-NEXT:    ldr r3, [r0]
181; CHECK-NEXT:    vorr d17, d16, d16
182; CHECK-NEXT:    vst2.16 {d16[1], d17[1]}, [r3], r1
183; CHECK-NEXT:    str r3, [r0]
184; CHECK-NEXT:    mov pc, lr
185	%A = load i16*, i16** %ptr
186	%tmp0 = bitcast i16* %A to i8*
187	%tmp1 = load <4 x i16>, <4 x i16>* %B
188	call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
189	%tmp2 = getelementptr i16, i16* %A, i32 %inc
190	store i16* %tmp2, i16** %ptr
191	ret void
192}
193
194define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
195; CHECK-LABEL: vst2lanei32:
196; CHECK:       @ %bb.0:
197; CHECK-NEXT:    vldr d16, [r1]
198; CHECK-NEXT:    vorr d17, d16, d16
199; CHECK-NEXT:    vst2.32 {d16[1], d17[1]}, [r0]
200; CHECK-NEXT:    mov pc, lr
201	%tmp0 = bitcast i32* %A to i8*
202	%tmp1 = load <2 x i32>, <2 x i32>* %B
203	call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
204	ret void
205}
206
207define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
208; CHECK-LABEL: vst2lanef:
209; CHECK:       @ %bb.0:
210; CHECK-NEXT:    vldr d16, [r1]
211; CHECK-NEXT:    vorr d17, d16, d16
212; CHECK-NEXT:    vst2.32 {d16[1], d17[1]}, [r0]
213; CHECK-NEXT:    mov pc, lr
214	%tmp0 = bitcast float* %A to i8*
215	%tmp1 = load <2 x float>, <2 x float>* %B
216	call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
217	ret void
218}
219
220;Check the (default) alignment.
221define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
222; CHECK-LABEL: vst2laneQi16:
223; CHECK:       @ %bb.0:
224; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
225; CHECK-NEXT:    vorr q9, q8, q8
226; CHECK-NEXT:    vst2.16 {d17[1], d19[1]}, [r0]
227; CHECK-NEXT:    mov pc, lr
228	%tmp0 = bitcast i16* %A to i8*
229	%tmp1 = load <8 x i16>, <8 x i16>* %B
230	call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
231	ret void
232}
233
234;Check the alignment value.  Max for this instruction is 64 bits:
235define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
236; CHECK-LABEL: vst2laneQi32:
237; CHECK:       @ %bb.0:
238; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
239; CHECK-NEXT:    vorr q9, q8, q8
240; CHECK-NEXT:    vst2.32 {d17[0], d19[0]}, [r0:64]
241; CHECK-NEXT:    mov pc, lr
242	%tmp0 = bitcast i32* %A to i8*
243	%tmp1 = load <4 x i32>, <4 x i32>* %B
244	call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
245	ret void
246}
247
248define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
249; CHECK-LABEL: vst2laneQf:
250; CHECK:       @ %bb.0:
251; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
252; CHECK-NEXT:    vorr q9, q8, q8
253; CHECK-NEXT:    vst2.32 {d17[1], d19[1]}, [r0]
254; CHECK-NEXT:    mov pc, lr
255	%tmp0 = bitcast float* %A to i8*
256	%tmp1 = load <4 x float>, <4 x float>* %B
257	call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
258	ret void
259}
260
261declare void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
262declare void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
263declare void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
264declare void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
265
266declare void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
267declare void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
268declare void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
269
270define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
271; CHECK-LABEL: vst3lanei8:
272; CHECK:       @ %bb.0:
273; CHECK-NEXT:    vldr d16, [r1]
274; CHECK-NEXT:    vorr d17, d16, d16
275; CHECK-NEXT:    vorr d18, d16, d16
276; CHECK-NEXT:    vst3.8 {d16[1], d17[1], d18[1]}, [r0]
277; CHECK-NEXT:    mov pc, lr
278	%tmp1 = load <8 x i8>, <8 x i8>* %B
279	call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
280	ret void
281}
282
283;Check the (default) alignment value.  VST3 does not support alignment.
284define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
285; CHECK-LABEL: vst3lanei16:
286; CHECK:       @ %bb.0:
287; CHECK-NEXT:    vldr d16, [r1]
288; CHECK-NEXT:    vorr d17, d16, d16
289; CHECK-NEXT:    vorr d18, d16, d16
290; CHECK-NEXT:    vst3.16 {d16[1], d17[1], d18[1]}, [r0]
291; CHECK-NEXT:    mov pc, lr
292	%tmp0 = bitcast i16* %A to i8*
293	%tmp1 = load <4 x i16>, <4 x i16>* %B
294	call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
295	ret void
296}
297
298define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
299; CHECK-LABEL: vst3lanei32:
300; CHECK:       @ %bb.0:
301; CHECK-NEXT:    vldr d16, [r1]
302; CHECK-NEXT:    vorr d17, d16, d16
303; CHECK-NEXT:    vorr d18, d16, d16
304; CHECK-NEXT:    vst3.32 {d16[1], d17[1], d18[1]}, [r0]
305; CHECK-NEXT:    mov pc, lr
306	%tmp0 = bitcast i32* %A to i8*
307	%tmp1 = load <2 x i32>, <2 x i32>* %B
308	call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
309	ret void
310}
311
312define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
313; CHECK-LABEL: vst3lanef:
314; CHECK:       @ %bb.0:
315; CHECK-NEXT:    vldr d16, [r1]
316; CHECK-NEXT:    vorr d17, d16, d16
317; CHECK-NEXT:    vorr d18, d16, d16
318; CHECK-NEXT:    vst3.32 {d16[1], d17[1], d18[1]}, [r0]
319; CHECK-NEXT:    mov pc, lr
320	%tmp0 = bitcast float* %A to i8*
321	%tmp1 = load <2 x float>, <2 x float>* %B
322	call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
323	ret void
324}
325
326define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
327; CHECK-LABEL: vst3laneQi16:
328; CHECK:       @ %bb.0:
329; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
330; CHECK-NEXT:    vorr q9, q8, q8
331; CHECK-NEXT:    vorr q10, q8, q8
332; CHECK-NEXT:    vst3.16 {d17[2], d19[2], d21[2]}, [r0]
333; CHECK-NEXT:    mov pc, lr
334;Check the (default) alignment value.  VST3 does not support alignment.
335	%tmp0 = bitcast i16* %A to i8*
336	%tmp1 = load <8 x i16>, <8 x i16>* %B
337	call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
338	ret void
339}
340
341define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
342; CHECK-LABEL: vst3laneQi32:
343; CHECK:       @ %bb.0:
344; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
345; CHECK-NEXT:    vorr q9, q8, q8
346; CHECK-NEXT:    vorr q10, q8, q8
347; CHECK-NEXT:    vst3.32 {d16[0], d18[0], d20[0]}, [r0]
348; CHECK-NEXT:    mov pc, lr
349	%tmp0 = bitcast i32* %A to i8*
350	%tmp1 = load <4 x i32>, <4 x i32>* %B
351	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
352	ret void
353}
354
355;Check for a post-increment updating store.
356define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
357; CHECK-LABEL: vst3laneQi32_update:
358; CHECK:       @ %bb.0:
359; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
360; CHECK-NEXT:    vorr q9, q8, q8
361; CHECK-NEXT:    ldr r2, [r0]
362; CHECK-NEXT:    vorr q10, q8, q8
363; CHECK-NEXT:    vst3.32 {d16[0], d18[0], d20[0]}, [r2]!
364; CHECK-NEXT:    str r2, [r0]
365; CHECK-NEXT:    mov pc, lr
366	%A = load i32*, i32** %ptr
367	%tmp0 = bitcast i32* %A to i8*
368	%tmp1 = load <4 x i32>, <4 x i32>* %B
369	call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
370	%tmp2 = getelementptr i32, i32* %A, i32 3
371	store i32* %tmp2, i32** %ptr
372	ret void
373}
374
375define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
376; CHECK-LABEL: vst3laneQf:
377; CHECK:       @ %bb.0:
378; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
379; CHECK-NEXT:    vorr q9, q8, q8
380; CHECK-NEXT:    vorr q10, q8, q8
381; CHECK-NEXT:    vst3.32 {d16[1], d18[1], d20[1]}, [r0]
382; CHECK-NEXT:    mov pc, lr
383	%tmp0 = bitcast float* %A to i8*
384	%tmp1 = load <4 x float>, <4 x float>* %B
385	call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
386	ret void
387}
388
389declare void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
390declare void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
391declare void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
392declare void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
393
394declare void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
395declare void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
396declare void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
397
398
399;Check the alignment value.  Max for this instruction is 32 bits:
400define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
401; CHECK-LABEL: vst4lanei8:
402; CHECK:       @ %bb.0:
403; CHECK-NEXT:    vldr d16, [r1]
404; CHECK-NEXT:    vorr d17, d16, d16
405; CHECK-NEXT:    vorr d18, d16, d16
406; CHECK-NEXT:    vorr d19, d16, d16
407; CHECK-NEXT:    vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
408; CHECK-NEXT:    mov pc, lr
409	%tmp1 = load <8 x i8>, <8 x i8>* %B
410	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
411	ret void
412}
413
414;Check for a post-increment updating store.
415define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
416; CHECK-LABEL: vst4lanei8_update:
417; CHECK:       @ %bb.0:
418; CHECK-NEXT:    vldr d16, [r1]
419; CHECK-NEXT:    vorr d17, d16, d16
420; CHECK-NEXT:    ldr r2, [r0]
421; CHECK-NEXT:    vorr d18, d16, d16
422; CHECK-NEXT:    vorr d19, d16, d16
423; CHECK-NEXT:    vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r2:32]!
424; CHECK-NEXT:    str r2, [r0]
425; CHECK-NEXT:    mov pc, lr
426	%A = load i8*, i8** %ptr
427	%tmp1 = load <8 x i8>, <8 x i8>* %B
428	call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
429	%tmp2 = getelementptr i8, i8* %A, i32 4
430	store i8* %tmp2, i8** %ptr
431	ret void
432}
433
434define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
435; CHECK-LABEL: vst4lanei16:
436; CHECK:       @ %bb.0:
437; CHECK-NEXT:    vldr d16, [r1]
438; CHECK-NEXT:    vorr d17, d16, d16
439; CHECK-NEXT:    vorr d18, d16, d16
440; CHECK-NEXT:    vorr d19, d16, d16
441; CHECK-NEXT:    vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
442; CHECK-NEXT:    mov pc, lr
443	%tmp0 = bitcast i16* %A to i8*
444	%tmp1 = load <4 x i16>, <4 x i16>* %B
445	call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
446	ret void
447}
448
449;Check the alignment value.  Max for this instruction is 128 bits:
450define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
451; CHECK-LABEL: vst4lanei32:
452; CHECK:       @ %bb.0:
453; CHECK-NEXT:    vldr d16, [r1]
454; CHECK-NEXT:    vorr d17, d16, d16
455; CHECK-NEXT:    vorr d18, d16, d16
456; CHECK-NEXT:    vorr d19, d16, d16
457; CHECK-NEXT:    vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
458; CHECK-NEXT:    mov pc, lr
459	%tmp0 = bitcast i32* %A to i8*
460	%tmp1 = load <2 x i32>, <2 x i32>* %B
461	call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
462	ret void
463}
464
465define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
466; CHECK-LABEL: vst4lanef:
467; CHECK:       @ %bb.0:
468; CHECK-NEXT:    vldr d16, [r1]
469; CHECK-NEXT:    vorr d17, d16, d16
470; CHECK-NEXT:    vorr d18, d16, d16
471; CHECK-NEXT:    vorr d19, d16, d16
472; CHECK-NEXT:    vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
473; CHECK-NEXT:    mov pc, lr
474	%tmp0 = bitcast float* %A to i8*
475	%tmp1 = load <2 x float>, <2 x float>* %B
476	call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
477	ret void
478}
479
480;Check the alignment value.  Max for this instruction is 64 bits:
481define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
482; CHECK-LABEL: vst4laneQi16:
483; CHECK:       @ %bb.0:
484; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
485; CHECK-NEXT:    vorr q9, q8, q8
486; CHECK-NEXT:    vorr q10, q8, q8
487; CHECK-NEXT:    vorr q11, q8, q8
488; CHECK-NEXT:    vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
489; CHECK-NEXT:    mov pc, lr
490	%tmp0 = bitcast i16* %A to i8*
491	%tmp1 = load <8 x i16>, <8 x i16>* %B
492	call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
493	ret void
494}
495
496;Check the (default) alignment.
497define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
498; CHECK-LABEL: vst4laneQi32:
499; CHECK:       @ %bb.0:
500; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
501; CHECK-NEXT:    vorr q9, q8, q8
502; CHECK-NEXT:    vorr q10, q8, q8
503; CHECK-NEXT:    vorr q11, q8, q8
504; CHECK-NEXT:    vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
505; CHECK-NEXT:    mov pc, lr
506	%tmp0 = bitcast i32* %A to i8*
507	%tmp1 = load <4 x i32>, <4 x i32>* %B
508	call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
509	ret void
510}
511
512define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
513; CHECK-LABEL: vst4laneQf:
514; CHECK:       @ %bb.0:
515; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
516; CHECK-NEXT:    vorr q9, q8, q8
517; CHECK-NEXT:    vorr q10, q8, q8
518; CHECK-NEXT:    vorr q11, q8, q8
519; CHECK-NEXT:    vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0]
520; CHECK-NEXT:    mov pc, lr
521	%tmp0 = bitcast float* %A to i8*
522	%tmp1 = load <4 x float>, <4 x float>* %B
523	call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
524	ret void
525}
526
527; Make sure this doesn't crash; PR10258
528define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind readnone {
529; CHECK-LABEL: variable_insertelement:
530; CHECK:       @ %bb.0:
531; CHECK-NEXT:    push {r11, lr}
532; CHECK-NEXT:    mov r11, sp
533; CHECK-NEXT:    sub sp, sp, #24
534; CHECK-NEXT:    bic sp, sp, #15
535; CHECK-NEXT:    ldr lr, [r11, #12]
536; CHECK-NEXT:    vmov d17, r2, r3
537; CHECK-NEXT:    vmov d16, r0, r1
538; CHECK-NEXT:    mov r1, sp
539; CHECK-NEXT:    and r0, lr, #7
540; CHECK-NEXT:    mov r2, r1
541; CHECK-NEXT:    ldrh r12, [r11, #8]
542; CHECK-NEXT:    lsl r0, r0, #1
543; CHECK-NEXT:    vst1.64 {d16, d17}, [r2:128], r0
544; CHECK-NEXT:    strh r12, [r2]
545; CHECK-NEXT:    vld1.64 {d16, d17}, [r1:128]
546; CHECK-NEXT:    vmov r0, r1, d16
547; CHECK-NEXT:    vmov r2, r3, d17
548; CHECK-NEXT:    mov sp, r11
549; CHECK-NEXT:    pop {r11, lr}
550; CHECK-NEXT:    mov pc, lr
551    %r = insertelement <8 x i16> %a, i16 %b, i32 %c
552    ret <8 x i16> %r
553}
554
555declare void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
556declare void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
557declare void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
558declare void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
559
560declare void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
561declare void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
562declare void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
563