1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon | FileCheck %s --check-prefixes=CHECK,DEFAULT
3; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic | FileCheck %s --check-prefixes=CHECK,BASIC
4
5;Check the (default) alignment value.
6define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
7; CHECK-LABEL: vld1lanei8:
8; CHECK:       @ %bb.0:
9; CHECK-NEXT:    vldr d16, [r1]
10; CHECK-NEXT:    vld1.8 {d16[3]}, [r0]
11; CHECK-NEXT:    vmov r0, r1, d16
12; CHECK-NEXT:    mov pc, lr
13  %tmp1 = load <8 x i8>, <8 x i8>* %B
14  %tmp2 = load i8, i8* %A, align 8
15  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
16  ret <8 x i8> %tmp3
17}
18
19;Check the alignment value.  Max for this instruction is 16 bits:
20define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
21; CHECK-LABEL: vld1lanei16:
22; CHECK:       @ %bb.0:
23; CHECK-NEXT:    vldr d16, [r1]
24; CHECK-NEXT:    vld1.16 {d16[2]}, [r0:16]
25; CHECK-NEXT:    vmov r0, r1, d16
26; CHECK-NEXT:    mov pc, lr
27  %tmp1 = load <4 x i16>, <4 x i16>* %B
28  %tmp2 = load i16, i16* %A, align 8
29  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
30  ret <4 x i16> %tmp3
31}
32
33;Check the alignment value.  Max for this instruction is 32 bits:
34define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
35; CHECK-LABEL: vld1lanei32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vldr d16, [r1]
38; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
39; CHECK-NEXT:    vmov r0, r1, d16
40; CHECK-NEXT:    mov pc, lr
41  %tmp1 = load <2 x i32>, <2 x i32>* %B
42  %tmp2 = load i32, i32* %A, align 8
43  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
44  ret <2 x i32> %tmp3
45}
46
47;Check the alignment value.  Legal values are none or :32.
48define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind {
49; CHECK-LABEL: vld1lanei32a32:
50; CHECK:       @ %bb.0:
51; CHECK-NEXT:    vldr d16, [r1]
52; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
53; CHECK-NEXT:    vmov r0, r1, d16
54; CHECK-NEXT:    mov pc, lr
55  %tmp1 = load <2 x i32>, <2 x i32>* %B
56  %tmp2 = load i32, i32* %A, align 4
57  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
58  ret <2 x i32> %tmp3
59}
60
61define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
62; CHECK-LABEL: vld1lanef:
63; CHECK:       @ %bb.0:
64; CHECK-NEXT:    vldr d16, [r1]
65; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
66; CHECK-NEXT:    vmov r0, r1, d16
67; CHECK-NEXT:    mov pc, lr
68  %tmp1 = load <2 x float>, <2 x float>* %B
69  %tmp2 = load float, float* %A, align 4
70  %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
71  ret <2 x float> %tmp3
72}
73
74define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
75; CHECK-LABEL: vld1laneQi8:
76; CHECK:       @ %bb.0:
77; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
78; CHECK-NEXT:    vld1.8 {d17[1]}, [r0]
79; CHECK-NEXT:    vmov r0, r1, d16
80; CHECK-NEXT:    vmov r2, r3, d17
81; CHECK-NEXT:    mov pc, lr
82  %tmp1 = load <16 x i8>, <16 x i8>* %B
83  %tmp2 = load i8, i8* %A, align 8
84  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
85  ret <16 x i8> %tmp3
86}
87
88define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
89; CHECK-LABEL: vld1laneQi16:
90; CHECK:       @ %bb.0:
91; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
92; CHECK-NEXT:    vld1.16 {d17[1]}, [r0:16]
93; CHECK-NEXT:    vmov r0, r1, d16
94; CHECK-NEXT:    vmov r2, r3, d17
95; CHECK-NEXT:    mov pc, lr
96  %tmp1 = load <8 x i16>, <8 x i16>* %B
97  %tmp2 = load i16, i16* %A, align 8
98  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
99  ret <8 x i16> %tmp3
100}
101
102define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
103; CHECK-LABEL: vld1laneQi32:
104; CHECK:       @ %bb.0:
105; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
106; CHECK-NEXT:    vld1.32 {d17[1]}, [r0:32]
107; CHECK-NEXT:    vmov r0, r1, d16
108; CHECK-NEXT:    vmov r2, r3, d17
109; CHECK-NEXT:    mov pc, lr
110  %tmp1 = load <4 x i32>, <4 x i32>* %B
111  %tmp2 = load i32, i32* %A, align 8
112  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
113  ret <4 x i32> %tmp3
114}
115
116define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
117; CHECK-LABEL: vld1laneQf:
118; CHECK:       @ %bb.0:
119; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
120; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
121; CHECK-NEXT:    vmov r2, r3, d17
122; CHECK-NEXT:    vmov r0, r1, d16
123; CHECK-NEXT:    mov pc, lr
124  %tmp1 = load <4 x float>, <4 x float>* %B
125  %tmp2 = load float, float* %A
126  %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
127  ret <4 x float> %tmp3
128}
129
130%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
131%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
132%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
133%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
134
135%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
136%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
137%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
138
139;Check the alignment value.  Max for this instruction is 16 bits:
140define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
141; CHECK-LABEL: vld2lanei8:
142; CHECK:       @ %bb.0:
143; CHECK-NEXT:    vldr d16, [r1]
144; CHECK-NEXT:    vorr d17, d16, d16
145; CHECK-NEXT:    vld2.8 {d16[1], d17[1]}, [r0:16]
146; CHECK-NEXT:    vadd.i8 d16, d16, d17
147; CHECK-NEXT:    vmov r0, r1, d16
148; CHECK-NEXT:    mov pc, lr
149  %tmp1 = load <8 x i8>, <8 x i8>* %B
150  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
151  %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
152  %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
153  %tmp5 = add <8 x i8> %tmp3, %tmp4
154  ret <8 x i8> %tmp5
155}
156
157;Check the alignment value.  Max for this instruction is 32 bits:
158define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
159; CHECK-LABEL: vld2lanei16:
160; CHECK:       @ %bb.0:
161; CHECK-NEXT:    vldr d16, [r1]
162; CHECK-NEXT:    vorr d17, d16, d16
163; CHECK-NEXT:    vld2.16 {d16[1], d17[1]}, [r0:32]
164; CHECK-NEXT:    vadd.i16 d16, d16, d17
165; CHECK-NEXT:    vmov r0, r1, d16
166; CHECK-NEXT:    mov pc, lr
167  %tmp0 = bitcast i16* %A to i8*
168  %tmp1 = load <4 x i16>, <4 x i16>* %B
169  %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
170  %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
171  %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
172  %tmp5 = add <4 x i16> %tmp3, %tmp4
173  ret <4 x i16> %tmp5
174}
175
176define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
177; CHECK-LABEL: vld2lanei32:
178; CHECK:       @ %bb.0:
179; CHECK-NEXT:    vldr d16, [r1]
180; CHECK-NEXT:    vorr d17, d16, d16
181; CHECK-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]
182; CHECK-NEXT:    vadd.i32 d16, d16, d17
183; CHECK-NEXT:    vmov r0, r1, d16
184; CHECK-NEXT:    mov pc, lr
185  %tmp0 = bitcast i32* %A to i8*
186  %tmp1 = load <2 x i32>, <2 x i32>* %B
187  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
188  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
189  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
190  %tmp5 = add <2 x i32> %tmp3, %tmp4
191  ret <2 x i32> %tmp5
192}
193
194;Check for a post-increment updating load.
195define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
196; DEFAULT-LABEL: vld2lanei32_update:
197; DEFAULT:       @ %bb.0:
198; DEFAULT-NEXT:    vldr d16, [r1]
199; DEFAULT-NEXT:    ldr r3, [r0]
200; DEFAULT-NEXT:    vorr d17, d16, d16
201; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3]!
202; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
203; DEFAULT-NEXT:    str r3, [r0]
204; DEFAULT-NEXT:    vmov r2, r1, d16
205; DEFAULT-NEXT:    mov r0, r2
206; DEFAULT-NEXT:    mov pc, lr
207;
208; BASIC-LABEL: vld2lanei32_update:
209; BASIC:       @ %bb.0:
210; BASIC-NEXT:    mov r2, r1
211; BASIC-NEXT:    mov r1, r0
212; BASIC-NEXT:    vldr d16, [r2]
213; BASIC-NEXT:    ldr r0, [r0]
214; BASIC-NEXT:    vorr d17, d16, d16
215; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]!
216; BASIC-NEXT:    vadd.i32 d16, d16, d17
217; BASIC-NEXT:    str r0, [r1]
218; BASIC-NEXT:    vmov r2, r3, d16
219; BASIC-NEXT:    mov r0, r2
220; BASIC-NEXT:    mov r1, r3
221; BASIC-NEXT:    mov pc, lr
222  %A = load i32*, i32** %ptr
223  %tmp0 = bitcast i32* %A to i8*
224  %tmp1 = load <2 x i32>, <2 x i32>* %B
225  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
226  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
227  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
228  %tmp5 = add <2 x i32> %tmp3, %tmp4
229  %tmp6 = getelementptr i32, i32* %A, i32 2
230  store i32* %tmp6, i32** %ptr
231  ret <2 x i32> %tmp5
232}
233
234define <2 x i32> @vld2lanei32_odd_update(i32** %ptr, <2 x i32>* %B) nounwind {
235; DEFAULT-LABEL: vld2lanei32_odd_update:
236; DEFAULT:       @ %bb.0:
237; DEFAULT-NEXT:    vldr d16, [r1]
238; DEFAULT-NEXT:    mov r1, #12
239; DEFAULT-NEXT:    ldr r3, [r0]
240; DEFAULT-NEXT:    vorr d17, d16, d16
241; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3], r1
242; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
243; DEFAULT-NEXT:    str r3, [r0]
244; DEFAULT-NEXT:    vmov r2, r1, d16
245; DEFAULT-NEXT:    mov r0, r2
246; DEFAULT-NEXT:    mov pc, lr
247;
248; BASIC-LABEL: vld2lanei32_odd_update:
249; BASIC:       @ %bb.0:
250; BASIC-NEXT:    mov r2, r1
251; BASIC-NEXT:    mov r1, r0
252; BASIC-NEXT:    vldr d16, [r2]
253; BASIC-NEXT:    mov r2, #12
254; BASIC-NEXT:    ldr r0, [r0]
255; BASIC-NEXT:    vorr d17, d16, d16
256; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0], r2
257; BASIC-NEXT:    vadd.i32 d16, d16, d17
258; BASIC-NEXT:    str r0, [r1]
259; BASIC-NEXT:    vmov r2, r3, d16
260; BASIC-NEXT:    mov r0, r2
261; BASIC-NEXT:    mov r1, r3
262; BASIC-NEXT:    mov pc, lr
263  %A = load i32*, i32** %ptr
264  %tmp0 = bitcast i32* %A to i8*
265  %tmp1 = load <2 x i32>, <2 x i32>* %B
266  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
267  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
268  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
269  %tmp5 = add <2 x i32> %tmp3, %tmp4
270  %tmp6 = getelementptr i32, i32* %A, i32 3
271  store i32* %tmp6, i32** %ptr
272  ret <2 x i32> %tmp5
273}
274
275define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
276; CHECK-LABEL: vld2lanef:
277; CHECK:       @ %bb.0:
278; CHECK-NEXT:    vldr d16, [r1]
279; CHECK-NEXT:    vorr d17, d16, d16
280; CHECK-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]
281; CHECK-NEXT:    vadd.f32 d16, d16, d17
282; CHECK-NEXT:    vmov r0, r1, d16
283; CHECK-NEXT:    mov pc, lr
284  %tmp0 = bitcast float* %A to i8*
285  %tmp1 = load <2 x float>, <2 x float>* %B
286  %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
287  %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
288  %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
289  %tmp5 = fadd <2 x float> %tmp3, %tmp4
290  ret <2 x float> %tmp5
291}
292
293;Check the (default) alignment.
294define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
295; CHECK-LABEL: vld2laneQi16:
296; CHECK:       @ %bb.0:
297; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
298; CHECK-NEXT:    vorr q9, q8, q8
299; CHECK-NEXT:    vld2.16 {d17[1], d19[1]}, [r0]
300; CHECK-NEXT:    vadd.i16 q8, q8, q9
301; CHECK-NEXT:    vmov r0, r1, d16
302; CHECK-NEXT:    vmov r2, r3, d17
303; CHECK-NEXT:    mov pc, lr
304  %tmp0 = bitcast i16* %A to i8*
305  %tmp1 = load <8 x i16>, <8 x i16>* %B
306  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
307  %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
308  %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
309  %tmp5 = add <8 x i16> %tmp3, %tmp4
310  ret <8 x i16> %tmp5
311}
312
313;Check the alignment value.  Max for this instruction is 64 bits:
314define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
315; CHECK-LABEL: vld2laneQi32:
316; CHECK:       @ %bb.0:
317; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
318; CHECK-NEXT:    vorr q9, q8, q8
319; CHECK-NEXT:    vld2.32 {d17[0], d19[0]}, [r0:64]
320; CHECK-NEXT:    vadd.i32 q8, q8, q9
321; CHECK-NEXT:    vmov r0, r1, d16
322; CHECK-NEXT:    vmov r2, r3, d17
323; CHECK-NEXT:    mov pc, lr
324  %tmp0 = bitcast i32* %A to i8*
325  %tmp1 = load <4 x i32>, <4 x i32>* %B
326  %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
327  %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
328  %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
329  %tmp5 = add <4 x i32> %tmp3, %tmp4
330  ret <4 x i32> %tmp5
331}
332
333define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
334; CHECK-LABEL: vld2laneQf:
335; CHECK:       @ %bb.0:
336; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
337; CHECK-NEXT:    vorr q9, q8, q8
338; CHECK-NEXT:    vld2.32 {d16[1], d18[1]}, [r0]
339; CHECK-NEXT:    vadd.f32 q8, q8, q9
340; CHECK-NEXT:    vmov r0, r1, d16
341; CHECK-NEXT:    vmov r2, r3, d17
342; CHECK-NEXT:    mov pc, lr
343  %tmp0 = bitcast float* %A to i8*
344  %tmp1 = load <4 x float>, <4 x float>* %B
345  %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
346  %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
347  %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
348  %tmp5 = fadd <4 x float> %tmp3, %tmp4
349  ret <4 x float> %tmp5
350}
351
352declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
353declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
354declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
355declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, i32, i32) nounwind readonly
356
357declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
358declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
359declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, i32, i32) nounwind readonly
360
361%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
362%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
363%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
364%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
365
366%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
367%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
368%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
369
370define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
371; DEFAULT-LABEL: vld3lanei8:
372; DEFAULT:       @ %bb.0:
373; DEFAULT-NEXT:    vldr d16, [r1]
374; DEFAULT-NEXT:    vorr d17, d16, d16
375; DEFAULT-NEXT:    vorr d18, d16, d16
376; DEFAULT-NEXT:    vld3.8 {d16[1], d17[1], d18[1]}, [r0]
377; DEFAULT-NEXT:    vadd.i8 d20, d16, d17
378; DEFAULT-NEXT:    vadd.i8 d16, d18, d20
379; DEFAULT-NEXT:    vmov r0, r1, d16
380; DEFAULT-NEXT:    mov pc, lr
381;
382; BASIC-LABEL: vld3lanei8:
383; BASIC:       @ %bb.0:
384; BASIC-NEXT:    vldr d18, [r1]
385; BASIC-NEXT:    vorr d19, d18, d18
386; BASIC-NEXT:    vorr d20, d18, d18
387; BASIC-NEXT:    vld3.8 {d18[1], d19[1], d20[1]}, [r0]
388; BASIC-NEXT:    vadd.i8 d16, d18, d19
389; BASIC-NEXT:    vadd.i8 d16, d20, d16
390; BASIC-NEXT:    vmov r0, r1, d16
391; BASIC-NEXT:    mov pc, lr
392  %tmp1 = load <8 x i8>, <8 x i8>* %B
393  %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
394  %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
395  %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
396  %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
397  %tmp6 = add <8 x i8> %tmp3, %tmp4
398  %tmp7 = add <8 x i8> %tmp5, %tmp6
399  ret <8 x i8> %tmp7
400}
401
402;Check the (default) alignment value.  VLD3 does not support alignment.
403define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
404; DEFAULT-LABEL: vld3lanei16:
405; DEFAULT:       @ %bb.0:
406; DEFAULT-NEXT:    vldr d16, [r1]
407; DEFAULT-NEXT:    vorr d17, d16, d16
408; DEFAULT-NEXT:    vorr d18, d16, d16
409; DEFAULT-NEXT:    vld3.16 {d16[1], d17[1], d18[1]}, [r0]
410; DEFAULT-NEXT:    vadd.i16 d20, d16, d17
411; DEFAULT-NEXT:    vadd.i16 d16, d18, d20
412; DEFAULT-NEXT:    vmov r0, r1, d16
413; DEFAULT-NEXT:    mov pc, lr
414;
415; BASIC-LABEL: vld3lanei16:
416; BASIC:       @ %bb.0:
417; BASIC-NEXT:    vldr d18, [r1]
418; BASIC-NEXT:    vorr d19, d18, d18
419; BASIC-NEXT:    vorr d20, d18, d18
420; BASIC-NEXT:    vld3.16 {d18[1], d19[1], d20[1]}, [r0]
421; BASIC-NEXT:    vadd.i16 d16, d18, d19
422; BASIC-NEXT:    vadd.i16 d16, d20, d16
423; BASIC-NEXT:    vmov r0, r1, d16
424; BASIC-NEXT:    mov pc, lr
425  %tmp0 = bitcast i16* %A to i8*
426  %tmp1 = load <4 x i16>, <4 x i16>* %B
427  %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
428  %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
429  %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
430  %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
431  %tmp6 = add <4 x i16> %tmp3, %tmp4
432  %tmp7 = add <4 x i16> %tmp5, %tmp6
433  ret <4 x i16> %tmp7
434}
435
436define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
437; DEFAULT-LABEL: vld3lanei32:
438; DEFAULT:       @ %bb.0:
439; DEFAULT-NEXT:    vldr d16, [r1]
440; DEFAULT-NEXT:    vorr d17, d16, d16
441; DEFAULT-NEXT:    vorr d18, d16, d16
442; DEFAULT-NEXT:    vld3.32 {d16[1], d17[1], d18[1]}, [r0]
443; DEFAULT-NEXT:    vadd.i32 d20, d16, d17
444; DEFAULT-NEXT:    vadd.i32 d16, d18, d20
445; DEFAULT-NEXT:    vmov r0, r1, d16
446; DEFAULT-NEXT:    mov pc, lr
447;
448; BASIC-LABEL: vld3lanei32:
449; BASIC:       @ %bb.0:
450; BASIC-NEXT:    vldr d18, [r1]
451; BASIC-NEXT:    vorr d19, d18, d18
452; BASIC-NEXT:    vorr d20, d18, d18
453; BASIC-NEXT:    vld3.32 {d18[1], d19[1], d20[1]}, [r0]
454; BASIC-NEXT:    vadd.i32 d16, d18, d19
455; BASIC-NEXT:    vadd.i32 d16, d20, d16
456; BASIC-NEXT:    vmov r0, r1, d16
457; BASIC-NEXT:    mov pc, lr
458  %tmp0 = bitcast i32* %A to i8*
459  %tmp1 = load <2 x i32>, <2 x i32>* %B
460  %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
461  %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
462  %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
463  %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
464  %tmp6 = add <2 x i32> %tmp3, %tmp4
465  %tmp7 = add <2 x i32> %tmp5, %tmp6
466  ret <2 x i32> %tmp7
467}
468
469define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
470; DEFAULT-LABEL: vld3lanef:
471; DEFAULT:       @ %bb.0:
472; DEFAULT-NEXT:    vldr d16, [r1]
473; DEFAULT-NEXT:    vorr d17, d16, d16
474; DEFAULT-NEXT:    vorr d18, d16, d16
475; DEFAULT-NEXT:    vld3.32 {d16[1], d17[1], d18[1]}, [r0]
476; DEFAULT-NEXT:    vadd.f32 d20, d16, d17
477; DEFAULT-NEXT:    vadd.f32 d16, d18, d20
478; DEFAULT-NEXT:    vmov r0, r1, d16
479; DEFAULT-NEXT:    mov pc, lr
480;
481; BASIC-LABEL: vld3lanef:
482; BASIC:       @ %bb.0:
483; BASIC-NEXT:    vldr d18, [r1]
484; BASIC-NEXT:    vorr d19, d18, d18
485; BASIC-NEXT:    vorr d20, d18, d18
486; BASIC-NEXT:    vld3.32 {d18[1], d19[1], d20[1]}, [r0]
487; BASIC-NEXT:    vadd.f32 d16, d18, d19
488; BASIC-NEXT:    vadd.f32 d16, d20, d16
489; BASIC-NEXT:    vmov r0, r1, d16
490; BASIC-NEXT:    mov pc, lr
491  %tmp0 = bitcast float* %A to i8*
492  %tmp1 = load <2 x float>, <2 x float>* %B
493  %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
494  %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
495  %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
496  %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
497  %tmp6 = fadd <2 x float> %tmp3, %tmp4
498  %tmp7 = fadd <2 x float> %tmp5, %tmp6
499  ret <2 x float> %tmp7
500}
501
502;Check the (default) alignment value.  VLD3 does not support alignment.
503define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
504; DEFAULT-LABEL: vld3laneQi16:
505; DEFAULT:       @ %bb.0:
506; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
507; DEFAULT-NEXT:    vorr q9, q8, q8
508; DEFAULT-NEXT:    vorr q10, q8, q8
509; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [r0]
510; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
511; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
512; DEFAULT-NEXT:    vmov r0, r1, d16
513; DEFAULT-NEXT:    vmov r2, r3, d17
514; DEFAULT-NEXT:    mov pc, lr
515;
516; BASIC-LABEL: vld3laneQi16:
517; BASIC:       @ %bb.0:
518; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
519; BASIC-NEXT:    vorr q10, q9, q9
520; BASIC-NEXT:    vorr q11, q9, q9
521; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0]
522; BASIC-NEXT:    vadd.i16 q8, q9, q10
523; BASIC-NEXT:    vadd.i16 q8, q11, q8
524; BASIC-NEXT:    vmov r0, r1, d16
525; BASIC-NEXT:    vmov r2, r3, d17
526; BASIC-NEXT:    mov pc, lr
527  %tmp0 = bitcast i16* %A to i8*
528  %tmp1 = load <8 x i16>, <8 x i16>* %B
529  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
530  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
531  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
532  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
533  %tmp6 = add <8 x i16> %tmp3, %tmp4
534  %tmp7 = add <8 x i16> %tmp5, %tmp6
535  ret <8 x i16> %tmp7
536}
537
538;Check for a post-increment updating load with register increment.
539define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
540; DEFAULT-LABEL: vld3laneQi16_update:
541; DEFAULT:       @ %bb.0:
542; DEFAULT-NEXT:    .save {r11, lr}
543; DEFAULT-NEXT:    push {r11, lr}
544; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
545; DEFAULT-NEXT:    lsl r1, r2, #1
546; DEFAULT-NEXT:    vorr q9, q8, q8
547; DEFAULT-NEXT:    ldr lr, [r0]
548; DEFAULT-NEXT:    vorr q10, q8, q8
549; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [lr], r1
550; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
551; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
552; DEFAULT-NEXT:    str lr, [r0]
553; DEFAULT-NEXT:    vmov r12, r1, d16
554; DEFAULT-NEXT:    vmov r2, r3, d17
555; DEFAULT-NEXT:    mov r0, r12
556; DEFAULT-NEXT:    pop {r11, lr}
557; DEFAULT-NEXT:    mov pc, lr
558;
559; BASIC-LABEL: vld3laneQi16_update:
560; BASIC:       @ %bb.0:
561; BASIC-NEXT:    .save {r11, lr}
562; BASIC-NEXT:    push {r11, lr}
563; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
564; BASIC-NEXT:    mov r3, r0
565; BASIC-NEXT:    vorr q10, q9, q9
566; BASIC-NEXT:    lsl r1, r2, #1
567; BASIC-NEXT:    ldr r0, [r0]
568; BASIC-NEXT:    vorr q11, q9, q9
569; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0], r1
570; BASIC-NEXT:    vadd.i16 q8, q9, q10
571; BASIC-NEXT:    vadd.i16 q8, q11, q8
572; BASIC-NEXT:    str r0, [r3]
573; BASIC-NEXT:    vmov r1, lr, d16
574; BASIC-NEXT:    vmov r2, r12, d17
575; BASIC-NEXT:    mov r0, r1
576; BASIC-NEXT:    mov r1, lr
577; BASIC-NEXT:    mov r3, r12
578; BASIC-NEXT:    pop {r11, lr}
579; BASIC-NEXT:    mov pc, lr
580  %A = load i16*, i16** %ptr
581  %tmp0 = bitcast i16* %A to i8*
582  %tmp1 = load <8 x i16>, <8 x i16>* %B
583  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
584  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
585  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
586  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
587  %tmp6 = add <8 x i16> %tmp3, %tmp4
588  %tmp7 = add <8 x i16> %tmp5, %tmp6
589  %tmp8 = getelementptr i16, i16* %A, i32 %inc
590  store i16* %tmp8, i16** %ptr
591  ret <8 x i16> %tmp7
592}
593
594define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
595; DEFAULT-LABEL: vld3laneQi32:
596; DEFAULT:       @ %bb.0:
597; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
598; DEFAULT-NEXT:    vorr q9, q8, q8
599; DEFAULT-NEXT:    vorr q10, q8, q8
600; DEFAULT-NEXT:    vld3.32 {d17[1], d19[1], d21[1]}, [r0]
601; DEFAULT-NEXT:    vadd.i32 q12, q8, q9
602; DEFAULT-NEXT:    vadd.i32 q8, q10, q12
603; DEFAULT-NEXT:    vmov r0, r1, d16
604; DEFAULT-NEXT:    vmov r2, r3, d17
605; DEFAULT-NEXT:    mov pc, lr
606;
607; BASIC-LABEL: vld3laneQi32:
608; BASIC:       @ %bb.0:
609; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
610; BASIC-NEXT:    vorr q10, q9, q9
611; BASIC-NEXT:    vorr q11, q9, q9
612; BASIC-NEXT:    vld3.32 {d19[1], d21[1], d23[1]}, [r0]
613; BASIC-NEXT:    vadd.i32 q8, q9, q10
614; BASIC-NEXT:    vadd.i32 q8, q11, q8
615; BASIC-NEXT:    vmov r0, r1, d16
616; BASIC-NEXT:    vmov r2, r3, d17
617; BASIC-NEXT:    mov pc, lr
618  %tmp0 = bitcast i32* %A to i8*
619  %tmp1 = load <4 x i32>, <4 x i32>* %B
620  %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
621  %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
622  %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
623  %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
624  %tmp6 = add <4 x i32> %tmp3, %tmp4
625  %tmp7 = add <4 x i32> %tmp5, %tmp6
626  ret <4 x i32> %tmp7
627}
628
629define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
630; DEFAULT-LABEL: vld3laneQf:
631; DEFAULT:       @ %bb.0:
632; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
633; DEFAULT-NEXT:    vorr q9, q8, q8
634; DEFAULT-NEXT:    vorr q10, q8, q8
635; DEFAULT-NEXT:    vld3.32 {d16[1], d18[1], d20[1]}, [r0]
636; DEFAULT-NEXT:    vadd.f32 q12, q8, q9
637; DEFAULT-NEXT:    vadd.f32 q8, q10, q12
638; DEFAULT-NEXT:    vmov r0, r1, d16
639; DEFAULT-NEXT:    vmov r2, r3, d17
640; DEFAULT-NEXT:    mov pc, lr
641;
642; BASIC-LABEL: vld3laneQf:
643; BASIC:       @ %bb.0:
644; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
645; BASIC-NEXT:    vorr q10, q9, q9
646; BASIC-NEXT:    vorr q11, q9, q9
647; BASIC-NEXT:    vld3.32 {d18[1], d20[1], d22[1]}, [r0]
648; BASIC-NEXT:    vadd.f32 q8, q9, q10
649; BASIC-NEXT:    vadd.f32 q8, q11, q8
650; BASIC-NEXT:    vmov r0, r1, d16
651; BASIC-NEXT:    vmov r2, r3, d17
652; BASIC-NEXT:    mov pc, lr
653  %tmp0 = bitcast float* %A to i8*
654  %tmp1 = load <4 x float>, <4 x float>* %B
655  %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
656  %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
657  %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
658  %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
659  %tmp6 = fadd <4 x float> %tmp3, %tmp4
660  %tmp7 = fadd <4 x float> %tmp5, %tmp6
661  ret <4 x float> %tmp7
662}
663
664declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
665declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
666declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
667declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
668
669declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
670declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
671declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
672
673%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
674%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
675%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
676%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
677
678%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
679%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
680%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
681
682;Check the alignment value.  Max for this instruction is 32 bits:
683define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
684; CHECK-LABEL: vld4lanei8:
685; CHECK:       @ %bb.0:
686; CHECK-NEXT:    vldr d16, [r1]
687; CHECK-NEXT:    vorr d17, d16, d16
688; CHECK-NEXT:    vorr d18, d16, d16
689; CHECK-NEXT:    vorr d19, d16, d16
690; CHECK-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
691; CHECK-NEXT:    vadd.i8 d16, d16, d17
692; CHECK-NEXT:    vadd.i8 d20, d18, d19
693; CHECK-NEXT:    vadd.i8 d16, d16, d20
694; CHECK-NEXT:    vmov r0, r1, d16
695; CHECK-NEXT:    mov pc, lr
696  %tmp1 = load <8 x i8>, <8 x i8>* %B
697  %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
698  %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
699  %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
700  %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
701  %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
702  %tmp7 = add <8 x i8> %tmp3, %tmp4
703  %tmp8 = add <8 x i8> %tmp5, %tmp6
704  %tmp9 = add <8 x i8> %tmp7, %tmp8
705  ret <8 x i8> %tmp9
706}
707
708;Check for a post-increment updating load.
709define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
710; DEFAULT-LABEL: vld4lanei8_update:
711; DEFAULT:       @ %bb.0:
712; DEFAULT-NEXT:    vldr d16, [r1]
713; DEFAULT-NEXT:    vorr d17, d16, d16
714; DEFAULT-NEXT:    ldr r3, [r0]
715; DEFAULT-NEXT:    vorr d18, d16, d16
716; DEFAULT-NEXT:    vorr d19, d16, d16
717; DEFAULT-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3:32]!
718; DEFAULT-NEXT:    vadd.i8 d16, d16, d17
719; DEFAULT-NEXT:    vadd.i8 d20, d18, d19
720; DEFAULT-NEXT:    str r3, [r0]
721; DEFAULT-NEXT:    vadd.i8 d16, d16, d20
722; DEFAULT-NEXT:    vmov r2, r1, d16
723; DEFAULT-NEXT:    mov r0, r2
724; DEFAULT-NEXT:    mov pc, lr
725;
726; BASIC-LABEL: vld4lanei8_update:
727; BASIC:       @ %bb.0:
728; BASIC-NEXT:    vldr d16, [r1]
729; BASIC-NEXT:    mov r3, r0
730; BASIC-NEXT:    vorr d17, d16, d16
731; BASIC-NEXT:    ldr r0, [r0]
732; BASIC-NEXT:    vorr d18, d16, d16
733; BASIC-NEXT:    vorr d19, d16, d16
734; BASIC-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]!
735; BASIC-NEXT:    vadd.i8 d16, d16, d17
736; BASIC-NEXT:    vadd.i8 d20, d18, d19
737; BASIC-NEXT:    str r0, [r3]
738; BASIC-NEXT:    vadd.i8 d16, d16, d20
739; BASIC-NEXT:    vmov r1, r2, d16
740; BASIC-NEXT:    mov r0, r1
741; BASIC-NEXT:    mov r1, r2
742; BASIC-NEXT:    mov pc, lr
743  %A = load i8*, i8** %ptr
744  %tmp1 = load <8 x i8>, <8 x i8>* %B
745  %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
746  %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
747  %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
748  %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
749  %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
750  %tmp7 = add <8 x i8> %tmp3, %tmp4
751  %tmp8 = add <8 x i8> %tmp5, %tmp6
752  %tmp9 = add <8 x i8> %tmp7, %tmp8
753  %tmp10 = getelementptr i8, i8* %A, i32 4
754  store i8* %tmp10, i8** %ptr
755  ret <8 x i8> %tmp9
756}
757
758;Check that a power-of-two alignment smaller than the total size of the memory
759;being loaded is ignored.
760define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
761; CHECK-LABEL: vld4lanei16:
762; CHECK:       @ %bb.0:
763; CHECK-NEXT:    vldr d16, [r1]
764; CHECK-NEXT:    vorr d17, d16, d16
765; CHECK-NEXT:    vorr d18, d16, d16
766; CHECK-NEXT:    vorr d19, d16, d16
767; CHECK-NEXT:    vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
768; CHECK-NEXT:    vadd.i16 d16, d16, d17
769; CHECK-NEXT:    vadd.i16 d20, d18, d19
770; CHECK-NEXT:    vadd.i16 d16, d16, d20
771; CHECK-NEXT:    vmov r0, r1, d16
772; CHECK-NEXT:    mov pc, lr
773  %tmp0 = bitcast i16* %A to i8*
774  %tmp1 = load <4 x i16>, <4 x i16>* %B
775  %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
776  %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
777  %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
778  %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
779  %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
780  %tmp7 = add <4 x i16> %tmp3, %tmp4
781  %tmp8 = add <4 x i16> %tmp5, %tmp6
782  %tmp9 = add <4 x i16> %tmp7, %tmp8
783  ret <4 x i16> %tmp9
784}
785
786;Check the alignment value.  An 8-byte alignment is allowed here even though
787;it is smaller than the total size of the memory being loaded.
788define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
789; CHECK-LABEL: vld4lanei32:
790; CHECK:       @ %bb.0:
791; CHECK-NEXT:    vldr d16, [r1]
792; CHECK-NEXT:    vorr d17, d16, d16
793; CHECK-NEXT:    vorr d18, d16, d16
794; CHECK-NEXT:    vorr d19, d16, d16
795; CHECK-NEXT:    vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:64]
796; CHECK-NEXT:    vadd.i32 d16, d16, d17
797; CHECK-NEXT:    vadd.i32 d20, d18, d19
798; CHECK-NEXT:    vadd.i32 d16, d16, d20
799; CHECK-NEXT:    vmov r0, r1, d16
800; CHECK-NEXT:    mov pc, lr
801  %tmp0 = bitcast i32* %A to i8*
802  %tmp1 = load <2 x i32>, <2 x i32>* %B
803  %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
804  %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
805  %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
806  %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
807  %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
808  %tmp7 = add <2 x i32> %tmp3, %tmp4
809  %tmp8 = add <2 x i32> %tmp5, %tmp6
810  %tmp9 = add <2 x i32> %tmp7, %tmp8
811  ret <2 x i32> %tmp9
812}
813
814define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
815; CHECK-LABEL: vld4lanef:
816; CHECK:       @ %bb.0:
817; CHECK-NEXT:    vldr d16, [r1]
818; CHECK-NEXT:    vorr d17, d16, d16
819; CHECK-NEXT:    vorr d18, d16, d16
820; CHECK-NEXT:    vorr d19, d16, d16
821; CHECK-NEXT:    vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
822; CHECK-NEXT:    vadd.f32 d16, d16, d17
823; CHECK-NEXT:    vadd.f32 d20, d18, d19
824; CHECK-NEXT:    vadd.f32 d16, d16, d20
825; CHECK-NEXT:    vmov r0, r1, d16
826; CHECK-NEXT:    mov pc, lr
827  %tmp0 = bitcast float* %A to i8*
828  %tmp1 = load <2 x float>, <2 x float>* %B
829  %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
830  %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
831  %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
832  %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
833  %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
834  %tmp7 = fadd <2 x float> %tmp3, %tmp4
835  %tmp8 = fadd <2 x float> %tmp5, %tmp6
836  %tmp9 = fadd <2 x float> %tmp7, %tmp8
837  ret <2 x float> %tmp9
838}
839
840;Check the alignment value.  Max for this instruction is 64 bits:
841define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
842; CHECK-LABEL: vld4laneQi16:
843; CHECK:       @ %bb.0:
844; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
845; CHECK-NEXT:    vorr q9, q8, q8
846; CHECK-NEXT:    vorr q10, q8, q8
847; CHECK-NEXT:    vorr q11, q8, q8
848; CHECK-NEXT:    vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0:64]
849; CHECK-NEXT:    vadd.i16 q8, q8, q9
850; CHECK-NEXT:    vadd.i16 q12, q10, q11
851; CHECK-NEXT:    vadd.i16 q8, q8, q12
852; CHECK-NEXT:    vmov r0, r1, d16
853; CHECK-NEXT:    vmov r2, r3, d17
854; CHECK-NEXT:    mov pc, lr
855  %tmp0 = bitcast i16* %A to i8*
856  %tmp1 = load <8 x i16>, <8 x i16>* %B
857  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
858  %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
859  %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
860  %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
861  %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
862  %tmp7 = add <8 x i16> %tmp3, %tmp4
863  %tmp8 = add <8 x i16> %tmp5, %tmp6
864  %tmp9 = add <8 x i16> %tmp7, %tmp8
865  ret <8 x i16> %tmp9
866}
867
868;Check the (default) alignment.
869define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
870; CHECK-LABEL: vld4laneQi32:
871; CHECK:       @ %bb.0:
872; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
873; CHECK-NEXT:    vorr q9, q8, q8
874; CHECK-NEXT:    vorr q10, q8, q8
875; CHECK-NEXT:    vorr q11, q8, q8
876; CHECK-NEXT:    vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
877; CHECK-NEXT:    vadd.i32 q8, q8, q9
878; CHECK-NEXT:    vadd.i32 q12, q10, q11
879; CHECK-NEXT:    vadd.i32 q8, q8, q12
880; CHECK-NEXT:    vmov r0, r1, d16
881; CHECK-NEXT:    vmov r2, r3, d17
882; CHECK-NEXT:    mov pc, lr
883  %tmp0 = bitcast i32* %A to i8*
884  %tmp1 = load <4 x i32>, <4 x i32>* %B
885  %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
886  %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
887  %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
888  %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
889  %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
890  %tmp7 = add <4 x i32> %tmp3, %tmp4
891  %tmp8 = add <4 x i32> %tmp5, %tmp6
892  %tmp9 = add <4 x i32> %tmp7, %tmp8
893  ret <4 x i32> %tmp9
894}
895
896define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
897; CHECK-LABEL: vld4laneQf:
898; CHECK:       @ %bb.0:
899; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
900; CHECK-NEXT:    vorr q9, q8, q8
901; CHECK-NEXT:    vorr q10, q8, q8
902; CHECK-NEXT:    vorr q11, q8, q8
903; CHECK-NEXT:    vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0]
904; CHECK-NEXT:    vadd.f32 q8, q8, q9
905; CHECK-NEXT:    vadd.f32 q12, q10, q11
906; CHECK-NEXT:    vadd.f32 q8, q8, q12
907; CHECK-NEXT:    vmov r0, r1, d16
908; CHECK-NEXT:    vmov r2, r3, d17
909; CHECK-NEXT:    mov pc, lr
910  %tmp0 = bitcast float* %A to i8*
911  %tmp1 = load <4 x float>, <4 x float>* %B
912  %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
913  %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
914  %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
915  %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
916  %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
917  %tmp7 = fadd <4 x float> %tmp3, %tmp4
918  %tmp8 = fadd <4 x float> %tmp5, %tmp6
919  %tmp9 = fadd <4 x float> %tmp7, %tmp8
920  ret <4 x float> %tmp9
921}
922
923declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
924declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0i8(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
925declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0i8(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
926declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0i8(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
927
928declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
929declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0i8(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
930declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0i8(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
931
932; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
933; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
934; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
935; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
936define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
937; DEFAULT-LABEL: test_qqqq_regsequence_subreg:
938; DEFAULT:       @ %bb.0:
939; DEFAULT-NEXT:    add r0, sp, #24
940; DEFAULT-NEXT:    vld1.32 {d21[0]}, [r0:32]
941; DEFAULT-NEXT:    add r0, sp, #28
942; DEFAULT-NEXT:    vmov.i32 d20, #0x0
943; DEFAULT-NEXT:    vld1.32 {d21[1]}, [r0:32]
944; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [r0]
945; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
946; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
947; DEFAULT-NEXT:    vmov r0, r1, d16
948; DEFAULT-NEXT:    vmov r2, r3, d17
949; DEFAULT-NEXT:    mov pc, lr
950;
951; BASIC-LABEL: test_qqqq_regsequence_subreg:
952; BASIC:       @ %bb.0:
953; BASIC-NEXT:    add r0, sp, #24
954; BASIC-NEXT:    vld1.32 {d23[0]}, [r0:32]
955; BASIC-NEXT:    add r0, sp, #28
956; BASIC-NEXT:    vmov.i32 d22, #0x0
957; BASIC-NEXT:    vld1.32 {d23[1]}, [r0:32]
958; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0]
959; BASIC-NEXT:    vadd.i16 q8, q9, q10
960; BASIC-NEXT:    vadd.i16 q8, q11, q8
961; BASIC-NEXT:    vmov r0, r1, d16
962; BASIC-NEXT:    vmov r2, r3, d17
963; BASIC-NEXT:    mov pc, lr
964  %tmp63 = extractvalue [6 x i64] %b, 5
965  %tmp64 = zext i64 %tmp63 to i128
966  %tmp65 = shl i128 %tmp64, 64
967  %ins67 = or i128 %tmp65, 0
968  %tmp78 = bitcast i128 %ins67 to <8 x i16>
969  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
970  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
971  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
972  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
973  %tmp6 = add <8 x i16> %tmp3, %tmp4
974  %tmp7 = add <8 x i16> %tmp5, %tmp6
975  ret <8 x i16> %tmp7
976}
977
978declare void @llvm.trap() nounwind
979