1; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
2; RUN: llc -mtriple=arm-eabi -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON
3
4; NEON-LABEL: load_factor2:
5; NEON: vld2.8 {d16, d17}, [r0]
6; NONEON-LABEL: load_factor2:
7; NONEON-NOT: vld2
8define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
9  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
10  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
11  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
12  %add = add nsw <8 x i8> %strided.v0, %strided.v1
13  ret <8 x i8> %add
14}
15
16; NEON-LABEL: load_factor3:
17; NEON: vld3.32 {d16, d17, d18}, [r0]
18; NONEON-LABEL: load_factor3:
19; NONEON-NOT: vld3
20define <2 x i32> @load_factor3(i32* %ptr) {
21  %base = bitcast i32* %ptr to <6 x i32>*
22  %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
23  %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
24  %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
25  %add = add nsw <2 x i32> %strided.v2, %strided.v1
26  ret <2 x i32> %add
27}
28
29; NEON-LABEL: load_factor4:
30; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
31; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
32; NONEON-LABEL: load_factor4:
33; NONEON-NOT: vld4
34define <4 x i32> @load_factor4(i32* %ptr) {
35  %base = bitcast i32* %ptr to <16 x i32>*
36  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
37  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
38  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
39  %add = add nsw <4 x i32> %strided.v0, %strided.v2
40  ret <4 x i32> %add
41}
42
43; NEON-LABEL: store_factor2:
44; NEON: vst2.8 {d16, d17}, [r0]
45; NONEON-LABEL: store_factor2:
46; NONEON-NOT: vst2
47define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
48  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
49  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
50  ret void
51}
52
53; NEON-LABEL: store_factor3:
54; NEON: vst3.32 {d16, d18, d20}, [r0]!
55; NEON: vst3.32 {d17, d19, d21}, [r0]
56; NONEON-LABEL: store_factor3:
57; NONEON-NOT: vst3.32
58define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
59  %base = bitcast i32* %ptr to <12 x i32>*
60  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
61  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
62  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
63  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
64  ret void
65}
66
67; NEON-LABEL: store_factor4:
68; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
69; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
70; NONEON-LABEL: store_factor4:
71; NONEON-NOT: vst4
72define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
73  %base = bitcast i32* %ptr to <16 x i32>*
74  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
75  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
77  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
78  ret void
79}
80
81; The following cases test that interleaved access of pointer vectors can be
82; matched to ldN/stN instruction.
83
84; NEON-LABEL: load_ptrvec_factor2:
85; NEON: vld2.32 {d16, d17}, [r0]
86; NONEON-LABEL: load_ptrvec_factor2:
87; NONEON-NOT: vld2
88define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
89  %base = bitcast i32** %ptr to <4 x i32*>*
90  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
91  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
92  ret <2 x i32*> %strided.v0
93}
94
95; NEON-LABEL: load_ptrvec_factor3:
96; NEON: vld3.32 {d16, d17, d18}, [r0]
97; NONEON-LABEL: load_ptrvec_factor3:
98; NONEON-NOT: vld3
99define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
100  %base = bitcast i32** %ptr to <6 x i32*>*
101  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
102  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
103  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
104  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
105  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
106  ret void
107}
108
109; NEON-LABEL: load_ptrvec_factor4:
110; NEON: vld4.32 {d16, d17, d18, d19}, [r0]
111; NONEON-LABEL: load_ptrvec_factor4:
112; NONEON-NOT: vld4
113define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
114  %base = bitcast i32** %ptr to <8 x i32*>*
115  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
116  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
117  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
118  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
119  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
120  ret void
121}
122
123; NEON-LABEL: store_ptrvec_factor2:
124; NEON: vst2.32 {d16, d17}, [r0]
125; NONEON-LABEL: store_ptrvec_factor2:
126; NONEON-NOT: vst2
127define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
128  %base = bitcast i32** %ptr to <4 x i32*>*
129  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
130  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
131  ret void
132}
133
134; NEON-LABEL: store_ptrvec_factor3:
135; NEON: vst3.32 {d16, d17, d18}, [r0]
136; NONEON-LABEL: store_ptrvec_factor3:
137; NONEON-NOT: vst3
138define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
139  %base = bitcast i32** %ptr to <6 x i32*>*
140  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
141  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
142  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
143  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
144  ret void
145}
146
147; NEON-LABEL: store_ptrvec_factor4:
148; NEON: vst4.32 {d16, d17, d18, d19}, [r0]
149; NONEON-LABEL: store_ptrvec_factor4:
150; NONEON-NOT: vst4
151define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
152  %base = bitcast i32* %ptr to <8 x i32*>*
153  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
154  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
155  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
156  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
157  ret void
158}
159
160; Following cases check that shuffle maskes with undef indices can be matched
161; into ldN/stN instruction.
162
163; NEON-LABEL: load_undef_mask_factor2:
164; NEON: vld2.32 {d16, d17, d18, d19}, [r0]
165; NONEON-LABEL: load_undef_mask_factor2:
166; NONEON-NOT: vld2
167define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
168  %base = bitcast i32* %ptr to <8 x i32>*
169  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
170  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
171  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
172  %add = add nsw <4 x i32> %strided.v0, %strided.v1
173  ret <4 x i32> %add
174}
175
176; NEON-LABEL: load_undef_mask_factor3:
177; NEON: vld3.32 {d16, d18, d20}, [r0]!
178; NEON: vld3.32 {d17, d19, d21}, [r0]
179; NONEON-LABEL: load_undef_mask_factor3:
180; NONEON-NOT: vld3
181define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
182  %base = bitcast i32* %ptr to <12 x i32>*
183  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
184  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
185  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
186  %add = add nsw <4 x i32> %strided.v2, %strided.v1
187  ret <4 x i32> %add
188}
189
190; NEON-LABEL: load_undef_mask_factor4:
191; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
192; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
193; NONEON-LABEL: load_undef_mask_factor4:
194; NONEON-NOT: vld4
195define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
196  %base = bitcast i32* %ptr to <16 x i32>*
197  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
198  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
199  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
200  %add = add nsw <4 x i32> %strided.v0, %strided.v2
201  ret <4 x i32> %add
202}
203
204; NEON-LABEL: store_undef_mask_factor2:
205; NEON: vst2.32 {d16, d17, d18, d19}, [r0]
206; NONEON-LABEL: store_undef_mask_factor2:
207; NONEON-NOT: vst2
208define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
209  %base = bitcast i32* %ptr to <8 x i32>*
210  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
211  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
212  ret void
213}
214
215; NEON-LABEL: store_undef_mask_factor3:
216; NEON: vst3.32 {d16, d18, d20}, [r0]!
217; NEON: vst3.32 {d17, d19, d21}, [r0]
218; NONEON-LABEL: store_undef_mask_factor3:
219; NONEON-NOT: vst3
220define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
221  %base = bitcast i32* %ptr to <12 x i32>*
222  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
223  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
224  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
225  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
226  ret void
227}
228
229; NEON-LABEL: store_undef_mask_factor4:
230; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
231; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
232; NONEON-LABEL: store_undef_mask_factor4:
233; NONEON-NOT: vst4
234define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
235  %base = bitcast i32* %ptr to <16 x i32>*
236  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
237  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
238  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
239  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
240  ret void
241}
242
243; The following test cases check that address spaces are properly handled
244
245; NEON-LABEL: load_address_space
246; NEON: vld3.32
247; NONEON-LABEL: load_address_space
248; NONEON-NOT: vld3
249define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
250 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
251 %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
252 store <2 x i32> %interleaved, <2 x i32>* %B
253 ret void
254}
255
256; NEON-LABEL: store_address_space
257; NEON: vst2.32
258; NONEON-LABEL: store_address_space
259; NONEON-NOT: vst2
260define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
261 %tmp0 = load <2 x i32>, <2 x i32>* %A
262 %tmp1 = load <2 x i32>, <2 x i32>* %B
263 %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
264 store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
265 ret void
266}
267
268; Check that we do something sane with illegal types.
269
270; NEON-LABEL: load_illegal_factor2:
271; NEON: BB#0:
272; NEON-NEXT: vld1.64 {d16, d17}, [r0:128]
273; NEON-NEXT: vuzp.32 q8, {{.*}}
274; NEON-NEXT: vmov r0, r1, d16
275; NEON-NEXT: vmov r2, r3, {{.*}}
276; NEON-NEXT: mov pc, lr
277; NONEON-LABEL: load_illegal_factor2:
278; NONEON: BB#0:
279; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0]
280; NONEON-NEXT: ldr r1, [r0, #8]
281; NONEON-NEXT: mov r0, [[ELT0]]
282; NONEON-NEXT: mov pc, lr
283define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
284  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
285  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
286  ret <3 x float> %tmp2
287}
288
289; This lowering isn't great, but it's at least correct.
290
291; NEON-LABEL: store_illegal_factor2:
292; NEON: BB#0:
293; NEON-NEXT: vldr d17, [sp]
294; NEON-NEXT: vmov d16, r2, r3
295; NEON-NEXT: vuzp.32 q8, {{.*}}
296; NEON-NEXT: vstr d16, [r0]
297; NEON-NEXT: mov pc, lr
298; NONEON-LABEL: store_illegal_factor2:
299; NONEON: BB#0:
300; NONEON-NEXT: stm r0, {r1, r3}
301; NONEON-NEXT: mov pc, lr
302define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
303  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
304  store <3 x float> %tmp1, <3 x float>* %p, align 16
305  ret void
306}
307
308; NEON-LABEL: load_factor2_with_extract_user:
309; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
310; NEON: vmov.32 r0, d16[1]
311; NONEON-LABEL: load_factor2_with_extract_user:
312; NONEON-NOT: vld2
313define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
314  %1 = load <8 x i32>, <8 x i32>* %a, align 8
315  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
316  %3 = extractelement <8 x i32> %1, i32 2
317  ret i32 %3
318}
319