1; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
2; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
3
4; NEON-LABEL: load_factor2:
5; NEON: ld2 { v0.8b, v1.8b }, [x0]
6; NONEON-LABEL: load_factor2:
7; NONEON-NOT: ld2
8define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
9  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
10  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
11  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
12  %add = add nsw <8 x i8> %strided.v0, %strided.v1
13  ret <8 x i8> %add
14}
15
16; NEON-LABEL: load_factor3:
17; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
18; NONEON-LABEL: load_factor3:
19; NONEON-NOT: ld3
20define <4 x i32> @load_factor3(i32* %ptr) {
21  %base = bitcast i32* %ptr to <12 x i32>*
22  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
23  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
24  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
25  %add = add nsw <4 x i32> %strided.v2, %strided.v1
26  ret <4 x i32> %add
27}
28
29; NEON-LABEL: load_factor4:
30; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
31; NONEON-LABEL: load_factor4:
32; NONEON-NOT: ld4
33define <4 x i32> @load_factor4(i32* %ptr) {
34  %base = bitcast i32* %ptr to <16 x i32>*
35  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
36  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
37  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
38  %add = add nsw <4 x i32> %strided.v0, %strided.v2
39  ret <4 x i32> %add
40}
41
42; NEON-LABEL: store_factor2:
43; NEON: st2 { v0.8b, v1.8b }, [x0]
44; NONEON-LABEL: store_factor2:
45; NONEON-NOT: st2
46define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
47  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
48  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
49  ret void
50}
51
52; NEON-LABEL: store_factor3:
53; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
54; NONEON-LABEL: store_factor3:
55; NONEON-NOT: st3
56define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
57  %base = bitcast i32* %ptr to <12 x i32>*
58  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
60  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
61  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
62  ret void
63}
64
65; NEON-LABEL: store_factor4:
66; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
67; NONEON-LABEL: store_factor4:
68; NONEON-NOT: st4
69define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
70  %base = bitcast i32* %ptr to <16 x i32>*
71  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
73  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
74  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
75  ret void
76}
77
78; The following cases test that interleaved access of pointer vectors can be
79; matched to ldN/stN instruction.
80
81; NEON-LABEL: load_ptrvec_factor2:
82; NEON: ld2 { v0.2d, v1.2d }, [x0]
83; NONEON-LABEL: load_ptrvec_factor2:
84; NONEON-NOT: ld2
85define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
86  %base = bitcast i32** %ptr to <4 x i32*>*
87  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
88  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
89  ret <2 x i32*> %strided.v0
90}
91
92; NEON-LABEL: load_ptrvec_factor3:
93; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
94; NONEON-LABEL: load_ptrvec_factor3:
95; NONEON-NOT: ld3
96define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
97  %base = bitcast i32** %ptr to <6 x i32*>*
98  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
99  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
100  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
101  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
102  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
103  ret void
104}
105
106; NEON-LABEL: load_ptrvec_factor4:
107; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
108; NONEON-LABEL: load_ptrvec_factor4:
109; NONEON-NOT: ld4
110define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
111  %base = bitcast i32** %ptr to <8 x i32*>*
112  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
113  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
114  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
115  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
116  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
117  ret void
118}
119
120; NEON-LABEL: store_ptrvec_factor2:
121; NEON: st2 { v0.2d, v1.2d }, [x0]
122; NONEON-LABEL: store_ptrvec_factor2:
123; NONEON-NOT: st2
124define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
125  %base = bitcast i32** %ptr to <4 x i32*>*
126  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
127  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
128  ret void
129}
130
131; NEON-LABEL: store_ptrvec_factor3:
132; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0]
133; NONEON-LABEL: store_ptrvec_factor3:
134; NONEON-NOT: st3
135define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
136  %base = bitcast i32** %ptr to <6 x i32*>*
137  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
138  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
139  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
140  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
141  ret void
142}
143
144; NEON-LABEL: store_ptrvec_factor4:
145; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
146; NONEON-LABEL: store_ptrvec_factor4:
147; NONEON-NOT: st4
148define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
149  %base = bitcast i32* %ptr to <8 x i32*>*
150  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
151  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
153  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
154  ret void
155}
156
157; Following cases check that shuffle maskes with undef indices can be matched
158; into ldN/stN instruction.
159
160; NEON-LABEL: load_undef_mask_factor2:
161; NEON: ld2 { v0.4s, v1.4s }, [x0]
162; NONEON-LABEL: load_undef_mask_factor2:
163; NONEON-NOT: ld2
164define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
165  %base = bitcast i32* %ptr to <8 x i32>*
166  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
167  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
168  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
169  %add = add nsw <4 x i32> %strided.v0, %strided.v1
170  ret <4 x i32> %add
171}
172
173; NEON-LABEL: load_undef_mask_factor3:
174; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
175; NONEON-LABEL: load_undef_mask_factor3:
176; NONEON-NOT: ld3
177define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
178  %base = bitcast i32* %ptr to <12 x i32>*
179  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
180  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
181  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
182  %add = add nsw <4 x i32> %strided.v2, %strided.v1
183  ret <4 x i32> %add
184}
185
186; NEON-LABEL: load_undef_mask_factor4:
187; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
188; NONEON-LABEL: load_undef_mask_factor4:
189; NONEON-NOT: ld4
190define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
191  %base = bitcast i32* %ptr to <16 x i32>*
192  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
193  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
194  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
195  %add = add nsw <4 x i32> %strided.v0, %strided.v2
196  ret <4 x i32> %add
197}
198
199; NEON-LABEL: store_undef_mask_factor2:
200; NEON: st2 { v0.4s, v1.4s }, [x0]
201; NONEON-LABEL: store_undef_mask_factor2:
202; NONEON-NOT: st2
203define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
204  %base = bitcast i32* %ptr to <8 x i32>*
205  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
206  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
207  ret void
208}
209
210; NEON-LABEL: store_undef_mask_factor3:
211; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
212; NONEON-LABEL: store_undef_mask_factor3:
213; NONEON-NOT: st3
214define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
215  %base = bitcast i32* %ptr to <12 x i32>*
216  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
217  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
218  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
219  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
220  ret void
221}
222
223; NEON-LABEL: store_undef_mask_factor4:
224; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
225; NONEON-LABEL: store_undef_mask_factor4:
226; NONEON-NOT: st4
227define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
228  %base = bitcast i32* %ptr to <16 x i32>*
229  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
230  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
231  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
232  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
233  ret void
234}
235
236; Check that we do something sane with illegal types.
237
238; NEON-LABEL: load_illegal_factor2:
239; NEON: BB#0:
240; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
241; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
242; NEON-NEXT: ret
243; NONEON-LABEL: load_illegal_factor2:
244; NONEON: BB#0:
245; NONEON-NEXT: ldr s0, [x0]
246; NONEON-NEXT: ldr s1, [x0, #8]
247; NONEON-NEXT: ret
248define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
249  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
250  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
251  ret <3 x float> %tmp2
252}
253
254; NEON-LABEL: store_illegal_factor2:
255; NEON: BB#0:
256; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
257; NEON-NEXT: st1 { v0.d }[0], [x0]
258; NEON-NEXT: ret
259; NONEON-LABEL: store_illegal_factor2:
260; NONEON: BB#0:
261; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
262; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
263; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
264; NONEON-NEXT: str x[[RES]], [x0]
265; NONEON-NEXT: ret
266define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
267  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
268  store <3 x float> %tmp1, <3 x float>* %p, align 16
269  ret void
270}
271
272; NEON-LABEL: load_factor2_with_extract_user:
273; NEON: ld2 { v0.4s, v1.4s }, [x0]
274; NEON: mov w0, v0.s[1]
275; NONEON-LABEL: load_factor2_with_extract_user:
276; NONEON-NOT: ld2
277define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
278  %1 = load <8 x i32>, <8 x i32>* %a, align 8
279  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
280  %3 = extractelement <8 x i32> %1, i32 2
281  ret i32 %3
282}
283