1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) {
7; CHECK-LABEL: vst2_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    ldrd r2, r12, [r0]
10; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
11; CHECK-NEXT:    vmov.32 q0[0], r2
12; CHECK-NEXT:    vmov.32 q1[0], r3
13; CHECK-NEXT:    vmov.32 q0[2], r12
14; CHECK-NEXT:    vmov.f64 d4, d1
15; CHECK-NEXT:    vmov.32 q1[2], r0
16; CHECK-NEXT:    vmov.f32 s9, s3
17; CHECK-NEXT:    vmov.f32 s2, s4
18; CHECK-NEXT:    vmov.f32 s3, s5
19; CHECK-NEXT:    vmov.f32 s10, s6
20; CHECK-NEXT:    vmov.f32 s1, s2
21; CHECK-NEXT:    vmov.f32 s11, s7
22; CHECK-NEXT:    vmov.f32 s2, s8
23; CHECK-NEXT:    vmov.f32 s3, s10
24; CHECK-NEXT:    vstrw.32 q0, [r1]
25; CHECK-NEXT:    bx lr
26entry:
27  %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
28  %l1 = load <2 x i32>, <2 x i32>* %s1, align 4
29  %s2 = getelementptr <2 x i32>, <2 x i32>* %src, i32 1
30  %l2 = load <2 x i32>, <2 x i32>* %s2, align 4
31  %s = shufflevector <2 x i32> %l1, <2 x i32> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
32  store <4 x i32> %s, <4 x i32> *%dst
33  ret void
34}
35
36define void @vst2_v4i32(<4 x i32> *%src, <8 x i32> *%dst) {
37; CHECK-LABEL: vst2_v4i32:
38; CHECK:       @ %bb.0: @ %entry
39; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
40; CHECK-NEXT:    vldrw.u32 q0, [r0]
41; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
42; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]
43; CHECK-NEXT:    bx lr
44entry:
45  %s1 = getelementptr <4 x i32>, <4 x i32>* %src, i32 0
46  %l1 = load <4 x i32>, <4 x i32>* %s1, align 4
47  %s2 = getelementptr <4 x i32>, <4 x i32>* %src, i32 1
48  %l2 = load <4 x i32>, <4 x i32>* %s2, align 4
49  %s = shufflevector <4 x i32> %l1, <4 x i32> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
50  store <8 x i32> %s, <8 x i32> *%dst
51  ret void
52}
53
54define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) {
55; CHECK-LABEL: vst2_v8i32:
56; CHECK:       @ %bb.0: @ %entry
57; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
58; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
59; CHECK-NEXT:    vldrw.u32 q0, [r0]
60; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
61; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
62; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
63; CHECK-NEXT:    vst20.32 {q2, q3}, [r1]
64; CHECK-NEXT:    vst21.32 {q2, q3}, [r1]
65; CHECK-NEXT:    bx lr
66entry:
67  %s1 = getelementptr <8 x i32>, <8 x i32>* %src, i32 0
68  %l1 = load <8 x i32>, <8 x i32>* %s1, align 4
69  %s2 = getelementptr <8 x i32>, <8 x i32>* %src, i32 1
70  %l2 = load <8 x i32>, <8 x i32>* %s2, align 4
71  %s = shufflevector <8 x i32> %l1, <8 x i32> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
72  store <16 x i32> %s, <16 x i32> *%dst
73  ret void
74}
75
76define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) {
77; CHECK-LABEL: vst2_v16i32:
78; CHECK:       @ %bb.0: @ %entry
79; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
80; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
81; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
82; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
83; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
84; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
85; CHECK-NEXT:    vldrw.u32 q6, [r0]
86; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
87; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
88; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
89; CHECK-NEXT:    vst20.32 {q6, q7}, [r1]
90; CHECK-NEXT:    add.w r0, r1, #96
91; CHECK-NEXT:    add.w r2, r1, #64
92; CHECK-NEXT:    vst21.32 {q6, q7}, [r1]!
93; CHECK-NEXT:    vst20.32 {q4, q5}, [r1]
94; CHECK-NEXT:    vst21.32 {q4, q5}, [r1]
95; CHECK-NEXT:    vst20.32 {q2, q3}, [r2]
96; CHECK-NEXT:    vst21.32 {q2, q3}, [r2]
97; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
98; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]
99; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
100; CHECK-NEXT:    bx lr
101entry:
102  %s1 = getelementptr <16 x i32>, <16 x i32>* %src, i32 0
103  %l1 = load <16 x i32>, <16 x i32>* %s1, align 4
104  %s2 = getelementptr <16 x i32>, <16 x i32>* %src, i32 1
105  %l2 = load <16 x i32>, <16 x i32>* %s2, align 4
106  %s = shufflevector <16 x i32> %l1, <16 x i32> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
107  store <32 x i32> %s, <32 x i32> *%dst
108  ret void
109}
110
111; i16
112
113define void @vst2_v2i16(<2 x i16> *%src, <4 x i16> *%dst) {
114; CHECK-LABEL: vst2_v2i16:
115; CHECK:       @ %bb.0: @ %entry
116; CHECK-NEXT:    ldrh r3, [r0]
117; CHECK-NEXT:    ldrh r2, [r0, #4]
118; CHECK-NEXT:    vmov.32 q0[0], r3
119; CHECK-NEXT:    ldrh.w r12, [r0, #6]
120; CHECK-NEXT:    ldrh r0, [r0, #2]
121; CHECK-NEXT:    vmov.32 q0[1], r2
122; CHECK-NEXT:    vmov.32 q0[2], r0
123; CHECK-NEXT:    vmov.32 q0[3], r12
124; CHECK-NEXT:    vstrh.32 q0, [r1]
125; CHECK-NEXT:    bx lr
126entry:
127  %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0
128  %l1 = load <2 x i16>, <2 x i16>* %s1, align 4
129  %s2 = getelementptr <2 x i16>, <2 x i16>* %src, i32 1
130  %l2 = load <2 x i16>, <2 x i16>* %s2, align 4
131  %s = shufflevector <2 x i16> %l1, <2 x i16> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
132  store <4 x i16> %s, <4 x i16> *%dst
133  ret void
134}
135
136define void @vst2_v4i16(<4 x i16> *%src, <8 x i16> *%dst) {
137; CHECK-LABEL: vst2_v4i16:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
140; CHECK-NEXT:    vldrh.u32 q1, [r0]
141; CHECK-NEXT:    vmovnt.i32 q1, q0
142; CHECK-NEXT:    vstrw.32 q1, [r1]
143; CHECK-NEXT:    bx lr
144entry:
145  %s1 = getelementptr <4 x i16>, <4 x i16>* %src, i32 0
146  %l1 = load <4 x i16>, <4 x i16>* %s1, align 4
147  %s2 = getelementptr <4 x i16>, <4 x i16>* %src, i32 1
148  %l2 = load <4 x i16>, <4 x i16>* %s2, align 4
149  %s = shufflevector <4 x i16> %l1, <4 x i16> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
150  store <8 x i16> %s, <8 x i16> *%dst
151  ret void
152}
153
154define void @vst2_v8i16(<8 x i16> *%src, <16 x i16> *%dst) {
155; CHECK-LABEL: vst2_v8i16:
156; CHECK:       @ %bb.0: @ %entry
157; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
158; CHECK-NEXT:    vldrw.u32 q0, [r0]
159; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
160; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]
161; CHECK-NEXT:    bx lr
162entry:
163  %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
164  %l1 = load <8 x i16>, <8 x i16>* %s1, align 4
165  %s2 = getelementptr <8 x i16>, <8 x i16>* %src, i32 1
166  %l2 = load <8 x i16>, <8 x i16>* %s2, align 4
167  %s = shufflevector <8 x i16> %l1, <8 x i16> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
168  store <16 x i16> %s, <16 x i16> *%dst
169  ret void
170}
171
172define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) {
173; CHECK-LABEL: vst2_v16i16:
174; CHECK:       @ %bb.0: @ %entry
175; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
176; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
177; CHECK-NEXT:    vldrw.u32 q0, [r0]
178; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
179; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
180; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]!
181; CHECK-NEXT:    vst20.16 {q2, q3}, [r1]
182; CHECK-NEXT:    vst21.16 {q2, q3}, [r1]
183; CHECK-NEXT:    bx lr
184entry:
185  %s1 = getelementptr <16 x i16>, <16 x i16>* %src, i32 0
186  %l1 = load <16 x i16>, <16 x i16>* %s1, align 4
187  %s2 = getelementptr <16 x i16>, <16 x i16>* %src, i32 1
188  %l2 = load <16 x i16>, <16 x i16>* %s2, align 4
189  %s = shufflevector <16 x i16> %l1, <16 x i16> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
190  store <32 x i16> %s, <32 x i16> *%dst
191  ret void
192}
193
194; i8
195
196define void @vst2_v2i8(<2 x i8> *%src, <4 x i8> *%dst) {
197; CHECK-LABEL: vst2_v2i8:
198; CHECK:       @ %bb.0: @ %entry
199; CHECK-NEXT:    ldrb r2, [r0]
200; CHECK-NEXT:    ldrb r3, [r0, #2]
201; CHECK-NEXT:    vmov.32 q0[0], r2
202; CHECK-NEXT:    ldrb.w r12, [r0, #1]
203; CHECK-NEXT:    vmov.32 q0[1], r3
204; CHECK-NEXT:    ldrb r0, [r0, #3]
205; CHECK-NEXT:    vmov.32 q0[2], r12
206; CHECK-NEXT:    vmov.32 q0[3], r0
207; CHECK-NEXT:    vstrb.32 q0, [r1]
208; CHECK-NEXT:    bx lr
209entry:
210  %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0
211  %l1 = load <2 x i8>, <2 x i8>* %s1, align 4
212  %s2 = getelementptr <2 x i8>, <2 x i8>* %src, i32 1
213  %l2 = load <2 x i8>, <2 x i8>* %s2, align 4
214  %s = shufflevector <2 x i8> %l1, <2 x i8> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
215  store <4 x i8> %s, <4 x i8> *%dst
216  ret void
217}
218
219define void @vst2_v4i8(<4 x i8> *%src, <8 x i8> *%dst) {
220; CHECK-LABEL: vst2_v4i8:
221; CHECK:       @ %bb.0: @ %entry
222; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
223; CHECK-NEXT:    vldrb.u32 q1, [r0]
224; CHECK-NEXT:    vmovnt.i32 q1, q0
225; CHECK-NEXT:    vstrb.16 q1, [r1]
226; CHECK-NEXT:    bx lr
227entry:
228  %s1 = getelementptr <4 x i8>, <4 x i8>* %src, i32 0
229  %l1 = load <4 x i8>, <4 x i8>* %s1, align 4
230  %s2 = getelementptr <4 x i8>, <4 x i8>* %src, i32 1
231  %l2 = load <4 x i8>, <4 x i8>* %s2, align 4
232  %s = shufflevector <4 x i8> %l1, <4 x i8> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
233  store <8 x i8> %s, <8 x i8> *%dst
234  ret void
235}
236
237define void @vst2_v8i8(<8 x i8> *%src, <16 x i8> *%dst) {
238; CHECK-LABEL: vst2_v8i8:
239; CHECK:       @ %bb.0: @ %entry
240; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
241; CHECK-NEXT:    vldrb.u16 q1, [r0]
242; CHECK-NEXT:    vmovnt.i16 q1, q0
243; CHECK-NEXT:    vstrw.32 q1, [r1]
244; CHECK-NEXT:    bx lr
245entry:
246  %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
247  %l1 = load <8 x i8>, <8 x i8>* %s1, align 4
248  %s2 = getelementptr <8 x i8>, <8 x i8>* %src, i32 1
249  %l2 = load <8 x i8>, <8 x i8>* %s2, align 4
250  %s = shufflevector <8 x i8> %l1, <8 x i8> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
251  store <16 x i8> %s, <16 x i8> *%dst
252  ret void
253}
254
255define void @vst2_v16i8(<16 x i8> *%src, <32 x i8> *%dst) {
256; CHECK-LABEL: vst2_v16i8:
257; CHECK:       @ %bb.0: @ %entry
258; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
259; CHECK-NEXT:    vldrw.u32 q0, [r0]
260; CHECK-NEXT:    vst20.8 {q0, q1}, [r1]
261; CHECK-NEXT:    vst21.8 {q0, q1}, [r1]
262; CHECK-NEXT:    bx lr
263entry:
264  %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0
265  %l1 = load <16 x i8>, <16 x i8>* %s1, align 4
266  %s2 = getelementptr <16 x i8>, <16 x i8>* %src, i32 1
267  %l2 = load <16 x i8>, <16 x i8>* %s2, align 4
268  %s = shufflevector <16 x i8> %l1, <16 x i8> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
269  store <32 x i8> %s, <32 x i8> *%dst
270  ret void
271}
272
273; i64
274
275define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
276; CHECK-LABEL: vst2_v2i64:
277; CHECK:       @ %bb.0: @ %entry
278; CHECK-NEXT:    vldrw.u32 q1, [r0]
279; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
280; CHECK-NEXT:    vmov.f64 d4, d2
281; CHECK-NEXT:    vmov.f32 s9, s5
282; CHECK-NEXT:    vmov.f32 s10, s0
283; CHECK-NEXT:    vmov.f32 s11, s1
284; CHECK-NEXT:    vmov.f32 s0, s6
285; CHECK-NEXT:    vstrb.8 q2, [r1], #16
286; CHECK-NEXT:    vmov.f32 s1, s7
287; CHECK-NEXT:    vstrw.32 q0, [r1]
288; CHECK-NEXT:    bx lr
289entry:
290  %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
291  %l1 = load <2 x i64>, <2 x i64>* %s1, align 4
292  %s2 = getelementptr <2 x i64>, <2 x i64>* %src, i32 1
293  %l2 = load <2 x i64>, <2 x i64>* %s2, align 4
294  %s = shufflevector <2 x i64> %l1, <2 x i64> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
295  store <4 x i64> %s, <4 x i64> *%dst
296  ret void
297}
298
299define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
300; CHECK-LABEL: vst2_v4i64:
301; CHECK:       @ %bb.0: @ %entry
302; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
303; CHECK-NEXT:    vpush {d8, d9, d10, d11}
304; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
305; CHECK-NEXT:    vldrw.u32 q1, [r0]
306; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
307; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
308; CHECK-NEXT:    vmov.f64 d6, d1
309; CHECK-NEXT:    vmov.f64 d10, d3
310; CHECK-NEXT:    vmov.f32 s13, s3
311; CHECK-NEXT:    vmov.f32 s21, s7
312; CHECK-NEXT:    vmov.f32 s2, s16
313; CHECK-NEXT:    vmov.f32 s6, s8
314; CHECK-NEXT:    vmov.f32 s14, s18
315; CHECK-NEXT:    vmov.f32 s22, s10
316; CHECK-NEXT:    vmov.f32 s3, s17
317; CHECK-NEXT:    vmov.f32 s7, s9
318; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
319; CHECK-NEXT:    vmov.f32 s15, s19
320; CHECK-NEXT:    vstrb.8 q1, [r1], #48
321; CHECK-NEXT:    vmov.f32 s23, s11
322; CHECK-NEXT:    vstrw.32 q3, [r1]
323; CHECK-NEXT:    vstrw.32 q5, [r1, #-32]
324; CHECK-NEXT:    vpop {d8, d9, d10, d11}
325; CHECK-NEXT:    bx lr
326entry:
327  %s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
328  %l1 = load <4 x i64>, <4 x i64>* %s1, align 4
329  %s2 = getelementptr <4 x i64>, <4 x i64>* %src, i32 1
330  %l2 = load <4 x i64>, <4 x i64>* %s2, align 4
331  %s = shufflevector <4 x i64> %l1, <4 x i64> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
332  store <8 x i64> %s, <8 x i64> *%dst
333  ret void
334}
335
336; f32
337
338define void @vst2_v2f32(<2 x float> *%src, <4 x float> *%dst) {
339; CHECK-LABEL: vst2_v2f32:
340; CHECK:       @ %bb.0: @ %entry
341; CHECK-NEXT:    vldr s0, [r0]
342; CHECK-NEXT:    vldr s4, [r0, #4]
343; CHECK-NEXT:    vldr s1, [r0, #8]
344; CHECK-NEXT:    vldr s5, [r0, #12]
345; CHECK-NEXT:    vmov.f32 s2, s4
346; CHECK-NEXT:    vmov.f32 s3, s5
347; CHECK-NEXT:    vstrw.32 q0, [r1]
348; CHECK-NEXT:    bx lr
349entry:
350  %s1 = getelementptr <2 x float>, <2 x float>* %src, i32 0
351  %l1 = load <2 x float>, <2 x float>* %s1, align 4
352  %s2 = getelementptr <2 x float>, <2 x float>* %src, i32 1
353  %l2 = load <2 x float>, <2 x float>* %s2, align 4
354  %s = shufflevector <2 x float> %l1, <2 x float> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
355  store <4 x float> %s, <4 x float> *%dst
356  ret void
357}
358
359define void @vst2_v4f32(<4 x float> *%src, <8 x float> *%dst) {
360; CHECK-LABEL: vst2_v4f32:
361; CHECK:       @ %bb.0: @ %entry
362; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
363; CHECK-NEXT:    vldrw.u32 q0, [r0]
364; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
365; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]
366; CHECK-NEXT:    bx lr
367entry:
368  %s1 = getelementptr <4 x float>, <4 x float>* %src, i32 0
369  %l1 = load <4 x float>, <4 x float>* %s1, align 4
370  %s2 = getelementptr <4 x float>, <4 x float>* %src, i32 1
371  %l2 = load <4 x float>, <4 x float>* %s2, align 4
372  %s = shufflevector <4 x float> %l1, <4 x float> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
373  store <8 x float> %s, <8 x float> *%dst
374  ret void
375}
376
377define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) {
378; CHECK-LABEL: vst2_v8f32:
379; CHECK:       @ %bb.0: @ %entry
380; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
381; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
382; CHECK-NEXT:    vldrw.u32 q0, [r0]
383; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
384; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
385; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
386; CHECK-NEXT:    vst20.32 {q2, q3}, [r1]
387; CHECK-NEXT:    vst21.32 {q2, q3}, [r1]
388; CHECK-NEXT:    bx lr
389entry:
390  %s1 = getelementptr <8 x float>, <8 x float>* %src, i32 0
391  %l1 = load <8 x float>, <8 x float>* %s1, align 4
392  %s2 = getelementptr <8 x float>, <8 x float>* %src, i32 1
393  %l2 = load <8 x float>, <8 x float>* %s2, align 4
394  %s = shufflevector <8 x float> %l1, <8 x float> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
395  store <16 x float> %s, <16 x float> *%dst
396  ret void
397}
398
399define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) {
400; CHECK-LABEL: vst2_v16f32:
401; CHECK:       @ %bb.0: @ %entry
402; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
403; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
404; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
405; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
406; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
407; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
408; CHECK-NEXT:    vldrw.u32 q6, [r0]
409; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
410; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
411; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
412; CHECK-NEXT:    vst20.32 {q6, q7}, [r1]
413; CHECK-NEXT:    add.w r0, r1, #96
414; CHECK-NEXT:    add.w r2, r1, #64
415; CHECK-NEXT:    vst21.32 {q6, q7}, [r1]!
416; CHECK-NEXT:    vst20.32 {q4, q5}, [r1]
417; CHECK-NEXT:    vst21.32 {q4, q5}, [r1]
418; CHECK-NEXT:    vst20.32 {q2, q3}, [r2]
419; CHECK-NEXT:    vst21.32 {q2, q3}, [r2]
420; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
421; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]
422; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
423; CHECK-NEXT:    bx lr
424entry:
425  %s1 = getelementptr <16 x float>, <16 x float>* %src, i32 0
426  %l1 = load <16 x float>, <16 x float>* %s1, align 4
427  %s2 = getelementptr <16 x float>, <16 x float>* %src, i32 1
428  %l2 = load <16 x float>, <16 x float>* %s2, align 4
429  %s = shufflevector <16 x float> %l1, <16 x float> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
430  store <32 x float> %s, <32 x float> *%dst
431  ret void
432}
433
434; f16
435
436define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
437; CHECK-LABEL: vst2_v2f16:
438; CHECK:       @ %bb.0: @ %entry
439; CHECK-NEXT:    ldrd r2, r0, [r0]
440; CHECK-NEXT:    vmov.32 q0[0], r2
441; CHECK-NEXT:    vmov.32 q1[0], r0
442; CHECK-NEXT:    vmov r2, s0
443; CHECK-NEXT:    vmovx.f16 s0, s0
444; CHECK-NEXT:    vmov r0, s4
445; CHECK-NEXT:    vmov.16 q2[0], r2
446; CHECK-NEXT:    vmov.16 q2[1], r0
447; CHECK-NEXT:    vmov r0, s0
448; CHECK-NEXT:    vmovx.f16 s0, s4
449; CHECK-NEXT:    vmov.16 q2[2], r0
450; CHECK-NEXT:    vmov r0, s0
451; CHECK-NEXT:    vmov.16 q2[3], r0
452; CHECK-NEXT:    vmov r2, s9
453; CHECK-NEXT:    vmov r0, s8
454; CHECK-NEXT:    strd r0, r2, [r1]
455; CHECK-NEXT:    bx lr
456entry:
457  %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
458  %l1 = load <2 x half>, <2 x half>* %s1, align 4
459  %s2 = getelementptr <2 x half>, <2 x half>* %src, i32 1
460  %l2 = load <2 x half>, <2 x half>* %s2, align 4
461  %s = shufflevector <2 x half> %l1, <2 x half> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
462  store <4 x half> %s, <4 x half> *%dst
463  ret void
464}
465
466define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
467; CHECK-LABEL: vst2_v4f16:
468; CHECK:       @ %bb.0: @ %entry
469; CHECK-NEXT:    ldm.w r0, {r2, r3, r12}
470; CHECK-NEXT:    vmov.32 q0[0], r12
471; CHECK-NEXT:    ldr r0, [r0, #12]
472; CHECK-NEXT:    vmov.32 q2[0], r2
473; CHECK-NEXT:    vmov.32 q2[1], r3
474; CHECK-NEXT:    vmov.32 q0[1], r0
475; CHECK-NEXT:    vmov r2, s8
476; CHECK-NEXT:    vmovx.f16 s12, s8
477; CHECK-NEXT:    vmov r0, s0
478; CHECK-NEXT:    vmov.16 q1[0], r2
479; CHECK-NEXT:    vmov.16 q1[1], r0
480; CHECK-NEXT:    vmov r0, s12
481; CHECK-NEXT:    vmovx.f16 s12, s0
482; CHECK-NEXT:    vmov.16 q1[2], r0
483; CHECK-NEXT:    vmov r0, s12
484; CHECK-NEXT:    vmovx.f16 s8, s9
485; CHECK-NEXT:    vmov.16 q1[3], r0
486; CHECK-NEXT:    vmov r0, s9
487; CHECK-NEXT:    vmov.16 q1[4], r0
488; CHECK-NEXT:    vmov r0, s1
489; CHECK-NEXT:    vmov.16 q1[5], r0
490; CHECK-NEXT:    vmov r0, s8
491; CHECK-NEXT:    vmovx.f16 s0, s1
492; CHECK-NEXT:    vmov.16 q1[6], r0
493; CHECK-NEXT:    vmov r0, s0
494; CHECK-NEXT:    vmov.16 q1[7], r0
495; CHECK-NEXT:    vstrw.32 q1, [r1]
496; CHECK-NEXT:    bx lr
497entry:
498  %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
499  %l1 = load <4 x half>, <4 x half>* %s1, align 4
500  %s2 = getelementptr <4 x half>, <4 x half>* %src, i32 1
501  %l2 = load <4 x half>, <4 x half>* %s2, align 4
502  %s = shufflevector <4 x half> %l1, <4 x half> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
503  store <8 x half> %s, <8 x half> *%dst
504  ret void
505}
506
507define void @vst2_v8f16(<8 x half> *%src, <16 x half> *%dst) {
508; CHECK-LABEL: vst2_v8f16:
509; CHECK:       @ %bb.0: @ %entry
510; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
511; CHECK-NEXT:    vldrw.u32 q0, [r0]
512; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
513; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]
514; CHECK-NEXT:    bx lr
515entry:
516  %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
517  %l1 = load <8 x half>, <8 x half>* %s1, align 4
518  %s2 = getelementptr <8 x half>, <8 x half>* %src, i32 1
519  %l2 = load <8 x half>, <8 x half>* %s2, align 4
520  %s = shufflevector <8 x half> %l1, <8 x half> %l2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
521  store <16 x half> %s, <16 x half> *%dst
522  ret void
523}
524
525define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) {
526; CHECK-LABEL: vst2_v16f16:
527; CHECK:       @ %bb.0: @ %entry
528; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
529; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
530; CHECK-NEXT:    vldrw.u32 q2, [r0]
531; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
532; CHECK-NEXT:    vst20.16 {q2, q3}, [r1]
533; CHECK-NEXT:    vst21.16 {q2, q3}, [r1]!
534; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
535; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]
536; CHECK-NEXT:    bx lr
537entry:
538  %s1 = getelementptr <16 x half>, <16 x half>* %src, i32 0
539  %l1 = load <16 x half>, <16 x half>* %s1, align 4
540  %s2 = getelementptr <16 x half>, <16 x half>* %src, i32 1
541  %l2 = load <16 x half>, <16 x half>* %s2, align 4
542  %s = shufflevector <16 x half> %l1, <16 x half> %l2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
543  store <32 x half> %s, <32 x half> *%dst
544  ret void
545}
546
547; f64
548
549define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
550; CHECK-LABEL: vst2_v2f64:
551; CHECK:       @ %bb.0: @ %entry
552; CHECK-NEXT:    vldrw.u32 q1, [r0]
553; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
554; CHECK-NEXT:    vmov.f64 d4, d3
555; CHECK-NEXT:    vmov.f64 d5, d1
556; CHECK-NEXT:    vmov.f64 d3, d0
557; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
558; CHECK-NEXT:    vstrw.32 q1, [r1]
559; CHECK-NEXT:    bx lr
560entry:
561  %s1 = getelementptr <2 x double>, <2 x double>* %src, i32 0
562  %l1 = load <2 x double>, <2 x double>* %s1, align 4
563  %s2 = getelementptr <2 x double>, <2 x double>* %src, i32 1
564  %l2 = load <2 x double>, <2 x double>* %s2, align 4
565  %s = shufflevector <2 x double> %l1, <2 x double> %l2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
566  store <4 x double> %s, <4 x double> *%dst
567  ret void
568}
569
570define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) {
571; CHECK-LABEL: vst2_v4f64:
572; CHECK:       @ %bb.0: @ %entry
573; CHECK-NEXT:    .vsave {d8, d9}
574; CHECK-NEXT:    vpush {d8, d9}
575; CHECK-NEXT:    vldrw.u32 q2, [r0]
576; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
577; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
578; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
579; CHECK-NEXT:    vmov.f64 d8, d4
580; CHECK-NEXT:    vmov.f64 d9, d0
581; CHECK-NEXT:    vmov.f64 d0, d5
582; CHECK-NEXT:    vstrw.32 q4, [r1]
583; CHECK-NEXT:    vmov.f64 d4, d6
584; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
585; CHECK-NEXT:    vmov.f64 d5, d2
586; CHECK-NEXT:    vmov.f64 d2, d7
587; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
588; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
589; CHECK-NEXT:    vpop {d8, d9}
590; CHECK-NEXT:    bx lr
591entry:
592  %s1 = getelementptr <4 x double>, <4 x double>* %src, i32 0
593  %l1 = load <4 x double>, <4 x double>* %s1, align 4
594  %s2 = getelementptr <4 x double>, <4 x double>* %src, i32 1
595  %l2 = load <4 x double>, <4 x double>* %s2, align 4
596  %s = shufflevector <4 x double> %l1, <4 x double> %l2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
597  store <8 x double> %s, <8 x double> *%dst
598  ret void
599}
600