1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4; i32
5
6define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
7; CHECK-LABEL: vld3_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    vldrw.u32 q0, [r0]
10; CHECK-NEXT:    ldrd r12, r3, [r0, #16]
11; CHECK-NEXT:    vmov r0, s1
12; CHECK-NEXT:    vmov r2, s0
13; CHECK-NEXT:    vmov.f64 d2, d0
14; CHECK-NEXT:    vmov.f32 s6, s3
15; CHECK-NEXT:    add r0, r2
16; CHECK-NEXT:    vmov r2, s2
17; CHECK-NEXT:    add r0, r2
18; CHECK-NEXT:    vmov r2, s6
19; CHECK-NEXT:    add r2, r12
20; CHECK-NEXT:    add r2, r3
21; CHECK-NEXT:    strd r0, r2, [r1]
22; CHECK-NEXT:    bx lr
23entry:
24  %l1 = load <6 x i32>, <6 x i32>* %src, align 4
25  %s1 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
26  %s2 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
27  %s3 = shufflevector <6 x i32> %l1, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
28  %a1 = add <2 x i32> %s1, %s2
29  %a = add <2 x i32> %a1, %s3
30  store <2 x i32> %a, <2 x i32> *%dst
31  ret void
32}
33
34define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
35; CHECK-LABEL: vld3_v4i32:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
38; CHECK-NEXT:    vpush {d8, d9, d10, d11}
39; CHECK-NEXT:    vldrw.u32 q1, [r0]
40; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
41; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
42; CHECK-NEXT:    vmov.f32 s12, s5
43; CHECK-NEXT:    vmov.f32 s13, s0
44; CHECK-NEXT:    vmov r0, s10
45; CHECK-NEXT:    vdup.32 q4, r0
46; CHECK-NEXT:    vmov.f32 s14, s3
47; CHECK-NEXT:    vmov.f32 s15, s19
48; CHECK-NEXT:    vmov.f64 d8, d2
49; CHECK-NEXT:    vmov.f32 s17, s7
50; CHECK-NEXT:    vmov r0, s9
51; CHECK-NEXT:    vmov.f32 s18, s2
52; CHECK-NEXT:    vdup.32 q5, r0
53; CHECK-NEXT:    vmov.f32 s0, s6
54; CHECK-NEXT:    vmov.f32 s19, s23
55; CHECK-NEXT:    vmov.f32 s10, s8
56; CHECK-NEXT:    vadd.i32 q3, q4, q3
57; CHECK-NEXT:    vmov.f32 s2, s8
58; CHECK-NEXT:    vmov.f32 s3, s11
59; CHECK-NEXT:    vadd.i32 q0, q3, q0
60; CHECK-NEXT:    vstrw.32 q0, [r1]
61; CHECK-NEXT:    vpop {d8, d9, d10, d11}
62; CHECK-NEXT:    bx lr
63entry:
64  %l1 = load <12 x i32>, <12 x i32>* %src, align 4
65  %s1 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
66  %s2 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
67  %s3 = shufflevector <12 x i32> %l1, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
68  %a1 = add <4 x i32> %s1, %s2
69  %a = add <4 x i32> %a1, %s3
70  store <4 x i32> %a, <4 x i32> *%dst
71  ret void
72}
73
74define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
75; CHECK-LABEL: vld3_v8i32:
76; CHECK:       @ %bb.0: @ %entry
77; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
78; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
79; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
80; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
81; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
82; CHECK-NEXT:    vmov.f32 s12, s5
83; CHECK-NEXT:    vmov.f32 s13, s0
84; CHECK-NEXT:    vmov r2, s10
85; CHECK-NEXT:    vdup.32 q4, r2
86; CHECK-NEXT:    vmov.f32 s14, s3
87; CHECK-NEXT:    vmov.f32 s15, s19
88; CHECK-NEXT:    vmov.f64 d8, d2
89; CHECK-NEXT:    vmov.f32 s17, s7
90; CHECK-NEXT:    vmov r2, s9
91; CHECK-NEXT:    vmov.f32 s18, s2
92; CHECK-NEXT:    vdup.32 q5, r2
93; CHECK-NEXT:    vmov.f32 s0, s6
94; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
95; CHECK-NEXT:    vmov.f32 s19, s23
96; CHECK-NEXT:    vmov.f32 s10, s8
97; CHECK-NEXT:    vadd.i32 q3, q4, q3
98; CHECK-NEXT:    vmov.f32 s2, s8
99; CHECK-NEXT:    vmov.f32 s3, s11
100; CHECK-NEXT:    vldrw.u32 q2, [r0]
101; CHECK-NEXT:    vadd.i32 q0, q3, q0
102; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
103; CHECK-NEXT:    vmov.f32 s16, s9
104; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
105; CHECK-NEXT:    vmov.f32 s17, s4
106; CHECK-NEXT:    vmov r0, s14
107; CHECK-NEXT:    vdup.32 q5, r0
108; CHECK-NEXT:    vmov.f32 s18, s7
109; CHECK-NEXT:    vmov.f32 s19, s23
110; CHECK-NEXT:    vmov.f64 d10, d4
111; CHECK-NEXT:    vmov.f32 s21, s11
112; CHECK-NEXT:    vmov r0, s13
113; CHECK-NEXT:    vmov.f32 s22, s6
114; CHECK-NEXT:    vdup.32 q6, r0
115; CHECK-NEXT:    vmov.f32 s4, s10
116; CHECK-NEXT:    vmov.f32 s23, s27
117; CHECK-NEXT:    vmov.f32 s14, s12
118; CHECK-NEXT:    vadd.i32 q4, q5, q4
119; CHECK-NEXT:    vmov.f32 s6, s12
120; CHECK-NEXT:    vmov.f32 s7, s15
121; CHECK-NEXT:    vadd.i32 q1, q4, q1
122; CHECK-NEXT:    vstrw.32 q1, [r1]
123; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
124; CHECK-NEXT:    bx lr
125entry:
126  %l1 = load <24 x i32>, <24 x i32>* %src, align 4
127  %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
128  %s2 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
129  %s3 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
130  %a1 = add <8 x i32> %s1, %s2
131  %a = add <8 x i32> %a1, %s3
132  store <8 x i32> %a, <8 x i32> *%dst
133  ret void
134}
135
136define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
137; CHECK-LABEL: vld3_v16i32:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
140; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
141; CHECK-NEXT:    .pad #16
142; CHECK-NEXT:    sub sp, #16
143; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
144; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
145; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
146; CHECK-NEXT:    vmov.f32 s12, s5
147; CHECK-NEXT:    vmov.f32 s13, s0
148; CHECK-NEXT:    vmov r2, s10
149; CHECK-NEXT:    vdup.32 q4, r2
150; CHECK-NEXT:    vmov.f32 s14, s3
151; CHECK-NEXT:    vmov.f32 s15, s19
152; CHECK-NEXT:    vmov.f64 d8, d2
153; CHECK-NEXT:    vmov.f32 s17, s7
154; CHECK-NEXT:    vmov r2, s9
155; CHECK-NEXT:    vmov.f32 s18, s2
156; CHECK-NEXT:    vdup.32 q5, r2
157; CHECK-NEXT:    vmov.f32 s0, s6
158; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
159; CHECK-NEXT:    vmov.f32 s19, s23
160; CHECK-NEXT:    vmov.f32 s10, s8
161; CHECK-NEXT:    vadd.i32 q3, q4, q3
162; CHECK-NEXT:    vmov.f32 s2, s8
163; CHECK-NEXT:    vmov.f32 s3, s11
164; CHECK-NEXT:    vldrw.u32 q2, [r0]
165; CHECK-NEXT:    vadd.i32 q0, q3, q0
166; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
167; CHECK-NEXT:    vmov.f32 s16, s9
168; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
169; CHECK-NEXT:    vmov.f32 s17, s4
170; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
171; CHECK-NEXT:    vmov r2, s14
172; CHECK-NEXT:    vdup.32 q5, r2
173; CHECK-NEXT:    vmov.f32 s18, s7
174; CHECK-NEXT:    vmov.f32 s19, s23
175; CHECK-NEXT:    vmov.f64 d10, d4
176; CHECK-NEXT:    vmov.f32 s21, s11
177; CHECK-NEXT:    vmov r2, s13
178; CHECK-NEXT:    vmov.f32 s22, s6
179; CHECK-NEXT:    vdup.32 q6, r2
180; CHECK-NEXT:    vmov.f32 s4, s10
181; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
182; CHECK-NEXT:    vmov.f32 s23, s27
183; CHECK-NEXT:    vmov.f32 s14, s12
184; CHECK-NEXT:    vadd.i32 q4, q5, q4
185; CHECK-NEXT:    vmov.f32 s6, s12
186; CHECK-NEXT:    vmov.f32 s7, s15
187; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
188; CHECK-NEXT:    vadd.i32 q1, q4, q1
189; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
190; CHECK-NEXT:    vmov.f32 s20, s13
191; CHECK-NEXT:    vmov.f32 s21, s8
192; CHECK-NEXT:    vmov r2, s18
193; CHECK-NEXT:    vdup.32 q6, r2
194; CHECK-NEXT:    vmov.f32 s22, s11
195; CHECK-NEXT:    vmov.f32 s23, s27
196; CHECK-NEXT:    vmov.f64 d12, d6
197; CHECK-NEXT:    vmov.f32 s25, s15
198; CHECK-NEXT:    vmov r2, s17
199; CHECK-NEXT:    vmov.f32 s26, s10
200; CHECK-NEXT:    vdup.32 q7, r2
201; CHECK-NEXT:    vmov.f32 s8, s14
202; CHECK-NEXT:    vmov.f32 s27, s31
203; CHECK-NEXT:    vmov.f32 s18, s16
204; CHECK-NEXT:    vadd.i32 q5, q6, q5
205; CHECK-NEXT:    vmov.f32 s10, s16
206; CHECK-NEXT:    vmov.f32 s11, s19
207; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
208; CHECK-NEXT:    vadd.i32 q2, q5, q2
209; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
210; CHECK-NEXT:    vmov.f32 s24, s17
211; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
212; CHECK-NEXT:    vmov.f32 s25, s20
213; CHECK-NEXT:    vstrw.32 q1, [r1]
214; CHECK-NEXT:    vmov.f64 d6, d8
215; CHECK-NEXT:    vmov r0, s2
216; CHECK-NEXT:    vmov.f32 s13, s19
217; CHECK-NEXT:    vdup.32 q7, r0
218; CHECK-NEXT:    vmov.f32 s26, s23
219; CHECK-NEXT:    vmov r0, s1
220; CHECK-NEXT:    vmov.f32 s27, s31
221; CHECK-NEXT:    vdup.32 q7, r0
222; CHECK-NEXT:    vmov.f32 s14, s22
223; CHECK-NEXT:    vmov.f32 s20, s18
224; CHECK-NEXT:    vmov.f32 s15, s31
225; CHECK-NEXT:    vmov.f32 s2, s0
226; CHECK-NEXT:    vadd.i32 q6, q3, q6
227; CHECK-NEXT:    vmov.f32 s22, s0
228; CHECK-NEXT:    vmov.f32 s23, s3
229; CHECK-NEXT:    vadd.i32 q0, q6, q5
230; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
231; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
232; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
233; CHECK-NEXT:    add sp, #16
234; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
235; CHECK-NEXT:    bx lr
236entry:
237  %l1 = load <48 x i32>, <48 x i32>* %src, align 4
238  %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
239  %s2 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
240  %s3 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
241  %a1 = add <16 x i32> %s1, %s2
242  %a = add <16 x i32> %a1, %s3
243  store <16 x i32> %a, <16 x i32> *%dst
244  ret void
245}
246
247; i16
248
249define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
250; CHECK-LABEL: vld3_v2i16:
251; CHECK:       @ %bb.0: @ %entry
252; CHECK-NEXT:    .pad #8
253; CHECK-NEXT:    sub sp, #8
254; CHECK-NEXT:    vldrh.u32 q0, [r0]
255; CHECK-NEXT:    ldr r2, [r0, #8]
256; CHECK-NEXT:    mov r3, sp
257; CHECK-NEXT:    str r2, [sp]
258; CHECK-NEXT:    vmov.f64 d2, d0
259; CHECK-NEXT:    vmov.f32 s6, s3
260; CHECK-NEXT:    vmov.f32 s8, s1
261; CHECK-NEXT:    vmov.f64 d6, d1
262; CHECK-NEXT:    vmov r0, s6
263; CHECK-NEXT:    vldrh.u32 q1, [r3]
264; CHECK-NEXT:    vmov.f32 s10, s4
265; CHECK-NEXT:    vmov.f32 s14, s5
266; CHECK-NEXT:    vmov r2, s10
267; CHECK-NEXT:    add r0, r2
268; CHECK-NEXT:    vmov r2, s14
269; CHECK-NEXT:    add r0, r2
270; CHECK-NEXT:    strh r0, [r1, #2]
271; CHECK-NEXT:    vmov r0, s8
272; CHECK-NEXT:    vmov r2, s0
273; CHECK-NEXT:    add r0, r2
274; CHECK-NEXT:    vmov r2, s12
275; CHECK-NEXT:    add r0, r2
276; CHECK-NEXT:    strh r0, [r1]
277; CHECK-NEXT:    add sp, #8
278; CHECK-NEXT:    bx lr
279entry:
280  %l1 = load <6 x i16>, <6 x i16>* %src, align 4
281  %s1 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
282  %s2 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
283  %s3 = shufflevector <6 x i16> %l1, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
284  %a1 = add <2 x i16> %s1, %s2
285  %a = add <2 x i16> %a1, %s3
286  store <2 x i16> %a, <2 x i16> *%dst
287  ret void
288}
289
290define void @vld3_v4i16(<12 x i16> *%src, <4 x i16> *%dst) {
291; CHECK-LABEL: vld3_v4i16:
292; CHECK:       @ %bb.0: @ %entry
293; CHECK-NEXT:    vldrw.u32 q0, [r0]
294; CHECK-NEXT:    vldrh.u32 q3, [r0, #16]
295; CHECK-NEXT:    vmov.u16 r2, q0[0]
296; CHECK-NEXT:    vmov r0, s14
297; CHECK-NEXT:    vmov.32 q1[0], r2
298; CHECK-NEXT:    vmov.u16 r2, q0[3]
299; CHECK-NEXT:    vmov.32 q1[1], r2
300; CHECK-NEXT:    vmov.u16 r2, q0[6]
301; CHECK-NEXT:    vmov.32 q1[2], r2
302; CHECK-NEXT:    vmov.u16 r2, q0[1]
303; CHECK-NEXT:    vmov.32 q2[0], r2
304; CHECK-NEXT:    vmov.u16 r2, q0[4]
305; CHECK-NEXT:    vmov.32 q2[1], r2
306; CHECK-NEXT:    vmov.u16 r2, q0[7]
307; CHECK-NEXT:    vmov.32 q2[2], r2
308; CHECK-NEXT:    vmov.32 q2[3], r0
309; CHECK-NEXT:    vmov r0, s13
310; CHECK-NEXT:    vmov.32 q1[3], r0
311; CHECK-NEXT:    vmov.u16 r0, q0[2]
312; CHECK-NEXT:    vadd.i32 q1, q1, q2
313; CHECK-NEXT:    vmov.32 q2[0], r0
314; CHECK-NEXT:    vmov.u16 r0, q0[5]
315; CHECK-NEXT:    vmov.32 q2[1], r0
316; CHECK-NEXT:    vmov r0, s12
317; CHECK-NEXT:    vmov.32 q2[2], r0
318; CHECK-NEXT:    vmov r0, s15
319; CHECK-NEXT:    vmov.32 q2[3], r0
320; CHECK-NEXT:    vadd.i32 q0, q1, q2
321; CHECK-NEXT:    vstrh.32 q0, [r1]
322; CHECK-NEXT:    bx lr
323entry:
324  %l1 = load <12 x i16>, <12 x i16>* %src, align 4
325  %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
326  %s2 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
327  %s3 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
328  %a1 = add <4 x i16> %s1, %s2
329  %a = add <4 x i16> %a1, %s3
330  store <4 x i16> %a, <4 x i16> *%dst
331  ret void
332}
333
334define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
335; CHECK-LABEL: vld3_v8i16:
336; CHECK:       @ %bb.0: @ %entry
337; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
338; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
339; CHECK-NEXT:    vldrw.u32 q1, [r0]
340; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
341; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
342; CHECK-NEXT:    vmov.u16 r2, q1[2]
343; CHECK-NEXT:    vmov.u16 r0, q3[4]
344; CHECK-NEXT:    vmov.16 q4[0], r2
345; CHECK-NEXT:    vmov.u16 r2, q1[5]
346; CHECK-NEXT:    vmov.16 q4[1], r2
347; CHECK-NEXT:    vmov.u16 r2, q2[0]
348; CHECK-NEXT:    vmov.16 q5[6], r0
349; CHECK-NEXT:    vmov.u16 r0, q3[7]
350; CHECK-NEXT:    vmov.16 q5[7], r0
351; CHECK-NEXT:    vmov.16 q4[2], r2
352; CHECK-NEXT:    vmov.u16 r2, q2[3]
353; CHECK-NEXT:    vmov.f32 s22, s12
354; CHECK-NEXT:    vmov.16 q4[3], r2
355; CHECK-NEXT:    vmov q6, q5
356; CHECK-NEXT:    vmov.f32 s18, s11
357; CHECK-NEXT:    vmov r2, s16
358; CHECK-NEXT:    vmovnb.i32 q6, q4
359; CHECK-NEXT:    vmov.32 q0[0], r2
360; CHECK-NEXT:    vmov r2, s17
361; CHECK-NEXT:    vmov.32 q0[1], r2
362; CHECK-NEXT:    vmov r0, s26
363; CHECK-NEXT:    vmov.32 q0[2], r0
364; CHECK-NEXT:    vmov r0, s23
365; CHECK-NEXT:    vmov.32 q0[3], r0
366; CHECK-NEXT:    vmov.u16 r0, q1[0]
367; CHECK-NEXT:    vmov.16 q4[0], r0
368; CHECK-NEXT:    vmov.u16 r0, q1[3]
369; CHECK-NEXT:    vmov.16 q4[1], r0
370; CHECK-NEXT:    vmov.u16 r0, q1[6]
371; CHECK-NEXT:    vmov.16 q4[2], r0
372; CHECK-NEXT:    vmov.u16 r0, q2[1]
373; CHECK-NEXT:    vmov.16 q4[3], r0
374; CHECK-NEXT:    vmov.u16 r0, q2[4]
375; CHECK-NEXT:    vmov.16 q4[4], r0
376; CHECK-NEXT:    vmov.u16 r0, q3[2]
377; CHECK-NEXT:    vmov.16 q5[6], r0
378; CHECK-NEXT:    vmov.u16 r0, q3[5]
379; CHECK-NEXT:    vmov.16 q5[7], r0
380; CHECK-NEXT:    vmov.u16 r0, q2[7]
381; CHECK-NEXT:    vmov.16 q4[5], r0
382; CHECK-NEXT:    vmov.u16 r0, q1[1]
383; CHECK-NEXT:    vmov.f32 s19, s23
384; CHECK-NEXT:    vmov.16 q5[0], r0
385; CHECK-NEXT:    vmov.u16 r0, q1[4]
386; CHECK-NEXT:    vmov.16 q5[1], r0
387; CHECK-NEXT:    vmov.u16 r0, q1[7]
388; CHECK-NEXT:    vmov.16 q5[2], r0
389; CHECK-NEXT:    vmov.u16 r0, q2[2]
390; CHECK-NEXT:    vmov.16 q5[3], r0
391; CHECK-NEXT:    vmov.u16 r0, q2[5]
392; CHECK-NEXT:    vmov.16 q5[4], r0
393; CHECK-NEXT:    vmov.u16 r0, q3[0]
394; CHECK-NEXT:    vmov.16 q1[5], r0
395; CHECK-NEXT:    vmov.u16 r0, q3[3]
396; CHECK-NEXT:    vmov.16 q1[6], r0
397; CHECK-NEXT:    vmov.u16 r0, q3[6]
398; CHECK-NEXT:    vmov.16 q1[7], r0
399; CHECK-NEXT:    vmov r0, s20
400; CHECK-NEXT:    vmov q2, q1
401; CHECK-NEXT:    vmov.32 q3[0], r0
402; CHECK-NEXT:    vmov r0, s21
403; CHECK-NEXT:    vmovnb.i32 q2, q5
404; CHECK-NEXT:    vmov.32 q3[1], r0
405; CHECK-NEXT:    vmov r0, s10
406; CHECK-NEXT:    vmov.32 q3[2], r0
407; CHECK-NEXT:    vmov r0, s7
408; CHECK-NEXT:    vmov.32 q3[3], r0
409; CHECK-NEXT:    vadd.i16 q1, q4, q3
410; CHECK-NEXT:    vadd.i16 q0, q1, q0
411; CHECK-NEXT:    vstrw.32 q0, [r1]
412; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
413; CHECK-NEXT:    bx lr
414entry:
415  %l1 = load <24 x i16>, <24 x i16>* %src, align 4
416  %s1 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
417  %s2 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
418  %s3 = shufflevector <24 x i16> %l1, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
419  %a1 = add <8 x i16> %s1, %s2
420  %a = add <8 x i16> %a1, %s3
421  store <8 x i16> %a, <8 x i16> *%dst
422  ret void
423}
424
425define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
426; CHECK-LABEL: vld3_v16i16:
427; CHECK:       @ %bb.0: @ %entry
428; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
429; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
430; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
431; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
432; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
433; CHECK-NEXT:    vmov.u16 r2, q1[2]
434; CHECK-NEXT:    vmov.16 q4[0], r2
435; CHECK-NEXT:    vmov.u16 r2, q1[5]
436; CHECK-NEXT:    vmov.16 q4[1], r2
437; CHECK-NEXT:    vmov.u16 r2, q2[0]
438; CHECK-NEXT:    vmov.16 q4[2], r2
439; CHECK-NEXT:    vmov.u16 r2, q2[3]
440; CHECK-NEXT:    vmov.16 q4[3], r2
441; CHECK-NEXT:    vmov.f32 s18, s11
442; CHECK-NEXT:    vmov r2, s16
443; CHECK-NEXT:    vmov.32 q0[0], r2
444; CHECK-NEXT:    vmov r2, s17
445; CHECK-NEXT:    vmov.32 q0[1], r2
446; CHECK-NEXT:    vmov.u16 r2, q3[4]
447; CHECK-NEXT:    vmov.16 q5[6], r2
448; CHECK-NEXT:    vmov.u16 r2, q3[7]
449; CHECK-NEXT:    vmov.16 q5[7], r2
450; CHECK-NEXT:    vmov.f32 s22, s12
451; CHECK-NEXT:    vmov q6, q5
452; CHECK-NEXT:    vmovnb.i32 q6, q4
453; CHECK-NEXT:    vmov r2, s26
454; CHECK-NEXT:    vmov.32 q0[2], r2
455; CHECK-NEXT:    vmov r2, s23
456; CHECK-NEXT:    vmov.32 q0[3], r2
457; CHECK-NEXT:    vmov.u16 r2, q1[0]
458; CHECK-NEXT:    vmov.16 q4[0], r2
459; CHECK-NEXT:    vmov.u16 r2, q1[3]
460; CHECK-NEXT:    vmov.16 q4[1], r2
461; CHECK-NEXT:    vmov.u16 r2, q1[6]
462; CHECK-NEXT:    vmov.16 q4[2], r2
463; CHECK-NEXT:    vmov.u16 r2, q2[1]
464; CHECK-NEXT:    vmov.16 q4[3], r2
465; CHECK-NEXT:    vmov.u16 r2, q2[4]
466; CHECK-NEXT:    vmov.16 q4[4], r2
467; CHECK-NEXT:    vmov.u16 r2, q3[2]
468; CHECK-NEXT:    vmov.16 q5[6], r2
469; CHECK-NEXT:    vmov.u16 r2, q3[5]
470; CHECK-NEXT:    vmov.16 q5[7], r2
471; CHECK-NEXT:    vmov.u16 r2, q2[7]
472; CHECK-NEXT:    vmov.16 q4[5], r2
473; CHECK-NEXT:    vmov.u16 r2, q1[1]
474; CHECK-NEXT:    vmov.f32 s19, s23
475; CHECK-NEXT:    vmov.16 q5[0], r2
476; CHECK-NEXT:    vmov.u16 r2, q1[4]
477; CHECK-NEXT:    vmov.16 q5[1], r2
478; CHECK-NEXT:    vmov.u16 r2, q1[7]
479; CHECK-NEXT:    vmov.16 q5[2], r2
480; CHECK-NEXT:    vmov.u16 r2, q2[2]
481; CHECK-NEXT:    vmov.16 q5[3], r2
482; CHECK-NEXT:    vmov.u16 r2, q2[5]
483; CHECK-NEXT:    vmov.16 q5[4], r2
484; CHECK-NEXT:    vmov.u16 r2, q3[0]
485; CHECK-NEXT:    vmov.16 q1[5], r2
486; CHECK-NEXT:    vmov.u16 r2, q3[3]
487; CHECK-NEXT:    vmov.16 q1[6], r2
488; CHECK-NEXT:    vmov.u16 r2, q3[6]
489; CHECK-NEXT:    vmov.16 q1[7], r2
490; CHECK-NEXT:    vmov r2, s20
491; CHECK-NEXT:    vmov q2, q1
492; CHECK-NEXT:    vmov.32 q3[0], r2
493; CHECK-NEXT:    vmov r2, s21
494; CHECK-NEXT:    vmovnb.i32 q2, q5
495; CHECK-NEXT:    vmov.32 q3[1], r2
496; CHECK-NEXT:    vmov r2, s10
497; CHECK-NEXT:    vmov.32 q3[2], r2
498; CHECK-NEXT:    vmov r2, s7
499; CHECK-NEXT:    vmov.32 q3[3], r2
500; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
501; CHECK-NEXT:    vadd.i16 q1, q4, q3
502; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
503; CHECK-NEXT:    vadd.i16 q0, q1, q0
504; CHECK-NEXT:    vldrw.u32 q1, [r0]
505; CHECK-NEXT:    vmov.u16 r0, q3[4]
506; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
507; CHECK-NEXT:    vmov.u16 r2, q1[2]
508; CHECK-NEXT:    vmov.16 q6[6], r0
509; CHECK-NEXT:    vmov.16 q5[0], r2
510; CHECK-NEXT:    vmov.u16 r2, q1[5]
511; CHECK-NEXT:    vmov.16 q5[1], r2
512; CHECK-NEXT:    vmov.u16 r2, q2[0]
513; CHECK-NEXT:    vmov.u16 r0, q3[7]
514; CHECK-NEXT:    vmov.16 q5[2], r2
515; CHECK-NEXT:    vmov.16 q6[7], r0
516; CHECK-NEXT:    vmov.u16 r2, q2[3]
517; CHECK-NEXT:    vmov.16 q5[3], r2
518; CHECK-NEXT:    vmov.f32 s26, s12
519; CHECK-NEXT:    vmov.f32 s22, s11
520; CHECK-NEXT:    vmov q7, q6
521; CHECK-NEXT:    vmov r0, s20
522; CHECK-NEXT:    vmovnb.i32 q7, q5
523; CHECK-NEXT:    vmov.32 q4[0], r0
524; CHECK-NEXT:    vmov r0, s21
525; CHECK-NEXT:    vmov.32 q4[1], r0
526; CHECK-NEXT:    vmov r0, s30
527; CHECK-NEXT:    vmov.32 q4[2], r0
528; CHECK-NEXT:    vmov r0, s27
529; CHECK-NEXT:    vmov.32 q4[3], r0
530; CHECK-NEXT:    vmov.u16 r0, q1[0]
531; CHECK-NEXT:    vmov.16 q5[0], r0
532; CHECK-NEXT:    vmov.u16 r0, q1[3]
533; CHECK-NEXT:    vmov.16 q5[1], r0
534; CHECK-NEXT:    vmov.u16 r0, q1[6]
535; CHECK-NEXT:    vmov.16 q5[2], r0
536; CHECK-NEXT:    vmov.u16 r0, q2[1]
537; CHECK-NEXT:    vmov.16 q5[3], r0
538; CHECK-NEXT:    vmov.u16 r0, q2[4]
539; CHECK-NEXT:    vmov.16 q5[4], r0
540; CHECK-NEXT:    vmov.u16 r0, q3[2]
541; CHECK-NEXT:    vmov.16 q6[6], r0
542; CHECK-NEXT:    vmov.u16 r0, q3[5]
543; CHECK-NEXT:    vmov.16 q6[7], r0
544; CHECK-NEXT:    vmov.u16 r0, q2[7]
545; CHECK-NEXT:    vmov.16 q5[5], r0
546; CHECK-NEXT:    vmov.u16 r0, q1[1]
547; CHECK-NEXT:    vmov.f32 s23, s27
548; CHECK-NEXT:    vmov.16 q6[0], r0
549; CHECK-NEXT:    vmov.u16 r0, q1[4]
550; CHECK-NEXT:    vmov.16 q6[1], r0
551; CHECK-NEXT:    vmov.u16 r0, q1[7]
552; CHECK-NEXT:    vmov.16 q6[2], r0
553; CHECK-NEXT:    vmov.u16 r0, q2[2]
554; CHECK-NEXT:    vmov.16 q6[3], r0
555; CHECK-NEXT:    vmov.u16 r0, q2[5]
556; CHECK-NEXT:    vmov.16 q6[4], r0
557; CHECK-NEXT:    vmov r0, s24
558; CHECK-NEXT:    vmov.32 q1[0], r0
559; CHECK-NEXT:    vmov r0, s25
560; CHECK-NEXT:    vmov.32 q1[1], r0
561; CHECK-NEXT:    vmov.u16 r0, q3[0]
562; CHECK-NEXT:    vmov.16 q2[5], r0
563; CHECK-NEXT:    vmov.u16 r0, q3[3]
564; CHECK-NEXT:    vmov.16 q2[6], r0
565; CHECK-NEXT:    vmov.u16 r0, q3[6]
566; CHECK-NEXT:    vmov.16 q2[7], r0
567; CHECK-NEXT:    vmov q3, q2
568; CHECK-NEXT:    vmovnb.i32 q3, q6
569; CHECK-NEXT:    vmov r0, s14
570; CHECK-NEXT:    vmov.32 q1[2], r0
571; CHECK-NEXT:    vmov r0, s11
572; CHECK-NEXT:    vmov.32 q1[3], r0
573; CHECK-NEXT:    vadd.i16 q1, q5, q1
574; CHECK-NEXT:    vadd.i16 q1, q1, q4
575; CHECK-NEXT:    vstrw.32 q1, [r1]
576; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
577; CHECK-NEXT:    bx lr
578entry:
579  %l1 = load <48 x i16>, <48 x i16>* %src, align 4
580  %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
581  %s2 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
582  %s3 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
583  %a1 = add <16 x i16> %s1, %s2
584  %a = add <16 x i16> %a1, %s3
585  store <16 x i16> %a, <16 x i16> *%dst
586  ret void
587}
588
589; i8
590
591define void @vld3_v2i8(<6 x i8> *%src, <2 x i8> *%dst) {
592; CHECK-LABEL: vld3_v2i8:
593; CHECK:       @ %bb.0: @ %entry
594; CHECK-NEXT:    .pad #8
595; CHECK-NEXT:    sub sp, #8
596; CHECK-NEXT:    ldrd r2, r0, [r0]
597; CHECK-NEXT:    strd r2, r0, [sp]
598; CHECK-NEXT:    mov r0, sp
599; CHECK-NEXT:    vldrb.u16 q0, [r0]
600; CHECK-NEXT:    vmov.u16 r0, q0[4]
601; CHECK-NEXT:    vmov.u16 r2, q0[3]
602; CHECK-NEXT:    add r0, r2
603; CHECK-NEXT:    vmov.u16 r2, q0[5]
604; CHECK-NEXT:    add r0, r2
605; CHECK-NEXT:    strb r0, [r1, #1]
606; CHECK-NEXT:    vmov.u16 r0, q0[1]
607; CHECK-NEXT:    vmov.u16 r2, q0[0]
608; CHECK-NEXT:    add r0, r2
609; CHECK-NEXT:    vmov.u16 r2, q0[2]
610; CHECK-NEXT:    add r0, r2
611; CHECK-NEXT:    strb r0, [r1]
612; CHECK-NEXT:    add sp, #8
613; CHECK-NEXT:    bx lr
614entry:
615  %l1 = load <6 x i8>, <6 x i8>* %src, align 4
616  %s1 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
617  %s2 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
618  %s3 = shufflevector <6 x i8> %l1, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
619  %a1 = add <2 x i8> %s1, %s2
620  %a = add <2 x i8> %a1, %s3
621  store <2 x i8> %a, <2 x i8> *%dst
622  ret void
623}
624
625define void @vld3_v4i8(<12 x i8> *%src, <4 x i8> *%dst) {
626; CHECK-LABEL: vld3_v4i8:
627; CHECK:       @ %bb.0: @ %entry
628; CHECK-NEXT:    .pad #8
629; CHECK-NEXT:    sub sp, #8
630; CHECK-NEXT:    vldrb.u16 q2, [r0]
631; CHECK-NEXT:    ldr r3, [r0, #8]
632; CHECK-NEXT:    mov r2, sp
633; CHECK-NEXT:    str r3, [sp]
634; CHECK-NEXT:    vmov.u16 r0, q2[2]
635; CHECK-NEXT:    vmov.32 q0[0], r0
636; CHECK-NEXT:    vmov.u16 r0, q2[5]
637; CHECK-NEXT:    vmov.32 q0[1], r0
638; CHECK-NEXT:    vmov.u16 r0, q2[0]
639; CHECK-NEXT:    vmov.32 q1[0], r0
640; CHECK-NEXT:    vmov.u16 r0, q2[3]
641; CHECK-NEXT:    vmov.32 q1[1], r0
642; CHECK-NEXT:    vmov.u16 r0, q2[6]
643; CHECK-NEXT:    vmov.32 q1[2], r0
644; CHECK-NEXT:    vmov.u16 r0, q2[1]
645; CHECK-NEXT:    vmov.32 q3[0], r0
646; CHECK-NEXT:    vmov.u16 r0, q2[4]
647; CHECK-NEXT:    vmov.32 q3[1], r0
648; CHECK-NEXT:    vmov.u16 r0, q2[7]
649; CHECK-NEXT:    vldrb.u16 q2, [r2]
650; CHECK-NEXT:    vmov.32 q3[2], r0
651; CHECK-NEXT:    vmov.u16 r0, q2[2]
652; CHECK-NEXT:    vmov.32 q3[3], r0
653; CHECK-NEXT:    vmov.u16 r0, q2[1]
654; CHECK-NEXT:    vmov.32 q1[3], r0
655; CHECK-NEXT:    vmov.u16 r0, q2[0]
656; CHECK-NEXT:    vmov.32 q0[2], r0
657; CHECK-NEXT:    vmov.u16 r0, q2[3]
658; CHECK-NEXT:    vadd.i32 q1, q1, q3
659; CHECK-NEXT:    vmov.32 q0[3], r0
660; CHECK-NEXT:    vadd.i32 q0, q1, q0
661; CHECK-NEXT:    vstrb.32 q0, [r1]
662; CHECK-NEXT:    add sp, #8
663; CHECK-NEXT:    bx lr
664entry:
665  %l1 = load <12 x i8>, <12 x i8>* %src, align 4
666  %s1 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
667  %s2 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
668  %s3 = shufflevector <12 x i8> %l1, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
669  %a1 = add <4 x i8> %s1, %s2
670  %a = add <4 x i8> %a1, %s3
671  store <4 x i8> %a, <4 x i8> *%dst
672  ret void
673}
674
675define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
676; CHECK-LABEL: vld3_v8i8:
677; CHECK:       @ %bb.0: @ %entry
678; CHECK-NEXT:    vldrw.u32 q0, [r0]
679; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
680; CHECK-NEXT:    vmov.u8 r2, q0[0]
681; CHECK-NEXT:    vmov.u16 r0, q1[2]
682; CHECK-NEXT:    vmov.16 q2[0], r2
683; CHECK-NEXT:    vmov.u8 r2, q0[3]
684; CHECK-NEXT:    vmov.16 q2[1], r2
685; CHECK-NEXT:    vmov.u8 r2, q0[6]
686; CHECK-NEXT:    vmov.16 q2[2], r2
687; CHECK-NEXT:    vmov.u8 r2, q0[9]
688; CHECK-NEXT:    vmov.16 q2[3], r2
689; CHECK-NEXT:    vmov.u8 r2, q0[12]
690; CHECK-NEXT:    vmov.16 q2[4], r2
691; CHECK-NEXT:    vmov.u8 r2, q0[15]
692; CHECK-NEXT:    vmov.16 q2[5], r2
693; CHECK-NEXT:    vmov.16 q2[6], r0
694; CHECK-NEXT:    vmov.u8 r0, q0[1]
695; CHECK-NEXT:    vmov.16 q3[0], r0
696; CHECK-NEXT:    vmov.u8 r0, q0[4]
697; CHECK-NEXT:    vmov.16 q3[1], r0
698; CHECK-NEXT:    vmov.u8 r0, q0[7]
699; CHECK-NEXT:    vmov.16 q3[2], r0
700; CHECK-NEXT:    vmov.u8 r0, q0[10]
701; CHECK-NEXT:    vmov.16 q3[3], r0
702; CHECK-NEXT:    vmov.u8 r0, q0[13]
703; CHECK-NEXT:    vmov.16 q3[4], r0
704; CHECK-NEXT:    vmov.u16 r0, q1[0]
705; CHECK-NEXT:    vmov.16 q3[5], r0
706; CHECK-NEXT:    vmov.u16 r0, q1[3]
707; CHECK-NEXT:    vmov.16 q3[6], r0
708; CHECK-NEXT:    vmov.u16 r0, q1[6]
709; CHECK-NEXT:    vmov.16 q3[7], r0
710; CHECK-NEXT:    vmov.u16 r0, q1[5]
711; CHECK-NEXT:    vmov.16 q2[7], r0
712; CHECK-NEXT:    vmov.u8 r0, q0[2]
713; CHECK-NEXT:    vadd.i16 q2, q2, q3
714; CHECK-NEXT:    vmov.16 q3[0], r0
715; CHECK-NEXT:    vmov.u8 r0, q0[5]
716; CHECK-NEXT:    vmov.16 q3[1], r0
717; CHECK-NEXT:    vmov.u8 r0, q0[8]
718; CHECK-NEXT:    vmov.16 q3[2], r0
719; CHECK-NEXT:    vmov.u8 r0, q0[11]
720; CHECK-NEXT:    vmov.16 q3[3], r0
721; CHECK-NEXT:    vmov.u8 r0, q0[14]
722; CHECK-NEXT:    vmov.16 q3[4], r0
723; CHECK-NEXT:    vmov.u16 r0, q1[1]
724; CHECK-NEXT:    vmov.16 q3[5], r0
725; CHECK-NEXT:    vmov.u16 r0, q1[4]
726; CHECK-NEXT:    vmov.16 q3[6], r0
727; CHECK-NEXT:    vmov.u16 r0, q1[7]
728; CHECK-NEXT:    vmov.16 q3[7], r0
729; CHECK-NEXT:    vadd.i16 q0, q2, q3
730; CHECK-NEXT:    vstrb.16 q0, [r1]
731; CHECK-NEXT:    bx lr
732entry:
733  %l1 = load <24 x i8>, <24 x i8>* %src, align 4
734  %s1 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
735  %s2 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
736  %s3 = shufflevector <24 x i8> %l1, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
737  %a1 = add <8 x i8> %s1, %s2
738  %a = add <8 x i8> %a1, %s3
739  store <8 x i8> %a, <8 x i8> *%dst
740  ret void
741}
742
743define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
744; CHECK-LABEL: vld3_v16i8:
745; CHECK:       @ %bb.0: @ %entry
746; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
747; CHECK-NEXT:    vpush {d8, d9, d10, d11}
748; CHECK-NEXT:    vldrw.u32 q2, [r0]
749; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
750; CHECK-NEXT:    vmov.u8 r2, q2[0]
751; CHECK-NEXT:    vmov.8 q1[0], r2
752; CHECK-NEXT:    vmov.u8 r2, q2[3]
753; CHECK-NEXT:    vmov.8 q1[1], r2
754; CHECK-NEXT:    vmov.u8 r2, q2[6]
755; CHECK-NEXT:    vmov.8 q1[2], r2
756; CHECK-NEXT:    vmov.u8 r2, q2[9]
757; CHECK-NEXT:    vmov.8 q1[3], r2
758; CHECK-NEXT:    vmov.u8 r2, q2[12]
759; CHECK-NEXT:    vmov.8 q1[4], r2
760; CHECK-NEXT:    vmov.u8 r2, q2[15]
761; CHECK-NEXT:    vmov.8 q1[5], r2
762; CHECK-NEXT:    vmov.u8 r2, q0[2]
763; CHECK-NEXT:    vmov.8 q1[6], r2
764; CHECK-NEXT:    vmov.u8 r2, q0[5]
765; CHECK-NEXT:    vmov.8 q1[7], r2
766; CHECK-NEXT:    vmov r2, s4
767; CHECK-NEXT:    vmov.32 q3[0], r2
768; CHECK-NEXT:    vmov r2, s5
769; CHECK-NEXT:    vmov.32 q3[1], r2
770; CHECK-NEXT:    vmov.u8 r2, q0[8]
771; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
772; CHECK-NEXT:    vmov.8 q4[8], r2
773; CHECK-NEXT:    vmov.u8 r2, q0[11]
774; CHECK-NEXT:    vmov.8 q4[9], r2
775; CHECK-NEXT:    vmov.u8 r2, q0[14]
776; CHECK-NEXT:    vmov.8 q4[10], r2
777; CHECK-NEXT:    vmov.u8 r0, q1[1]
778; CHECK-NEXT:    vmov.8 q4[11], r0
779; CHECK-NEXT:    vmov r0, s18
780; CHECK-NEXT:    vmov.32 q3[2], r0
781; CHECK-NEXT:    vmov.u8 r0, q2[1]
782; CHECK-NEXT:    vmov.8 q5[0], r0
783; CHECK-NEXT:    vmov.u8 r0, q2[4]
784; CHECK-NEXT:    vmov.8 q5[1], r0
785; CHECK-NEXT:    vmov.u8 r0, q2[7]
786; CHECK-NEXT:    vmov.8 q5[2], r0
787; CHECK-NEXT:    vmov.u8 r0, q2[10]
788; CHECK-NEXT:    vmov.8 q5[3], r0
789; CHECK-NEXT:    vmov.u8 r0, q2[13]
790; CHECK-NEXT:    vmov.8 q5[4], r0
791; CHECK-NEXT:    vmov.u8 r0, q0[0]
792; CHECK-NEXT:    vmov.8 q5[5], r0
793; CHECK-NEXT:    vmov.u8 r0, q0[3]
794; CHECK-NEXT:    vmov.8 q5[6], r0
795; CHECK-NEXT:    vmov.u8 r0, q0[6]
796; CHECK-NEXT:    vmov.8 q5[7], r0
797; CHECK-NEXT:    vmov r0, s20
798; CHECK-NEXT:    vmov.32 q4[0], r0
799; CHECK-NEXT:    vmov r0, s21
800; CHECK-NEXT:    vmov.32 q4[1], r0
801; CHECK-NEXT:    vmov.u8 r0, q0[9]
802; CHECK-NEXT:    vmov.8 q5[8], r0
803; CHECK-NEXT:    vmov.u8 r0, q0[12]
804; CHECK-NEXT:    vmov.8 q5[9], r0
805; CHECK-NEXT:    vmov.u8 r0, q0[15]
806; CHECK-NEXT:    vmov.8 q5[10], r0
807; CHECK-NEXT:    vmov.u8 r0, q1[2]
808; CHECK-NEXT:    vmov.8 q5[11], r0
809; CHECK-NEXT:    vmov r0, s22
810; CHECK-NEXT:    vmov.32 q4[2], r0
811; CHECK-NEXT:    vmov.u8 r0, q1[5]
812; CHECK-NEXT:    vmov.8 q5[12], r0
813; CHECK-NEXT:    vmov.u8 r0, q1[8]
814; CHECK-NEXT:    vmov.8 q5[13], r0
815; CHECK-NEXT:    vmov.u8 r0, q1[11]
816; CHECK-NEXT:    vmov.8 q5[14], r0
817; CHECK-NEXT:    vmov.u8 r0, q1[14]
818; CHECK-NEXT:    vmov.8 q5[15], r0
819; CHECK-NEXT:    vmov r0, s23
820; CHECK-NEXT:    vmov.32 q4[3], r0
821; CHECK-NEXT:    vmov.u8 r0, q1[4]
822; CHECK-NEXT:    vmov.8 q5[12], r0
823; CHECK-NEXT:    vmov.u8 r0, q1[7]
824; CHECK-NEXT:    vmov.8 q5[13], r0
825; CHECK-NEXT:    vmov.u8 r0, q1[10]
826; CHECK-NEXT:    vmov.8 q5[14], r0
827; CHECK-NEXT:    vmov.u8 r0, q1[13]
828; CHECK-NEXT:    vmov.8 q5[15], r0
829; CHECK-NEXT:    vmov r0, s23
830; CHECK-NEXT:    vmov.32 q3[3], r0
831; CHECK-NEXT:    vmov.u8 r0, q2[2]
832; CHECK-NEXT:    vadd.i8 q3, q3, q4
833; CHECK-NEXT:    vmov.8 q4[0], r0
834; CHECK-NEXT:    vmov.u8 r0, q2[5]
835; CHECK-NEXT:    vmov.8 q4[1], r0
836; CHECK-NEXT:    vmov.u8 r0, q2[8]
837; CHECK-NEXT:    vmov.8 q4[2], r0
838; CHECK-NEXT:    vmov.u8 r0, q2[11]
839; CHECK-NEXT:    vmov.8 q4[3], r0
840; CHECK-NEXT:    vmov.u8 r0, q2[14]
841; CHECK-NEXT:    vmov.8 q4[4], r0
842; CHECK-NEXT:    vmov.u8 r0, q0[1]
843; CHECK-NEXT:    vmov.8 q4[5], r0
844; CHECK-NEXT:    vmov.u8 r0, q0[4]
845; CHECK-NEXT:    vmov.8 q4[6], r0
846; CHECK-NEXT:    vmov.u8 r0, q0[7]
847; CHECK-NEXT:    vmov.8 q4[7], r0
848; CHECK-NEXT:    vmov r0, s16
849; CHECK-NEXT:    vmov.32 q2[0], r0
850; CHECK-NEXT:    vmov r0, s17
851; CHECK-NEXT:    vmov.32 q2[1], r0
852; CHECK-NEXT:    vmov.u8 r0, q0[10]
853; CHECK-NEXT:    vmov.8 q4[8], r0
854; CHECK-NEXT:    vmov.u8 r0, q0[13]
855; CHECK-NEXT:    vmov.8 q4[9], r0
856; CHECK-NEXT:    vmov.u8 r0, q1[0]
857; CHECK-NEXT:    vmov.8 q4[10], r0
858; CHECK-NEXT:    vmov.u8 r0, q1[3]
859; CHECK-NEXT:    vmov.8 q4[11], r0
860; CHECK-NEXT:    vmov r0, s18
861; CHECK-NEXT:    vmov.32 q2[2], r0
862; CHECK-NEXT:    vmov.u8 r0, q1[6]
863; CHECK-NEXT:    vmov.8 q0[12], r0
864; CHECK-NEXT:    vmov.u8 r0, q1[9]
865; CHECK-NEXT:    vmov.8 q0[13], r0
866; CHECK-NEXT:    vmov.u8 r0, q1[12]
867; CHECK-NEXT:    vmov.8 q0[14], r0
868; CHECK-NEXT:    vmov.u8 r0, q1[15]
869; CHECK-NEXT:    vmov.8 q0[15], r0
870; CHECK-NEXT:    vmov r0, s3
871; CHECK-NEXT:    vmov.32 q2[3], r0
872; CHECK-NEXT:    vadd.i8 q0, q3, q2
873; CHECK-NEXT:    vstrw.32 q0, [r1]
874; CHECK-NEXT:    vpop {d8, d9, d10, d11}
875; CHECK-NEXT:    bx lr
876entry:
877  %l1 = load <48 x i8>, <48 x i8>* %src, align 4
878  %s1 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
879  %s2 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
880  %s3 = shufflevector <48 x i8> %l1, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
881  %a1 = add <16 x i8> %s1, %s2
882  %a = add <16 x i8> %a1, %s3
883  store <16 x i8> %a, <16 x i8> *%dst
884  ret void
885}
886
887; i64
888
889define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
890; CHECK-LABEL: vld3_v2i64:
891; CHECK:       @ %bb.0: @ %entry
892; CHECK-NEXT:    .save {r4, lr}
893; CHECK-NEXT:    push {r4, lr}
894; CHECK-NEXT:    .vsave {d8, d9}
895; CHECK-NEXT:    vpush {d8, d9}
896; CHECK-NEXT:    vldrw.u32 q1, [r0]
897; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
898; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
899; CHECK-NEXT:    vmov.f64 d6, d3
900; CHECK-NEXT:    vmov.f32 s13, s7
901; CHECK-NEXT:    vmov.f32 s14, s16
902; CHECK-NEXT:    vmov.f32 s6, s10
903; CHECK-NEXT:    vmov.f32 s7, s11
904; CHECK-NEXT:    vmov.f32 s15, s17
905; CHECK-NEXT:    vmov r3, s14
906; CHECK-NEXT:    vmov r0, s6
907; CHECK-NEXT:    vmov.f64 d0, d4
908; CHECK-NEXT:    vmov.f32 s1, s9
909; CHECK-NEXT:    vmov.f32 s2, s18
910; CHECK-NEXT:    vmov.f32 s3, s19
911; CHECK-NEXT:    vmov r12, s15
912; CHECK-NEXT:    vmov r2, s7
913; CHECK-NEXT:    vmov r4, s4
914; CHECK-NEXT:    adds.w lr, r0, r3
915; CHECK-NEXT:    vmov r0, s2
916; CHECK-NEXT:    vmov r3, s3
917; CHECK-NEXT:    adc.w r2, r2, r12
918; CHECK-NEXT:    adds.w lr, lr, r0
919; CHECK-NEXT:    vmov r0, s12
920; CHECK-NEXT:    adc.w r12, r2, r3
921; CHECK-NEXT:    vmov r3, s13
922; CHECK-NEXT:    vmov r2, s5
923; CHECK-NEXT:    adds r0, r0, r4
924; CHECK-NEXT:    vmov r4, s0
925; CHECK-NEXT:    adcs r2, r3
926; CHECK-NEXT:    vmov r3, s1
927; CHECK-NEXT:    adds r0, r0, r4
928; CHECK-NEXT:    vmov.32 q0[0], r0
929; CHECK-NEXT:    adcs r2, r3
930; CHECK-NEXT:    vmov.32 q0[1], r2
931; CHECK-NEXT:    vmov.32 q0[2], lr
932; CHECK-NEXT:    vmov.32 q0[3], r12
933; CHECK-NEXT:    vstrw.32 q0, [r1]
934; CHECK-NEXT:    vpop {d8, d9}
935; CHECK-NEXT:    pop {r4, pc}
936entry:
937  %l1 = load <6 x i64>, <6 x i64>* %src, align 4
938  %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
939  %s2 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
940  %s3 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
941  %a1 = add <2 x i64> %s1, %s2
942  %a = add <2 x i64> %a1, %s3
943  store <2 x i64> %a, <2 x i64> *%dst
944  ret void
945}
946
947define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
948; CHECK-LABEL: vld3_v4i64:
949; CHECK:       @ %bb.0: @ %entry
950; CHECK-NEXT:    .save {r4, lr}
951; CHECK-NEXT:    push {r4, lr}
952; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
953; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
954; CHECK-NEXT:    .pad #24
955; CHECK-NEXT:    sub sp, #24
956; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
957; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
958; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
959; CHECK-NEXT:    vldrw.u32 q1, [r0]
960; CHECK-NEXT:    vmov.f64 d4, d0
961; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
962; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
963; CHECK-NEXT:    vmov.f32 s9, s1
964; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
965; CHECK-NEXT:    vmov.f64 d14, d11
966; CHECK-NEXT:    vmov.f32 s29, s23
967; CHECK-NEXT:    vmov.f32 s30, s0
968; CHECK-NEXT:    vmov.f32 s22, s26
969; CHECK-NEXT:    vmov.f32 s23, s27
970; CHECK-NEXT:    vmov.f32 s31, s1
971; CHECK-NEXT:    vmov r3, s30
972; CHECK-NEXT:    vmov r0, s22
973; CHECK-NEXT:    vmov.f64 d6, d3
974; CHECK-NEXT:    vmov.f32 s13, s7
975; CHECK-NEXT:    vmov.f32 s10, s18
976; CHECK-NEXT:    vmov.f32 s14, s16
977; CHECK-NEXT:    vmov.f32 s11, s19
978; CHECK-NEXT:    vmov.f32 s15, s17
979; CHECK-NEXT:    vmov.f64 d8, d12
980; CHECK-NEXT:    vmov.f32 s17, s25
981; CHECK-NEXT:    vmov.f32 s18, s2
982; CHECK-NEXT:    vmov.f32 s19, s3
983; CHECK-NEXT:    vmov r12, s31
984; CHECK-NEXT:    vmov r2, s23
985; CHECK-NEXT:    adds.w lr, r0, r3
986; CHECK-NEXT:    vmov r0, s18
987; CHECK-NEXT:    vmov r4, s20
988; CHECK-NEXT:    vmov r3, s19
989; CHECK-NEXT:    adc.w r2, r2, r12
990; CHECK-NEXT:    adds.w lr, lr, r0
991; CHECK-NEXT:    vmov r0, s28
992; CHECK-NEXT:    adc.w r12, r2, r3
993; CHECK-NEXT:    vmov r3, s29
994; CHECK-NEXT:    vmov r2, s21
995; CHECK-NEXT:    adds r0, r0, r4
996; CHECK-NEXT:    vmov r4, s16
997; CHECK-NEXT:    adcs r2, r3
998; CHECK-NEXT:    vmov r3, s17
999; CHECK-NEXT:    adds r0, r0, r4
1000; CHECK-NEXT:    vmov.32 q0[0], r0
1001; CHECK-NEXT:    vmov r0, s15
1002; CHECK-NEXT:    adcs r2, r3
1003; CHECK-NEXT:    vmov r3, s14
1004; CHECK-NEXT:    vmov.32 q0[1], r2
1005; CHECK-NEXT:    vmov.32 q0[2], lr
1006; CHECK-NEXT:    vmov.32 q0[3], r12
1007; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1008; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
1009; CHECK-NEXT:    vmov.f32 s6, s2
1010; CHECK-NEXT:    vmov.f32 s7, s3
1011; CHECK-NEXT:    vmov r4, s6
1012; CHECK-NEXT:    vmov r2, s7
1013; CHECK-NEXT:    adds r3, r3, r4
1014; CHECK-NEXT:    vmov r4, s10
1015; CHECK-NEXT:    adcs r0, r2
1016; CHECK-NEXT:    vmov r2, s11
1017; CHECK-NEXT:    adds.w lr, r3, r4
1018; CHECK-NEXT:    vmov r3, s4
1019; CHECK-NEXT:    vmov r4, s5
1020; CHECK-NEXT:    adc.w r12, r0, r2
1021; CHECK-NEXT:    vmov r0, s12
1022; CHECK-NEXT:    vmov r2, s13
1023; CHECK-NEXT:    adds r0, r0, r3
1024; CHECK-NEXT:    vmov r3, s9
1025; CHECK-NEXT:    adcs r2, r4
1026; CHECK-NEXT:    vmov r4, s8
1027; CHECK-NEXT:    adds r0, r0, r4
1028; CHECK-NEXT:    adcs r2, r3
1029; CHECK-NEXT:    vmov.32 q0[0], r0
1030; CHECK-NEXT:    vmov.32 q0[1], r2
1031; CHECK-NEXT:    vmov.32 q0[2], lr
1032; CHECK-NEXT:    vmov.32 q0[3], r12
1033; CHECK-NEXT:    vstrw.32 q0, [r1]
1034; CHECK-NEXT:    add sp, #24
1035; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1036; CHECK-NEXT:    pop {r4, pc}
1037entry:
1038  %l1 = load <12 x i64>, <12 x i64>* %src, align 4
1039  %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1040  %s2 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1041  %s3 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1042  %a1 = add <4 x i64> %s1, %s2
1043  %a = add <4 x i64> %a1, %s3
1044  store <4 x i64> %a, <4 x i64> *%dst
1045  ret void
1046}
1047
1048; f32
1049
1050define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
1051; CHECK-LABEL: vld3_v2f32:
1052; CHECK:       @ %bb.0: @ %entry
1053; CHECK-NEXT:    vldrw.u32 q2, [r0]
1054; CHECK-NEXT:    vldr s1, [r0, #16]
1055; CHECK-NEXT:    vldr s5, [r0, #20]
1056; CHECK-NEXT:    vmov.f64 d6, d4
1057; CHECK-NEXT:    vmov.f32 s13, s11
1058; CHECK-NEXT:    vmov.f32 s0, s9
1059; CHECK-NEXT:    vadd.f32 q0, q3, q0
1060; CHECK-NEXT:    vmov.f32 s4, s10
1061; CHECK-NEXT:    vadd.f32 q0, q0, q1
1062; CHECK-NEXT:    vstmia r1, {s0, s1}
1063; CHECK-NEXT:    bx lr
1064entry:
1065  %l1 = load <6 x float>, <6 x float>* %src, align 4
1066  %s1 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 0, i32 3>
1067  %s2 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 1, i32 4>
1068  %s3 = shufflevector <6 x float> %l1, <6 x float> undef, <2 x i32> <i32 2, i32 5>
1069  %a1 = fadd <2 x float> %s1, %s2
1070  %a = fadd <2 x float> %a1, %s3
1071  store <2 x float> %a, <2 x float> *%dst
1072  ret void
1073}
1074
1075define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
1076; CHECK-LABEL: vld3_v4f32:
1077; CHECK:       @ %bb.0: @ %entry
1078; CHECK-NEXT:    .vsave {d8, d9}
1079; CHECK-NEXT:    vpush {d8, d9}
1080; CHECK-NEXT:    vldrw.u32 q1, [r0]
1081; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1082; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
1083; CHECK-NEXT:    vmov.f32 s12, s5
1084; CHECK-NEXT:    vmov.f64 d8, d2
1085; CHECK-NEXT:    vmov.f32 s13, s0
1086; CHECK-NEXT:    vmov.f32 s17, s7
1087; CHECK-NEXT:    vmov.f32 s14, s3
1088; CHECK-NEXT:    vmov.f32 s18, s2
1089; CHECK-NEXT:    vmov.f32 s0, s6
1090; CHECK-NEXT:    vmov.f32 s15, s10
1091; CHECK-NEXT:    vmov.f32 s19, s9
1092; CHECK-NEXT:    vmov.f32 s10, s8
1093; CHECK-NEXT:    vadd.f32 q3, q4, q3
1094; CHECK-NEXT:    vmov.f32 s2, s8
1095; CHECK-NEXT:    vmov.f32 s3, s11
1096; CHECK-NEXT:    vadd.f32 q0, q3, q0
1097; CHECK-NEXT:    vstrw.32 q0, [r1]
1098; CHECK-NEXT:    vpop {d8, d9}
1099; CHECK-NEXT:    bx lr
1100entry:
1101  %l1 = load <12 x float>, <12 x float>* %src, align 4
1102  %s1 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1103  %s2 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1104  %s3 = shufflevector <12 x float> %l1, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1105  %a1 = fadd <4 x float> %s1, %s2
1106  %a = fadd <4 x float> %a1, %s3
1107  store <4 x float> %a, <4 x float> *%dst
1108  ret void
1109}
1110
1111define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
1112; CHECK-LABEL: vld3_v8f32:
1113; CHECK:       @ %bb.0: @ %entry
1114; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
1115; CHECK-NEXT:    vpush {d8, d9, d10, d11}
1116; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
1117; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1118; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
1119; CHECK-NEXT:    vmov.f32 s12, s5
1120; CHECK-NEXT:    vmov.f64 d8, d2
1121; CHECK-NEXT:    vmov.f32 s13, s0
1122; CHECK-NEXT:    vmov.f32 s17, s7
1123; CHECK-NEXT:    vmov.f32 s14, s3
1124; CHECK-NEXT:    vmov.f32 s18, s2
1125; CHECK-NEXT:    vmov.f32 s0, s6
1126; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1127; CHECK-NEXT:    vmov.f32 s15, s10
1128; CHECK-NEXT:    vmov.f32 s19, s9
1129; CHECK-NEXT:    vmov.f32 s10, s8
1130; CHECK-NEXT:    vadd.f32 q3, q4, q3
1131; CHECK-NEXT:    vmov.f32 s2, s8
1132; CHECK-NEXT:    vmov.f32 s3, s11
1133; CHECK-NEXT:    vldrw.u32 q2, [r0]
1134; CHECK-NEXT:    vadd.f32 q0, q3, q0
1135; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1136; CHECK-NEXT:    vmov.f32 s16, s9
1137; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1138; CHECK-NEXT:    vmov.f64 d10, d4
1139; CHECK-NEXT:    vmov.f32 s17, s4
1140; CHECK-NEXT:    vmov.f32 s21, s11
1141; CHECK-NEXT:    vmov.f32 s18, s7
1142; CHECK-NEXT:    vmov.f32 s22, s6
1143; CHECK-NEXT:    vmov.f32 s4, s10
1144; CHECK-NEXT:    vmov.f32 s19, s14
1145; CHECK-NEXT:    vmov.f32 s23, s13
1146; CHECK-NEXT:    vmov.f32 s14, s12
1147; CHECK-NEXT:    vadd.f32 q4, q5, q4
1148; CHECK-NEXT:    vmov.f32 s6, s12
1149; CHECK-NEXT:    vmov.f32 s7, s15
1150; CHECK-NEXT:    vadd.f32 q1, q4, q1
1151; CHECK-NEXT:    vstrw.32 q1, [r1]
1152; CHECK-NEXT:    vpop {d8, d9, d10, d11}
1153; CHECK-NEXT:    bx lr
1154entry:
1155  %l1 = load <24 x float>, <24 x float>* %src, align 4
1156  %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1157  %s2 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1158  %s3 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1159  %a1 = fadd <8 x float> %s1, %s2
1160  %a = fadd <8 x float> %a1, %s3
1161  store <8 x float> %a, <8 x float> *%dst
1162  ret void
1163}
1164
1165define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
1166; CHECK-LABEL: vld3_v16f32:
1167; CHECK:       @ %bb.0: @ %entry
1168; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1169; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1170; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
1171; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1172; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
1173; CHECK-NEXT:    vmov.f32 s12, s5
1174; CHECK-NEXT:    vmov.f64 d8, d2
1175; CHECK-NEXT:    vmov.f32 s13, s0
1176; CHECK-NEXT:    vmov.f32 s17, s7
1177; CHECK-NEXT:    vmov.f32 s14, s3
1178; CHECK-NEXT:    vmov.f32 s18, s2
1179; CHECK-NEXT:    vmov.f32 s0, s6
1180; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1181; CHECK-NEXT:    vmov.f32 s15, s10
1182; CHECK-NEXT:    vmov.f32 s19, s9
1183; CHECK-NEXT:    vmov.f32 s10, s8
1184; CHECK-NEXT:    vadd.f32 q3, q4, q3
1185; CHECK-NEXT:    vmov.f32 s2, s8
1186; CHECK-NEXT:    vmov.f32 s3, s11
1187; CHECK-NEXT:    vldrw.u32 q2, [r0]
1188; CHECK-NEXT:    vadd.f32 q0, q3, q0
1189; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1190; CHECK-NEXT:    vmov.f32 s16, s9
1191; CHECK-NEXT:    vmov.f64 d10, d4
1192; CHECK-NEXT:    vmov.f32 s17, s4
1193; CHECK-NEXT:    vmov.f32 s21, s11
1194; CHECK-NEXT:    vmov.f32 s18, s7
1195; CHECK-NEXT:    vmov.f32 s22, s6
1196; CHECK-NEXT:    vmov.f32 s4, s10
1197; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
1198; CHECK-NEXT:    vmov.f32 s19, s14
1199; CHECK-NEXT:    vmov.f32 s23, s13
1200; CHECK-NEXT:    vmov.f32 s14, s12
1201; CHECK-NEXT:    vadd.f32 q4, q5, q4
1202; CHECK-NEXT:    vmov.f32 s6, s12
1203; CHECK-NEXT:    vmov.f32 s7, s15
1204; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
1205; CHECK-NEXT:    vadd.f32 q1, q4, q1
1206; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
1207; CHECK-NEXT:    vmov.f32 s20, s13
1208; CHECK-NEXT:    vmov.f64 d12, d6
1209; CHECK-NEXT:    vmov.f32 s21, s8
1210; CHECK-NEXT:    vmov.f32 s25, s15
1211; CHECK-NEXT:    vmov.f32 s22, s11
1212; CHECK-NEXT:    vmov.f32 s26, s10
1213; CHECK-NEXT:    vmov.f32 s8, s14
1214; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
1215; CHECK-NEXT:    vmov.f32 s23, s18
1216; CHECK-NEXT:    vmov.f32 s27, s17
1217; CHECK-NEXT:    vmov.f32 s18, s16
1218; CHECK-NEXT:    vadd.f32 q5, q6, q5
1219; CHECK-NEXT:    vmov.f32 s10, s16
1220; CHECK-NEXT:    vmov.f32 s11, s19
1221; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
1222; CHECK-NEXT:    vadd.f32 q2, q5, q2
1223; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
1224; CHECK-NEXT:    vmov.f32 s24, s17
1225; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
1226; CHECK-NEXT:    vmov.f64 d14, d8
1227; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1228; CHECK-NEXT:    vstrw.32 q1, [r1]
1229; CHECK-NEXT:    vmov.f32 s25, s12
1230; CHECK-NEXT:    vmov.f32 s29, s19
1231; CHECK-NEXT:    vmov.f32 s26, s15
1232; CHECK-NEXT:    vmov.f32 s30, s14
1233; CHECK-NEXT:    vmov.f32 s12, s18
1234; CHECK-NEXT:    vmov.f32 s27, s22
1235; CHECK-NEXT:    vmov.f32 s31, s21
1236; CHECK-NEXT:    vmov.f32 s22, s20
1237; CHECK-NEXT:    vadd.f32 q6, q7, q6
1238; CHECK-NEXT:    vmov.f32 s14, s20
1239; CHECK-NEXT:    vmov.f32 s15, s23
1240; CHECK-NEXT:    vadd.f32 q3, q6, q3
1241; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
1242; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1243; CHECK-NEXT:    bx lr
1244entry:
1245  %l1 = load <48 x float>, <48 x float>* %src, align 4
1246  %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1247  %s2 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1248  %s3 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1249  %a1 = fadd <16 x float> %s1, %s2
1250  %a = fadd <16 x float> %a1, %s3
1251  store <16 x float> %a, <16 x float> *%dst
1252  ret void
1253}
1254
1255; f16
1256
1257define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
1258; CHECK-LABEL: vld3_v2f16:
1259; CHECK:       @ %bb.0: @ %entry
1260; CHECK-NEXT:    ldrd r2, r3, [r0]
1261; CHECK-NEXT:    ldr r0, [r0, #8]
1262; CHECK-NEXT:    vmov.32 q0[0], r2
1263; CHECK-NEXT:    vmov.32 q0[1], r3
1264; CHECK-NEXT:    vmov.32 q0[2], r0
1265; CHECK-NEXT:    vmovx.f16 s4, s0
1266; CHECK-NEXT:    vmov r2, s2
1267; CHECK-NEXT:    vmov r0, s4
1268; CHECK-NEXT:    vmovx.f16 s8, s1
1269; CHECK-NEXT:    vmov.16 q1[0], r0
1270; CHECK-NEXT:    vmov r0, s8
1271; CHECK-NEXT:    vmov.16 q1[1], r2
1272; CHECK-NEXT:    vmov r2, s0
1273; CHECK-NEXT:    vmov.16 q2[0], r2
1274; CHECK-NEXT:    vmov r2, s1
1275; CHECK-NEXT:    vmov.16 q2[1], r0
1276; CHECK-NEXT:    vadd.f16 q1, q2, q1
1277; CHECK-NEXT:    vmovx.f16 s8, s2
1278; CHECK-NEXT:    vmov r0, s8
1279; CHECK-NEXT:    vmov.16 q0[0], r2
1280; CHECK-NEXT:    vmov.16 q0[1], r0
1281; CHECK-NEXT:    vadd.f16 q0, q1, q0
1282; CHECK-NEXT:    vmov r0, s0
1283; CHECK-NEXT:    str r0, [r1]
1284; CHECK-NEXT:    bx lr
1285entry:
1286  %l1 = load <6 x half>, <6 x half>* %src, align 4
1287  %s1 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 0, i32 3>
1288  %s2 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 1, i32 4>
1289  %s3 = shufflevector <6 x half> %l1, <6 x half> undef, <2 x i32> <i32 2, i32 5>
1290  %a1 = fadd <2 x half> %s1, %s2
1291  %a = fadd <2 x half> %a1, %s3
1292  store <2 x half> %a, <2 x half> *%dst
1293  ret void
1294}
1295
1296define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
1297; CHECK-LABEL: vld3_v4f16:
1298; CHECK:       @ %bb.0: @ %entry
1299; CHECK-NEXT:    .vsave {d8}
1300; CHECK-NEXT:    vpush {d8}
1301; CHECK-NEXT:    vldrw.u32 q0, [r0]
1302; CHECK-NEXT:    vmovx.f16 s4, s0
1303; CHECK-NEXT:    vmov r2, s2
1304; CHECK-NEXT:    vmov r3, s4
1305; CHECK-NEXT:    vmovx.f16 s4, s3
1306; CHECK-NEXT:    vmov.16 q2[0], r3
1307; CHECK-NEXT:    vmovx.f16 s12, s1
1308; CHECK-NEXT:    vmov.16 q2[1], r2
1309; CHECK-NEXT:    vmov r2, s4
1310; CHECK-NEXT:    vmov.16 q2[2], r2
1311; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
1312; CHECK-NEXT:    vmov.32 q1[0], r2
1313; CHECK-NEXT:    vmov r2, s0
1314; CHECK-NEXT:    vmov.32 q1[1], r0
1315; CHECK-NEXT:    vmovx.f16 s0, s2
1316; CHECK-NEXT:    vmov r0, s5
1317; CHECK-NEXT:    vmovx.f16 s16, s4
1318; CHECK-NEXT:    vmov.16 q2[3], r0
1319; CHECK-NEXT:    vmov r0, s12
1320; CHECK-NEXT:    vmov.16 q3[0], r2
1321; CHECK-NEXT:    vmov r2, s0
1322; CHECK-NEXT:    vmov.16 q3[1], r0
1323; CHECK-NEXT:    vmov r0, s3
1324; CHECK-NEXT:    vmov.16 q3[2], r0
1325; CHECK-NEXT:    vmov r0, s16
1326; CHECK-NEXT:    vmov.16 q3[3], r0
1327; CHECK-NEXT:    vmov r0, s1
1328; CHECK-NEXT:    vmov.16 q0[0], r0
1329; CHECK-NEXT:    vmov r0, s4
1330; CHECK-NEXT:    vmov.16 q0[1], r2
1331; CHECK-NEXT:    vmovx.f16 s4, s5
1332; CHECK-NEXT:    vmov.16 q0[2], r0
1333; CHECK-NEXT:    vmov r0, s4
1334; CHECK-NEXT:    vadd.f16 q2, q3, q2
1335; CHECK-NEXT:    vmov.16 q0[3], r0
1336; CHECK-NEXT:    vadd.f16 q0, q2, q0
1337; CHECK-NEXT:    vmov r2, s1
1338; CHECK-NEXT:    vmov r0, s0
1339; CHECK-NEXT:    strd r0, r2, [r1]
1340; CHECK-NEXT:    vpop {d8}
1341; CHECK-NEXT:    bx lr
1342entry:
1343  %l1 = load <12 x half>, <12 x half>* %src, align 4
1344  %s1 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1345  %s2 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1346  %s3 = shufflevector <12 x half> %l1, <12 x half> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1347  %a1 = fadd <4 x half> %s1, %s2
1348  %a = fadd <4 x half> %a1, %s3
1349  store <4 x half> %a, <4 x half> *%dst
1350  ret void
1351}
1352
1353define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
1354; CHECK-LABEL: vld3_v8f16:
1355; CHECK:       @ %bb.0: @ %entry
1356; CHECK-NEXT:    .save {r4, lr}
1357; CHECK-NEXT:    push {r4, lr}
1358; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1359; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1360; CHECK-NEXT:    vldrw.u32 q1, [r0]
1361; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
1362; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
1363; CHECK-NEXT:    vmovx.f16 s0, s6
1364; CHECK-NEXT:    vmov r3, s5
1365; CHECK-NEXT:    vmov r2, s0
1366; CHECK-NEXT:    vmov.16 q0[0], r3
1367; CHECK-NEXT:    vmov.16 q0[1], r2
1368; CHECK-NEXT:    vmov r2, s8
1369; CHECK-NEXT:    vmovx.f16 s12, s9
1370; CHECK-NEXT:    vmov.16 q0[2], r2
1371; CHECK-NEXT:    vmov r2, s12
1372; CHECK-NEXT:    vmovx.f16 s12, s19
1373; CHECK-NEXT:    vmov.16 q0[3], r2
1374; CHECK-NEXT:    vmov r3, s18
1375; CHECK-NEXT:    vmov.f32 s2, s11
1376; CHECK-NEXT:    vmovx.f16 s20, s16
1377; CHECK-NEXT:    vmov r0, s12
1378; CHECK-NEXT:    vmov.16 q3[6], r3
1379; CHECK-NEXT:    vmov.16 q3[7], r0
1380; CHECK-NEXT:    vmov r0, s20
1381; CHECK-NEXT:    vmov.f32 s14, s16
1382; CHECK-NEXT:    vmovx.f16 s24, s8
1383; CHECK-NEXT:    vmov r4, s0
1384; CHECK-NEXT:    vmov r2, s2
1385; CHECK-NEXT:    vmov.16 q5[4], r2
1386; CHECK-NEXT:    vmov r2, s17
1387; CHECK-NEXT:    vmov.16 q5[5], r0
1388; CHECK-NEXT:    vmov r0, s19
1389; CHECK-NEXT:    vmov lr, s22
1390; CHECK-NEXT:    vmovx.f16 s20, s17
1391; CHECK-NEXT:    vmov r3, s20
1392; CHECK-NEXT:    vmov.16 q5[6], r3
1393; CHECK-NEXT:    vmov.16 q5[7], r0
1394; CHECK-NEXT:    vmov r0, s16
1395; CHECK-NEXT:    vmov r12, s23
1396; CHECK-NEXT:    vmovx.f16 s20, s10
1397; CHECK-NEXT:    vmov r3, s20
1398; CHECK-NEXT:    vmov.16 q5[4], r3
1399; CHECK-NEXT:    vmov.16 q5[5], r0
1400; CHECK-NEXT:    vmov r3, s22
1401; CHECK-NEXT:    vmovx.f16 s20, s18
1402; CHECK-NEXT:    vmov r0, s20
1403; CHECK-NEXT:    vmov.16 q4[6], r2
1404; CHECK-NEXT:    vmov.16 q4[7], r0
1405; CHECK-NEXT:    vmovx.f16 s20, s5
1406; CHECK-NEXT:    vmov r0, s4
1407; CHECK-NEXT:    vmov r2, s20
1408; CHECK-NEXT:    vmov.16 q5[0], r0
1409; CHECK-NEXT:    vmov.16 q5[1], r2
1410; CHECK-NEXT:    vmov r0, s7
1411; CHECK-NEXT:    vmov.16 q5[2], r0
1412; CHECK-NEXT:    vmov r0, s24
1413; CHECK-NEXT:    vmov.16 q5[3], r0
1414; CHECK-NEXT:    vmov r0, s10
1415; CHECK-NEXT:    vmovx.f16 s24, s11
1416; CHECK-NEXT:    vmov.16 q5[4], r0
1417; CHECK-NEXT:    vmov r0, s24
1418; CHECK-NEXT:    vmovx.f16 s24, s4
1419; CHECK-NEXT:    vmov r2, s24
1420; CHECK-NEXT:    vmov.16 q5[5], r0
1421; CHECK-NEXT:    vmov r0, s6
1422; CHECK-NEXT:    vmov.16 q6[0], r2
1423; CHECK-NEXT:    vmovx.f16 s4, s7
1424; CHECK-NEXT:    vmov.16 q6[1], r0
1425; CHECK-NEXT:    vmov r0, s4
1426; CHECK-NEXT:    vmov.32 q1[0], r4
1427; CHECK-NEXT:    vmov.16 q6[2], r0
1428; CHECK-NEXT:    vmov r0, s9
1429; CHECK-NEXT:    vmov.16 q6[3], r0
1430; CHECK-NEXT:    vmov r4, s1
1431; CHECK-NEXT:    vmov r2, s24
1432; CHECK-NEXT:    vmov.32 q1[1], r4
1433; CHECK-NEXT:    vmov r0, s25
1434; CHECK-NEXT:    vmov.32 q0[0], r2
1435; CHECK-NEXT:    vmov.32 q0[1], r0
1436; CHECK-NEXT:    vmov.32 q1[2], lr
1437; CHECK-NEXT:    vmov.32 q0[2], r3
1438; CHECK-NEXT:    vmov r4, s15
1439; CHECK-NEXT:    vmov.f32 s23, s19
1440; CHECK-NEXT:    vmov.32 q0[3], r12
1441; CHECK-NEXT:    vmov.32 q1[3], r4
1442; CHECK-NEXT:    vadd.f16 q0, q5, q0
1443; CHECK-NEXT:    vadd.f16 q0, q0, q1
1444; CHECK-NEXT:    vstrw.32 q0, [r1]
1445; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1446; CHECK-NEXT:    pop {r4, pc}
1447entry:
1448  %l1 = load <24 x half>, <24 x half>* %src, align 4
1449  %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
1450  %s2 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
1451  %s3 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
1452  %a1 = fadd <8 x half> %s1, %s2
1453  %a = fadd <8 x half> %a1, %s3
1454  store <8 x half> %a, <8 x half> *%dst
1455  ret void
1456}
1457
1458define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
1459; CHECK-LABEL: vld3_v16f16:
1460; CHECK:       @ %bb.0: @ %entry
1461; CHECK-NEXT:    .save {r4, r5, r7, lr}
1462; CHECK-NEXT:    push {r4, r5, r7, lr}
1463; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1464; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1465; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
1466; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
1467; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
1468; CHECK-NEXT:    vmovx.f16 s0, s19
1469; CHECK-NEXT:    vmovx.f16 s4, s16
1470; CHECK-NEXT:    vmov r2, s18
1471; CHECK-NEXT:    vmovx.f16 s20, s13
1472; CHECK-NEXT:    vmov r3, s0
1473; CHECK-NEXT:    vmov.16 q0[6], r2
1474; CHECK-NEXT:    vmov r12, s4
1475; CHECK-NEXT:    vmovx.f16 s4, s10
1476; CHECK-NEXT:    vmov r2, s9
1477; CHECK-NEXT:    vmov.16 q0[7], r3
1478; CHECK-NEXT:    vmov r3, s4
1479; CHECK-NEXT:    vmov.16 q1[0], r2
1480; CHECK-NEXT:    vmov.16 q1[1], r3
1481; CHECK-NEXT:    vmov r2, s12
1482; CHECK-NEXT:    vmov.16 q1[2], r2
1483; CHECK-NEXT:    vmov r2, s20
1484; CHECK-NEXT:    vmov.16 q1[3], r2
1485; CHECK-NEXT:    vmov r3, s16
1486; CHECK-NEXT:    vmov.f32 s6, s15
1487; CHECK-NEXT:    vmovx.f16 s24, s12
1488; CHECK-NEXT:    vmov.f32 s2, s16
1489; CHECK-NEXT:    vmovx.f16 s16, s18
1490; CHECK-NEXT:    vmov r4, s16
1491; CHECK-NEXT:    vmov r5, s4
1492; CHECK-NEXT:    vmov r2, s6
1493; CHECK-NEXT:    vmov.16 q5[4], r2
1494; CHECK-NEXT:    vmov.16 q5[5], r12
1495; CHECK-NEXT:    vmov lr, s22
1496; CHECK-NEXT:    vmovx.f16 s20, s14
1497; CHECK-NEXT:    vmov r2, s20
1498; CHECK-NEXT:    vmov.16 q5[4], r2
1499; CHECK-NEXT:    vmov r2, s19
1500; CHECK-NEXT:    vmov.16 q5[5], r3
1501; CHECK-NEXT:    vmov r12, s22
1502; CHECK-NEXT:    vmovx.f16 s20, s17
1503; CHECK-NEXT:    vmov r3, s20
1504; CHECK-NEXT:    vmov.16 q5[6], r3
1505; CHECK-NEXT:    vmov r3, s17
1506; CHECK-NEXT:    vmov.16 q5[7], r2
1507; CHECK-NEXT:    vmov.16 q4[6], r3
1508; CHECK-NEXT:    vmov r2, s23
1509; CHECK-NEXT:    vmov.16 q4[7], r4
1510; CHECK-NEXT:    vmovx.f16 s20, s9
1511; CHECK-NEXT:    vmov r4, s8
1512; CHECK-NEXT:    vmov r3, s20
1513; CHECK-NEXT:    vmov.16 q5[0], r4
1514; CHECK-NEXT:    vmov.16 q5[1], r3
1515; CHECK-NEXT:    vmov r3, s11
1516; CHECK-NEXT:    vmov.16 q5[2], r3
1517; CHECK-NEXT:    vmov r3, s24
1518; CHECK-NEXT:    vmov.16 q5[3], r3
1519; CHECK-NEXT:    vmov r3, s14
1520; CHECK-NEXT:    vmovx.f16 s24, s15
1521; CHECK-NEXT:    vmov.16 q5[4], r3
1522; CHECK-NEXT:    vmov r3, s24
1523; CHECK-NEXT:    vmovx.f16 s24, s8
1524; CHECK-NEXT:    vmov.16 q5[5], r3
1525; CHECK-NEXT:    vmov r3, s24
1526; CHECK-NEXT:    vmov r4, s10
1527; CHECK-NEXT:    vmov.16 q6[0], r3
1528; CHECK-NEXT:    vmovx.f16 s8, s11
1529; CHECK-NEXT:    vmov.16 q6[1], r4
1530; CHECK-NEXT:    vmov r3, s8
1531; CHECK-NEXT:    vmov.32 q2[0], r5
1532; CHECK-NEXT:    vmov.16 q6[2], r3
1533; CHECK-NEXT:    vmov r3, s13
1534; CHECK-NEXT:    vmov.16 q6[3], r3
1535; CHECK-NEXT:    vmov r5, s5
1536; CHECK-NEXT:    vmov r3, s24
1537; CHECK-NEXT:    vmov.32 q2[1], r5
1538; CHECK-NEXT:    vmov r5, s3
1539; CHECK-NEXT:    vmov.32 q0[0], r3
1540; CHECK-NEXT:    vmov r4, s25
1541; CHECK-NEXT:    vmov.32 q2[2], lr
1542; CHECK-NEXT:    vmov.32 q0[1], r4
1543; CHECK-NEXT:    vmov.f32 s23, s19
1544; CHECK-NEXT:    vmov.32 q0[2], r12
1545; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
1546; CHECK-NEXT:    vmov.32 q0[3], r2
1547; CHECK-NEXT:    vmov.32 q2[3], r5
1548; CHECK-NEXT:    vadd.f16 q0, q5, q0
1549; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1550; CHECK-NEXT:    vadd.f16 q0, q0, q2
1551; CHECK-NEXT:    vldrw.u32 q2, [r0]
1552; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
1553; CHECK-NEXT:    vmovx.f16 s0, s19
1554; CHECK-NEXT:    vmov r0, s18
1555; CHECK-NEXT:    vmovx.f16 s12, s16
1556; CHECK-NEXT:    vmov r2, s0
1557; CHECK-NEXT:    vmov.16 q0[6], r0
1558; CHECK-NEXT:    vmov.16 q0[7], r2
1559; CHECK-NEXT:    vmov r0, s12
1560; CHECK-NEXT:    vmovx.f16 s12, s10
1561; CHECK-NEXT:    vmov r2, s9
1562; CHECK-NEXT:    vmov r3, s12
1563; CHECK-NEXT:    vmov.16 q3[0], r2
1564; CHECK-NEXT:    vmov.16 q3[1], r3
1565; CHECK-NEXT:    vmov r2, s4
1566; CHECK-NEXT:    vmovx.f16 s20, s5
1567; CHECK-NEXT:    vmov.16 q3[2], r2
1568; CHECK-NEXT:    vmov r2, s20
1569; CHECK-NEXT:    vmovx.f16 s24, s4
1570; CHECK-NEXT:    vmov.16 q3[3], r2
1571; CHECK-NEXT:    vmov r3, s16
1572; CHECK-NEXT:    vmov.f32 s14, s7
1573; CHECK-NEXT:    vmov.f32 s2, s16
1574; CHECK-NEXT:    vmovx.f16 s16, s18
1575; CHECK-NEXT:    vmov r4, s16
1576; CHECK-NEXT:    vmov r2, s14
1577; CHECK-NEXT:    vmov.16 q5[4], r2
1578; CHECK-NEXT:    vmov.16 q5[5], r0
1579; CHECK-NEXT:    vmov r2, s22
1580; CHECK-NEXT:    vmovx.f16 s20, s6
1581; CHECK-NEXT:    vmov r0, s20
1582; CHECK-NEXT:    vmov.16 q5[4], r0
1583; CHECK-NEXT:    vmov r0, s12
1584; CHECK-NEXT:    vmov.16 q5[5], r3
1585; CHECK-NEXT:    vmov r3, s19
1586; CHECK-NEXT:    vmov r12, s22
1587; CHECK-NEXT:    vmovx.f16 s20, s17
1588; CHECK-NEXT:    vmov r5, s20
1589; CHECK-NEXT:    vmov.16 q5[6], r5
1590; CHECK-NEXT:    vmov r5, s17
1591; CHECK-NEXT:    vmov.16 q5[7], r3
1592; CHECK-NEXT:    vmov.16 q4[6], r5
1593; CHECK-NEXT:    vmov r3, s23
1594; CHECK-NEXT:    vmov.16 q4[7], r4
1595; CHECK-NEXT:    vmovx.f16 s20, s9
1596; CHECK-NEXT:    vmov r4, s8
1597; CHECK-NEXT:    vmov r5, s20
1598; CHECK-NEXT:    vmov.16 q5[0], r4
1599; CHECK-NEXT:    vmov.16 q5[1], r5
1600; CHECK-NEXT:    vmov r5, s11
1601; CHECK-NEXT:    vmov.16 q5[2], r5
1602; CHECK-NEXT:    vmov r5, s24
1603; CHECK-NEXT:    vmov.16 q5[3], r5
1604; CHECK-NEXT:    vmov r5, s6
1605; CHECK-NEXT:    vmovx.f16 s24, s7
1606; CHECK-NEXT:    vmov.16 q5[4], r5
1607; CHECK-NEXT:    vmov r5, s24
1608; CHECK-NEXT:    vmovx.f16 s24, s8
1609; CHECK-NEXT:    vmov.16 q5[5], r5
1610; CHECK-NEXT:    vmov r5, s24
1611; CHECK-NEXT:    vmov r4, s10
1612; CHECK-NEXT:    vmov.16 q6[0], r5
1613; CHECK-NEXT:    vmovx.f16 s8, s11
1614; CHECK-NEXT:    vmov.16 q6[1], r4
1615; CHECK-NEXT:    vmov r5, s8
1616; CHECK-NEXT:    vmov.16 q6[2], r5
1617; CHECK-NEXT:    vmov r5, s5
1618; CHECK-NEXT:    vmov.16 q6[3], r5
1619; CHECK-NEXT:    vmov.32 q1[0], r0
1620; CHECK-NEXT:    vmov r0, s13
1621; CHECK-NEXT:    vmov r5, s24
1622; CHECK-NEXT:    vmov.32 q1[1], r0
1623; CHECK-NEXT:    vmov r0, s3
1624; CHECK-NEXT:    vmov.32 q0[0], r5
1625; CHECK-NEXT:    vmov r4, s25
1626; CHECK-NEXT:    vmov.32 q1[2], r2
1627; CHECK-NEXT:    vmov.32 q0[1], r4
1628; CHECK-NEXT:    vmov.f32 s23, s19
1629; CHECK-NEXT:    vmov.32 q0[2], r12
1630; CHECK-NEXT:    vmov.32 q1[3], r0
1631; CHECK-NEXT:    vmov.32 q0[3], r3
1632; CHECK-NEXT:    vadd.f16 q0, q5, q0
1633; CHECK-NEXT:    vadd.f16 q0, q0, q1
1634; CHECK-NEXT:    vstrw.32 q0, [r1]
1635; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1636; CHECK-NEXT:    pop {r4, r5, r7, pc}
1637entry:
1638  %l1 = load <48 x half>, <48 x half>* %src, align 4
1639  %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1640  %s2 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1641  %s3 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1642  %a1 = fadd <16 x half> %s1, %s2
1643  %a = fadd <16 x half> %a1, %s3
1644  store <16 x half> %a, <16 x half> *%dst
1645  ret void
1646}
1647
1648; f64
1649
1650define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
1651; CHECK-LABEL: vld3_v2f64:
1652; CHECK:       @ %bb.0: @ %entry
1653; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1654; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
1655; CHECK-NEXT:    vldrw.u32 q3, [r0]
1656; CHECK-NEXT:    vadd.f64 d4, d3, d0
1657; CHECK-NEXT:    vadd.f64 d5, d6, d7
1658; CHECK-NEXT:    vadd.f64 d1, d4, d1
1659; CHECK-NEXT:    vadd.f64 d0, d5, d2
1660; CHECK-NEXT:    vstrw.32 q0, [r1]
1661; CHECK-NEXT:    bx lr
1662entry:
1663  %l1 = load <6 x double>, <6 x double>* %src, align 4
1664  %s1 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 0, i32 3>
1665  %s2 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 1, i32 4>
1666  %s3 = shufflevector <6 x double> %l1, <6 x double> undef, <2 x i32> <i32 2, i32 5>
1667  %a1 = fadd <2 x double> %s1, %s2
1668  %a = fadd <2 x double> %a1, %s3
1669  store <2 x double> %a, <2 x double> *%dst
1670  ret void
1671}
1672
1673define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
1674; CHECK-LABEL: vld3_v4f64:
1675; CHECK:       @ %bb.0: @ %entry
1676; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1677; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1678; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
1679; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
1680; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
1681; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
1682; CHECK-NEXT:    vadd.f64 d5, d6, d7
1683; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
1684; CHECK-NEXT:    vldrw.u32 q6, [r0]
1685; CHECK-NEXT:    vadd.f64 d4, d1, d2
1686; CHECK-NEXT:    vadd.f64 d10, d9, d6
1687; CHECK-NEXT:    vadd.f64 d11, d12, d13
1688; CHECK-NEXT:    vadd.f64 d3, d4, d3
1689; CHECK-NEXT:    vadd.f64 d2, d5, d0
1690; CHECK-NEXT:    vadd.f64 d1, d10, d7
1691; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
1692; CHECK-NEXT:    vadd.f64 d0, d11, d8
1693; CHECK-NEXT:    vstrw.32 q0, [r1]
1694; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1695; CHECK-NEXT:    bx lr
1696entry:
1697  %l1 = load <12 x double>, <12 x double>* %src, align 4
1698  %s1 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
1699  %s2 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
1700  %s3 = shufflevector <12 x double> %l1, <12 x double> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
1701  %a1 = fadd <4 x double> %s1, %s2
1702  %a = fadd <4 x double> %a1, %s3
1703  store <4 x double> %a, <4 x double> *%dst
1704  ret void
1705}
1706