1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
3
4define <8 x i8> @v_dup8(i8 %A) nounwind {
5; CHECK-LABEL: v_dup8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vdup.8 d16, r0
8; CHECK-NEXT:    vmov r0, r1, d16
9; CHECK-NEXT:    mov pc, lr
10	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
11	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
12	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
13	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
14	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
15	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
16	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
17	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
18	ret <8 x i8> %tmp8
19}
20
21define <4 x i16> @v_dup16(i16 %A) nounwind {
22; CHECK-LABEL: v_dup16:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    vdup.16 d16, r0
25; CHECK-NEXT:    vmov r0, r1, d16
26; CHECK-NEXT:    mov pc, lr
27	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
28	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
29	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
30	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
31	ret <4 x i16> %tmp4
32}
33
34define <2 x i32> @v_dup32(i32 %A) nounwind {
35; CHECK-LABEL: v_dup32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vdup.32 d16, r0
38; CHECK-NEXT:    vmov r0, r1, d16
39; CHECK-NEXT:    mov pc, lr
40	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
41	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
42	ret <2 x i32> %tmp2
43}
44
45define <2 x float> @v_dupfloat(float %A) nounwind {
46; CHECK-LABEL: v_dupfloat:
47; CHECK:       @ %bb.0:
48; CHECK-NEXT:    vdup.32 d16, r0
49; CHECK-NEXT:    vmov r0, r1, d16
50; CHECK-NEXT:    mov pc, lr
51	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
52	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
53	ret <2 x float> %tmp2
54}
55
56define <16 x i8> @v_dupQ8(i8 %A) nounwind {
57; CHECK-LABEL: v_dupQ8:
58; CHECK:       @ %bb.0:
59; CHECK-NEXT:    vdup.8 q8, r0
60; CHECK-NEXT:    vmov r0, r1, d16
61; CHECK-NEXT:    vmov r2, r3, d17
62; CHECK-NEXT:    mov pc, lr
63	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
64	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
65	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
66	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
67	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
68	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
69	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
70	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
71	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
72	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
73	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
74	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
75	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
76	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
77	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
78	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
79	ret <16 x i8> %tmp16
80}
81
82define <8 x i16> @v_dupQ16(i16 %A) nounwind {
83; CHECK-LABEL: v_dupQ16:
84; CHECK:       @ %bb.0:
85; CHECK-NEXT:    vdup.16 q8, r0
86; CHECK-NEXT:    vmov r0, r1, d16
87; CHECK-NEXT:    vmov r2, r3, d17
88; CHECK-NEXT:    mov pc, lr
89	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
90	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
91	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
92	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
93	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
94	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
95	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
96	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
97	ret <8 x i16> %tmp8
98}
99
100define <4 x i32> @v_dupQ32(i32 %A) nounwind {
101; CHECK-LABEL: v_dupQ32:
102; CHECK:       @ %bb.0:
103; CHECK-NEXT:    vdup.32 q8, r0
104; CHECK-NEXT:    vmov r0, r1, d16
105; CHECK-NEXT:    vmov r2, r3, d17
106; CHECK-NEXT:    mov pc, lr
107	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
108	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
109	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
110	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
111	ret <4 x i32> %tmp4
112}
113
114define <4 x float> @v_dupQfloat(float %A) nounwind {
115; CHECK-LABEL: v_dupQfloat:
116; CHECK:       @ %bb.0:
117; CHECK-NEXT:    vdup.32 q8, r0
118; CHECK-NEXT:    vmov r0, r1, d16
119; CHECK-NEXT:    vmov r2, r3, d17
120; CHECK-NEXT:    mov pc, lr
121	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
122	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
123	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
124	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
125	ret <4 x float> %tmp4
126}
127
128; Check to make sure it works with shuffles, too.
129
130define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
131; CHECK-LABEL: v_shuffledup8:
132; CHECK:       @ %bb.0:
133; CHECK-NEXT:    vdup.8 d16, r0
134; CHECK-NEXT:    vmov r0, r1, d16
135; CHECK-NEXT:    mov pc, lr
136	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
137	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
138	ret <8 x i8> %tmp2
139}
140
141define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
142; CHECK-LABEL: v_shuffledup16:
143; CHECK:       @ %bb.0:
144; CHECK-NEXT:    vdup.16 d16, r0
145; CHECK-NEXT:    vmov r0, r1, d16
146; CHECK-NEXT:    mov pc, lr
147	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
148	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
149	ret <4 x i16> %tmp2
150}
151
152define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
153; CHECK-LABEL: v_shuffledup32:
154; CHECK:       @ %bb.0:
155; CHECK-NEXT:    vdup.32 d16, r0
156; CHECK-NEXT:    vmov r0, r1, d16
157; CHECK-NEXT:    mov pc, lr
158	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
159	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
160	ret <2 x i32> %tmp2
161}
162
163define <2 x float> @v_shuffledupfloat(float %A) nounwind {
164; CHECK-LABEL: v_shuffledupfloat:
165; CHECK:       @ %bb.0:
166; CHECK-NEXT:    vdup.32 d16, r0
167; CHECK-NEXT:    vmov r0, r1, d16
168; CHECK-NEXT:    mov pc, lr
169	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
170	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
171	ret <2 x float> %tmp2
172}
173
174define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
175; CHECK-LABEL: v_shuffledupQ8:
176; CHECK:       @ %bb.0:
177; CHECK-NEXT:    vdup.8 q8, r0
178; CHECK-NEXT:    vmov r0, r1, d16
179; CHECK-NEXT:    vmov r2, r3, d17
180; CHECK-NEXT:    mov pc, lr
181	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
182	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
183	ret <16 x i8> %tmp2
184}
185
186define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
187; CHECK-LABEL: v_shuffledupQ16:
188; CHECK:       @ %bb.0:
189; CHECK-NEXT:    vdup.16 q8, r0
190; CHECK-NEXT:    vmov r0, r1, d16
191; CHECK-NEXT:    vmov r2, r3, d17
192; CHECK-NEXT:    mov pc, lr
193	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
194	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
195	ret <8 x i16> %tmp2
196}
197
198define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
199; CHECK-LABEL: v_shuffledupQ32:
200; CHECK:       @ %bb.0:
201; CHECK-NEXT:    vdup.32 q8, r0
202; CHECK-NEXT:    vmov r0, r1, d16
203; CHECK-NEXT:    vmov r2, r3, d17
204; CHECK-NEXT:    mov pc, lr
205	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
206	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
207	ret <4 x i32> %tmp2
208}
209
210define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
211; CHECK-LABEL: v_shuffledupQfloat:
212; CHECK:       @ %bb.0:
213; CHECK-NEXT:    vdup.32 q8, r0
214; CHECK-NEXT:    vmov r0, r1, d16
215; CHECK-NEXT:    vmov r2, r3, d17
216; CHECK-NEXT:    mov pc, lr
217	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
218	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
219	ret <4 x float> %tmp2
220}
221
222define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
223; CHECK-LABEL: vduplane8:
224; CHECK:       @ %bb.0:
225; CHECK-NEXT:    vldr d16, [r0]
226; CHECK-NEXT:    vdup.8 d16, d16[1]
227; CHECK-NEXT:    vmov r0, r1, d16
228; CHECK-NEXT:    mov pc, lr
229	%tmp1 = load <8 x i8>, <8 x i8>* %A
230	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
231	ret <8 x i8> %tmp2
232}
233
234define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
235; CHECK-LABEL: vduplane16:
236; CHECK:       @ %bb.0:
237; CHECK-NEXT:    vldr d16, [r0]
238; CHECK-NEXT:    vdup.16 d16, d16[1]
239; CHECK-NEXT:    vmov r0, r1, d16
240; CHECK-NEXT:    mov pc, lr
241	%tmp1 = load <4 x i16>, <4 x i16>* %A
242	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
243	ret <4 x i16> %tmp2
244}
245
246define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
247; CHECK-LABEL: vduplane32:
248; CHECK:       @ %bb.0:
249; CHECK-NEXT:    vldr d16, [r0]
250; CHECK-NEXT:    vdup.32 d16, d16[1]
251; CHECK-NEXT:    vmov r0, r1, d16
252; CHECK-NEXT:    mov pc, lr
253	%tmp1 = load <2 x i32>, <2 x i32>* %A
254	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
255	ret <2 x i32> %tmp2
256}
257
258define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
259; CHECK-LABEL: vduplanefloat:
260; CHECK:       @ %bb.0:
261; CHECK-NEXT:    vldr d16, [r0]
262; CHECK-NEXT:    vdup.32 d16, d16[1]
263; CHECK-NEXT:    vmov r0, r1, d16
264; CHECK-NEXT:    mov pc, lr
265	%tmp1 = load <2 x float>, <2 x float>* %A
266	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
267	ret <2 x float> %tmp2
268}
269
270define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
271; CHECK-LABEL: vduplaneQ8:
272; CHECK:       @ %bb.0:
273; CHECK-NEXT:    vldr d16, [r0]
274; CHECK-NEXT:    vdup.8 q8, d16[1]
275; CHECK-NEXT:    vmov r0, r1, d16
276; CHECK-NEXT:    vmov r2, r3, d17
277; CHECK-NEXT:    mov pc, lr
278	%tmp1 = load <8 x i8>, <8 x i8>* %A
279	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
280	ret <16 x i8> %tmp2
281}
282
283define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
284; CHECK-LABEL: vduplaneQ16:
285; CHECK:       @ %bb.0:
286; CHECK-NEXT:    vldr d16, [r0]
287; CHECK-NEXT:    vdup.16 q8, d16[1]
288; CHECK-NEXT:    vmov r0, r1, d16
289; CHECK-NEXT:    vmov r2, r3, d17
290; CHECK-NEXT:    mov pc, lr
291	%tmp1 = load <4 x i16>, <4 x i16>* %A
292	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
293	ret <8 x i16> %tmp2
294}
295
296define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
297; CHECK-LABEL: vduplaneQ32:
298; CHECK:       @ %bb.0:
299; CHECK-NEXT:    vldr d16, [r0]
300; CHECK-NEXT:    vdup.32 q8, d16[1]
301; CHECK-NEXT:    vmov r0, r1, d16
302; CHECK-NEXT:    vmov r2, r3, d17
303; CHECK-NEXT:    mov pc, lr
304	%tmp1 = load <2 x i32>, <2 x i32>* %A
305	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
306	ret <4 x i32> %tmp2
307}
308
309define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
310; CHECK-LABEL: vduplaneQfloat:
311; CHECK:       @ %bb.0:
312; CHECK-NEXT:    vldr d16, [r0]
313; CHECK-NEXT:    vdup.32 q8, d16[1]
314; CHECK-NEXT:    vmov r0, r1, d16
315; CHECK-NEXT:    vmov r2, r3, d17
316; CHECK-NEXT:    mov pc, lr
317	%tmp1 = load <2 x float>, <2 x float>* %A
318	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
319	ret <4 x float> %tmp2
320}
321
322define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
323; CHECK-LABEL: foo:
324; CHECK:       @ %bb.0: @ %entry
325; CHECK-NEXT:    mov r0, r2
326; CHECK-NEXT:    mov r1, r3
327; CHECK-NEXT:    mov pc, lr
328entry:
329  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
330  ret <2 x i64> %0
331}
332
333define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
334; CHECK-LABEL: bar:
335; CHECK:       @ %bb.0: @ %entry
336; CHECK-NEXT:    mov r2, r0
337; CHECK-NEXT:    mov r3, r1
338; CHECK-NEXT:    mov pc, lr
339entry:
340  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
341  ret <2 x i64> %0
342}
343
344define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
345; CHECK-LABEL: baz:
346; CHECK:       @ %bb.0: @ %entry
347; CHECK-NEXT:    mov r0, r2
348; CHECK-NEXT:    mov r1, r3
349; CHECK-NEXT:    mov pc, lr
350entry:
351  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
352  ret <2 x double> %0
353}
354
355define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
356; CHECK-LABEL: qux:
357; CHECK:       @ %bb.0: @ %entry
358; CHECK-NEXT:    mov r2, r0
359; CHECK-NEXT:    mov r3, r1
360; CHECK-NEXT:    mov pc, lr
361entry:
362  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
363  ret <2 x double> %0
364}
365
366; Radar 7373643
367define void @redundantVdup(<8 x i8>* %ptr) nounwind {
368; CHECK-LABEL: redundantVdup:
369; CHECK:       @ %bb.0:
370; CHECK-NEXT:    vmov.i8 d16, #0x80
371; CHECK-NEXT:    vstr d16, [r0]
372; CHECK-NEXT:    mov pc, lr
373  %1 = insertelement <8 x i8> undef, i8 -128, i32 0
374  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
375  store <8 x i8> %2, <8 x i8>* %ptr, align 8
376  ret void
377}
378
379define <4 x i32> @tdupi(i32 %x, i32 %y) {
380; CHECK-LABEL: tdupi:
381; CHECK:       @ %bb.0:
382; CHECK-NEXT:    vdup.32 q8, r0
383; CHECK-NEXT:    vmov.32 d17[1], r1
384; CHECK-NEXT:    vmov r0, r1, d16
385; CHECK-NEXT:    vmov r2, r3, d17
386; CHECK-NEXT:    mov pc, lr
387  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
388  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
389  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
390  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
391  ret <4 x i32> %4
392}
393
394define <4 x float> @tdupf(float %x, float %y) {
395; CHECK-LABEL: tdupf:
396; CHECK:       @ %bb.0:
397; CHECK-NEXT:    vdup.32 q0, r0
398; CHECK-NEXT:    vmov s3, r1
399; CHECK-NEXT:    vmov r0, r1, d0
400; CHECK-NEXT:    vmov r2, r3, d1
401; CHECK-NEXT:    mov pc, lr
402  %1 = insertelement <4 x float> undef, float %x, i32 0
403  %2 = insertelement <4 x float> %1, float %x, i32 1
404  %3 = insertelement <4 x float> %2, float %x, i32 2
405  %4 = insertelement <4 x float> %3, float %y, i32 3
406  ret <4 x float> %4
407}
408
409; This test checks that when splatting an element from a vector into another,
410; the value isn't moved out to GPRs first.
411define <4 x i32> @tduplane(<4 x i32> %invec) {
412; CHECK-LABEL: tduplane:
413; CHECK:       @ %bb.0:
414; CHECK-NEXT:    vmov d16, r0, r1
415; CHECK-NEXT:    mov r0, #255
416; CHECK-NEXT:    vdup.32 q8, d16[1]
417; CHECK-NEXT:    vmov.32 d17[1], r0
418; CHECK-NEXT:    vmov r0, r1, d16
419; CHECK-NEXT:    vmov r2, r3, d17
420; CHECK-NEXT:    mov pc, lr
421  %in = extractelement <4 x i32> %invec, i32 1
422  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
423  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
424  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
425  %4 = insertelement <4 x i32> %3, i32 255, i32 3
426  ret <4 x i32> %4
427}
428
429define <2 x float> @check_f32(<4 x float> %v) nounwind {
430; CHECK-LABEL: check_f32:
431; CHECK:       @ %bb.0:
432; CHECK-NEXT:    vmov d16, r2, r3
433; CHECK-NEXT:    vdup.32 d16, d16[1]
434; CHECK-NEXT:    vmov r0, r1, d16
435; CHECK-NEXT:    mov pc, lr
436  %x = extractelement <4 x float> %v, i32 3
437  %1 = insertelement  <2 x float> undef, float %x, i32 0
438  %2 = insertelement  <2 x float> %1, float %x, i32 1
439  ret <2 x float> %2
440}
441
442define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
443; CHECK-LABEL: check_i32:
444; CHECK:       @ %bb.0:
445; CHECK-NEXT:    vmov d16, r2, r3
446; CHECK-NEXT:    vdup.32 d16, d16[1]
447; CHECK-NEXT:    vmov r0, r1, d16
448; CHECK-NEXT:    mov pc, lr
449  %x = extractelement <4 x i32> %v, i32 3
450  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
451  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
452  ret <2 x i32> %2
453}
454
455define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
456; CHECK-LABEL: check_i16:
457; CHECK:       @ %bb.0:
458; CHECK-NEXT:    vmov d16, r0, r1
459; CHECK-NEXT:    vdup.16 d16, d16[3]
460; CHECK-NEXT:    vmov r0, r1, d16
461; CHECK-NEXT:    mov pc, lr
462  %x = extractelement <8 x i16> %v, i32 3
463  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
464  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
465  ret <4 x i16> %2
466}
467
468define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
469; CHECK-LABEL: check_i8:
470; CHECK:       @ %bb.0:
471; CHECK-NEXT:    vmov d16, r0, r1
472; CHECK-NEXT:    vdup.8 d16, d16[3]
473; CHECK-NEXT:    vmov r0, r1, d16
474; CHECK-NEXT:    mov pc, lr
475  %x = extractelement <16 x i8> %v, i32 3
476  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
477  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
478  ret <8 x i8> %2
479}
480
481; Check that an SPR splat produces a vdup.
482
483define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
484; CHECK-LABEL: check_spr_splat2:
485; CHECK:       @ %bb.0:
486; CHECK-NEXT:    lsl r2, r2, #16
487; CHECK-NEXT:    vmov d16, r0, r1
488; CHECK-NEXT:    asr r2, r2, #16
489; CHECK-NEXT:    vmov s0, r2
490; CHECK-NEXT:    vcvt.f32.s32 s0, s0
491; CHECK-NEXT:    vdup.32 d17, d0[0]
492; CHECK-NEXT:    vsub.f32 d16, d17, d16
493; CHECK-NEXT:    vmov r0, r1, d16
494; CHECK-NEXT:    mov pc, lr
495  %conv = sitofp i16 %q to float
496  %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
497  %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
498  %sub = fsub <2 x float> %splat.splat, %p
499  ret <2 x float> %sub
500}
501
502define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
503; CHECK-LABEL: check_spr_splat4:
504; CHECK:       @ %bb.0:
505; CHECK-NEXT:    ldrsh r12, [sp]
506; CHECK-NEXT:    vmov d17, r2, r3
507; CHECK-NEXT:    vmov d16, r0, r1
508; CHECK-NEXT:    vmov s0, r12
509; CHECK-NEXT:    vcvt.f32.s32 s0, s0
510; CHECK-NEXT:    vdup.32 q9, d0[0]
511; CHECK-NEXT:    vsub.f32 q8, q9, q8
512; CHECK-NEXT:    vmov r0, r1, d16
513; CHECK-NEXT:    vmov r2, r3, d17
514; CHECK-NEXT:    mov pc, lr
515  %conv = sitofp i16 %q to float
516  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
517  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
518  %sub = fsub <4 x float> %splat.splat, %p
519  ret <4 x float> %sub
520}
521; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
522define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
523; CHECK-LABEL: check_spr_splat4_lane1:
524; CHECK:       @ %bb.0:
525; CHECK-NEXT:    ldrsh r12, [sp]
526; CHECK-NEXT:    vmov d17, r2, r3
527; CHECK-NEXT:    vmov d16, r0, r1
528; CHECK-NEXT:    vmov s0, r12
529; CHECK-NEXT:    vcvt.f32.s32 s0, s0
530; CHECK-NEXT:    vdup.32 q9, d0[0]
531; CHECK-NEXT:    vsub.f32 q8, q9, q8
532; CHECK-NEXT:    vmov r0, r1, d16
533; CHECK-NEXT:    vmov r2, r3, d17
534; CHECK-NEXT:    mov pc, lr
535  %conv = sitofp i16 %q to float
536  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
537  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
538  %sub = fsub <4 x float> %splat.splat, %p
539  ret <4 x float> %sub
540}
541
542; Also make sure we don't barf on variable-index extractelts, where we almost
543; could have generated a vdup.
544
545define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
546; CHECK-LABEL: check_i8_varidx:
547; CHECK:       @ %bb.0:
548; CHECK-NEXT:    .save {r11}
549; CHECK-NEXT:    push {r11}
550; CHECK-NEXT:    .setfp r11, sp
551; CHECK-NEXT:    mov r11, sp
552; CHECK-NEXT:    .pad #28
553; CHECK-NEXT:    sub sp, sp, #28
554; CHECK-NEXT:    bic sp, sp, #15
555; CHECK-NEXT:    ldr r12, [r11, #4]
556; CHECK-NEXT:    vmov d17, r2, r3
557; CHECK-NEXT:    vmov d16, r0, r1
558; CHECK-NEXT:    mov r1, sp
559; CHECK-NEXT:    and r0, r12, #15
560; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128], r0
561; CHECK-NEXT:    vld1.8 {d16[]}, [r1]
562; CHECK-NEXT:    vmov r0, r1, d16
563; CHECK-NEXT:    mov sp, r11
564; CHECK-NEXT:    pop {r11}
565; CHECK-NEXT:    mov pc, lr
566  %x = extractelement <16 x i8> %v, i32 %idx
567  %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
568  %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
569  ret <8 x i8> %2
570}
571