1; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \
2; RUN:	| FileCheck %s
3
4define <8 x i8> @v_dup8(i8 %A) nounwind {
5;CHECK-LABEL: v_dup8:
6;CHECK: vdup.8
7	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
8	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
9	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
10	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
11	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
12	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
13	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
14	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
15	ret <8 x i8> %tmp8
16}
17
18define <4 x i16> @v_dup16(i16 %A) nounwind {
19;CHECK-LABEL: v_dup16:
20;CHECK: vdup.16
21	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
22	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
23	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
24	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
25	ret <4 x i16> %tmp4
26}
27
28define <2 x i32> @v_dup32(i32 %A) nounwind {
29;CHECK-LABEL: v_dup32:
30;CHECK: vdup.32
31	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
32	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
33	ret <2 x i32> %tmp2
34}
35
36define <2 x float> @v_dupfloat(float %A) nounwind {
37;CHECK-LABEL: v_dupfloat:
38;CHECK: vdup.32
39	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
40	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
41	ret <2 x float> %tmp2
42}
43
44define <16 x i8> @v_dupQ8(i8 %A) nounwind {
45;CHECK-LABEL: v_dupQ8:
46;CHECK: vdup.8
47	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
48	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
49	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
50	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
51	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
52	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
53	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
54	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
55	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
56	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
57	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
58	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
59	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
60	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
61	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
62	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
63	ret <16 x i8> %tmp16
64}
65
66define <8 x i16> @v_dupQ16(i16 %A) nounwind {
67;CHECK-LABEL: v_dupQ16:
68;CHECK: vdup.16
69	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
70	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
71	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
72	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
73	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
74	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
75	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
76	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
77	ret <8 x i16> %tmp8
78}
79
80define <4 x i32> @v_dupQ32(i32 %A) nounwind {
81;CHECK-LABEL: v_dupQ32:
82;CHECK: vdup.32
83	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
84	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
85	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
86	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
87	ret <4 x i32> %tmp4
88}
89
90define <4 x float> @v_dupQfloat(float %A) nounwind {
91;CHECK-LABEL: v_dupQfloat:
92;CHECK: vdup.32
93	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
94	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
95	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
96	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
97	ret <4 x float> %tmp4
98}
99
100; Check to make sure it works with shuffles, too.
101
102define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
103;CHECK-LABEL: v_shuffledup8:
104;CHECK: vdup.8
105	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
106	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
107	ret <8 x i8> %tmp2
108}
109
110define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
111;CHECK-LABEL: v_shuffledup16:
112;CHECK: vdup.16
113	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
114	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
115	ret <4 x i16> %tmp2
116}
117
118define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
119;CHECK-LABEL: v_shuffledup32:
120;CHECK: vdup.32
121	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
122	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
123	ret <2 x i32> %tmp2
124}
125
126define <2 x float> @v_shuffledupfloat(float %A) nounwind {
127;CHECK-LABEL: v_shuffledupfloat:
128;CHECK: vdup.32
129	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
130	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
131	ret <2 x float> %tmp2
132}
133
134define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
135;CHECK-LABEL: v_shuffledupQ8:
136;CHECK: vdup.8
137	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
138	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
139	ret <16 x i8> %tmp2
140}
141
142define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
143;CHECK-LABEL: v_shuffledupQ16:
144;CHECK: vdup.16
145	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
146	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
147	ret <8 x i16> %tmp2
148}
149
150define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
151;CHECK-LABEL: v_shuffledupQ32:
152;CHECK: vdup.32
153	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
154	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
155	ret <4 x i32> %tmp2
156}
157
158define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
159;CHECK-LABEL: v_shuffledupQfloat:
160;CHECK: vdup.32
161	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
162	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
163	ret <4 x float> %tmp2
164}
165
166define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
167;CHECK-LABEL: vduplane8:
168;CHECK: vdup.8
169	%tmp1 = load <8 x i8>, <8 x i8>* %A
170	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
171	ret <8 x i8> %tmp2
172}
173
174define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
175;CHECK-LABEL: vduplane16:
176;CHECK: vdup.16
177	%tmp1 = load <4 x i16>, <4 x i16>* %A
178	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
179	ret <4 x i16> %tmp2
180}
181
182define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
183;CHECK-LABEL: vduplane32:
184;CHECK: vdup.32
185	%tmp1 = load <2 x i32>, <2 x i32>* %A
186	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
187	ret <2 x i32> %tmp2
188}
189
190define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
191;CHECK-LABEL: vduplanefloat:
192;CHECK: vdup.32
193	%tmp1 = load <2 x float>, <2 x float>* %A
194	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
195	ret <2 x float> %tmp2
196}
197
198define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
199;CHECK-LABEL: vduplaneQ8:
200;CHECK: vdup.8
201	%tmp1 = load <8 x i8>, <8 x i8>* %A
202	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
203	ret <16 x i8> %tmp2
204}
205
206define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
207;CHECK-LABEL: vduplaneQ16:
208;CHECK: vdup.16
209	%tmp1 = load <4 x i16>, <4 x i16>* %A
210	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
211	ret <8 x i16> %tmp2
212}
213
214define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
215;CHECK-LABEL: vduplaneQ32:
216;CHECK: vdup.32
217	%tmp1 = load <2 x i32>, <2 x i32>* %A
218	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
219	ret <4 x i32> %tmp2
220}
221
222define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
223;CHECK-LABEL: vduplaneQfloat:
224;CHECK: vdup.32
225	%tmp1 = load <2 x float>, <2 x float>* %A
226	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
227	ret <4 x float> %tmp2
228}
229
230define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
231entry:
232  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
233  ret <2 x i64> %0
234}
235
236define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
237entry:
238  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
239  ret <2 x i64> %0
240}
241
242define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
243entry:
244  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
245  ret <2 x double> %0
246}
247
248define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
249entry:
250  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
251  ret <2 x double> %0
252}
253
254; Radar 7373643
255;CHECK-LABEL: redundantVdup:
256;CHECK: vmov.i8
257;CHECK-NOT: vdup.8
258;CHECK: vstr
259define void @redundantVdup(<8 x i8>* %ptr) nounwind {
260  %1 = insertelement <8 x i8> undef, i8 -128, i32 0
261  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
262  store <8 x i8> %2, <8 x i8>* %ptr, align 8
263  ret void
264}
265
266define <4 x i32> @tdupi(i32 %x, i32 %y) {
267;CHECK-LABEL: tdupi:
268;CHECK: vdup.32
269  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
270  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
271  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
272  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
273  ret <4 x i32> %4
274}
275
276define <4 x float> @tdupf(float %x, float %y) {
277;CHECK-LABEL: tdupf:
278;CHECK: vdup.32
279  %1 = insertelement <4 x float> undef, float %x, i32 0
280  %2 = insertelement <4 x float> %1, float %x, i32 1
281  %3 = insertelement <4 x float> %2, float %x, i32 2
282  %4 = insertelement <4 x float> %3, float %y, i32 3
283  ret <4 x float> %4
284}
285
286; This test checks that when splatting an element from a vector into another,
287; the value isn't moved out to GPRs first.
288define <4 x i32> @tduplane(<4 x i32> %invec) {
289;CHECK-LABEL: tduplane:
290;CHECK-NOT: vmov {{.*}}, d16[1]
291;CHECK: vdup.32 {{.*}}, d16[1]
292  %in = extractelement <4 x i32> %invec, i32 1
293  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
294  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
295  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
296  %4 = insertelement <4 x i32> %3, i32 255, i32 3
297  ret <4 x i32> %4
298}
299
300define <2 x float> @check_f32(<4 x float> %v) nounwind {
301;CHECK-LABEL: check_f32:
302;CHECK: vdup.32 {{.*}}, d{{..}}[1]
303  %x = extractelement <4 x float> %v, i32 3
304  %1 = insertelement  <2 x float> undef, float %x, i32 0
305  %2 = insertelement  <2 x float> %1, float %x, i32 1
306  ret <2 x float> %2
307}
308
309define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
310;CHECK-LABEL: check_i32:
311;CHECK: vdup.32 {{.*}}, d{{..}}[1]
312  %x = extractelement <4 x i32> %v, i32 3
313  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
314  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
315  ret <2 x i32> %2
316}
317
318define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
319;CHECK-LABEL: check_i16:
320;CHECK: vdup.16 {{.*}}, d{{..}}[3]
321  %x = extractelement <8 x i16> %v, i32 3
322  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
323  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
324  ret <4 x i16> %2
325}
326
327define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
328;CHECK-LABEL: check_i8:
329;CHECK: vdup.8 {{.*}}, d{{..}}[3]
330  %x = extractelement <16 x i8> %v, i32 3
331  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
332  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
333  ret <8 x i8> %2
334}
335
336; Check that an SPR splat produces a vdup.
337
338define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
339;CHECK-LABEL: check_spr_splat2:
340;CHECK: vdup.32 d
341  %conv = sitofp i16 %q to float
342  %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
343  %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
344  %sub = fsub <2 x float> %splat.splat, %p
345  ret <2 x float> %sub
346}
347
348define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
349;CHECK-LABEL: check_spr_splat4:
350;CHECK: vld1.16
351  %conv = sitofp i16 %q to float
352  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
353  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
354  %sub = fsub <4 x float> %splat.splat, %p
355  ret <4 x float> %sub
356}
357; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
358define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
359;CHECK-LABEL: check_spr_splat4_lane1:
360;CHECK: vld1.16
361  %conv = sitofp i16 %q to float
362  %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
363  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
364  %sub = fsub <4 x float> %splat.splat, %p
365  ret <4 x float> %sub
366}
367
368; Also make sure we don't barf on variable-index extractelts, where we almost
369; could have generated a vdup.
370
371define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
372; CHECK-LABEL: check_i8_varidx:
373; CHECK: mov r[[FP:[0-9]+]], sp
374; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
375; CHECK: mov r[[SPCOPY:[0-9]+]], sp
376; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[IDX]]
377; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
378  %x = extractelement <16 x i8> %v, i32 %idx
379  %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
380  %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
381  ret <8 x i8> %2
382}
383