1; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
2
3define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4;CHECK-LABEL: vmuli8:
5;CHECK: vmul.i8
6	%tmp1 = load <8 x i8>, <8 x i8>* %A
7	%tmp2 = load <8 x i8>, <8 x i8>* %B
8	%tmp3 = mul <8 x i8> %tmp1, %tmp2
9	ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
13;CHECK-LABEL: vmuli16:
14;CHECK: vmul.i16
15	%tmp1 = load <4 x i16>, <4 x i16>* %A
16	%tmp2 = load <4 x i16>, <4 x i16>* %B
17	%tmp3 = mul <4 x i16> %tmp1, %tmp2
18	ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
22;CHECK-LABEL: vmuli32:
23;CHECK: vmul.i32
24	%tmp1 = load <2 x i32>, <2 x i32>* %A
25	%tmp2 = load <2 x i32>, <2 x i32>* %B
26	%tmp3 = mul <2 x i32> %tmp1, %tmp2
27	ret <2 x i32> %tmp3
28}
29
30define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
31;CHECK-LABEL: vmulf32:
32;CHECK: vmul.f32
33	%tmp1 = load <2 x float>, <2 x float>* %A
34	%tmp2 = load <2 x float>, <2 x float>* %B
35	%tmp3 = fmul <2 x float> %tmp1, %tmp2
36	ret <2 x float> %tmp3
37}
38
39define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
40;CHECK-LABEL: vmulp8:
41;CHECK: vmul.p8
42	%tmp1 = load <8 x i8>, <8 x i8>* %A
43	%tmp2 = load <8 x i8>, <8 x i8>* %B
44	%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
45	ret <8 x i8> %tmp3
46}
47
48define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
49;CHECK-LABEL: vmulQi8:
50;CHECK: vmul.i8
51	%tmp1 = load <16 x i8>, <16 x i8>* %A
52	%tmp2 = load <16 x i8>, <16 x i8>* %B
53	%tmp3 = mul <16 x i8> %tmp1, %tmp2
54	ret <16 x i8> %tmp3
55}
56
57define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
58;CHECK-LABEL: vmulQi16:
59;CHECK: vmul.i16
60	%tmp1 = load <8 x i16>, <8 x i16>* %A
61	%tmp2 = load <8 x i16>, <8 x i16>* %B
62	%tmp3 = mul <8 x i16> %tmp1, %tmp2
63	ret <8 x i16> %tmp3
64}
65
66define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
67;CHECK-LABEL: vmulQi32:
68;CHECK: vmul.i32
69	%tmp1 = load <4 x i32>, <4 x i32>* %A
70	%tmp2 = load <4 x i32>, <4 x i32>* %B
71	%tmp3 = mul <4 x i32> %tmp1, %tmp2
72	ret <4 x i32> %tmp3
73}
74
75define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
76;CHECK-LABEL: vmulQf32:
77;CHECK: vmul.f32
78	%tmp1 = load <4 x float>, <4 x float>* %A
79	%tmp2 = load <4 x float>, <4 x float>* %B
80	%tmp3 = fmul <4 x float> %tmp1, %tmp2
81	ret <4 x float> %tmp3
82}
83
84define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
85;CHECK-LABEL: vmulQp8:
86;CHECK: vmul.p8
87	%tmp1 = load <16 x i8>, <16 x i8>* %A
88	%tmp2 = load <16 x i8>, <16 x i8>* %B
89	%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
90	ret <16 x i8> %tmp3
91}
92
93declare <8 x i8>  @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
94declare <16 x i8>  @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
95
96define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
97entry:
98; CHECK-LABEL: test_vmul_lanef32:
99; CHECK: vmul.f32 d0, d0, d1[0]
100  %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
101  %1 = fmul <2 x float> %0, %arg0_float32x2_t     ; <<2 x float>> [#uses=1]
102  ret <2 x float> %1
103}
104
105define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
106entry:
107; CHECK-LABEL: test_vmul_lanes16:
108; CHECK: vmul.i16 d0, d0, d1[1]
109  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
110  %1 = mul <4 x i16> %0, %arg0_int16x4_t          ; <<4 x i16>> [#uses=1]
111  ret <4 x i16> %1
112}
113
114define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
115entry:
116; CHECK-LABEL: test_vmul_lanes32:
117; CHECK: vmul.i32 d0, d0, d1[1]
118  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
119  %1 = mul <2 x i32> %0, %arg0_int32x2_t          ; <<2 x i32>> [#uses=1]
120  ret <2 x i32> %1
121}
122
123define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
124entry:
125; CHECK-LABEL: test_vmulQ_lanef32:
126; CHECK: vmul.f32 q0, q0, d2[1]
127  %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
128  %1 = fmul <4 x float> %0, %arg0_float32x4_t     ; <<4 x float>> [#uses=1]
129  ret <4 x float> %1
130}
131
132define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
133entry:
134; CHECK-LABEL: test_vmulQ_lanes16:
135; CHECK: vmul.i16 q0, q0, d2[1]
136  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
137  %1 = mul <8 x i16> %0, %arg0_int16x8_t          ; <<8 x i16>> [#uses=1]
138  ret <8 x i16> %1
139}
140
141define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
142entry:
143; CHECK-LABEL: test_vmulQ_lanes32:
144; CHECK: vmul.i32 q0, q0, d2[1]
145  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
146  %1 = mul <4 x i32> %0, %arg0_int32x4_t          ; <<4 x i32>> [#uses=1]
147  ret <4 x i32> %1
148}
149
150define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
151;CHECK-LABEL: vmulls8:
152;CHECK: vmull.s8
153	%tmp1 = load <8 x i8>, <8 x i8>* %A
154	%tmp2 = load <8 x i8>, <8 x i8>* %B
155	%tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
156	%tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
157	%tmp5 = mul <8 x i16> %tmp3, %tmp4
158	ret <8 x i16> %tmp5
159}
160
161define <8 x i16> @vmulls8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
162;CHECK-LABEL: vmulls8_int:
163;CHECK: vmull.s8
164	%tmp1 = load <8 x i8>, <8 x i8>* %A
165	%tmp2 = load <8 x i8>, <8 x i8>* %B
166	%tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
167	ret <8 x i16> %tmp3
168}
169
170define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
171;CHECK-LABEL: vmulls16:
172;CHECK: vmull.s16
173	%tmp1 = load <4 x i16>, <4 x i16>* %A
174	%tmp2 = load <4 x i16>, <4 x i16>* %B
175	%tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
176	%tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
177	%tmp5 = mul <4 x i32> %tmp3, %tmp4
178	ret <4 x i32> %tmp5
179}
180
181define <4 x i32> @vmulls16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
182;CHECK-LABEL: vmulls16_int:
183;CHECK: vmull.s16
184	%tmp1 = load <4 x i16>, <4 x i16>* %A
185	%tmp2 = load <4 x i16>, <4 x i16>* %B
186	%tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
187	ret <4 x i32> %tmp3
188}
189
190define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
191;CHECK-LABEL: vmulls32:
192;CHECK: vmull.s32
193	%tmp1 = load <2 x i32>, <2 x i32>* %A
194	%tmp2 = load <2 x i32>, <2 x i32>* %B
195	%tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
196	%tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
197	%tmp5 = mul <2 x i64> %tmp3, %tmp4
198	ret <2 x i64> %tmp5
199}
200
201define <2 x i64> @vmulls32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
202;CHECK-LABEL: vmulls32_int:
203;CHECK: vmull.s32
204	%tmp1 = load <2 x i32>, <2 x i32>* %A
205	%tmp2 = load <2 x i32>, <2 x i32>* %B
206	%tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
207	ret <2 x i64> %tmp3
208}
209
210define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
211;CHECK-LABEL: vmullu8:
212;CHECK: vmull.u8
213	%tmp1 = load <8 x i8>, <8 x i8>* %A
214	%tmp2 = load <8 x i8>, <8 x i8>* %B
215	%tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
216	%tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
217	%tmp5 = mul <8 x i16> %tmp3, %tmp4
218	ret <8 x i16> %tmp5
219}
220
221define <8 x i16> @vmullu8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind {
222;CHECK-LABEL: vmullu8_int:
223;CHECK: vmull.u8
224	%tmp1 = load <8 x i8>, <8 x i8>* %A
225	%tmp2 = load <8 x i8>, <8 x i8>* %B
226	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
227	ret <8 x i16> %tmp3
228}
229
230define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
231;CHECK-LABEL: vmullu16:
232;CHECK: vmull.u16
233	%tmp1 = load <4 x i16>, <4 x i16>* %A
234	%tmp2 = load <4 x i16>, <4 x i16>* %B
235	%tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
236	%tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
237	%tmp5 = mul <4 x i32> %tmp3, %tmp4
238	ret <4 x i32> %tmp5
239}
240
241define <4 x i32> @vmullu16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind {
242;CHECK-LABEL: vmullu16_int:
243;CHECK: vmull.u16
244	%tmp1 = load <4 x i16>, <4 x i16>* %A
245	%tmp2 = load <4 x i16>, <4 x i16>* %B
246	%tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
247	ret <4 x i32> %tmp3
248}
249
250define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
251;CHECK-LABEL: vmullu32:
252;CHECK: vmull.u32
253	%tmp1 = load <2 x i32>, <2 x i32>* %A
254	%tmp2 = load <2 x i32>, <2 x i32>* %B
255	%tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
256	%tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
257	%tmp5 = mul <2 x i64> %tmp3, %tmp4
258	ret <2 x i64> %tmp5
259}
260
261define <2 x i64> @vmullu32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind {
262;CHECK-LABEL: vmullu32_int:
263;CHECK: vmull.u32
264	%tmp1 = load <2 x i32>, <2 x i32>* %A
265	%tmp2 = load <2 x i32>, <2 x i32>* %B
266	%tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
267	ret <2 x i64> %tmp3
268}
269
270define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
271;CHECK-LABEL: vmullp8:
272;CHECK: vmull.p8
273	%tmp1 = load <8 x i8>, <8 x i8>* %A
274	%tmp2 = load <8 x i8>, <8 x i8>* %B
275	%tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
276	ret <8 x i16> %tmp3
277}
278
279define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
280entry:
281; CHECK: test_vmull_lanes16
282; CHECK: vmull.s16 q0, d0, d1[1]
283  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
284  %1 = sext <4 x i16> %arg0_int16x4_t to <4 x i32>
285  %2 = sext <4 x i16> %0 to <4 x i32>
286  %3 = mul <4 x i32> %1, %2
287  ret <4 x i32> %3
288}
289
290define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16_int(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
291entry:
292; CHECK: test_vmull_lanes16_int
293; CHECK: vmull.s16 q0, d0, d1[1]
294  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
295  %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
296  ret <4 x i32> %1
297}
298
299define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
300entry:
301; CHECK: test_vmull_lanes32
302; CHECK: vmull.s32 q0, d0, d1[1]
303  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
304  %1 = sext <2 x i32> %arg0_int32x2_t to <2 x i64>
305  %2 = sext <2 x i32> %0 to <2 x i64>
306  %3 = mul <2 x i64> %1, %2
307  ret <2 x i64> %3
308}
309
310define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32_int(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
311entry:
312; CHECK: test_vmull_lanes32_int
313; CHECK: vmull.s32 q0, d0, d1[1]
314  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
315  %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
316  ret <2 x i64> %1
317}
318
319define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
320entry:
321; CHECK: test_vmull_laneu16
322; CHECK: vmull.u16 q0, d0, d1[1]
323  %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
324  %1 = zext <4 x i16> %arg0_uint16x4_t to <4 x i32>
325  %2 = zext <4 x i16> %0 to <4 x i32>
326  %3 = mul <4 x i32> %1, %2
327  ret <4 x i32> %3
328}
329
330define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16_int(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
331entry:
332; CHECK: test_vmull_laneu16_int
333; CHECK: vmull.u16 q0, d0, d1[1]
334  %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
335  %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
336  ret <4 x i32> %1
337}
338
339define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
340entry:
341; CHECK: test_vmull_laneu32
342; CHECK: vmull.u32 q0, d0, d1[1]
343  %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
344  %1 = zext <2 x i32> %arg0_uint32x2_t to <2 x i64>
345  %2 = zext <2 x i32> %0 to <2 x i64>
346  %3 = mul <2 x i64> %1, %2
347  ret <2 x i64> %3
348}
349
350define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32_int(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
351entry:
352; CHECK: test_vmull_laneu32_int
353; CHECK: vmull.u32 q0, d0, d1[1]
354  %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
355  %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
356  ret <2 x i64> %1
357}
358
359declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
360declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
361declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
362
363declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
364declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
365declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
366
367declare <8 x i16>  @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
368
369
370; Radar 8687140
371; VMULL needs to recognize BUILD_VECTORs with sign/zero-extended elements.
372
373define <8 x i16> @vmull_extvec_s8(<8 x i8> %arg) nounwind {
374; CHECK: vmull_extvec_s8
375; CHECK: vmull.s8
376  %tmp3 = sext <8 x i8> %arg to <8 x i16>
377  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
378  ret <8 x i16> %tmp4
379}
380
381define <8 x i16> @vmull_extvec_u8(<8 x i8> %arg) nounwind {
382; CHECK: vmull_extvec_u8
383; CHECK: vmull.u8
384  %tmp3 = zext <8 x i8> %arg to <8 x i16>
385  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
386  ret <8 x i16> %tmp4
387}
388
389define <8 x i16> @vmull_noextvec_s8(<8 x i8> %arg) nounwind {
390; Do not use VMULL if the BUILD_VECTOR element values are too big.
391; CHECK: vmull_noextvec_s8
392; CHECK: vmovl.s8
393; CHECK: vmul.i16
394  %tmp3 = sext <8 x i8> %arg to <8 x i16>
395  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
396  ret <8 x i16> %tmp4
397}
398
399define <8 x i16> @vmull_noextvec_u8(<8 x i8> %arg) nounwind {
400; Do not use VMULL if the BUILD_VECTOR element values are too big.
401; CHECK: vmull_noextvec_u8
402; CHECK: vmovl.u8
403; CHECK: vmul.i16
404  %tmp3 = zext <8 x i8> %arg to <8 x i16>
405  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
406  ret <8 x i16> %tmp4
407}
408
409define <4 x i32> @vmull_extvec_s16(<4 x i16> %arg) nounwind {
410; CHECK: vmull_extvec_s16
411; CHECK: vmull.s16
412  %tmp3 = sext <4 x i16> %arg to <4 x i32>
413  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
414  ret <4 x i32> %tmp4
415}
416
417define <4 x i32> @vmull_extvec_u16(<4 x i16> %arg) nounwind {
418; CHECK: vmull_extvec_u16
419; CHECK: vmull.u16
420  %tmp3 = zext <4 x i16> %arg to <4 x i32>
421  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
422  ret <4 x i32> %tmp4
423}
424
425define <2 x i64> @vmull_extvec_s32(<2 x i32> %arg) nounwind {
426; CHECK: vmull_extvec_s32
427; CHECK: vmull.s32
428  %tmp3 = sext <2 x i32> %arg to <2 x i64>
429  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
430  ret <2 x i64> %tmp4
431}
432
433define <2 x i64> @vmull_extvec_u32(<2 x i32> %arg) nounwind {
434; CHECK: vmull_extvec_u32
435; CHECK: vmull.u32
436  %tmp3 = zext <2 x i32> %arg to <2 x i64>
437  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
438  ret <2 x i64> %tmp4
439}
440
441; rdar://9197392
442define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
443entry:
444; CHECK-LABEL: distribute:
445; CHECK: vmull.u8 [[REG1:(q[0-9]+)]], d{{.*}}, [[REG2:(d[0-9]+)]]
446; CHECK: vmlal.u8 [[REG1]], d{{.*}}, [[REG2]]
447  %0 = trunc i32 %mul to i8
448  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
449  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
450  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
451  %4 = bitcast <16 x i8> %3 to <2 x double>
452  %5 = extractelement <2 x double> %4, i32 1
453  %6 = bitcast double %5 to <8 x i8>
454  %7 = zext <8 x i8> %6 to <8 x i16>
455  %8 = zext <8 x i8> %2 to <8 x i16>
456  %9 = extractelement <2 x double> %4, i32 0
457  %10 = bitcast double %9 to <8 x i8>
458  %11 = zext <8 x i8> %10 to <8 x i16>
459  %12 = add <8 x i16> %7, %11
460  %13 = mul <8 x i16> %12, %8
461  %14 = bitcast i16* %dst to i8*
462  tail call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %14, <8 x i16> %13, i32 2)
463  ret void
464}
465
466declare <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8*, i32) nounwind readonly
467
468declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind
469
470; Take advantage of the Cortex-A8 multiplier accumulator forward.
471
472%struct.uint8x8_t = type { <8 x i8> }
473
474define void @distribute2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
475entry:
476; CHECK: distribute2
477; CHECK-NOT: vadd.i8
478; CHECK: vmul.i8
479; CHECK: vmla.i8
480  %0 = trunc i32 %mul to i8
481  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
482  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
483  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
484  %4 = bitcast <16 x i8> %3 to <2 x double>
485  %5 = extractelement <2 x double> %4, i32 1
486  %6 = bitcast double %5 to <8 x i8>
487  %7 = extractelement <2 x double> %4, i32 0
488  %8 = bitcast double %7 to <8 x i8>
489  %9 = add <8 x i8> %6, %8
490  %10 = mul <8 x i8> %9, %2
491  %11 = getelementptr inbounds %struct.uint8x8_t, %struct.uint8x8_t* %dst, i32 0, i32 0
492  store <8 x i8> %10, <8 x i8>* %11, align 8
493  ret void
494}
495
496define void @distribute2_commutative(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
497entry:
498; CHECK: distribute2_commutative
499; CHECK-NOT: vadd.i8
500; CHECK: vmul.i8
501; CHECK: vmla.i8
502  %0 = trunc i32 %mul to i8
503  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
504  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
505  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %src, i32 1)
506  %4 = bitcast <16 x i8> %3 to <2 x double>
507  %5 = extractelement <2 x double> %4, i32 1
508  %6 = bitcast double %5 to <8 x i8>
509  %7 = extractelement <2 x double> %4, i32 0
510  %8 = bitcast double %7 to <8 x i8>
511  %9 = add <8 x i8> %6, %8
512  %10 = mul <8 x i8> %2, %9
513  %11 = getelementptr inbounds %struct.uint8x8_t, %struct.uint8x8_t* %dst, i32 0, i32 0
514  store <8 x i8> %10, <8 x i8>* %11, align 8
515  ret void
516}
517
518define <8 x i8> @no_distribute(<8 x i8> %a, <8 x i8> %b) nounwind {
519entry:
520; CHECK: no_distribute
521; CHECK: vadd.i8
522; CHECK: vmul.i8
523; CHECK-NOT: vmla.i8
524  %0 = add <8 x i8> %a, %b
525  %1 = mul <8x i8> %0, %0
526  ret <8 x i8> %1
527}
528
529; If one operand has a zero-extend and the other a sign-extend, vmull
530; cannot be used.
531define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
532; CHECK: vmullWithInconsistentExtensions
533; CHECK-NOT: vmull.s8
534  %1 = sext <8 x i8> %vec to <8 x i16>
535  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
536  %3 = extractelement <8 x i16> %2, i32 0
537  ret i16 %3
538}
539
540; A constant build_vector created for a vmull with half-width elements must
541; not introduce illegal types. <rdar://problem/11324364>
542define void @vmull_buildvector() nounwind optsize ssp align 2 {
543; CHECK: vmull_buildvector
544entry:
545  br i1 undef, label %for.end179, label %for.body.lr.ph
546
547for.body.lr.ph:                                   ; preds = %entry
548  br label %for.body
549
550for.cond.loopexit:                                ; preds = %for.body33, %for.body
551  br i1 undef, label %for.end179, label %for.body
552
553for.body:                                         ; preds = %for.cond.loopexit, %for.body.lr.ph
554  br i1 undef, label %for.cond.loopexit, label %for.body33.lr.ph
555
556for.body33.lr.ph:                                 ; preds = %for.body
557  %.sub = select i1 undef, i32 0, i32 undef
558  br label %for.body33
559
560for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
561  %add45 = add i32 undef, undef
562  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* undef, i32 1)
563  %0 = load i32*, i32** undef, align 4
564  %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
565  %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
566  %vmovl.i249 = zext <8 x i8> %1 to <8 x i16>
567  %shuffle.i246 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
568  %shuffle.i240 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> <i32 1>
569  %2 = bitcast <1 x i64> %shuffle.i240 to <8 x i8>
570  %3 = bitcast <16 x i8> undef to <2 x i64>
571  %vmovl.i237 = zext <8 x i8> undef to <8 x i16>
572  %shuffle.i234 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
573  %shuffle.i226 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
574  %vmovl.i225 = zext <8 x i8> undef to <8 x i16>
575  %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
576  %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
577  %vqsub2.i216 = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
578  %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
579  %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
580  %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
581  %sub.i205 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n130
582  %sub.i203 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n134
583  %add.i200 = add <8 x i16> %sub.i205, <i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96>
584  %add.i198 = add <8 x i16> %add.i200, %sub.i203
585  %mul.i194 = mul <8 x i16> %add.i198, %vmovl.i237
586  %mul.i191 = mul <8 x i16> %vshr_n130, undef
587  %add.i192 = add <8 x i16> %mul.i191, %mul.i194
588  %mul.i187 = mul <8 x i16> %vshr_n134, undef
589  %add.i188 = add <8 x i16> %mul.i187, %add.i192
590  %mul.i185 = mul <8 x i16> undef, undef
591  %add.i186 = add <8 x i16> %mul.i185, undef
592  %vrshr_n160 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i188, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
593  %vrshr_n163 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i186, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
594  %mul.i184 = mul <8 x i16> undef, %vrshr_n160
595  %mul.i181 = mul <8 x i16> undef, %vmovl.i225
596  %add.i182 = add <8 x i16> %mul.i181, %mul.i184
597  %vrshr_n170 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i182, <8 x i16> <i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7>)
598  %vqmovn1.i180 = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %vrshr_n170) nounwind
599  %4 = bitcast <8 x i8> %vqmovn1.i180 to <1 x i64>
600  %shuffle.i = shufflevector <1 x i64> %4, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
601  %5 = bitcast <2 x i64> %shuffle.i to <16 x i8>
602  store <16 x i8> %5, <16 x i8>* undef, align 16
603  %add177 = add nsw i32 undef, 16
604  br i1 undef, label %for.body33, label %for.cond.loopexit
605
606for.end179:                                       ; preds = %for.cond.loopexit, %entry
607  ret void
608}
609
610declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
611declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
612declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
613
614; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8),
615; creating an illegal type during legalization and causing an assert.
616; PR15970
617define void @no_illegal_types_vmull_sext(<4 x i32> %a) {
618entry:
619  %wide.load283.i = load <4 x i8>, <4 x i8>* undef, align 1
620  %0 = sext <4 x i8> %wide.load283.i to <4 x i32>
621  %1 = sub nsw <4 x i32> %0, %a
622  %2 = mul nsw <4 x i32> %1, %1
623  %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
624  store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
625  ret void
626}
627define void @no_illegal_types_vmull_zext(<4 x i32> %a) {
628entry:
629  %wide.load283.i = load <4 x i8>, <4 x i8>* undef, align 1
630  %0 = zext <4 x i8> %wide.load283.i to <4 x i32>
631  %1 = sub nsw <4 x i32> %0, %a
632  %2 = mul nsw <4 x i32> %1, %1
633  %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
634  store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
635  ret void
636}
637
638define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind {
639; Look for a scalar float rather than a splat, then a vector*scalar multiply.
640; CHECK: vmov s0, r2
641; CHECK: vmul.f32  q8, q8, d0[0]
642  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
643  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
644  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
645  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
646  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
647  %tmp10 = fmul <4 x float> %tmp9, %tmp5
648  store <4 x float> %tmp10, <4 x float>* %dst, align 4
649  ret void
650}
651
652define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
653; Look for doing a normal scalar FP load rather than an to-all-lanes load,
654; then a vector*scalar multiply.
655; FIXME: Temporarily broken due to splat representation changes.
656; CHECK: vld1.32 {d18[], d19[]}, [r2:32]
657; CHECK: vmul.f32  q8, q9, q8
658  %tmp = load float, float* %src, align 4
659  %tmp5 = load <4 x float>, <4 x float>* %a, align 4
660  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
661  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
662  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
663  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
664  %tmp10 = fmul <4 x float> %tmp9, %tmp5
665  store <4 x float> %tmp10, <4 x float>* %dst, align 4
666  ret void
667}
668