1; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
2
3
4define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5;CHECK-LABEL: smull8h:
6;CHECK: smull.8h
7  %tmp1 = load <8 x i8>, <8 x i8>* %A
8  %tmp2 = load <8 x i8>, <8 x i8>* %B
9  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
10  ret <8 x i16> %tmp3
11}
12
13define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
14;CHECK-LABEL: smull4s:
15;CHECK: smull.4s
16  %tmp1 = load <4 x i16>, <4 x i16>* %A
17  %tmp2 = load <4 x i16>, <4 x i16>* %B
18  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
19  ret <4 x i32> %tmp3
20}
21
22define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
23;CHECK-LABEL: smull2d:
24;CHECK: smull.2d
25  %tmp1 = load <2 x i32>, <2 x i32>* %A
26  %tmp2 = load <2 x i32>, <2 x i32>* %B
27  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
28  ret <2 x i64> %tmp3
29}
30
31declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
32declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
33declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
34
35define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
36;CHECK-LABEL: umull8h:
37;CHECK: umull.8h
38  %tmp1 = load <8 x i8>, <8 x i8>* %A
39  %tmp2 = load <8 x i8>, <8 x i8>* %B
40  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
41  ret <8 x i16> %tmp3
42}
43
44define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
45;CHECK-LABEL: umull4s:
46;CHECK: umull.4s
47  %tmp1 = load <4 x i16>, <4 x i16>* %A
48  %tmp2 = load <4 x i16>, <4 x i16>* %B
49  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
50  ret <4 x i32> %tmp3
51}
52
53define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
54;CHECK-LABEL: umull2d:
55;CHECK: umull.2d
56  %tmp1 = load <2 x i32>, <2 x i32>* %A
57  %tmp2 = load <2 x i32>, <2 x i32>* %B
58  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
59  ret <2 x i64> %tmp3
60}
61
62declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
63declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
64declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
65
66define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
67;CHECK-LABEL: sqdmull4s:
68;CHECK: sqdmull.4s
69  %tmp1 = load <4 x i16>, <4 x i16>* %A
70  %tmp2 = load <4 x i16>, <4 x i16>* %B
71  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
72  ret <4 x i32> %tmp3
73}
74
75define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
76;CHECK-LABEL: sqdmull2d:
77;CHECK: sqdmull.2d
78  %tmp1 = load <2 x i32>, <2 x i32>* %A
79  %tmp2 = load <2 x i32>, <2 x i32>* %B
80  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
81  ret <2 x i64> %tmp3
82}
83
84define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
85;CHECK-LABEL: sqdmull2_4s:
86;CHECK: sqdmull2.4s
87  %load1 = load <8 x i16>, <8 x i16>* %A
88  %load2 = load <8 x i16>, <8 x i16>* %B
89  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
90  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
91  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
92  ret <4 x i32> %tmp3
93}
94
95define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
96;CHECK-LABEL: sqdmull2_2d:
97;CHECK: sqdmull2.2d
98  %load1 = load <4 x i32>, <4 x i32>* %A
99  %load2 = load <4 x i32>, <4 x i32>* %B
100  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
101  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
102  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
103  ret <2 x i64> %tmp3
104}
105
106
107declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
108declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
109
110define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
111;CHECK-LABEL: pmull8h:
112;CHECK: pmull.8h
113  %tmp1 = load <8 x i8>, <8 x i8>* %A
114  %tmp2 = load <8 x i8>, <8 x i8>* %B
115  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
116  ret <8 x i16> %tmp3
117}
118
119declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
120
121define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
122;CHECK-LABEL: sqdmulh_4h:
123;CHECK: sqdmulh.4h
124  %tmp1 = load <4 x i16>, <4 x i16>* %A
125  %tmp2 = load <4 x i16>, <4 x i16>* %B
126  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
127  ret <4 x i16> %tmp3
128}
129
130define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
131;CHECK-LABEL: sqdmulh_8h:
132;CHECK: sqdmulh.8h
133  %tmp1 = load <8 x i16>, <8 x i16>* %A
134  %tmp2 = load <8 x i16>, <8 x i16>* %B
135  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
136  ret <8 x i16> %tmp3
137}
138
139define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
140;CHECK-LABEL: sqdmulh_2s:
141;CHECK: sqdmulh.2s
142  %tmp1 = load <2 x i32>, <2 x i32>* %A
143  %tmp2 = load <2 x i32>, <2 x i32>* %B
144  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
145  ret <2 x i32> %tmp3
146}
147
148define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
149;CHECK-LABEL: sqdmulh_4s:
150;CHECK: sqdmulh.4s
151  %tmp1 = load <4 x i32>, <4 x i32>* %A
152  %tmp2 = load <4 x i32>, <4 x i32>* %B
153  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
154  ret <4 x i32> %tmp3
155}
156
157define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
158;CHECK-LABEL: sqdmulh_1s:
159;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
160  %tmp1 = load i32, i32* %A
161  %tmp2 = load i32, i32* %B
162  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
163  ret i32 %tmp3
164}
165
166declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
167declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
168declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
169declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
170declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
171
172define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
173;CHECK-LABEL: sqrdmulh_4h:
174;CHECK: sqrdmulh.4h
175  %tmp1 = load <4 x i16>, <4 x i16>* %A
176  %tmp2 = load <4 x i16>, <4 x i16>* %B
177  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
178  ret <4 x i16> %tmp3
179}
180
181define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
182;CHECK-LABEL: sqrdmulh_8h:
183;CHECK: sqrdmulh.8h
184  %tmp1 = load <8 x i16>, <8 x i16>* %A
185  %tmp2 = load <8 x i16>, <8 x i16>* %B
186  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
187  ret <8 x i16> %tmp3
188}
189
190define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
191;CHECK-LABEL: sqrdmulh_2s:
192;CHECK: sqrdmulh.2s
193  %tmp1 = load <2 x i32>, <2 x i32>* %A
194  %tmp2 = load <2 x i32>, <2 x i32>* %B
195  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
196  ret <2 x i32> %tmp3
197}
198
199define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
200;CHECK-LABEL: sqrdmulh_4s:
201;CHECK: sqrdmulh.4s
202  %tmp1 = load <4 x i32>, <4 x i32>* %A
203  %tmp2 = load <4 x i32>, <4 x i32>* %B
204  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
205  ret <4 x i32> %tmp3
206}
207
208define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
209;CHECK-LABEL: sqrdmulh_1s:
210;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
211  %tmp1 = load i32, i32* %A
212  %tmp2 = load i32, i32* %B
213  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
214  ret i32 %tmp3
215}
216
217declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
218declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
219declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
220declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
221declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
222
223define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
224;CHECK-LABEL: fmulx_2s:
225;CHECK: fmulx.2s
226  %tmp1 = load <2 x float>, <2 x float>* %A
227  %tmp2 = load <2 x float>, <2 x float>* %B
228  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
229  ret <2 x float> %tmp3
230}
231
232define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
233;CHECK-LABEL: fmulx_4s:
234;CHECK: fmulx.4s
235  %tmp1 = load <4 x float>, <4 x float>* %A
236  %tmp2 = load <4 x float>, <4 x float>* %B
237  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
238  ret <4 x float> %tmp3
239}
240
241define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
242;CHECK-LABEL: fmulx_2d:
243;CHECK: fmulx.2d
244  %tmp1 = load <2 x double>, <2 x double>* %A
245  %tmp2 = load <2 x double>, <2 x double>* %B
246  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
247  ret <2 x double> %tmp3
248}
249
250declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
251declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
252declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
253
254define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
255;CHECK-LABEL: smlal4s:
256;CHECK: smlal.4s
257  %tmp1 = load <4 x i16>, <4 x i16>* %A
258  %tmp2 = load <4 x i16>, <4 x i16>* %B
259  %tmp3 = load <4 x i32>, <4 x i32>* %C
260  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
261  %tmp5 = add <4 x i32> %tmp3, %tmp4
262  ret <4 x i32> %tmp5
263}
264
265define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
266;CHECK-LABEL: smlal2d:
267;CHECK: smlal.2d
268  %tmp1 = load <2 x i32>, <2 x i32>* %A
269  %tmp2 = load <2 x i32>, <2 x i32>* %B
270  %tmp3 = load <2 x i64>, <2 x i64>* %C
271  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
272  %tmp5 = add <2 x i64> %tmp3, %tmp4
273  ret <2 x i64> %tmp5
274}
275
276define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
277;CHECK-LABEL: smlsl4s:
278;CHECK: smlsl.4s
279  %tmp1 = load <4 x i16>, <4 x i16>* %A
280  %tmp2 = load <4 x i16>, <4 x i16>* %B
281  %tmp3 = load <4 x i32>, <4 x i32>* %C
282  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
283  %tmp5 = sub <4 x i32> %tmp3, %tmp4
284  ret <4 x i32> %tmp5
285}
286
287define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
288;CHECK-LABEL: smlsl2d:
289;CHECK: smlsl.2d
290  %tmp1 = load <2 x i32>, <2 x i32>* %A
291  %tmp2 = load <2 x i32>, <2 x i32>* %B
292  %tmp3 = load <2 x i64>, <2 x i64>* %C
293  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
294  %tmp5 = sub <2 x i64> %tmp3, %tmp4
295  ret <2 x i64> %tmp5
296}
297
298declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
299declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
300declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
301declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
302
303define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
304;CHECK-LABEL: sqdmlal4s:
305;CHECK: sqdmlal.4s
306  %tmp1 = load <4 x i16>, <4 x i16>* %A
307  %tmp2 = load <4 x i16>, <4 x i16>* %B
308  %tmp3 = load <4 x i32>, <4 x i32>* %C
309  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
310  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
311  ret <4 x i32> %tmp5
312}
313
314define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
315;CHECK-LABEL: sqdmlal2d:
316;CHECK: sqdmlal.2d
317  %tmp1 = load <2 x i32>, <2 x i32>* %A
318  %tmp2 = load <2 x i32>, <2 x i32>* %B
319  %tmp3 = load <2 x i64>, <2 x i64>* %C
320  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
321  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
322  ret <2 x i64> %tmp5
323}
324
325define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
326;CHECK-LABEL: sqdmlal2_4s:
327;CHECK: sqdmlal2.4s
328  %load1 = load <8 x i16>, <8 x i16>* %A
329  %load2 = load <8 x i16>, <8 x i16>* %B
330  %tmp3 = load <4 x i32>, <4 x i32>* %C
331  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
332  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
333  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
334  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
335  ret <4 x i32> %tmp5
336}
337
338define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
339;CHECK-LABEL: sqdmlal2_2d:
340;CHECK: sqdmlal2.2d
341  %load1 = load <4 x i32>, <4 x i32>* %A
342  %load2 = load <4 x i32>, <4 x i32>* %B
343  %tmp3 = load <2 x i64>, <2 x i64>* %C
344  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
345  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
346  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
347  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
348  ret <2 x i64> %tmp5
349}
350
351define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
352;CHECK-LABEL: sqdmlsl4s:
353;CHECK: sqdmlsl.4s
354  %tmp1 = load <4 x i16>, <4 x i16>* %A
355  %tmp2 = load <4 x i16>, <4 x i16>* %B
356  %tmp3 = load <4 x i32>, <4 x i32>* %C
357  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
358  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
359  ret <4 x i32> %tmp5
360}
361
362define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
363;CHECK-LABEL: sqdmlsl2d:
364;CHECK: sqdmlsl.2d
365  %tmp1 = load <2 x i32>, <2 x i32>* %A
366  %tmp2 = load <2 x i32>, <2 x i32>* %B
367  %tmp3 = load <2 x i64>, <2 x i64>* %C
368  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
369  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
370  ret <2 x i64> %tmp5
371}
372
373define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
374;CHECK-LABEL: sqdmlsl2_4s:
375;CHECK: sqdmlsl2.4s
376  %load1 = load <8 x i16>, <8 x i16>* %A
377  %load2 = load <8 x i16>, <8 x i16>* %B
378  %tmp3 = load <4 x i32>, <4 x i32>* %C
379  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
380  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
381  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
382  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
383  ret <4 x i32> %tmp5
384}
385
386define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
387;CHECK-LABEL: sqdmlsl2_2d:
388;CHECK: sqdmlsl2.2d
389  %load1 = load <4 x i32>, <4 x i32>* %A
390  %load2 = load <4 x i32>, <4 x i32>* %B
391  %tmp3 = load <2 x i64>, <2 x i64>* %C
392  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
393  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
394  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
395  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
396  ret <2 x i64> %tmp5
397}
398
399define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
400;CHECK-LABEL: umlal4s:
401;CHECK: umlal.4s
402  %tmp1 = load <4 x i16>, <4 x i16>* %A
403  %tmp2 = load <4 x i16>, <4 x i16>* %B
404  %tmp3 = load <4 x i32>, <4 x i32>* %C
405  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
406  %tmp5 = add <4 x i32> %tmp3, %tmp4
407  ret <4 x i32> %tmp5
408}
409
410define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
411;CHECK-LABEL: umlal2d:
412;CHECK: umlal.2d
413  %tmp1 = load <2 x i32>, <2 x i32>* %A
414  %tmp2 = load <2 x i32>, <2 x i32>* %B
415  %tmp3 = load <2 x i64>, <2 x i64>* %C
416  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
417  %tmp5 = add <2 x i64> %tmp3, %tmp4
418  ret <2 x i64> %tmp5
419}
420
421define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
422;CHECK-LABEL: umlsl4s:
423;CHECK: umlsl.4s
424  %tmp1 = load <4 x i16>, <4 x i16>* %A
425  %tmp2 = load <4 x i16>, <4 x i16>* %B
426  %tmp3 = load <4 x i32>, <4 x i32>* %C
427  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
428  %tmp5 = sub <4 x i32> %tmp3, %tmp4
429  ret <4 x i32> %tmp5
430}
431
432define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
433;CHECK-LABEL: umlsl2d:
434;CHECK: umlsl.2d
435  %tmp1 = load <2 x i32>, <2 x i32>* %A
436  %tmp2 = load <2 x i32>, <2 x i32>* %B
437  %tmp3 = load <2 x i64>, <2 x i64>* %C
438  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
439  %tmp5 = sub <2 x i64> %tmp3, %tmp4
440  ret <2 x i64> %tmp5
441}
442
443define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
444;CHECK-LABEL: fmla_2s:
445;CHECK: fmla.2s
446  %tmp1 = load <2 x float>, <2 x float>* %A
447  %tmp2 = load <2 x float>, <2 x float>* %B
448  %tmp3 = load <2 x float>, <2 x float>* %C
449  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
450  ret <2 x float> %tmp4
451}
452
453define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
454;CHECK-LABEL: fmla_4s:
455;CHECK: fmla.4s
456  %tmp1 = load <4 x float>, <4 x float>* %A
457  %tmp2 = load <4 x float>, <4 x float>* %B
458  %tmp3 = load <4 x float>, <4 x float>* %C
459  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
460  ret <4 x float> %tmp4
461}
462
463define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
464;CHECK-LABEL: fmla_2d:
465;CHECK: fmla.2d
466  %tmp1 = load <2 x double>, <2 x double>* %A
467  %tmp2 = load <2 x double>, <2 x double>* %B
468  %tmp3 = load <2 x double>, <2 x double>* %C
469  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
470  ret <2 x double> %tmp4
471}
472
473declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
474declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
475declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
476
477define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
478;CHECK-LABEL: fmls_2s:
479;CHECK: fmls.2s
480  %tmp1 = load <2 x float>, <2 x float>* %A
481  %tmp2 = load <2 x float>, <2 x float>* %B
482  %tmp3 = load <2 x float>, <2 x float>* %C
483  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
484  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
485  ret <2 x float> %tmp5
486}
487
488define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
489;CHECK-LABEL: fmls_4s:
490;CHECK: fmls.4s
491  %tmp1 = load <4 x float>, <4 x float>* %A
492  %tmp2 = load <4 x float>, <4 x float>* %B
493  %tmp3 = load <4 x float>, <4 x float>* %C
494  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
495  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
496  ret <4 x float> %tmp5
497}
498
499define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
500;CHECK-LABEL: fmls_2d:
501;CHECK: fmls.2d
502  %tmp1 = load <2 x double>, <2 x double>* %A
503  %tmp2 = load <2 x double>, <2 x double>* %B
504  %tmp3 = load <2 x double>, <2 x double>* %C
505  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
506  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
507  ret <2 x double> %tmp5
508}
509
510define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
511;CHECK-LABEL: fmls_commuted_neg_2s:
512;CHECK: fmls.2s
513  %tmp1 = load <2 x float>, <2 x float>* %A
514  %tmp2 = load <2 x float>, <2 x float>* %B
515  %tmp3 = load <2 x float>, <2 x float>* %C
516  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
517  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
518  ret <2 x float> %tmp5
519}
520
521define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
522;CHECK-LABEL: fmls_commuted_neg_4s:
523;CHECK: fmls.4s
524  %tmp1 = load <4 x float>, <4 x float>* %A
525  %tmp2 = load <4 x float>, <4 x float>* %B
526  %tmp3 = load <4 x float>, <4 x float>* %C
527  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
528  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
529  ret <4 x float> %tmp5
530}
531
532define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
533;CHECK-LABEL: fmls_commuted_neg_2d:
534;CHECK: fmls.2d
535  %tmp1 = load <2 x double>, <2 x double>* %A
536  %tmp2 = load <2 x double>, <2 x double>* %B
537  %tmp3 = load <2 x double>, <2 x double>* %C
538  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
539  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
540  ret <2 x double> %tmp5
541}
542
543define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
544;CHECK-LABEL: fmls_indexed_2s:
545;CHECK: fmls.2s
546entry:
547  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
548  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
549  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
550  ret <2 x float> %fmls1
551}
552
553define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
554;CHECK-LABEL: fmls_indexed_4s:
555;CHECK: fmls.4s
556entry:
557  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
558  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
559  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
560  ret <4 x float> %fmls1
561}
562
563define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
564;CHECK-LABEL: fmls_indexed_2d:
565;CHECK: fmls.2d
566entry:
567  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
568  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
569  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
570  ret <2 x double> %fmls1
571}
572
573define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
574entry:
575; CHECK-LABEL: fmla_indexed_scalar_2s:
576; CHECK-NEXT: fmla.2s
577; CHECK-NEXT: ret
578  %v1 = insertelement <2 x float> undef, float %c, i32 0
579  %v2 = insertelement <2 x float> %v1, float %c, i32 1
580  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
581  ret <2 x float> %fmla1
582}
583
584define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
585entry:
586; CHECK-LABEL: fmla_indexed_scalar_4s:
587; CHECK-NEXT: fmla.4s
588; CHECK-NEXT: ret
589  %v1 = insertelement <4 x float> undef, float %c, i32 0
590  %v2 = insertelement <4 x float> %v1, float %c, i32 1
591  %v3 = insertelement <4 x float> %v2, float %c, i32 2
592  %v4 = insertelement <4 x float> %v3, float %c, i32 3
593  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
594  ret <4 x float> %fmla1
595}
596
597define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
598; CHECK-LABEL: fmla_indexed_scalar_2d:
599; CHECK-NEXT: fmla.2d
600; CHECK-NEXT: ret
601entry:
602  %v1 = insertelement <2 x double> undef, double %c, i32 0
603  %v2 = insertelement <2 x double> %v1, double %c, i32 1
604  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
605  ret <2 x double> %fmla1
606}
607
608define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
609;CHECK-LABEL: mul_4h:
610;CHECK-NOT: dup
611;CHECK: mul.4h
612  %tmp1 = load <4 x i16>, <4 x i16>* %A
613  %tmp2 = load <4 x i16>, <4 x i16>* %B
614  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
615  %tmp4 = mul <4 x i16> %tmp1, %tmp3
616  ret <4 x i16> %tmp4
617}
618
619define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
620;CHECK-LABEL: mul_8h:
621;CHECK-NOT: dup
622;CHECK: mul.8h
623  %tmp1 = load <8 x i16>, <8 x i16>* %A
624  %tmp2 = load <8 x i16>, <8 x i16>* %B
625  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
626  %tmp4 = mul <8 x i16> %tmp1, %tmp3
627  ret <8 x i16> %tmp4
628}
629
630define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
631;CHECK-LABEL: mul_2s:
632;CHECK-NOT: dup
633;CHECK: mul.2s
634  %tmp1 = load <2 x i32>, <2 x i32>* %A
635  %tmp2 = load <2 x i32>, <2 x i32>* %B
636  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
637  %tmp4 = mul <2 x i32> %tmp1, %tmp3
638  ret <2 x i32> %tmp4
639}
640
641define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
642;CHECK-LABEL: mul_4s:
643;CHECK-NOT: dup
644;CHECK: mul.4s
645  %tmp1 = load <4 x i32>, <4 x i32>* %A
646  %tmp2 = load <4 x i32>, <4 x i32>* %B
647  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
648  %tmp4 = mul <4 x i32> %tmp1, %tmp3
649  ret <4 x i32> %tmp4
650}
651
652define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
653; CHECK-LABEL: mul_2d:
654; CHECK: mul
655; CHECK: mul
656  %tmp1 = mul <2 x i64> %A, %B
657  ret <2 x i64> %tmp1
658}
659
660define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
661;CHECK-LABEL: fmul_lane_2s:
662;CHECK-NOT: dup
663;CHECK: fmul.2s
664  %tmp1 = load <2 x float>, <2 x float>* %A
665  %tmp2 = load <2 x float>, <2 x float>* %B
666  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
667  %tmp4 = fmul <2 x float> %tmp1, %tmp3
668  ret <2 x float> %tmp4
669}
670
671define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
672;CHECK-LABEL: fmul_lane_4s:
673;CHECK-NOT: dup
674;CHECK: fmul.4s
675  %tmp1 = load <4 x float>, <4 x float>* %A
676  %tmp2 = load <4 x float>, <4 x float>* %B
677  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
678  %tmp4 = fmul <4 x float> %tmp1, %tmp3
679  ret <4 x float> %tmp4
680}
681
682define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
683;CHECK-LABEL: fmul_lane_2d:
684;CHECK-NOT: dup
685;CHECK: fmul.2d
686  %tmp1 = load <2 x double>, <2 x double>* %A
687  %tmp2 = load <2 x double>, <2 x double>* %B
688  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
689  %tmp4 = fmul <2 x double> %tmp1, %tmp3
690  ret <2 x double> %tmp4
691}
692
693define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
694;CHECK-LABEL: fmul_lane_s:
695;CHECK-NOT: dup
696;CHECK: fmul.s s0, s0, v1[3]
697  %B = extractelement <4 x float> %vec, i32 3
698  %res = fmul float %A, %B
699  ret float %res
700}
701
702define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
703;CHECK-LABEL: fmul_lane_d:
704;CHECK-NOT: dup
705;CHECK: fmul.d d0, d0, v1[1]
706  %B = extractelement <2 x double> %vec, i32 1
707  %res = fmul double %A, %B
708  ret double %res
709}
710
711
712
713define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
714;CHECK-LABEL: fmulx_lane_2s:
715;CHECK-NOT: dup
716;CHECK: fmulx.2s
717  %tmp1 = load <2 x float>, <2 x float>* %A
718  %tmp2 = load <2 x float>, <2 x float>* %B
719  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
720  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
721  ret <2 x float> %tmp4
722}
723
724define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
725;CHECK-LABEL: fmulx_lane_4s:
726;CHECK-NOT: dup
727;CHECK: fmulx.4s
728  %tmp1 = load <4 x float>, <4 x float>* %A
729  %tmp2 = load <4 x float>, <4 x float>* %B
730  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
731  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
732  ret <4 x float> %tmp4
733}
734
735define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
736;CHECK-LABEL: fmulx_lane_2d:
737;CHECK-NOT: dup
738;CHECK: fmulx.2d
739  %tmp1 = load <2 x double>, <2 x double>* %A
740  %tmp2 = load <2 x double>, <2 x double>* %B
741  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
742  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
743  ret <2 x double> %tmp4
744}
745
746define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
747;CHECK-LABEL: sqdmulh_lane_4h:
748;CHECK-NOT: dup
749;CHECK: sqdmulh.4h
750  %tmp1 = load <4 x i16>, <4 x i16>* %A
751  %tmp2 = load <4 x i16>, <4 x i16>* %B
752  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
753  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
754  ret <4 x i16> %tmp4
755}
756
757define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
758;CHECK-LABEL: sqdmulh_lane_8h:
759;CHECK-NOT: dup
760;CHECK: sqdmulh.8h
761  %tmp1 = load <8 x i16>, <8 x i16>* %A
762  %tmp2 = load <8 x i16>, <8 x i16>* %B
763  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
764  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
765  ret <8 x i16> %tmp4
766}
767
768define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
769;CHECK-LABEL: sqdmulh_lane_2s:
770;CHECK-NOT: dup
771;CHECK: sqdmulh.2s
772  %tmp1 = load <2 x i32>, <2 x i32>* %A
773  %tmp2 = load <2 x i32>, <2 x i32>* %B
774  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
775  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
776  ret <2 x i32> %tmp4
777}
778
779define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
780;CHECK-LABEL: sqdmulh_lane_4s:
781;CHECK-NOT: dup
782;CHECK: sqdmulh.4s
783  %tmp1 = load <4 x i32>, <4 x i32>* %A
784  %tmp2 = load <4 x i32>, <4 x i32>* %B
785  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
786  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
787  ret <4 x i32> %tmp4
788}
789
790define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
791;CHECK-LABEL: sqdmulh_lane_1s:
792;CHECK-NOT: dup
793;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
794  %tmp1 = extractelement <4 x i32> %B, i32 1
795  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
796  ret i32 %tmp2
797}
798
799define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
800;CHECK-LABEL: sqrdmulh_lane_4h:
801;CHECK-NOT: dup
802;CHECK: sqrdmulh.4h
803  %tmp1 = load <4 x i16>, <4 x i16>* %A
804  %tmp2 = load <4 x i16>, <4 x i16>* %B
805  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
806  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
807  ret <4 x i16> %tmp4
808}
809
810define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
811;CHECK-LABEL: sqrdmulh_lane_8h:
812;CHECK-NOT: dup
813;CHECK: sqrdmulh.8h
814  %tmp1 = load <8 x i16>, <8 x i16>* %A
815  %tmp2 = load <8 x i16>, <8 x i16>* %B
816  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
817  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
818  ret <8 x i16> %tmp4
819}
820
821define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
822;CHECK-LABEL: sqrdmulh_lane_2s:
823;CHECK-NOT: dup
824;CHECK: sqrdmulh.2s
825  %tmp1 = load <2 x i32>, <2 x i32>* %A
826  %tmp2 = load <2 x i32>, <2 x i32>* %B
827  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
828  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
829  ret <2 x i32> %tmp4
830}
831
832define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
833;CHECK-LABEL: sqrdmulh_lane_4s:
834;CHECK-NOT: dup
835;CHECK: sqrdmulh.4s
836  %tmp1 = load <4 x i32>, <4 x i32>* %A
837  %tmp2 = load <4 x i32>, <4 x i32>* %B
838  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
839  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
840  ret <4 x i32> %tmp4
841}
842
843define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
844;CHECK-LABEL: sqrdmulh_lane_1s:
845;CHECK-NOT: dup
846;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
847  %tmp1 = extractelement <4 x i32> %B, i32 1
848  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
849  ret i32 %tmp2
850}
851
852define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
853;CHECK-LABEL: sqdmull_lane_4s:
854;CHECK-NOT: dup
855;CHECK: sqdmull.4s
856  %tmp1 = load <4 x i16>, <4 x i16>* %A
857  %tmp2 = load <4 x i16>, <4 x i16>* %B
858  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
859  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
860  ret <4 x i32> %tmp4
861}
862
863define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
864;CHECK-LABEL: sqdmull_lane_2d:
865;CHECK-NOT: dup
866;CHECK: sqdmull.2d
867  %tmp1 = load <2 x i32>, <2 x i32>* %A
868  %tmp2 = load <2 x i32>, <2 x i32>* %B
869  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
870  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
871  ret <2 x i64> %tmp4
872}
873
874define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
875;CHECK-LABEL: sqdmull2_lane_4s:
876;CHECK-NOT: dup
877;CHECK: sqdmull2.4s
878  %load1 = load <8 x i16>, <8 x i16>* %A
879  %load2 = load <8 x i16>, <8 x i16>* %B
880  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
881  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
882  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
883  ret <4 x i32> %tmp4
884}
885
886define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
887;CHECK-LABEL: sqdmull2_lane_2d:
888;CHECK-NOT: dup
889;CHECK: sqdmull2.2d
890  %load1 = load <4 x i32>, <4 x i32>* %A
891  %load2 = load <4 x i32>, <4 x i32>* %B
892  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
893  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
894  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
895  ret <2 x i64> %tmp4
896}
897
898define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
899;CHECK-LABEL: umull_lane_4s:
900;CHECK-NOT: dup
901;CHECK: umull.4s
902  %tmp1 = load <4 x i16>, <4 x i16>* %A
903  %tmp2 = load <4 x i16>, <4 x i16>* %B
904  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
905  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
906  ret <4 x i32> %tmp4
907}
908
909define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
910;CHECK-LABEL: umull_lane_2d:
911;CHECK-NOT: dup
912;CHECK: umull.2d
913  %tmp1 = load <2 x i32>, <2 x i32>* %A
914  %tmp2 = load <2 x i32>, <2 x i32>* %B
915  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
916  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
917  ret <2 x i64> %tmp4
918}
919
920define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
921;CHECK-LABEL: smull_lane_4s:
922;CHECK-NOT: dup
923;CHECK: smull.4s
924  %tmp1 = load <4 x i16>, <4 x i16>* %A
925  %tmp2 = load <4 x i16>, <4 x i16>* %B
926  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
927  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
928  ret <4 x i32> %tmp4
929}
930
931define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
932;CHECK-LABEL: smull_lane_2d:
933;CHECK-NOT: dup
934;CHECK: smull.2d
935  %tmp1 = load <2 x i32>, <2 x i32>* %A
936  %tmp2 = load <2 x i32>, <2 x i32>* %B
937  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
938  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
939  ret <2 x i64> %tmp4
940}
941
942define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
943;CHECK-LABEL: smlal_lane_4s:
944;CHECK-NOT: dup
945;CHECK: smlal.4s
946  %tmp1 = load <4 x i16>, <4 x i16>* %A
947  %tmp2 = load <4 x i16>, <4 x i16>* %B
948  %tmp3 = load <4 x i32>, <4 x i32>* %C
949  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
950  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
951  %tmp6 = add <4 x i32> %tmp3, %tmp5
952  ret <4 x i32> %tmp6
953}
954
955define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
956;CHECK-LABEL: smlal_lane_2d:
957;CHECK-NOT: dup
958;CHECK: smlal.2d
959  %tmp1 = load <2 x i32>, <2 x i32>* %A
960  %tmp2 = load <2 x i32>, <2 x i32>* %B
961  %tmp3 = load <2 x i64>, <2 x i64>* %C
962  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
963  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
964  %tmp6 = add <2 x i64> %tmp3, %tmp5
965  ret <2 x i64> %tmp6
966}
967
968define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
969;CHECK-LABEL: sqdmlal_lane_4s:
970;CHECK-NOT: dup
971;CHECK: sqdmlal.4s
972  %tmp1 = load <4 x i16>, <4 x i16>* %A
973  %tmp2 = load <4 x i16>, <4 x i16>* %B
974  %tmp3 = load <4 x i32>, <4 x i32>* %C
975  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
976  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
977  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
978  ret <4 x i32> %tmp6
979}
980
981define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
982;CHECK-LABEL: sqdmlal_lane_2d:
983;CHECK-NOT: dup
984;CHECK: sqdmlal.2d
985  %tmp1 = load <2 x i32>, <2 x i32>* %A
986  %tmp2 = load <2 x i32>, <2 x i32>* %B
987  %tmp3 = load <2 x i64>, <2 x i64>* %C
988  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
989  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
990  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
991  ret <2 x i64> %tmp6
992}
993
994define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
995;CHECK-LABEL: sqdmlal2_lane_4s:
996;CHECK-NOT: dup
997;CHECK: sqdmlal2.4s
998  %load1 = load <8 x i16>, <8 x i16>* %A
999  %load2 = load <8 x i16>, <8 x i16>* %B
1000  %tmp3 = load <4 x i32>, <4 x i32>* %C
1001  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1002  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1003  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1004  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1005  ret <4 x i32> %tmp6
1006}
1007
1008define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1009;CHECK-LABEL: sqdmlal2_lane_2d:
1010;CHECK-NOT: dup
1011;CHECK: sqdmlal2.2d
1012  %load1 = load <4 x i32>, <4 x i32>* %A
1013  %load2 = load <4 x i32>, <4 x i32>* %B
1014  %tmp3 = load <2 x i64>, <2 x i64>* %C
1015  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1016  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1017  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1018  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1019  ret <2 x i64> %tmp6
1020}
1021
1022define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1023;CHECK-LABEL: sqdmlal_lane_1s:
1024;CHECK: sqdmlal.4s
1025  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1026  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1027  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1028  %prod = extractelement <4 x i32> %prod.vec, i32 0
1029  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
1030  ret i32 %res
1031}
1032declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
1033
1034define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
1035;CHECK-LABEL: sqdmlsl_lane_1s:
1036;CHECK: sqdmlsl.4s
1037  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
1038  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
1039  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
1040  %prod = extractelement <4 x i32> %prod.vec, i32 0
1041  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
1042  ret i32 %res
1043}
1044declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
1045
1046define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1047;CHECK-LABEL: sqdmlal_lane_1d:
1048;CHECK: sqdmlal.s
1049  %rhs = extractelement <2 x i32> %C, i32 1
1050  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1051  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
1052  ret i64 %res
1053}
1054declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
1055declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
1056
1057define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
1058;CHECK-LABEL: sqdmlsl_lane_1d:
1059;CHECK: sqdmlsl.s
1060  %rhs = extractelement <2 x i32> %C, i32 1
1061  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
1062  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
1063  ret i64 %res
1064}
1065declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
1066
1067
1068define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1069;CHECK-LABEL: umlal_lane_4s:
1070;CHECK-NOT: dup
1071;CHECK: umlal.4s
1072  %tmp1 = load <4 x i16>, <4 x i16>* %A
1073  %tmp2 = load <4 x i16>, <4 x i16>* %B
1074  %tmp3 = load <4 x i32>, <4 x i32>* %C
1075  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1076  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1077  %tmp6 = add <4 x i32> %tmp3, %tmp5
1078  ret <4 x i32> %tmp6
1079}
1080
1081define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1082;CHECK-LABEL: umlal_lane_2d:
1083;CHECK-NOT: dup
1084;CHECK: umlal.2d
1085  %tmp1 = load <2 x i32>, <2 x i32>* %A
1086  %tmp2 = load <2 x i32>, <2 x i32>* %B
1087  %tmp3 = load <2 x i64>, <2 x i64>* %C
1088  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1089  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1090  %tmp6 = add <2 x i64> %tmp3, %tmp5
1091  ret <2 x i64> %tmp6
1092}
1093
1094
1095define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1096;CHECK-LABEL: smlsl_lane_4s:
1097;CHECK-NOT: dup
1098;CHECK: smlsl.4s
1099  %tmp1 = load <4 x i16>, <4 x i16>* %A
1100  %tmp2 = load <4 x i16>, <4 x i16>* %B
1101  %tmp3 = load <4 x i32>, <4 x i32>* %C
1102  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1103  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1104  %tmp6 = sub <4 x i32> %tmp3, %tmp5
1105  ret <4 x i32> %tmp6
1106}
1107
1108define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1109;CHECK-LABEL: smlsl_lane_2d:
1110;CHECK-NOT: dup
1111;CHECK: smlsl.2d
1112  %tmp1 = load <2 x i32>, <2 x i32>* %A
1113  %tmp2 = load <2 x i32>, <2 x i32>* %B
1114  %tmp3 = load <2 x i64>, <2 x i64>* %C
1115  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1116  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1117  %tmp6 = sub <2 x i64> %tmp3, %tmp5
1118  ret <2 x i64> %tmp6
1119}
1120
1121define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1122;CHECK-LABEL: sqdmlsl_lane_4s:
1123;CHECK-NOT: dup
1124;CHECK: sqdmlsl.4s
1125  %tmp1 = load <4 x i16>, <4 x i16>* %A
1126  %tmp2 = load <4 x i16>, <4 x i16>* %B
1127  %tmp3 = load <4 x i32>, <4 x i32>* %C
1128  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1129  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1130  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1131  ret <4 x i32> %tmp6
1132}
1133
1134define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1135;CHECK-LABEL: sqdmlsl_lane_2d:
1136;CHECK-NOT: dup
1137;CHECK: sqdmlsl.2d
1138  %tmp1 = load <2 x i32>, <2 x i32>* %A
1139  %tmp2 = load <2 x i32>, <2 x i32>* %B
1140  %tmp3 = load <2 x i64>, <2 x i64>* %C
1141  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1142  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1143  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1144  ret <2 x i64> %tmp6
1145}
1146
1147define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
1148;CHECK-LABEL: sqdmlsl2_lane_4s:
1149;CHECK-NOT: dup
1150;CHECK: sqdmlsl2.4s
1151  %load1 = load <8 x i16>, <8 x i16>* %A
1152  %load2 = load <8 x i16>, <8 x i16>* %B
1153  %tmp3 = load <4 x i32>, <4 x i32>* %C
1154  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1155  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1156  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
1157  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
1158  ret <4 x i32> %tmp6
1159}
1160
1161define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
1162;CHECK-LABEL: sqdmlsl2_lane_2d:
1163;CHECK-NOT: dup
1164;CHECK: sqdmlsl2.2d
1165  %load1 = load <4 x i32>, <4 x i32>* %A
1166  %load2 = load <4 x i32>, <4 x i32>* %B
1167  %tmp3 = load <2 x i64>, <2 x i64>* %C
1168  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1169  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
1170  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
1171  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
1172  ret <2 x i64> %tmp6
1173}
1174
1175define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
1176;CHECK-LABEL: umlsl_lane_4s:
1177;CHECK-NOT: dup
1178;CHECK: umlsl.4s
1179  %tmp1 = load <4 x i16>, <4 x i16>* %A
1180  %tmp2 = load <4 x i16>, <4 x i16>* %B
1181  %tmp3 = load <4 x i32>, <4 x i32>* %C
1182  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1183  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
1184  %tmp6 = sub <4 x i32> %tmp3, %tmp5
1185  ret <4 x i32> %tmp6
1186}
1187
1188define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
1189;CHECK-LABEL: umlsl_lane_2d:
1190;CHECK-NOT: dup
1191;CHECK: umlsl.2d
1192  %tmp1 = load <2 x i32>, <2 x i32>* %A
1193  %tmp2 = load <2 x i32>, <2 x i32>* %B
1194  %tmp3 = load <2 x i64>, <2 x i64>* %C
1195  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
1196  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
1197  %tmp6 = sub <2 x i64> %tmp3, %tmp5
1198  ret <2 x i64> %tmp6
1199}
1200
1201; Scalar FMULX
1202define float @fmulxs(float %a, float %b) nounwind {
1203; CHECK-LABEL: fmulxs:
1204; CHECKNEXT: fmulx s0, s0, s1
1205  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1206; CHECKNEXT: ret
1207  ret float %fmulx.i
1208}
1209
1210define double @fmulxd(double %a, double %b) nounwind {
1211; CHECK-LABEL: fmulxd:
1212; CHECKNEXT: fmulx d0, d0, d1
1213  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1214; CHECKNEXT: ret
1215  ret double %fmulx.i
1216}
1217
1218define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
1219; CHECK-LABEL: fmulxs_lane:
1220; CHECKNEXT: fmulx.s s0, s0, v1[3]
1221  %b = extractelement <4 x float> %vec, i32 3
1222  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
1223; CHECKNEXT: ret
1224  ret float %fmulx.i
1225}
1226
1227define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
1228; CHECK-LABEL: fmulxd_lane:
1229; CHECKNEXT: fmulx d0, d0, v1[1]
1230  %b = extractelement <2 x double> %vec, i32 1
1231  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
1232; CHECKNEXT: ret
1233  ret double %fmulx.i
1234}
1235
1236declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
1237declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
1238
1239
1240define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
1241; CHECK-LABEL: smull2_8h_simple:
1242; CHECK-NEXT: smull2.8h v0, v0, v1
1243; CHECK-NEXT: ret
1244  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1245  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1246  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
1247  ret <8 x i16> %3
1248}
1249
1250define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
1251; CHECK-LABEL: foo0:
1252; CHECK: smull2.8h v0, v0, v1
1253  %tmp = bitcast <16 x i8> %a to <2 x i64>
1254  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1255  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1256  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1257  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1258  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1259  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1260  ret <8 x i16> %vmull.i.i
1261}
1262
1263define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
1264; CHECK-LABEL: foo1:
1265; CHECK: smull2.4s v0, v0, v1
1266  %tmp = bitcast <8 x i16> %a to <2 x i64>
1267  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1268  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1269  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1270  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1271  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1272  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1273  ret <4 x i32> %vmull2.i.i
1274}
1275
1276define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
1277; CHECK-LABEL: foo2:
1278; CHECK: smull2.2d v0, v0, v1
1279  %tmp = bitcast <4 x i32> %a to <2 x i64>
1280  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1281  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1282  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1283  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1284  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1285  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1286  ret <2 x i64> %vmull2.i.i
1287}
1288
1289define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
1290; CHECK-LABEL: foo3:
1291; CHECK: umull2.8h v0, v0, v1
1292  %tmp = bitcast <16 x i8> %a to <2 x i64>
1293  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1294  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
1295  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
1296  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1297  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
1298  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1299  ret <8 x i16> %vmull.i.i
1300}
1301
1302define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
1303; CHECK-LABEL: foo4:
1304; CHECK: umull2.4s v0, v0, v1
1305  %tmp = bitcast <8 x i16> %a to <2 x i64>
1306  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1307  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1308  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
1309  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1310  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1311  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1312  ret <4 x i32> %vmull2.i.i
1313}
1314
1315define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
1316; CHECK-LABEL: foo5:
1317; CHECK: umull2.2d v0, v0, v1
1318  %tmp = bitcast <4 x i32> %a to <2 x i64>
1319  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1320  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1321  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
1322  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1323  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1324  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1325  ret <2 x i64> %vmull2.i.i
1326}
1327
1328define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1329; CHECK-LABEL: foo6:
1330; CHECK-NEXT: smull2.4s v0, v1, v2[1]
1331; CHECK-NEXT: ret
1332entry:
1333  %0 = bitcast <8 x i16> %b to <2 x i64>
1334  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1335  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1336  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1337  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1338  ret <4 x i32> %vmull2.i
1339}
1340
1341define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1342; CHECK-LABEL: foo7:
1343; CHECK-NEXT: smull2.2d v0, v1, v2[1]
1344; CHECK-NEXT: ret
1345entry:
1346  %0 = bitcast <4 x i32> %b to <2 x i64>
1347  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1348  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1349  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1350  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1351  ret <2 x i64> %vmull2.i
1352}
1353
1354define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
1355; CHECK-LABEL: foo8:
1356; CHECK-NEXT: umull2.4s v0, v1, v2[1]
1357; CHECK-NEXT: ret
1358entry:
1359  %0 = bitcast <8 x i16> %b to <2 x i64>
1360  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1361  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
1362  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1363  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
1364  ret <4 x i32> %vmull2.i
1365}
1366
1367define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
1368; CHECK-LABEL: foo9:
1369; CHECK-NEXT: umull2.2d v0, v1, v2[1]
1370; CHECK-NEXT: ret
1371entry:
1372  %0 = bitcast <4 x i32> %b to <2 x i64>
1373  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1374  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
1375  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1376  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
1377  ret <2 x i64> %vmull2.i
1378}
1379
1380define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1381; CHECK-LABEL: bar0:
1382; CHECK: smlal2.8h v0, v1, v2
1383; CHECK-NEXT: ret
1384
1385  %tmp = bitcast <16 x i8> %b to <2 x i64>
1386  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1387  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1388  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1389  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1390  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1391  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1392  %add.i = add <8 x i16> %vmull.i.i.i, %a
1393  ret <8 x i16> %add.i
1394}
1395
1396define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1397; CHECK-LABEL: bar1:
1398; CHECK: smlal2.4s v0, v1, v2
1399; CHECK-NEXT: ret
1400
1401  %tmp = bitcast <8 x i16> %b to <2 x i64>
1402  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1403  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1404  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1405  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1406  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1407  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1408  %add.i = add <4 x i32> %vmull2.i.i.i, %a
1409  ret <4 x i32> %add.i
1410}
1411
1412define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1413; CHECK-LABEL: bar2:
1414; CHECK: smlal2.2d v0, v1, v2
1415; CHECK-NEXT: ret
1416
1417  %tmp = bitcast <4 x i32> %b to <2 x i64>
1418  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1419  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1420  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1421  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1422  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1423  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1424  %add.i = add <2 x i64> %vmull2.i.i.i, %a
1425  ret <2 x i64> %add.i
1426}
1427
1428define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
1429; CHECK-LABEL: bar3:
1430; CHECK: umlal2.8h v0, v1, v2
1431; CHECK-NEXT: ret
1432
1433  %tmp = bitcast <16 x i8> %b to <2 x i64>
1434  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1435  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
1436  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
1437  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1438  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
1439  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
1440  %add.i = add <8 x i16> %vmull.i.i.i, %a
1441  ret <8 x i16> %add.i
1442}
1443
1444define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
1445; CHECK-LABEL: bar4:
1446; CHECK: umlal2.4s v0, v1, v2
1447; CHECK-NEXT: ret
1448
1449  %tmp = bitcast <8 x i16> %b to <2 x i64>
1450  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1451  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
1452  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
1453  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1454  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
1455  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1456  %add.i = add <4 x i32> %vmull2.i.i.i, %a
1457  ret <4 x i32> %add.i
1458}
1459
1460define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
1461; CHECK-LABEL: bar5:
1462; CHECK: umlal2.2d v0, v1, v2
1463; CHECK-NEXT: ret
1464
1465  %tmp = bitcast <4 x i32> %b to <2 x i64>
1466  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1467  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
1468  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
1469  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1470  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
1471  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1472  %add.i = add <2 x i64> %vmull2.i.i.i, %a
1473  ret <2 x i64> %add.i
1474}
1475
1476define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1477; CHECK-LABEL: mlal2_1:
1478; CHECK: smlal2.4s v0, v1, v2[3]
1479; CHECK-NEXT: ret
1480  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1481  %tmp = bitcast <8 x i16> %b to <2 x i64>
1482  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1483  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1484  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1485  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1486  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1487  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1488  %add = add <4 x i32> %vmull2.i.i, %a
1489  ret <4 x i32> %add
1490}
1491
1492define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1493; CHECK-LABEL: mlal2_2:
1494; CHECK: smlal2.2d v0, v1, v2[1]
1495; CHECK-NEXT: ret
1496  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1497  %tmp = bitcast <4 x i32> %b to <2 x i64>
1498  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1499  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1500  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1501  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1502  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1503  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1504  %add = add <2 x i64> %vmull2.i.i, %a
1505  ret <2 x i64> %add
1506}
1507
1508define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
1509; CHECK-LABEL: mlal2_4:
1510; CHECK: umlal2.4s v0, v1, v2[2]
1511; CHECK-NEXT: ret
1512
1513  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
1514  %tmp = bitcast <8 x i16> %b to <2 x i64>
1515  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1516  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1517  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
1518  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1519  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
1520  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
1521  %add = add <4 x i32> %vmull2.i.i, %a
1522  ret <4 x i32> %add
1523}
1524
1525define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
1526; CHECK-LABEL: mlal2_5:
1527; CHECK: umlal2.2d v0, v1, v2[0]
1528; CHECK-NEXT: ret
1529  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
1530  %tmp = bitcast <4 x i32> %b to <2 x i64>
1531  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
1532  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1533  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
1534  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
1535  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
1536  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
1537  %add = add <2 x i64> %vmull2.i.i, %a
1538  ret <2 x i64> %add
1539}
1540
1541; rdar://12328502
1542define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
1543entry:
1544; CHECK-LABEL: vmulq_n_f64:
1545; CHECK-NOT: dup.2d
1546; CHECK: fmul.2d v0, v0, v1[0]
1547  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
1548  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
1549  %mul.i = fmul <2 x double> %vecinit1.i, %x
1550  ret <2 x double> %mul.i
1551}
1552
1553define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
1554entry:
1555; CHECK-LABEL: vmulq_n_f32:
1556; CHECK-NOT: dup.4s
1557; CHECK: fmul.4s v0, v0, v1[0]
1558  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
1559  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
1560  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
1561  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
1562  %mul.i = fmul <4 x float> %vecinit3.i, %x
1563  ret <4 x float> %mul.i
1564}
1565
1566define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
1567entry:
1568; CHECK-LABEL: vmul_n_f32:
1569; CHECK-NOT: dup.2s
1570; CHECK: fmul.2s v0, v0, v1[0]
1571  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
1572  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
1573  %mul.i = fmul <2 x float> %vecinit1.i, %x
1574  ret <2 x float> %mul.i
1575}
1576
1577define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1578entry:
1579; CHECK: vmla_laneq_s16_test
1580; CHECK-NOT: ext
1581; CHECK: mla.4h v0, v1, v2[6]
1582; CHECK-NEXT: ret
1583  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1584  %mul = mul <4 x i16> %shuffle, %b
1585  %add = add <4 x i16> %mul, %a
1586  ret <4 x i16> %add
1587}
1588
1589define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1590entry:
1591; CHECK: vmla_laneq_s32_test
1592; CHECK-NOT: ext
1593; CHECK: mla.2s v0, v1, v2[3]
1594; CHECK-NEXT: ret
1595  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1596  %mul = mul <2 x i32> %shuffle, %b
1597  %add = add <2 x i32> %mul, %a
1598  ret <2 x i32> %add
1599}
1600
1601define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
1602entry:
1603; CHECK: not_really_vmlaq_laneq_s16_test
1604; CHECK-NOT: ext
1605; CHECK: mla.8h v0, v1, v2[5]
1606; CHECK-NEXT: ret
1607  %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1608  %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1609  %mul = mul <8 x i16> %shuffle2, %b
1610  %add = add <8 x i16> %mul, %a
1611  ret <8 x i16> %add
1612}
1613
1614define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
1615entry:
1616; CHECK: not_really_vmlaq_laneq_s32_test
1617; CHECK-NOT: ext
1618; CHECK: mla.4s v0, v1, v2[3]
1619; CHECK-NEXT: ret
1620  %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1621  %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1622  %mul = mul <4 x i32> %shuffle2, %b
1623  %add = add <4 x i32> %mul, %a
1624  ret <4 x i32> %add
1625}
1626
1627define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1628entry:
1629; CHECK: vmull_laneq_s16_test
1630; CHECK-NOT: ext
1631; CHECK: smull.4s v0, v0, v1[6]
1632; CHECK-NEXT: ret
1633  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1634  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1635  ret <4 x i32> %vmull2.i
1636}
1637
1638define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1639entry:
1640; CHECK: vmull_laneq_s32_test
1641; CHECK-NOT: ext
1642; CHECK: smull.2d v0, v0, v1[2]
1643; CHECK-NEXT: ret
1644  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1645  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1646  ret <2 x i64> %vmull2.i
1647}
1648define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
1649entry:
1650; CHECK: vmull_laneq_u16_test
1651; CHECK-NOT: ext
1652; CHECK: umull.4s v0, v0, v1[6]
1653; CHECK-NEXT: ret
1654  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
1655  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
1656  ret <4 x i32> %vmull2.i
1657}
1658
1659define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
1660entry:
1661; CHECK: vmull_laneq_u32_test
1662; CHECK-NOT: ext
1663; CHECK: umull.2d v0, v0, v1[2]
1664; CHECK-NEXT: ret
1665  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
1666  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
1667  ret <2 x i64> %vmull2.i
1668}
1669
1670define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1671entry:
1672; CHECK: vmull_high_n_s16_test
1673; CHECK-NOT: ext
1674; CHECK: smull2.4s
1675; CHECK-NEXT: ret
1676  %conv = trunc i32 %d to i16
1677  %0 = bitcast <8 x i16> %b to <2 x i64>
1678  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1679  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1680  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1681  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1682  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1683  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1684  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1685  ret <4 x i32> %vmull2.i.i
1686}
1687
1688define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1689entry:
1690; CHECK: vmull_high_n_s32_test
1691; CHECK-NOT: ext
1692; CHECK: smull2.2d
1693; CHECK-NEXT: ret
1694  %0 = bitcast <4 x i32> %b to <2 x i64>
1695  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1696  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1697  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1698  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1699  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1700  ret <2 x i64> %vmull2.i.i
1701}
1702
1703define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
1704entry:
1705; CHECK: vmull_high_n_u16_test
1706; CHECK-NOT: ext
1707; CHECK: umull2.4s
1708; CHECK-NEXT: ret
1709  %conv = trunc i32 %d to i16
1710  %0 = bitcast <8 x i16> %b to <2 x i64>
1711  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1712  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
1713  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
1714  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
1715  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
1716  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
1717  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
1718  ret <4 x i32> %vmull2.i.i
1719}
1720
1721define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
1722entry:
1723; CHECK: vmull_high_n_u32_test
1724; CHECK-NOT: ext
1725; CHECK: umull2.2d
1726; CHECK-NEXT: ret
1727  %0 = bitcast <4 x i32> %b to <2 x i64>
1728  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
1729  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
1730  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
1731  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
1732  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
1733  ret <2 x i64> %vmull2.i.i
1734}
1735
1736define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
1737; CHECK-LABEL: vmul_built_dup_test:
1738; CHECK-NOT: ins
1739; CHECK-NOT: dup
1740; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
1741  %vget_lane = extractelement <4 x i32> %b, i32 1
1742  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
1743  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
1744  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
1745  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
1746  %prod = mul <4 x i32> %a, %vecinit3.i
1747  ret <4 x i32> %prod
1748}
1749
1750define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
1751; CHECK-LABEL: vmul_built_dup_fromsmall_test:
1752; CHECK-NOT: ins
1753; CHECK-NOT: dup
1754; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
1755  %vget_lane = extractelement <4 x i16> %b, i32 3
1756  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
1757  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
1758  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1759  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1760  %prod = mul <4 x i16> %a, %vecinit3.i
1761  ret <4 x i16> %prod
1762}
1763
1764define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
1765; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
1766; CHECK-NOT: ins
1767; CHECK-NOT: dup
1768; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
1769  %vget_lane = extractelement <4 x i16> %b, i32 0
1770  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
1771  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
1772  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
1773  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
1774  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
1775  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
1776  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
1777  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
1778  %prod = mul <8 x i16> %a, %vecinit7.i
1779  ret <8 x i16> %prod
1780}
1781
1782define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
1783; CHECK-LABEL: mull_from_two_extracts:
1784; CHECK-NOT: ext
1785; CHECK: sqdmull2.2d
1786
1787  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1788  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1789
1790  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1791  ret <2 x i64> %res
1792}
1793
1794define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1795; CHECK-LABEL: mlal_from_two_extracts:
1796; CHECK-NOT: ext
1797; CHECK: sqdmlal2.2d
1798
1799  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1800  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1801
1802  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1803  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1804  ret <2 x i64> %sum
1805}
1806
1807define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
1808; CHECK-LABEL: mull_from_extract_dup:
1809; CHECK-NOT: ext
1810; CHECK: sqdmull2.2d
1811  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1812  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1813
1814  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1815
1816  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
1817  ret <2 x i64> %res
1818}
1819
1820define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
1821; CHECK-LABEL: pmull_from_extract_dup:
1822; CHECK-NOT: ext
1823; CHECK: pmull2.8h
1824  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
1825  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1826
1827  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1828
1829  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
1830  ret <8 x i16> %res
1831}
1832
1833define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
1834; CHECK-LABEL: pmull_from_extract_duplane:
1835; CHECK-NOT: ext
1836; CHECK: pmull2.8h
1837
1838  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1839  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1840
1841  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
1842  ret <8 x i16> %res
1843}
1844
1845define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
1846; CHECK-LABEL: sqdmull_from_extract_duplane:
1847; CHECK-NOT: ext
1848; CHECK: sqdmull2.2d
1849
1850  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1851  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1852
1853  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1854  ret <2 x i64> %res
1855}
1856
1857define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1858; CHECK-LABEL: sqdmlal_from_extract_duplane:
1859; CHECK-NOT: ext
1860; CHECK: sqdmlal2.2d
1861
1862  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1863  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1864
1865  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1866  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
1867  ret <2 x i64> %sum
1868}
1869
1870define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
1871; CHECK-LABEL: umlal_from_extract_duplane:
1872; CHECK-NOT: ext
1873; CHECK: umlal2.2d
1874
1875  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1876  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
1877
1878  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
1879  %sum = add <2 x i64> %accum, %res
1880  ret <2 x i64> %sum
1881}
1882
1883define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
1884; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
1885; CHECK: fmla.s s0, s1, v2[3]
1886  %rhs = extractelement <4 x float> %rvec, i32 3
1887  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
1888  ret float %res
1889}
1890
1891define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
1892; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
1893; CHECK: fmla.s s0, s1, v2[1]
1894  %rhs = extractelement <2 x float> %rvec, i32 1
1895  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
1896  ret float %res
1897}
1898
1899define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
1900; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
1901; CHECK: fmls.s s0, s1, v2[3]
1902  %rhs.scal = extractelement <4 x float> %rvec, i32 3
1903  %rhs = fsub float -0.0, %rhs.scal
1904  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
1905  ret float %res
1906}
1907
1908define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
1909; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
1910; CHECK: fmls.s s0, s1, v2[1]
1911  %rhs.scal = extractelement <2 x float> %rvec, i32 1
1912  %rhs = fsub float -0.0, %rhs.scal
1913  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
1914  ret float %res
1915}
1916
1917declare float @llvm.fma.f32(float, float, float)
1918
1919define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
1920; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
1921; CHECK: fmla.d d0, d1, v2[1]
1922  %rhs = extractelement <2 x double> %rvec, i32 1
1923  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
1924  ret double %res
1925}
1926
1927define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
1928; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
1929; CHECK: fmls.d d0, d1, v2[1]
1930  %rhs.scal = extractelement <2 x double> %rvec, i32 1
1931  %rhs = fsub double -0.0, %rhs.scal
1932  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
1933  ret double %res
1934}
1935
1936declare double @llvm.fma.f64(double, double, double)
1937
1938define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
1939; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
1940; CHECK: fmls.2s v0, v1, v2[3]
1941  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
1942  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1943  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
1944  ret <2 x float> %res
1945}
1946
1947define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
1948; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
1949; CHECK: fmls.2s v0, v1, v2[1]
1950  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
1951  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1952  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
1953  ret <2 x float> %res
1954}
1955
1956define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
1957; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
1958; CHECK: fmls.4s v0, v1, v2[3]
1959  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
1960  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1961  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
1962  ret <4 x float> %res
1963}
1964
1965define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
1966; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
1967; CHECK: fmls.4s v0, v1, v2[1]
1968  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
1969  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1970  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
1971  ret <4 x float> %res
1972}
1973
1974define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
1975; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
1976; CHECK: fmls.2d v0, v1, v2[1]
1977  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
1978  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1979  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
1980  ret <2 x double> %res
1981}
1982
1983define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
1984; CHECK-LABEL: test_fmul_v1f64:
1985; CHECK: fmul
1986  %prod = fmul <1 x double> %L, %R
1987  ret <1 x double> %prod
1988}
1989
1990define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
1991; CHECK-LABEL: test_fdiv_v1f64:
1992; CHECK-LABEL: fdiv
1993  %prod = fdiv <1 x double> %L, %R
1994  ret <1 x double> %prod
1995}
1996
1997define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
1998;CHECK-LABEL: sqdmlal_d:
1999;CHECK: sqdmlal
2000  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2001  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
2002  ret i64 %tmp5
2003}
2004
2005define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
2006;CHECK-LABEL: sqdmlsl_d:
2007;CHECK: sqdmlsl
2008  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
2009  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
2010  ret i64 %tmp5
2011}
2012
2013define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
2014; CHECK-LABEL: test_pmull_64:
2015; CHECK: pmull.1q
2016  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
2017  ret <16 x i8> %val
2018}
2019
2020define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
2021; CHECK-LABEL: test_pmull_high_64:
2022; CHECK: pmull2.1q
2023  %l_hi = extractelement <2 x i64> %l, i32 1
2024  %r_hi = extractelement <2 x i64> %r, i32 1
2025  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
2026  ret <16 x i8> %val
2027}
2028
2029declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
2030
2031define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
2032; CHECK-LABEL: test_mul_v1i64:
2033; CHECK: mul
2034  %prod = mul <1 x i64> %lhs, %rhs
2035  ret <1 x i64> %prod
2036}
2037