1; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast \
2; RUN:     < %s -verify-machineinstrs -asm-verbose=false | FileCheck %s
3
4define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
5; CHECK-LABEL: test_vmull_high_n_s16:
6; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
7; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
8; CHECK-NEXT: ret
9entry:
10  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
11  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
12  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
13  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
14  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
15  %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
16  ret <4 x i32> %vmull15.i.i
17}
18
19define <4 x i32> @test_vmull_high_n_s16_imm(<8 x i16> %a) #0 {
20; CHECK-LABEL: test_vmull_high_n_s16_imm:
21; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
22; CHECK-NEXT: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
23; CHECK-NEXT: ret
24entry:
25  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
26  %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
27  ret <4 x i32> %vmull15.i.i
28}
29
30define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
31; CHECK-LABEL: test_vmull_high_n_s32:
32; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
33; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
34; CHECK-NEXT: ret
35entry:
36  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
37  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
38  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
39  %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
40  ret <2 x i64> %vmull9.i.i
41}
42
43define <2 x i64> @test_vmull_high_n_s32_imm(<4 x i32> %a) #0 {
44; CHECK-LABEL: test_vmull_high_n_s32_imm:
45; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #1, msl #8
46; CHECK-NEXT: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
47; CHECK-NEXT: ret
48entry:
49  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
50  %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 511, i32 511>)
51  ret <2 x i64> %vmull9.i.i
52}
53
54define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
55; CHECK-LABEL: test_vmull_high_n_u16:
56; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
57; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
58; CHECK-NEXT: ret
59entry:
60  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
61  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
62  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
63  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
64  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
65  %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
66  ret <4 x i32> %vmull15.i.i
67}
68
69define <4 x i32> @test_vmull_high_n_u16_imm(<8 x i16> %a) #0 {
70; CHECK-LABEL: test_vmull_high_n_u16_imm:
71; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
72; CHECK-NEXT: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
73; CHECK-NEXT: ret
74entry:
75  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
76  %vmull15.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 4352, i16 4352, i16 4352, i16 4352>)
77  ret <4 x i32> %vmull15.i.i
78}
79
80define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
81; CHECK-LABEL: test_vmull_high_n_u32:
82; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
83; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
84; CHECK-NEXT: ret
85entry:
86  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
87  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
88  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
89  %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
90  ret <2 x i64> %vmull9.i.i
91}
92
93define <2 x i64> @test_vmull_high_n_u32_imm(<4 x i32> %a) #0 {
94; CHECK-LABEL: test_vmull_high_n_u32_imm:
95; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].4s, #1, msl #8
96; CHECK-NEXT: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
97; CHECK-NEXT: ret
98entry:
99  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
100  %vmull9.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 4294966784, i32 4294966784>)
101  ret <2 x i64> %vmull9.i.i
102}
103
104define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
105; CHECK-LABEL: test_vqdmull_high_n_s16:
106; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
107; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
108; CHECK-NEXT: ret
109entry:
110  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
111  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
112  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
113  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
114  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
115  %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
116  ret <4 x i32> %vqdmull15.i.i
117}
118
119define <4 x i32> @test_vqdmull_high_n_s16_imm(<8 x i16> %a) #0 {
120; CHECK-LABEL: test_vqdmull_high_n_s16_imm:
121; CHECK-NEXT: mvni [[REPLICATE:v[0-9]+]].8h, #17, lsl #8
122; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
123; CHECK-NEXT: ret
124entry:
125  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
126  %vqdmull15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 61183, i16 61183, i16 61183, i16 61183>)
127  ret <4 x i32> %vqdmull15.i.i
128}
129
130define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
131; CHECK-LABEL: test_vqdmull_high_n_s32:
132; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
133; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
134; CHECK-NEXT: ret
135entry:
136  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
137  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
138  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
139  %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
140  ret <2 x i64> %vqdmull9.i.i
141}
142
143define <2 x i64> @test_vqdmull_high_n_s32_imm(<4 x i32> %a) #0 {
144; CHECK-LABEL: test_vqdmull_high_n_s32_imm:
145; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
146; CHECK-NEXT: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
147; CHECK-NEXT: ret
148entry:
149  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
150  %vqdmull9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
151  ret <2 x i64> %vqdmull9.i.i
152}
153
154define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
155; CHECK-LABEL: test_vmlal_high_n_s16:
156; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
157; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
158; CHECK-NEXT: ret
159entry:
160  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
161  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
162  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
163  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
164  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
165  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
166  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
167  ret <4 x i32> %add.i.i
168}
169
170define <4 x i32> @test_vmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
171; CHECK-LABEL: test_vmlal_high_n_s16_imm:
172; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
173; CHECK-NEXT: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
174; CHECK-NEXT: ret
175entry:
176  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
177  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
178  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
179  ret <4 x i32> %add.i.i
180}
181
182define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
183; CHECK-LABEL: test_vmlal_high_n_s32:
184; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
185; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
186; CHECK-NEXT: ret
187entry:
188  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
189  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
190  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
191  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
192  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
193  ret <2 x i64> %add.i.i
194}
195
196define <2 x i64> @test_vmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
197; CHECK-LABEL: test_vmlal_high_n_s32_imm:
198; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
199; CHECK-NEXT: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
200; CHECK-NEXT: ret
201entry:
202  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
203  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
204  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
205  ret <2 x i64> %add.i.i
206}
207
208define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
209; CHECK-LABEL: test_vmlal_high_n_u16:
210; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
211; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
212; CHECK-NEXT: ret
213entry:
214  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
215  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
216  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
217  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
218  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
219  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
220  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
221  ret <4 x i32> %add.i.i
222}
223
224define <4 x i32> @test_vmlal_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
225; CHECK-LABEL: test_vmlal_high_n_u16_imm:
226; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
227; CHECK-NEXT: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
228; CHECK-NEXT: ret
229entry:
230  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
231  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
232  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
233  ret <4 x i32> %add.i.i
234}
235
236define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
237; CHECK-LABEL: test_vmlal_high_n_u32:
238; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
239; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
240; CHECK-NEXT: ret
241entry:
242  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
243  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
244  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
245  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
246  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
247  ret <2 x i64> %add.i.i
248}
249
250define <2 x i64> @test_vmlal_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
251; CHECK-LABEL: test_vmlal_high_n_u32_imm:
252; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
253; CHECK-NEXT: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
254; CHECK-NEXT: ret
255entry:
256  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
257  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
258  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
259  ret <2 x i64> %add.i.i
260}
261
262define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
263; CHECK-LABEL: test_vqdmlal_high_n_s16:
264; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
265; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
266; CHECK-NEXT: ret
267entry:
268  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
269  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
270  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
271  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
272  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
273  %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
274  %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
275  ret <4 x i32> %vqdmlal17.i.i
276}
277
278define <4 x i32> @test_vqdmlal_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
279; CHECK-LABEL: test_vqdmlal_high_n_s16_imm:
280; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
281; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
282; CHECK-NEXT: ret
283entry:
284  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
285  %vqdmlal15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
286  %vqdmlal17.i.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
287  ret <4 x i32> %vqdmlal17.i.i
288}
289
290define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
291; CHECK-LABEL: test_vqdmlal_high_n_s32:
292; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
293; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
294; CHECK-NEXT: ret
295entry:
296  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
297  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
298  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
299  %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
300  %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
301  ret <2 x i64> %vqdmlal11.i.i
302}
303
304define <2 x i64> @test_vqdmlal_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
305; CHECK-LABEL: test_vqdmlal_high_n_s32_imm:
306; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
307; CHECK-NEXT: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
308; CHECK-NEXT: ret
309entry:
310  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
311  %vqdmlal9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
312  %vqdmlal11.i.i = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
313  ret <2 x i64> %vqdmlal11.i.i
314}
315
316define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
317; CHECK-LABEL: test_vmlsl_high_n_s16:
318; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
319; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
320; CHECK-NEXT: ret
321entry:
322  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
323  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
324  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
325  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
326  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
327  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
328  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
329  ret <4 x i32> %sub.i.i
330}
331
332define <4 x i32> @test_vmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
333; CHECK-LABEL: test_vmlsl_high_n_s16_imm:
334; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
335; CHECK-NEXT: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
336; CHECK-NEXT: ret
337entry:
338  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
340  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
341  ret <4 x i32> %sub.i.i
342}
343
344define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
345; CHECK-LABEL: test_vmlsl_high_n_s32:
346; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
347; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
348; CHECK-NEXT: ret
349entry:
350  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
351  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
352  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
353  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
354  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
355  ret <2 x i64> %sub.i.i
356}
357
358define <2 x i64> @test_vmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
359; CHECK-LABEL: test_vmlsl_high_n_s32_imm:
360; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
361; CHECK-NEXT: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
362; CHECK-NEXT: ret
363entry:
364  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
365  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
366  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
367  ret <2 x i64> %sub.i.i
368}
369
370define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
371; CHECK-LABEL: test_vmlsl_high_n_u16:
372; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
373; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
374; CHECK-NEXT: ret
375entry:
376  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
377  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
378  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
379  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
380  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
381  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
382  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
383  ret <4 x i32> %sub.i.i
384}
385
386define <4 x i32> @test_vmlsl_high_n_u16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
387; CHECK-LABEL: test_vmlsl_high_n_u16_imm:
388; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
389; CHECK-NEXT: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
390; CHECK-NEXT: ret
391entry:
392  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
393  %vmull2.i.i.i = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
394  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
395  ret <4 x i32> %sub.i.i
396}
397
398define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
399; CHECK-LABEL: test_vmlsl_high_n_u32:
400; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
401; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
402; CHECK-NEXT: ret
403entry:
404  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
405  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
406  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
407  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
408  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
409  ret <2 x i64> %sub.i.i
410}
411
412define <2 x i64> @test_vmlsl_high_n_u32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
413; CHECK-LABEL: test_vmlsl_high_n_u32_imm:
414; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
415; CHECK-NEXT: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
416; CHECK-NEXT: ret
417entry:
418  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
419  %vmull2.i.i.i = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
420  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
421  ret <2 x i64> %sub.i.i
422}
423
424define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
425; CHECK-LABEL: test_vqdmlsl_high_n_s16:
426; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].8h, w0
427; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
428; CHECK-NEXT: ret
429entry:
430  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
431  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
432  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
433  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
434  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
435  %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
436  %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
437  ret <4 x i32> %vqdmlsl17.i.i
438}
439
440define <4 x i32> @test_vqdmlsl_high_n_s16_imm(<4 x i32> %a, <8 x i16> %b) #0 {
441; CHECK-LABEL: test_vqdmlsl_high_n_s16_imm:
442; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].8h, #29
443; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
444; CHECK-NEXT: ret
445entry:
446  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
447  %vqdmlsl15.i.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> <i16 29, i16 29, i16 29, i16 29>)
448  %vqdmlsl17.i.i = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
449  ret <4 x i32> %vqdmlsl17.i.i
450}
451
452define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
453; CHECK-LABEL: test_vqdmlsl_high_n_s32:
454; CHECK-NEXT: dup [[REPLICATE:v[0-9]+]].4s, w0
455; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
456; CHECK-NEXT: ret
457entry:
458  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
459  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
460  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
461  %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
462  %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
463  ret <2 x i64> %vqdmlsl11.i.i
464}
465
466define <2 x i64> @test_vqdmlsl_high_n_s32_imm(<2 x i64> %a, <4 x i32> %b) #0 {
467; CHECK-LABEL: test_vqdmlsl_high_n_s32_imm:
468; CHECK-NEXT: movi [[REPLICATE:v[0-9]+]].4s, #29
469; CHECK-NEXT: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
470; CHECK-NEXT: ret
471entry:
472  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
473  %vqdmlsl9.i.i = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> <i32 29, i32 29>)
474  %vqdmlsl11.i.i = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
475  ret <2 x i64> %vqdmlsl11.i.i
476}
477
478define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
479; CHECK-LABEL: test_vmul_n_f32:
480; CHECK-NEXT: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
481; CHECK-NEXT: ret
482entry:
483  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
484  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
485  %mul.i = fmul <2 x float> %vecinit1.i, %a
486  ret <2 x float> %mul.i
487}
488
489define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
490; CHECK-LABEL: test_vmulq_n_f32:
491; CHECK-NEXT: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
492; CHECK-NEXT: ret
493entry:
494  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
495  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
496  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
497  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
498  %mul.i = fmul <4 x float> %vecinit3.i, %a
499  ret <4 x float> %mul.i
500}
501
502define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
503; CHECK-LABEL: test_vmulq_n_f64:
504; CHECK-NEXT: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
505; CHECK-NEXT: ret
506entry:
507  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
508  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
509  %mul.i = fmul <2 x double> %vecinit1.i, %a
510  ret <2 x double> %mul.i
511}
512
513define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
514; CHECK-LABEL: test_vfma_n_f32:
515; CHECK-NEXT: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
516; CHECK-NEXT: ret
517entry:
518  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
519  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
520  %0 = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
521  ret <2 x float> %0
522}
523
524define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
525; CHECK-LABEL: test_vfmaq_n_f32:
526; CHECK-NEXT: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
527; CHECK-NEXT: ret
528entry:
529  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
530  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
531  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
532  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
533  %0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
534  ret <4 x float> %0
535}
536
537define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
538; CHECK-LABEL: test_vfms_n_f32:
539; CHECK-NEXT: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
540; CHECK-NEXT: ret
541entry:
542  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
543  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
544  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
545  %1 = call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
546  ret <2 x float> %1
547}
548
549define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
550; CHECK-LABEL: test_vfmsq_n_f32:
551; CHECK-NEXT: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
552; CHECK-NEXT: ret
553entry:
554  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
555  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
556  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
557  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
558  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
559  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
560  ret <4 x float> %1
561}
562
563attributes #0 = { nounwind }
564
565declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
566declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
567declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
568declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
569declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
570declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
571declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
572declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
573declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
574declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
575declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
576declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
577