1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
3
4define arm_aapcs_vfpcc <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
5; CHECK-LABEL: test_vfmaq_f16:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vfma.f16 q0, q1, q2
8; CHECK-NEXT:    bx lr
9entry:
10  %0 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
11  ret <8 x half> %0
12}
13
14define arm_aapcs_vfpcc <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
15; CHECK-LABEL: test_vfmaq_f32:
16; CHECK:       @ %bb.0: @ %entry
17; CHECK-NEXT:    vfma.f32 q0, q1, q2
18; CHECK-NEXT:    bx lr
19entry:
20  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a)
21  ret <4 x float> %0
22}
23
24define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) {
25; CHECK-LABEL: test_vfmaq_n_f16:
26; CHECK:       @ %bb.0: @ %entry
27; CHECK-NEXT:    vmov r0, s8
28; CHECK-NEXT:    vfma.f16 q0, q1, r0
29; CHECK-NEXT:    bx lr
30entry:
31  %0 = bitcast float %c.coerce to i32
32  %tmp.0.extract.trunc = trunc i32 %0 to i16
33  %1 = bitcast i16 %tmp.0.extract.trunc to half
34  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
35  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
36  %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %.splat, <8 x half> %a)
37  ret <8 x half> %2
38}
39
40define arm_aapcs_vfpcc <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) {
41; CHECK-LABEL: test_vfmaq_n_f32:
42; CHECK:       @ %bb.0: @ %entry
43; CHECK-NEXT:    vmov r0, s8
44; CHECK-NEXT:    vfma.f32 q0, q1, r0
45; CHECK-NEXT:    bx lr
46entry:
47  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
48  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
49  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %.splat, <4 x float> %a)
50  ret <4 x float> %0
51}
52
53define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) {
54; CHECK-LABEL: test_vfmasq_n_f16:
55; CHECK:       @ %bb.0: @ %entry
56; CHECK-NEXT:    vmov r0, s8
57; CHECK-NEXT:    vfmas.f16 q0, q1, r0
58; CHECK-NEXT:    bx lr
59entry:
60  %0 = bitcast float %c.coerce to i32
61  %tmp.0.extract.trunc = trunc i32 %0 to i16
62  %1 = bitcast i16 %tmp.0.extract.trunc to half
63  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
64  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
65  %2 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %.splat)
66  ret <8 x half> %2
67}
68
69define arm_aapcs_vfpcc <4 x float> @test_vfmasq_n_f32(<4 x float> %a, <4 x float> %b, float %c) {
70; CHECK-LABEL: test_vfmasq_n_f32:
71; CHECK:       @ %bb.0: @ %entry
72; CHECK-NEXT:    vmov r0, s8
73; CHECK-NEXT:    vfmas.f32 q0, q1, r0
74; CHECK-NEXT:    bx lr
75entry:
76  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
77  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
78  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %.splat)
79  ret <4 x float> %0
80}
81
82define arm_aapcs_vfpcc <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
83; CHECK-LABEL: test_vfmsq_f16:
84; CHECK:       @ %bb.0: @ %entry
85; CHECK-NEXT:    vfms.f16 q0, q2, q1
86; CHECK-NEXT:    bx lr
87entry:
88  %0 = fneg <8 x half> %c
89  %1 = tail call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %0, <8 x half> %a)
90  ret <8 x half> %1
91}
92
93define arm_aapcs_vfpcc <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
94; CHECK-LABEL: test_vfmsq_f32:
95; CHECK:       @ %bb.0: @ %entry
96; CHECK-NEXT:    vfms.f32 q0, q2, q1
97; CHECK-NEXT:    bx lr
98entry:
99  %0 = fneg <4 x float> %c
100  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %0, <4 x float> %a)
101  ret <4 x float> %1
102}
103
104define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
105; CHECK-LABEL: test_vmlaq_n_s8:
106; CHECK:       @ %bb.0: @ %entry
107; CHECK-NEXT:    vmla.u8 q0, q1, r0
108; CHECK-NEXT:    bx lr
109entry:
110  %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
111  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
112  %0 = mul <16 x i8> %.splat, %b
113  %1 = add <16 x i8> %0, %a
114  ret <16 x i8> %1
115}
116
117define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
118; CHECK-LABEL: test_vmlaq_n_s16:
119; CHECK:       @ %bb.0: @ %entry
120; CHECK-NEXT:    vmla.u16 q0, q1, r0
121; CHECK-NEXT:    bx lr
122entry:
123  %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
124  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
125  %0 = mul <8 x i16> %.splat, %b
126  %1 = add <8 x i16> %0, %a
127  ret <8 x i16> %1
128}
129
130define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
131; CHECK-LABEL: test_vmlaq_n_s32:
132; CHECK:       @ %bb.0: @ %entry
133; CHECK-NEXT:    vmla.u32 q0, q1, r0
134; CHECK-NEXT:    bx lr
135entry:
136  %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
137  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
138  %0 = mul <4 x i32> %.splat, %b
139  %1 = add <4 x i32> %0, %a
140  ret <4 x i32> %1
141}
142
143define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) {
144; CHECK-LABEL: test_vmlaq_n_u8:
145; CHECK:       @ %bb.0: @ %entry
146; CHECK-NEXT:    vmla.u8 q0, q1, r0
147; CHECK-NEXT:    bx lr
148entry:
149  %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
150  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
151  %0 = mul <16 x i8> %.splat, %b
152  %1 = add <16 x i8> %0, %a
153  ret <16 x i8> %1
154}
155
156define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) {
157; CHECK-LABEL: test_vmlaq_n_u16:
158; CHECK:       @ %bb.0: @ %entry
159; CHECK-NEXT:    vmla.u16 q0, q1, r0
160; CHECK-NEXT:    bx lr
161entry:
162  %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
163  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
164  %0 = mul <8 x i16> %.splat, %b
165  %1 = add <8 x i16> %0, %a
166  ret <8 x i16> %1
167}
168
169define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
170; CHECK-LABEL: test_vmlaq_n_u32:
171; CHECK:       @ %bb.0: @ %entry
172; CHECK-NEXT:    vmla.u32 q0, q1, r0
173; CHECK-NEXT:    bx lr
174entry:
175  %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
176  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
177  %0 = mul <4 x i32> %.splat, %b
178  %1 = add <4 x i32> %0, %a
179  ret <4 x i32> %1
180}
181
182define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
183; CHECK-LABEL: test_vmlasq_n_s8:
184; CHECK:       @ %bb.0: @ %entry
185; CHECK-NEXT:    vmlas.u8 q1, q0, r0
186; CHECK-NEXT:    vmov q0, q1
187; CHECK-NEXT:    bx lr
188entry:
189  %0 = mul <16 x i8> %b, %a
190  %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
191  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
192  %1 = add <16 x i8> %.splat, %0
193  ret <16 x i8> %1
194}
195
196define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
197; CHECK-LABEL: test_vmlasq_n_s16:
198; CHECK:       @ %bb.0: @ %entry
199; CHECK-NEXT:    vmlas.u16 q1, q0, r0
200; CHECK-NEXT:    vmov q0, q1
201; CHECK-NEXT:    bx lr
202entry:
203  %0 = mul <8 x i16> %b, %a
204  %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
205  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
206  %1 = add <8 x i16> %.splat, %0
207  ret <8 x i16> %1
208}
209
210define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
211; CHECK-LABEL: test_vmlasq_n_s32:
212; CHECK:       @ %bb.0: @ %entry
213; CHECK-NEXT:    vmlas.u32 q1, q0, r0
214; CHECK-NEXT:    vmov q0, q1
215; CHECK-NEXT:    bx lr
216entry:
217  %0 = mul <4 x i32> %b, %a
218  %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
219  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
220  %1 = add <4 x i32> %.splat, %0
221  ret <4 x i32> %1
222}
223
224define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c) {
225; CHECK-LABEL: test_vmlasq_n_u8:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    vmlas.u8 q1, q0, r0
228; CHECK-NEXT:    vmov q0, q1
229; CHECK-NEXT:    bx lr
230entry:
231  %0 = mul <16 x i8> %b, %a
232  %.splatinsert = insertelement <16 x i8> undef, i8 %c, i32 0
233  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
234  %1 = add <16 x i8> %.splat, %0
235  ret <16 x i8> %1
236}
237
238define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) {
239; CHECK-LABEL: test_vmlasq_n_u16:
240; CHECK:       @ %bb.0: @ %entry
241; CHECK-NEXT:    vmlas.u16 q1, q0, r0
242; CHECK-NEXT:    vmov q0, q1
243; CHECK-NEXT:    bx lr
244entry:
245  %0 = mul <8 x i16> %b, %a
246  %.splatinsert = insertelement <8 x i16> undef, i16 %c, i32 0
247  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
248  %1 = add <8 x i16> %.splat, %0
249  ret <8 x i16> %1
250}
251
252define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
253; CHECK-LABEL: test_vmlasq_n_u32:
254; CHECK:       @ %bb.0: @ %entry
255; CHECK-NEXT:    vmlas.u32 q1, q0, r0
256; CHECK-NEXT:    vmov q0, q1
257; CHECK-NEXT:    bx lr
258entry:
259  %0 = mul <4 x i32> %b, %a
260  %.splatinsert = insertelement <4 x i32> undef, i32 %c, i32 0
261  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
262  %1 = add <4 x i32> %.splat, %0
263  ret <4 x i32> %1
264}
265
266define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
267; CHECK-LABEL: test_vqdmlahq_n_s8:
268; CHECK:       @ %bb.0: @ %entry
269; CHECK-NEXT:    vqdmlah.s8 q0, q1, r0
270; CHECK-NEXT:    bx lr
271entry:
272  %0 = zext i8 %c to i32
273  %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0)
274  ret <16 x i8> %1
275}
276
277define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
278; CHECK-LABEL: test_vqdmlahq_n_s16:
279; CHECK:       @ %bb.0: @ %entry
280; CHECK-NEXT:    vqdmlah.s16 q0, q1, r0
281; CHECK-NEXT:    bx lr
282entry:
283  %0 = zext i16 %c to i32
284  %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0)
285  ret <8 x i16> %1
286}
287
288define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
289; CHECK-LABEL: test_vqdmlahq_n_s32:
290; CHECK:       @ %bb.0: @ %entry
291; CHECK-NEXT:    vqdmlah.s32 q0, q1, r0
292; CHECK-NEXT:    bx lr
293entry:
294  %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c)
295  ret <4 x i32> %0
296}
297
298define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add) {
299; CHECK-LABEL: test_vqdmlashq_n_s8:
300; CHECK:       @ %bb.0: @ %entry
301; CHECK-NEXT:    vqdmlash.s8 q0, q1, r0
302; CHECK-NEXT:    bx lr
303entry:
304  %0 = zext i8 %add to i32
305  %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8> %m1, <16 x i8> %m2, i32 %0)
306  ret <16 x i8> %1
307}
308
309define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add) {
310; CHECK-LABEL: test_vqdmlashq_n_s16:
311; CHECK:       @ %bb.0: @ %entry
312; CHECK-NEXT:    vqdmlash.s16 q0, q1, r0
313; CHECK-NEXT:    bx lr
314entry:
315  %0 = zext i16 %add to i32
316  %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16> %m1, <8 x i16> %m2, i32 %0)
317  ret <8 x i16> %1
318}
319
320define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add) {
321; CHECK-LABEL: test_vqdmlashq_n_s32:
322; CHECK:       @ %bb.0: @ %entry
323; CHECK-NEXT:    vqdmlash.s32 q0, q1, r0
324; CHECK-NEXT:    bx lr
325entry:
326  %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32> %m1, <4 x i32> %m2, i32 %add)
327  ret <4 x i32> %0
328}
329
330define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
331; CHECK-LABEL: test_vqrdmlahq_n_s8:
332; CHECK:       @ %bb.0: @ %entry
333; CHECK-NEXT:    vqrdmlah.s8 q0, q1, r0
334; CHECK-NEXT:    bx lr
335entry:
336  %0 = zext i8 %c to i32
337  %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0)
338  ret <16 x i8> %1
339}
340
341define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
342; CHECK-LABEL: test_vqrdmlahq_n_s16:
343; CHECK:       @ %bb.0: @ %entry
344; CHECK-NEXT:    vqrdmlah.s16 q0, q1, r0
345; CHECK-NEXT:    bx lr
346entry:
347  %0 = zext i16 %c to i32
348  %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0)
349  ret <8 x i16> %1
350}
351
352define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
353; CHECK-LABEL: test_vqrdmlahq_n_s32:
354; CHECK:       @ %bb.0: @ %entry
355; CHECK-NEXT:    vqrdmlah.s32 q0, q1, r0
356; CHECK-NEXT:    bx lr
357entry:
358  %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c)
359  ret <4 x i32> %0
360}
361
362define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) {
363; CHECK-LABEL: test_vqrdmlashq_n_s8:
364; CHECK:       @ %bb.0: @ %entry
365; CHECK-NEXT:    vqrdmlash.s8 q0, q1, r0
366; CHECK-NEXT:    bx lr
367entry:
368  %0 = zext i8 %c to i32
369  %1 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8> %a, <16 x i8> %b, i32 %0)
370  ret <16 x i8> %1
371}
372
373define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) {
374; CHECK-LABEL: test_vqrdmlashq_n_s16:
375; CHECK:       @ %bb.0: @ %entry
376; CHECK-NEXT:    vqrdmlash.s16 q0, q1, r0
377; CHECK-NEXT:    bx lr
378entry:
379  %0 = zext i16 %c to i32
380  %1 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16> %a, <8 x i16> %b, i32 %0)
381  ret <8 x i16> %1
382}
383
384define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) {
385; CHECK-LABEL: test_vqrdmlashq_n_s32:
386; CHECK:       @ %bb.0: @ %entry
387; CHECK-NEXT:    vqrdmlash.s32 q0, q1, r0
388; CHECK-NEXT:    bx lr
389entry:
390  %0 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %c)
391  ret <4 x i32> %0
392}
393
394define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {
395; CHECK-LABEL: test_vfmaq_m_f16:
396; CHECK:       @ %bb.0: @ %entry
397; CHECK-NEXT:    vmsr p0, r0
398; CHECK-NEXT:    vpst
399; CHECK-NEXT:    vfmat.f16 q0, q1, q2
400; CHECK-NEXT:    bx lr
401entry:
402  %0 = zext i16 %p to i32
403  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
404  %2 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %c, <8 x half> %a, <8 x i1> %1)
405  ret <8 x half> %2
406}
407
408define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) {
409; CHECK-LABEL: test_vfmaq_m_f32:
410; CHECK:       @ %bb.0: @ %entry
411; CHECK-NEXT:    vmsr p0, r0
412; CHECK-NEXT:    vpst
413; CHECK-NEXT:    vfmat.f32 q0, q1, q2
414; CHECK-NEXT:    bx lr
415entry:
416  %0 = zext i16 %p to i32
417  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
418  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %c, <4 x float> %a, <4 x i1> %1)
419  ret <4 x float> %2
420}
421
422define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) {
423; CHECK-LABEL: test_vfmaq_m_n_f16:
424; CHECK:       @ %bb.0: @ %entry
425; CHECK-NEXT:    vmov r1, s8
426; CHECK-NEXT:    vmsr p0, r0
427; CHECK-NEXT:    vpst
428; CHECK-NEXT:    vfmat.f16 q0, q1, r1
429; CHECK-NEXT:    bx lr
430entry:
431  %0 = bitcast float %c.coerce to i32
432  %tmp.0.extract.trunc = trunc i32 %0 to i16
433  %1 = bitcast i16 %tmp.0.extract.trunc to half
434  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
435  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
436  %2 = zext i16 %p to i32
437  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
438  %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %.splat, <8 x half> %a, <8 x i1> %3)
439  ret <8 x half> %4
440}
441
442define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) {
443; CHECK-LABEL: test_vfmaq_m_n_f32:
444; CHECK:       @ %bb.0: @ %entry
445; CHECK-NEXT:    vmov r1, s8
446; CHECK-NEXT:    vmsr p0, r0
447; CHECK-NEXT:    vpst
448; CHECK-NEXT:    vfmat.f32 q0, q1, r1
449; CHECK-NEXT:    bx lr
450entry:
451  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
452  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
453  %0 = zext i16 %p to i32
454  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
455  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %.splat, <4 x float> %a, <4 x i1> %1)
456  ret <4 x float> %2
457}
458
459define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) {
460; CHECK-LABEL: test_vfmasq_m_n_f16:
461; CHECK:       @ %bb.0: @ %entry
462; CHECK-NEXT:    vmov r1, s8
463; CHECK-NEXT:    vmsr p0, r0
464; CHECK-NEXT:    vpst
465; CHECK-NEXT:    vfmast.f16 q0, q1, r1
466; CHECK-NEXT:    bx lr
467entry:
468  %0 = bitcast float %c.coerce to i32
469  %tmp.0.extract.trunc = trunc i32 %0 to i16
470  %1 = bitcast i16 %tmp.0.extract.trunc to half
471  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
472  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
473  %2 = zext i16 %p to i32
474  %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
475  %4 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x half> %.splat, <8 x i1> %3)
476  ret <8 x half> %4
477}
478
479define arm_aapcs_vfpcc <4 x float> @test_vfmasq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) {
480; CHECK-LABEL: test_vfmasq_m_n_f32:
481; CHECK:       @ %bb.0: @ %entry
482; CHECK-NEXT:    vmov r1, s8
483; CHECK-NEXT:    vmsr p0, r0
484; CHECK-NEXT:    vpst
485; CHECK-NEXT:    vfmast.f32 q0, q1, r1
486; CHECK-NEXT:    bx lr
487entry:
488  %.splatinsert = insertelement <4 x float> undef, float %c, i32 0
489  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
490  %0 = zext i16 %p to i32
491  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
492  %2 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x float> %.splat, <4 x i1> %1)
493  ret <4 x float> %2
494}
495
496define arm_aapcs_vfpcc <8 x half> @test_vfmsq_m_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i16 zeroext %p) {
497; CHECK-LABEL: test_vfmsq_m_f16:
498; CHECK:       @ %bb.0: @ %entry
499; CHECK-NEXT:    vmsr p0, r0
500; CHECK-NEXT:    vpst
501; CHECK-NEXT:    vfmst.f16 q0, q1, q2
502; CHECK-NEXT:    bx lr
503entry:
504  %0 = fneg <8 x half> %c
505  %1 = zext i16 %p to i32
506  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
507  %3 = tail call <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %0, <8 x half> %a, <8 x i1> %2)
508  ret <8 x half> %3
509}
510
511define arm_aapcs_vfpcc <4 x float> @test_vfmsq_m_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, i16 zeroext %p) {
512; CHECK-LABEL: test_vfmsq_m_f32:
513; CHECK:       @ %bb.0: @ %entry
514; CHECK-NEXT:    vmsr p0, r0
515; CHECK-NEXT:    vpst
516; CHECK-NEXT:    vfmst.f32 q0, q1, q2
517; CHECK-NEXT:    bx lr
518entry:
519  %0 = fneg <4 x float> %c
520  %1 = zext i16 %p to i32
521  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
522  %3 = tail call <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %0, <4 x float> %a, <4 x i1> %2)
523  ret <4 x float> %3
524}
525
526define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
527; CHECK-LABEL: test_vmlaq_m_n_s8:
528; CHECK:       @ %bb.0: @ %entry
529; CHECK-NEXT:    vmsr p0, r1
530; CHECK-NEXT:    vpst
531; CHECK-NEXT:    vmlat.u8 q0, q1, r0
532; CHECK-NEXT:    bx lr
533entry:
534  %0 = zext i8 %c to i32
535  %1 = zext i16 %p to i32
536  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
537  %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
538  ret <16 x i8> %3
539}
540
541define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
542; CHECK-LABEL: test_vmlaq_m_n_s16:
543; CHECK:       @ %bb.0: @ %entry
544; CHECK-NEXT:    vmsr p0, r1
545; CHECK-NEXT:    vpst
546; CHECK-NEXT:    vmlat.u16 q0, q1, r0
547; CHECK-NEXT:    bx lr
548entry:
549  %0 = zext i16 %c to i32
550  %1 = zext i16 %p to i32
551  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
552  %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
553  ret <8 x i16> %3
554}
555
556define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
557; CHECK-LABEL: test_vmlaq_m_n_s32:
558; CHECK:       @ %bb.0: @ %entry
559; CHECK-NEXT:    vmsr p0, r1
560; CHECK-NEXT:    vpst
561; CHECK-NEXT:    vmlat.u32 q0, q1, r0
562; CHECK-NEXT:    bx lr
563entry:
564  %0 = zext i16 %p to i32
565  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
566  %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
567  ret <4 x i32> %2
568}
569
570define arm_aapcs_vfpcc <16 x i8> @test_vmlaq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) {
571; CHECK-LABEL: test_vmlaq_m_n_u8:
572; CHECK:       @ %bb.0: @ %entry
573; CHECK-NEXT:    vmsr p0, r1
574; CHECK-NEXT:    vpst
575; CHECK-NEXT:    vmlat.u8 q0, q1, r0
576; CHECK-NEXT:    bx lr
577entry:
578  %0 = zext i8 %c to i32
579  %1 = zext i16 %p to i32
580  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
581  %3 = tail call <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
582  ret <16 x i8> %3
583}
584
585define arm_aapcs_vfpcc <8 x i16> @test_vmlaq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) {
586; CHECK-LABEL: test_vmlaq_m_n_u16:
587; CHECK:       @ %bb.0: @ %entry
588; CHECK-NEXT:    vmsr p0, r1
589; CHECK-NEXT:    vpst
590; CHECK-NEXT:    vmlat.u16 q0, q1, r0
591; CHECK-NEXT:    bx lr
592entry:
593  %0 = zext i16 %c to i32
594  %1 = zext i16 %p to i32
595  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
596  %3 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
597  ret <8 x i16> %3
598}
599
600define arm_aapcs_vfpcc <4 x i32> @test_vmlaq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
601; CHECK-LABEL: test_vmlaq_m_n_u32:
602; CHECK:       @ %bb.0: @ %entry
603; CHECK-NEXT:    vmsr p0, r1
604; CHECK-NEXT:    vpst
605; CHECK-NEXT:    vmlat.u32 q0, q1, r0
606; CHECK-NEXT:    bx lr
607entry:
608  %0 = zext i16 %p to i32
609  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
610  %2 = tail call <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
611  ret <4 x i32> %2
612}
613
614define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
615; CHECK-LABEL: test_vmlasq_m_n_s8:
616; CHECK:       @ %bb.0: @ %entry
617; CHECK-NEXT:    vmsr p0, r1
618; CHECK-NEXT:    vpst
619; CHECK-NEXT:    vmlast.u8 q0, q1, r0
620; CHECK-NEXT:    bx lr
621entry:
622  %0 = zext i8 %c to i32
623  %1 = zext i16 %p to i32
624  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
625  %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
626  ret <16 x i8> %3
627}
628
629define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
630; CHECK-LABEL: test_vmlasq_m_n_s16:
631; CHECK:       @ %bb.0: @ %entry
632; CHECK-NEXT:    vmsr p0, r1
633; CHECK-NEXT:    vpst
634; CHECK-NEXT:    vmlast.u16 q0, q1, r0
635; CHECK-NEXT:    bx lr
636entry:
637  %0 = zext i16 %c to i32
638  %1 = zext i16 %p to i32
639  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
640  %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
641  ret <8 x i16> %3
642}
643
644define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
645; CHECK-LABEL: test_vmlasq_m_n_s32:
646; CHECK:       @ %bb.0: @ %entry
647; CHECK-NEXT:    vmsr p0, r1
648; CHECK-NEXT:    vpst
649; CHECK-NEXT:    vmlast.u32 q0, q1, r0
650; CHECK-NEXT:    bx lr
651entry:
652  %0 = zext i16 %p to i32
653  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
654  %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
655  ret <4 x i32> %2
656}
657
658define arm_aapcs_vfpcc <16 x i8> @test_vmlasq_m_n_u8(<16 x i8> %a, <16 x i8> %b, i8 zeroext %c, i16 zeroext %p) {
659; CHECK-LABEL: test_vmlasq_m_n_u8:
660; CHECK:       @ %bb.0: @ %entry
661; CHECK-NEXT:    vmsr p0, r1
662; CHECK-NEXT:    vpst
663; CHECK-NEXT:    vmlast.u8 q0, q1, r0
664; CHECK-NEXT:    bx lr
665entry:
666  %0 = zext i8 %c to i32
667  %1 = zext i16 %p to i32
668  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
669  %3 = tail call <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
670  ret <16 x i8> %3
671}
672
673define arm_aapcs_vfpcc <8 x i16> @test_vmlasq_m_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c, i16 zeroext %p) {
674; CHECK-LABEL: test_vmlasq_m_n_u16:
675; CHECK:       @ %bb.0: @ %entry
676; CHECK-NEXT:    vmsr p0, r1
677; CHECK-NEXT:    vpst
678; CHECK-NEXT:    vmlast.u16 q0, q1, r0
679; CHECK-NEXT:    bx lr
680entry:
681  %0 = zext i16 %c to i32
682  %1 = zext i16 %p to i32
683  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
684  %3 = tail call <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
685  ret <8 x i16> %3
686}
687
688define arm_aapcs_vfpcc <4 x i32> @test_vmlasq_m_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
689; CHECK-LABEL: test_vmlasq_m_n_u32:
690; CHECK:       @ %bb.0: @ %entry
691; CHECK-NEXT:    vmsr p0, r1
692; CHECK-NEXT:    vpst
693; CHECK-NEXT:    vmlast.u32 q0, q1, r0
694; CHECK-NEXT:    bx lr
695entry:
696  %0 = zext i16 %p to i32
697  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
698  %2 = tail call <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
699  ret <4 x i32> %2
700}
701
702define arm_aapcs_vfpcc <16 x i8> @test_vqdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
703; CHECK-LABEL: test_vqdmlahq_m_n_s8:
704; CHECK:       @ %bb.0: @ %entry
705; CHECK-NEXT:    vmsr p0, r1
706; CHECK-NEXT:    vpst
707; CHECK-NEXT:    vqdmlaht.s8 q0, q1, r0
708; CHECK-NEXT:    bx lr
709entry:
710  %0 = zext i8 %c to i32
711  %1 = zext i16 %p to i32
712  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
713  %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
714  ret <16 x i8> %3
715}
716
717define arm_aapcs_vfpcc <8 x i16> @test_vqdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
718; CHECK-LABEL: test_vqdmlahq_m_n_s16:
719; CHECK:       @ %bb.0: @ %entry
720; CHECK-NEXT:    vmsr p0, r1
721; CHECK-NEXT:    vpst
722; CHECK-NEXT:    vqdmlaht.s16 q0, q1, r0
723; CHECK-NEXT:    bx lr
724entry:
725  %0 = zext i16 %c to i32
726  %1 = zext i16 %p to i32
727  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
728  %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
729  ret <8 x i16> %3
730}
731
732define arm_aapcs_vfpcc <4 x i32> @test_vqdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
733; CHECK-LABEL: test_vqdmlahq_m_n_s32:
734; CHECK:       @ %bb.0: @ %entry
735; CHECK-NEXT:    vmsr p0, r1
736; CHECK-NEXT:    vpst
737; CHECK-NEXT:    vqdmlaht.s32 q0, q1, r0
738; CHECK-NEXT:    bx lr
739entry:
740  %0 = zext i16 %p to i32
741  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
742  %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
743  ret <4 x i32> %2
744}
745
746define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_m_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add, i16 zeroext %p) {
747; CHECK-LABEL: test_vqdmlashq_m_n_s8:
748; CHECK:       @ %bb.0: @ %entry
749; CHECK-NEXT:    vmsr p0, r1
750; CHECK-NEXT:    vpst
751; CHECK-NEXT:    vqdmlasht.s8 q0, q1, r0
752; CHECK-NEXT:    bx lr
753entry:
754  %0 = zext i8 %add to i32
755  %1 = zext i16 %p to i32
756  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
757  %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8> %m1, <16 x i8> %m2, i32 %0, <16 x i1> %2)
758  ret <16 x i8> %3
759}
760
761define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_m_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add, i16 zeroext %p) {
762; CHECK-LABEL: test_vqdmlashq_m_n_s16:
763; CHECK:       @ %bb.0: @ %entry
764; CHECK-NEXT:    vmsr p0, r1
765; CHECK-NEXT:    vpst
766; CHECK-NEXT:    vqdmlasht.s16 q0, q1, r0
767; CHECK-NEXT:    bx lr
768entry:
769  %0 = zext i16 %add to i32
770  %1 = zext i16 %p to i32
771  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
772  %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16> %m1, <8 x i16> %m2, i32 %0, <8 x i1> %2)
773  ret <8 x i16> %3
774}
775
776define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_m_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add, i16 zeroext %p) {
777; CHECK-LABEL: test_vqdmlashq_m_n_s32:
778; CHECK:       @ %bb.0: @ %entry
779; CHECK-NEXT:    vmsr p0, r1
780; CHECK-NEXT:    vpst
781; CHECK-NEXT:    vqdmlasht.s32 q0, q1, r0
782; CHECK-NEXT:    bx lr
783entry:
784  %0 = zext i16 %p to i32
785  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
786  %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32> %m1, <4 x i32> %m2, i32 %add, <4 x i1> %1)
787  ret <4 x i32> %2
788}
789
790define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
791; CHECK-LABEL: test_vqrdmlahq_m_n_s8:
792; CHECK:       @ %bb.0: @ %entry
793; CHECK-NEXT:    vmsr p0, r1
794; CHECK-NEXT:    vpst
795; CHECK-NEXT:    vqrdmlaht.s8 q0, q1, r0
796; CHECK-NEXT:    bx lr
797entry:
798  %0 = zext i8 %c to i32
799  %1 = zext i16 %p to i32
800  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
801  %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
802  ret <16 x i8> %3
803}
804
805define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
806; CHECK-LABEL: test_vqrdmlahq_m_n_s16:
807; CHECK:       @ %bb.0: @ %entry
808; CHECK-NEXT:    vmsr p0, r1
809; CHECK-NEXT:    vpst
810; CHECK-NEXT:    vqrdmlaht.s16 q0, q1, r0
811; CHECK-NEXT:    bx lr
812entry:
813  %0 = zext i16 %c to i32
814  %1 = zext i16 %p to i32
815  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
816  %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
817  ret <8 x i16> %3
818}
819
820define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
821; CHECK-LABEL: test_vqrdmlahq_m_n_s32:
822; CHECK:       @ %bb.0: @ %entry
823; CHECK-NEXT:    vmsr p0, r1
824; CHECK-NEXT:    vpst
825; CHECK-NEXT:    vqrdmlaht.s32 q0, q1, r0
826; CHECK-NEXT:    bx lr
827entry:
828  %0 = zext i16 %p to i32
829  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
830  %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
831  ret <4 x i32> %2
832}
833
834define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlashq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) {
835; CHECK-LABEL: test_vqrdmlashq_m_n_s8:
836; CHECK:       @ %bb.0: @ %entry
837; CHECK-NEXT:    vmsr p0, r1
838; CHECK-NEXT:    vpst
839; CHECK-NEXT:    vqrdmlasht.s8 q0, q1, r0
840; CHECK-NEXT:    bx lr
841entry:
842  %0 = zext i8 %c to i32
843  %1 = zext i16 %p to i32
844  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
845  %3 = tail call <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, i32 %0, <16 x i1> %2)
846  ret <16 x i8> %3
847}
848
849define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlashq_m_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c, i16 zeroext %p) {
850; CHECK-LABEL: test_vqrdmlashq_m_n_s16:
851; CHECK:       @ %bb.0: @ %entry
852; CHECK-NEXT:    vmsr p0, r1
853; CHECK-NEXT:    vpst
854; CHECK-NEXT:    vqrdmlasht.s16 q0, q1, r0
855; CHECK-NEXT:    bx lr
856entry:
857  %0 = zext i16 %c to i32
858  %1 = zext i16 %p to i32
859  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
860  %3 = tail call <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, i32 %0, <8 x i1> %2)
861  ret <8 x i16> %3
862}
863
864define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlashq_m_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c, i16 zeroext %p) {
865; CHECK-LABEL: test_vqrdmlashq_m_n_s32:
866; CHECK:       @ %bb.0: @ %entry
867; CHECK-NEXT:    vmsr p0, r1
868; CHECK-NEXT:    vpst
869; CHECK-NEXT:    vqrdmlasht.s32 q0, q1, r0
870; CHECK-NEXT:    bx lr
871entry:
872  %0 = zext i16 %p to i32
873  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
874  %2 = tail call <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %1)
875  ret <4 x i32> %2
876}
877
878declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
879declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
880declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
881
882declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
883declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
884declare <8 x half> @llvm.arm.mve.fma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x half>, <8 x i1>)
885declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
886declare <16 x i8> @llvm.arm.mve.vmla.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
887declare <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
888declare <4 x i32> @llvm.arm.mve.vmla.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
889declare <16 x i8> @llvm.arm.mve.vmlas.n.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
890declare <8 x i16> @llvm.arm.mve.vmlas.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
891declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
892declare <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8>, <16 x i8>, i32)
893declare <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16>, <8 x i16>, i32)
894declare <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32>, <4 x i32>, i32)
895declare <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8>, <16 x i8>, i32)
896declare <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16>, <8 x i16>, i32)
897declare <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32>, <4 x i32>, i32)
898declare <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8>, <16 x i8>, i32)
899declare <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, i32)
900declare <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, i32)
901declare <16 x i8> @llvm.arm.mve.vqrdmlash.v16i8(<16 x i8>, <16 x i8>, i32)
902declare <8 x i16> @llvm.arm.mve.vqrdmlash.v8i16(<8 x i16>, <8 x i16>, i32)
903declare <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32>, <4 x i32>, i32)
904declare <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
905declare <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
906declare <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
907declare <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
908declare <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
909declare <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
910declare <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
911declare <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
912declare <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
913declare <16 x i8> @llvm.arm.mve.vqrdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>)
914declare <8 x i16> @llvm.arm.mve.vqrdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>)
915declare <4 x i32> @llvm.arm.mve.vqrdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>)
916