1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
3
4define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) {
5; CHECK-LABEL: test_vidupq_n_u8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vidup.u8 q0, r0, #4
8; CHECK-NEXT:    bx lr
9entry:
10  %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4)
11  %1 = extractvalue { <16 x i8>, i32 } %0, 0
12  ret <16 x i8> %1
13}
14
15define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) {
16; CHECK-LABEL: test_vidupq_n_u16:
17; CHECK:       @ %bb.0: @ %entry
18; CHECK-NEXT:    vidup.u16 q0, r0, #1
19; CHECK-NEXT:    bx lr
20entry:
21  %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1)
22  %1 = extractvalue { <8 x i16>, i32 } %0, 0
23  ret <8 x i16> %1
24}
25
26define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) {
27; CHECK-LABEL: test_vidupq_n_u32:
28; CHECK:       @ %bb.0: @ %entry
29; CHECK-NEXT:    vidup.u32 q0, r0, #4
30; CHECK-NEXT:    bx lr
31entry:
32  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4)
33  %1 = extractvalue { <4 x i32>, i32 } %0, 0
34  ret <4 x i32> %1
35}
36
37define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) {
38; CHECK-LABEL: test_vddupq_n_u8:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vddup.u8 q0, r0, #2
41; CHECK-NEXT:    bx lr
42entry:
43  %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2)
44  %1 = extractvalue { <16 x i8>, i32 } %0, 0
45  ret <16 x i8> %1
46}
47
48define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) {
49; CHECK-LABEL: test_vddupq_n_u16:
50; CHECK:       @ %bb.0: @ %entry
51; CHECK-NEXT:    vddup.u16 q0, r0, #4
52; CHECK-NEXT:    bx lr
53entry:
54  %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4)
55  %1 = extractvalue { <8 x i16>, i32 } %0, 0
56  ret <8 x i16> %1
57}
58
59define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) {
60; CHECK-LABEL: test_vddupq_n_u32:
61; CHECK:       @ %bb.0: @ %entry
62; CHECK-NEXT:    vddup.u32 q0, r0, #2
63; CHECK-NEXT:    bx lr
64entry:
65  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2)
66  %1 = extractvalue { <4 x i32>, i32 } %0, 0
67  ret <4 x i32> %1
68}
69
70define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) {
71; CHECK-LABEL: test_viwdupq_n_u8:
72; CHECK:       @ %bb.0: @ %entry
73; CHECK-NEXT:    viwdup.u8 q0, r0, r1, #4
74; CHECK-NEXT:    bx lr
75entry:
76  %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4)
77  %1 = extractvalue { <16 x i8>, i32 } %0, 0
78  ret <16 x i8> %1
79}
80
81define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) {
82; CHECK-LABEL: test_viwdupq_n_u16:
83; CHECK:       @ %bb.0: @ %entry
84; CHECK-NEXT:    viwdup.u16 q0, r0, r1, #2
85; CHECK-NEXT:    bx lr
86entry:
87  %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2)
88  %1 = extractvalue { <8 x i16>, i32 } %0, 0
89  ret <8 x i16> %1
90}
91
92define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) {
93; CHECK-LABEL: test_viwdupq_n_u32:
94; CHECK:       @ %bb.0: @ %entry
95; CHECK-NEXT:    viwdup.u32 q0, r0, r1, #8
96; CHECK-NEXT:    bx lr
97entry:
98  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8)
99  %1 = extractvalue { <4 x i32>, i32 } %0, 0
100  ret <4 x i32> %1
101}
102
103define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) {
104; CHECK-LABEL: test_vdwdupq_n_u8:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    vdwdup.u8 q0, r0, r1, #4
107; CHECK-NEXT:    bx lr
108entry:
109  %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4)
110  %1 = extractvalue { <16 x i8>, i32 } %0, 0
111  ret <16 x i8> %1
112}
113
114define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) {
115; CHECK-LABEL: test_vdwdupq_n_u16:
116; CHECK:       @ %bb.0: @ %entry
117; CHECK-NEXT:    vdwdup.u16 q0, r0, r1, #8
118; CHECK-NEXT:    bx lr
119entry:
120  %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8)
121  %1 = extractvalue { <8 x i16>, i32 } %0, 0
122  ret <8 x i16> %1
123}
124
125define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) {
126; CHECK-LABEL: test_vdwdupq_n_u32:
127; CHECK:       @ %bb.0: @ %entry
128; CHECK-NEXT:    vdwdup.u32 q0, r0, r1, #1
129; CHECK-NEXT:    bx lr
130entry:
131  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1)
132  %1 = extractvalue { <4 x i32>, i32 } %0, 0
133  ret <4 x i32> %1
134}
135
136define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(i32* nocapture %a) {
137; CHECK-LABEL: test_vidupq_wb_u8:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    ldr r2, [r0]
140; CHECK-NEXT:    vidup.u8 q0, r2, #8
141; CHECK-NEXT:    str r2, [r0]
142; CHECK-NEXT:    bx lr
143entry:
144  %0 = load i32, i32* %a, align 4
145  %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8)
146  %2 = extractvalue { <16 x i8>, i32 } %1, 1
147  store i32 %2, i32* %a, align 4
148  %3 = extractvalue { <16 x i8>, i32 } %1, 0
149  ret <16 x i8> %3
150}
151
152define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(i32* nocapture %a) {
153; CHECK-LABEL: test_vidupq_wb_u16:
154; CHECK:       @ %bb.0: @ %entry
155; CHECK-NEXT:    ldr r2, [r0]
156; CHECK-NEXT:    vidup.u16 q0, r2, #1
157; CHECK-NEXT:    str r2, [r0]
158; CHECK-NEXT:    bx lr
159entry:
160  %0 = load i32, i32* %a, align 4
161  %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1)
162  %2 = extractvalue { <8 x i16>, i32 } %1, 1
163  store i32 %2, i32* %a, align 4
164  %3 = extractvalue { <8 x i16>, i32 } %1, 0
165  ret <8 x i16> %3
166}
167
168define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(i32* nocapture %a) {
169; CHECK-LABEL: test_vidupq_wb_u32:
170; CHECK:       @ %bb.0: @ %entry
171; CHECK-NEXT:    ldr r2, [r0]
172; CHECK-NEXT:    vidup.u32 q0, r2, #4
173; CHECK-NEXT:    str r2, [r0]
174; CHECK-NEXT:    bx lr
175entry:
176  %0 = load i32, i32* %a, align 4
177  %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4)
178  %2 = extractvalue { <4 x i32>, i32 } %1, 1
179  store i32 %2, i32* %a, align 4
180  %3 = extractvalue { <4 x i32>, i32 } %1, 0
181  ret <4 x i32> %3
182}
183
184define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(i32* nocapture %a) {
185; CHECK-LABEL: test_vddupq_wb_u8:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    ldr r2, [r0]
188; CHECK-NEXT:    vddup.u8 q0, r2, #2
189; CHECK-NEXT:    str r2, [r0]
190; CHECK-NEXT:    bx lr
191entry:
192  %0 = load i32, i32* %a, align 4
193  %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2)
194  %2 = extractvalue { <16 x i8>, i32 } %1, 1
195  store i32 %2, i32* %a, align 4
196  %3 = extractvalue { <16 x i8>, i32 } %1, 0
197  ret <16 x i8> %3
198}
199
200define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(i32* nocapture %a) {
201; CHECK-LABEL: test_vddupq_wb_u16:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    ldr r2, [r0]
204; CHECK-NEXT:    vddup.u16 q0, r2, #8
205; CHECK-NEXT:    str r2, [r0]
206; CHECK-NEXT:    bx lr
207entry:
208  %0 = load i32, i32* %a, align 4
209  %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8)
210  %2 = extractvalue { <8 x i16>, i32 } %1, 1
211  store i32 %2, i32* %a, align 4
212  %3 = extractvalue { <8 x i16>, i32 } %1, 0
213  ret <8 x i16> %3
214}
215
216define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(i32* nocapture %a) {
217; CHECK-LABEL: test_vddupq_wb_u32:
218; CHECK:       @ %bb.0: @ %entry
219; CHECK-NEXT:    ldr r2, [r0]
220; CHECK-NEXT:    vddup.u32 q0, r2, #2
221; CHECK-NEXT:    str r2, [r0]
222; CHECK-NEXT:    bx lr
223entry:
224  %0 = load i32, i32* %a, align 4
225  %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2)
226  %2 = extractvalue { <4 x i32>, i32 } %1, 1
227  store i32 %2, i32* %a, align 4
228  %3 = extractvalue { <4 x i32>, i32 } %1, 0
229  ret <4 x i32> %3
230}
231
232define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(i32* nocapture %a, i32 %b) {
233; CHECK-LABEL: test_vdwdupq_wb_u8:
234; CHECK:       @ %bb.0: @ %entry
235; CHECK-NEXT:    ldr r2, [r0]
236; CHECK-NEXT:    vdwdup.u8 q0, r2, r1, #4
237; CHECK-NEXT:    str r2, [r0]
238; CHECK-NEXT:    bx lr
239entry:
240  %0 = load i32, i32* %a, align 4
241  %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4)
242  %2 = extractvalue { <16 x i8>, i32 } %1, 1
243  store i32 %2, i32* %a, align 4
244  %3 = extractvalue { <16 x i8>, i32 } %1, 0
245  ret <16 x i8> %3
246}
247
248define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(i32* nocapture %a, i32 %b) {
249; CHECK-LABEL: test_vdwdupq_wb_u16:
250; CHECK:       @ %bb.0: @ %entry
251; CHECK-NEXT:    ldr r2, [r0]
252; CHECK-NEXT:    vdwdup.u16 q0, r2, r1, #4
253; CHECK-NEXT:    str r2, [r0]
254; CHECK-NEXT:    bx lr
255entry:
256  %0 = load i32, i32* %a, align 4
257  %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4)
258  %2 = extractvalue { <8 x i16>, i32 } %1, 1
259  store i32 %2, i32* %a, align 4
260  %3 = extractvalue { <8 x i16>, i32 } %1, 0
261  ret <8 x i16> %3
262}
263
264define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(i32* nocapture %a, i32 %b) {
265; CHECK-LABEL: test_viwdupq_wb_u8:
266; CHECK:       @ %bb.0: @ %entry
267; CHECK-NEXT:    ldr r2, [r0]
268; CHECK-NEXT:    viwdup.u8 q0, r2, r1, #1
269; CHECK-NEXT:    str r2, [r0]
270; CHECK-NEXT:    bx lr
271entry:
272  %0 = load i32, i32* %a, align 4
273  %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1)
274  %2 = extractvalue { <16 x i8>, i32 } %1, 1
275  store i32 %2, i32* %a, align 4
276  %3 = extractvalue { <16 x i8>, i32 } %1, 0
277  ret <16 x i8> %3
278}
279
280define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(i32* nocapture %a, i32 %b) {
281; CHECK-LABEL: test_viwdupq_wb_u16:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    ldr r2, [r0]
284; CHECK-NEXT:    viwdup.u16 q0, r2, r1, #1
285; CHECK-NEXT:    str r2, [r0]
286; CHECK-NEXT:    bx lr
287entry:
288  %0 = load i32, i32* %a, align 4
289  %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1)
290  %2 = extractvalue { <8 x i16>, i32 } %1, 1
291  store i32 %2, i32* %a, align 4
292  %3 = extractvalue { <8 x i16>, i32 } %1, 0
293  ret <8 x i16> %3
294}
295
296define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(i32* nocapture %a, i32 %b) {
297; CHECK-LABEL: test_viwdupq_wb_u32:
298; CHECK:       @ %bb.0: @ %entry
299; CHECK-NEXT:    ldr r2, [r0]
300; CHECK-NEXT:    viwdup.u32 q0, r2, r1, #8
301; CHECK-NEXT:    str r2, [r0]
302; CHECK-NEXT:    bx lr
303entry:
304  %0 = load i32, i32* %a, align 4
305  %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8)
306  %2 = extractvalue { <4 x i32>, i32 } %1, 1
307  store i32 %2, i32* %a, align 4
308  %3 = extractvalue { <4 x i32>, i32 } %1, 0
309  ret <4 x i32> %3
310}
311
312define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(i32* nocapture %a, i32 %b) {
313; CHECK-LABEL: test_vdwdupq_wb_u32:
314; CHECK:       @ %bb.0: @ %entry
315; CHECK-NEXT:    ldr r2, [r0]
316; CHECK-NEXT:    vdwdup.u32 q0, r2, r1, #2
317; CHECK-NEXT:    str r2, [r0]
318; CHECK-NEXT:    bx lr
319entry:
320  %0 = load i32, i32* %a, align 4
321  %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2)
322  %2 = extractvalue { <4 x i32>, i32 } %1, 1
323  store i32 %2, i32* %a, align 4
324  %3 = extractvalue { <4 x i32>, i32 } %1, 0
325  ret <4 x i32> %3
326}
327
328define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
329; CHECK-LABEL: test_vidupq_m_n_u8:
330; CHECK:       @ %bb.0: @ %entry
331; CHECK-NEXT:    vmsr p0, r1
332; CHECK-NEXT:    vpst
333; CHECK-NEXT:    vidupt.u8 q0, r0, #8
334; CHECK-NEXT:    bx lr
335entry:
336  %0 = zext i16 %p to i32
337  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
338  %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
339  %3 = extractvalue { <16 x i8>, i32 } %2, 0
340  ret <16 x i8> %3
341}
342
343define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
344; CHECK-LABEL: test_vidupq_m_n_u16:
345; CHECK:       @ %bb.0: @ %entry
346; CHECK-NEXT:    vmsr p0, r1
347; CHECK-NEXT:    vpst
348; CHECK-NEXT:    vidupt.u16 q0, r0, #8
349; CHECK-NEXT:    bx lr
350entry:
351  %0 = zext i16 %p to i32
352  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
353  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1)
354  %3 = extractvalue { <8 x i16>, i32 } %2, 0
355  ret <8 x i16> %3
356}
357
358define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
359; CHECK-LABEL: test_vidupq_m_n_u32:
360; CHECK:       @ %bb.0: @ %entry
361; CHECK-NEXT:    vmsr p0, r1
362; CHECK-NEXT:    vpst
363; CHECK-NEXT:    vidupt.u32 q0, r0, #2
364; CHECK-NEXT:    bx lr
365entry:
366  %0 = zext i16 %p to i32
367  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
368  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1)
369  %3 = extractvalue { <4 x i32>, i32 } %2, 0
370  ret <4 x i32> %3
371}
372
373define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
374; CHECK-LABEL: test_vddupq_m_n_u8:
375; CHECK:       @ %bb.0: @ %entry
376; CHECK-NEXT:    vmsr p0, r1
377; CHECK-NEXT:    vpst
378; CHECK-NEXT:    vddupt.u8 q0, r0, #8
379; CHECK-NEXT:    bx lr
380entry:
381  %0 = zext i16 %p to i32
382  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
383  %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
384  %3 = extractvalue { <16 x i8>, i32 } %2, 0
385  ret <16 x i8> %3
386}
387
388define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
389; CHECK-LABEL: test_vddupq_m_n_u16:
390; CHECK:       @ %bb.0: @ %entry
391; CHECK-NEXT:    vmsr p0, r1
392; CHECK-NEXT:    vpst
393; CHECK-NEXT:    vddupt.u16 q0, r0, #2
394; CHECK-NEXT:    bx lr
395entry:
396  %0 = zext i16 %p to i32
397  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
398  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1)
399  %3 = extractvalue { <8 x i16>, i32 } %2, 0
400  ret <8 x i16> %3
401}
402
403define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
404; CHECK-LABEL: test_vddupq_m_n_u32:
405; CHECK:       @ %bb.0: @ %entry
406; CHECK-NEXT:    vmsr p0, r1
407; CHECK-NEXT:    vpst
408; CHECK-NEXT:    vddupt.u32 q0, r0, #8
409; CHECK-NEXT:    bx lr
410entry:
411  %0 = zext i16 %p to i32
412  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
413  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1)
414  %3 = extractvalue { <4 x i32>, i32 } %2, 0
415  ret <4 x i32> %3
416}
417
418define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
419; CHECK-LABEL: test_viwdupq_m_n_u8:
420; CHECK:       @ %bb.0: @ %entry
421; CHECK-NEXT:    vmsr p0, r2
422; CHECK-NEXT:    vpst
423; CHECK-NEXT:    viwdupt.u8 q0, r0, r1, #8
424; CHECK-NEXT:    bx lr
425entry:
426  %0 = zext i16 %p to i32
427  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
428  %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1)
429  %3 = extractvalue { <16 x i8>, i32 } %2, 0
430  ret <16 x i8> %3
431}
432
433define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
434; CHECK-LABEL: test_viwdupq_m_n_u16:
435; CHECK:       @ %bb.0: @ %entry
436; CHECK-NEXT:    vmsr p0, r2
437; CHECK-NEXT:    vpst
438; CHECK-NEXT:    viwdupt.u16 q0, r0, r1, #8
439; CHECK-NEXT:    bx lr
440entry:
441  %0 = zext i16 %p to i32
442  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
443  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1)
444  %3 = extractvalue { <8 x i16>, i32 } %2, 0
445  ret <8 x i16> %3
446}
447
448define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
449; CHECK-LABEL: test_viwdupq_m_n_u32:
450; CHECK:       @ %bb.0: @ %entry
451; CHECK-NEXT:    vmsr p0, r2
452; CHECK-NEXT:    vpst
453; CHECK-NEXT:    viwdupt.u32 q0, r0, r1, #4
454; CHECK-NEXT:    bx lr
455entry:
456  %0 = zext i16 %p to i32
457  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
458  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
459  %3 = extractvalue { <4 x i32>, i32 } %2, 0
460  ret <4 x i32> %3
461}
462
463define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
464; CHECK-LABEL: test_vdwdupq_m_n_u8:
465; CHECK:       @ %bb.0: @ %entry
466; CHECK-NEXT:    vmsr p0, r2
467; CHECK-NEXT:    vpst
468; CHECK-NEXT:    vdwdupt.u8 q0, r0, r1, #1
469; CHECK-NEXT:    bx lr
470entry:
471  %0 = zext i16 %p to i32
472  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
473  %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1)
474  %3 = extractvalue { <16 x i8>, i32 } %2, 0
475  ret <16 x i8> %3
476}
477
478define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
479; CHECK-LABEL: test_vdwdupq_m_n_u16:
480; CHECK:       @ %bb.0: @ %entry
481; CHECK-NEXT:    vmsr p0, r2
482; CHECK-NEXT:    vpst
483; CHECK-NEXT:    vdwdupt.u16 q0, r0, r1, #2
484; CHECK-NEXT:    bx lr
485entry:
486  %0 = zext i16 %p to i32
487  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
488  %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1)
489  %3 = extractvalue { <8 x i16>, i32 } %2, 0
490  ret <8 x i16> %3
491}
492
493define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
494; CHECK-LABEL: test_vdwdupq_m_n_u32:
495; CHECK:       @ %bb.0: @ %entry
496; CHECK-NEXT:    vmsr p0, r2
497; CHECK-NEXT:    vpst
498; CHECK-NEXT:    vdwdupt.u32 q0, r0, r1, #4
499; CHECK-NEXT:    bx lr
500entry:
501  %0 = zext i16 %p to i32
502  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
503  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
504  %3 = extractvalue { <4 x i32>, i32 } %2, 0
505  ret <4 x i32> %3
506}
507
508define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
509; CHECK-LABEL: test_vidupq_m_wb_u8:
510; CHECK:       @ %bb.0: @ %entry
511; CHECK-NEXT:    ldr r2, [r0]
512; CHECK-NEXT:    vmsr p0, r1
513; CHECK-NEXT:    vpst
514; CHECK-NEXT:    vidupt.u8 q0, r2, #8
515; CHECK-NEXT:    str r2, [r0]
516; CHECK-NEXT:    bx lr
517entry:
518  %0 = load i32, i32* %a, align 4
519  %1 = zext i16 %p to i32
520  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
521  %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2)
522  %4 = extractvalue { <16 x i8>, i32 } %3, 1
523  store i32 %4, i32* %a, align 4
524  %5 = extractvalue { <16 x i8>, i32 } %3, 0
525  ret <16 x i8> %5
526}
527
528define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
529; CHECK-LABEL: test_vidupq_m_wb_u16:
530; CHECK:       @ %bb.0: @ %entry
531; CHECK-NEXT:    ldr r2, [r0]
532; CHECK-NEXT:    vmsr p0, r1
533; CHECK-NEXT:    vpst
534; CHECK-NEXT:    vidupt.u16 q0, r2, #2
535; CHECK-NEXT:    str r2, [r0]
536; CHECK-NEXT:    bx lr
537entry:
538  %0 = load i32, i32* %a, align 4
539  %1 = zext i16 %p to i32
540  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
541  %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2)
542  %4 = extractvalue { <8 x i16>, i32 } %3, 1
543  store i32 %4, i32* %a, align 4
544  %5 = extractvalue { <8 x i16>, i32 } %3, 0
545  ret <8 x i16> %5
546}
547
548define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
549; CHECK-LABEL: test_vidupq_m_wb_u32:
550; CHECK:       @ %bb.0: @ %entry
551; CHECK-NEXT:    ldr r2, [r0]
552; CHECK-NEXT:    vmsr p0, r1
553; CHECK-NEXT:    vpst
554; CHECK-NEXT:    vidupt.u32 q0, r2, #8
555; CHECK-NEXT:    str r2, [r0]
556; CHECK-NEXT:    bx lr
557entry:
558  %0 = load i32, i32* %a, align 4
559  %1 = zext i16 %p to i32
560  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
561  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2)
562  %4 = extractvalue { <4 x i32>, i32 } %3, 1
563  store i32 %4, i32* %a, align 4
564  %5 = extractvalue { <4 x i32>, i32 } %3, 0
565  ret <4 x i32> %5
566}
567
568define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
569; CHECK-LABEL: test_vddupq_m_wb_u8:
570; CHECK:       @ %bb.0: @ %entry
571; CHECK-NEXT:    ldr r2, [r0]
572; CHECK-NEXT:    vmsr p0, r1
573; CHECK-NEXT:    vpst
574; CHECK-NEXT:    vddupt.u8 q0, r2, #1
575; CHECK-NEXT:    str r2, [r0]
576; CHECK-NEXT:    bx lr
577entry:
578  %0 = load i32, i32* %a, align 4
579  %1 = zext i16 %p to i32
580  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
581  %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2)
582  %4 = extractvalue { <16 x i8>, i32 } %3, 1
583  store i32 %4, i32* %a, align 4
584  %5 = extractvalue { <16 x i8>, i32 } %3, 0
585  ret <16 x i8> %5
586}
587
588define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
589; CHECK-LABEL: test_vddupq_m_wb_u16:
590; CHECK:       @ %bb.0: @ %entry
591; CHECK-NEXT:    ldr r2, [r0]
592; CHECK-NEXT:    vmsr p0, r1
593; CHECK-NEXT:    vpst
594; CHECK-NEXT:    vddupt.u16 q0, r2, #1
595; CHECK-NEXT:    str r2, [r0]
596; CHECK-NEXT:    bx lr
597entry:
598  %0 = load i32, i32* %a, align 4
599  %1 = zext i16 %p to i32
600  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
601  %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2)
602  %4 = extractvalue { <8 x i16>, i32 } %3, 1
603  store i32 %4, i32* %a, align 4
604  %5 = extractvalue { <8 x i16>, i32 } %3, 0
605  ret <8 x i16> %5
606}
607
608define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
609; CHECK-LABEL: test_vddupq_m_wb_u32:
610; CHECK:       @ %bb.0: @ %entry
611; CHECK-NEXT:    ldr r2, [r0]
612; CHECK-NEXT:    vmsr p0, r1
613; CHECK-NEXT:    vpst
614; CHECK-NEXT:    vddupt.u32 q0, r2, #4
615; CHECK-NEXT:    str r2, [r0]
616; CHECK-NEXT:    bx lr
617entry:
618  %0 = load i32, i32* %a, align 4
619  %1 = zext i16 %p to i32
620  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
621  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2)
622  %4 = extractvalue { <4 x i32>, i32 } %3, 1
623  store i32 %4, i32* %a, align 4
624  %5 = extractvalue { <4 x i32>, i32 } %3, 0
625  ret <4 x i32> %5
626}
627
628define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
629; CHECK-LABEL: test_viwdupq_m_wb_u8:
630; CHECK:       @ %bb.0: @ %entry
631; CHECK-NEXT:    ldr.w r12, [r0]
632; CHECK-NEXT:    vmsr p0, r2
633; CHECK-NEXT:    vpst
634; CHECK-NEXT:    viwdupt.u8 q0, r12, r1, #8
635; CHECK-NEXT:    str.w r12, [r0]
636; CHECK-NEXT:    bx lr
637entry:
638  %0 = load i32, i32* %a, align 4
639  %1 = zext i16 %p to i32
640  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
641  %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2)
642  %4 = extractvalue { <16 x i8>, i32 } %3, 1
643  store i32 %4, i32* %a, align 4
644  %5 = extractvalue { <16 x i8>, i32 } %3, 0
645  ret <16 x i8> %5
646}
647
648define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
649; CHECK-LABEL: test_viwdupq_m_wb_u16:
650; CHECK:       @ %bb.0: @ %entry
651; CHECK-NEXT:    ldr.w r12, [r0]
652; CHECK-NEXT:    vmsr p0, r2
653; CHECK-NEXT:    vpst
654; CHECK-NEXT:    viwdupt.u16 q0, r12, r1, #8
655; CHECK-NEXT:    str.w r12, [r0]
656; CHECK-NEXT:    bx lr
657entry:
658  %0 = load i32, i32* %a, align 4
659  %1 = zext i16 %p to i32
660  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
661  %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2)
662  %4 = extractvalue { <8 x i16>, i32 } %3, 1
663  store i32 %4, i32* %a, align 4
664  %5 = extractvalue { <8 x i16>, i32 } %3, 0
665  ret <8 x i16> %5
666}
667
668define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
669; CHECK-LABEL: test_viwdupq_m_wb_u32:
670; CHECK:       @ %bb.0: @ %entry
671; CHECK-NEXT:    ldr.w r12, [r0]
672; CHECK-NEXT:    vmsr p0, r2
673; CHECK-NEXT:    vpst
674; CHECK-NEXT:    viwdupt.u32 q0, r12, r1, #4
675; CHECK-NEXT:    str.w r12, [r0]
676; CHECK-NEXT:    bx lr
677entry:
678  %0 = load i32, i32* %a, align 4
679  %1 = zext i16 %p to i32
680  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
681  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
682  %4 = extractvalue { <4 x i32>, i32 } %3, 1
683  store i32 %4, i32* %a, align 4
684  %5 = extractvalue { <4 x i32>, i32 } %3, 0
685  ret <4 x i32> %5
686}
687
688define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
689; CHECK-LABEL: test_vdwdupq_m_wb_u8:
690; CHECK:       @ %bb.0: @ %entry
691; CHECK-NEXT:    ldr.w r12, [r0]
692; CHECK-NEXT:    vmsr p0, r2
693; CHECK-NEXT:    vpst
694; CHECK-NEXT:    vdwdupt.u8 q0, r12, r1, #1
695; CHECK-NEXT:    str.w r12, [r0]
696; CHECK-NEXT:    bx lr
697entry:
698  %0 = load i32, i32* %a, align 4
699  %1 = zext i16 %p to i32
700  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
701  %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2)
702  %4 = extractvalue { <16 x i8>, i32 } %3, 1
703  store i32 %4, i32* %a, align 4
704  %5 = extractvalue { <16 x i8>, i32 } %3, 0
705  ret <16 x i8> %5
706}
707
708define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
709; CHECK-LABEL: test_vdwdupq_m_wb_u16:
710; CHECK:       @ %bb.0: @ %entry
711; CHECK-NEXT:    ldr.w r12, [r0]
712; CHECK-NEXT:    vmsr p0, r2
713; CHECK-NEXT:    vpst
714; CHECK-NEXT:    vdwdupt.u16 q0, r12, r1, #4
715; CHECK-NEXT:    str.w r12, [r0]
716; CHECK-NEXT:    bx lr
717entry:
718  %0 = load i32, i32* %a, align 4
719  %1 = zext i16 %p to i32
720  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
721  %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2)
722  %4 = extractvalue { <8 x i16>, i32 } %3, 1
723  store i32 %4, i32* %a, align 4
724  %5 = extractvalue { <8 x i16>, i32 } %3, 0
725  ret <8 x i16> %5
726}
727
728define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
729; CHECK-LABEL: test_vdwdupq_m_wb_u32:
730; CHECK:       @ %bb.0: @ %entry
731; CHECK-NEXT:    ldr.w r12, [r0]
732; CHECK-NEXT:    vmsr p0, r2
733; CHECK-NEXT:    vpst
734; CHECK-NEXT:    vdwdupt.u32 q0, r12, r1, #4
735; CHECK-NEXT:    str.w r12, [r0]
736; CHECK-NEXT:    bx lr
737entry:
738  %0 = load i32, i32* %a, align 4
739  %1 = zext i16 %p to i32
740  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
741  %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
742  %4 = extractvalue { <4 x i32>, i32 } %3, 1
743  store i32 %4, i32* %a, align 4
744  %5 = extractvalue { <4 x i32>, i32 } %3, 0
745  ret <4 x i32> %5
746}
747
748declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
749declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
750declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
751
752declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
753declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32)
754declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
755declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32)
756declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32)
757declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32)
758declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32)
759declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32)
760declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
761declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32)
762declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32)
763declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32)
764declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
765declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
766declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
767declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
768declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
769declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
770declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
771declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
772declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)
773declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
774declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
775declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)
776