1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
3
4define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_s8(<16 x i8> %a, i32* nocapture %b) {
5; CHECK-LABEL: test_vshlcq_s8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    ldr r1, [r0]
8; CHECK-NEXT:    vshlc q0, r1, #18
9; CHECK-NEXT:    str r1, [r0]
10; CHECK-NEXT:    bx lr
11entry:
12  %0 = load i32, i32* %b, align 4
13  %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 18)
14  %2 = extractvalue { i32, <16 x i8> } %1, 0
15  store i32 %2, i32* %b, align 4
16  %3 = extractvalue { i32, <16 x i8> } %1, 1
17  ret <16 x i8> %3
18}
19
20define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_s16(<8 x i16> %a, i32* nocapture %b) {
21; CHECK-LABEL: test_vshlcq_s16:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    ldr r1, [r0]
24; CHECK-NEXT:    vshlc q0, r1, #16
25; CHECK-NEXT:    str r1, [r0]
26; CHECK-NEXT:    bx lr
27entry:
28  %0 = load i32, i32* %b, align 4
29  %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 16)
30  %2 = extractvalue { i32, <8 x i16> } %1, 0
31  store i32 %2, i32* %b, align 4
32  %3 = extractvalue { i32, <8 x i16> } %1, 1
33  ret <8 x i16> %3
34}
35
36define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_s32(<4 x i32> %a, i32* nocapture %b) {
37; CHECK-LABEL: test_vshlcq_s32:
38; CHECK:       @ %bb.0: @ %entry
39; CHECK-NEXT:    ldr r1, [r0]
40; CHECK-NEXT:    vshlc q0, r1, #4
41; CHECK-NEXT:    str r1, [r0]
42; CHECK-NEXT:    bx lr
43entry:
44  %0 = load i32, i32* %b, align 4
45  %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 4)
46  %2 = extractvalue { i32, <4 x i32> } %1, 0
47  store i32 %2, i32* %b, align 4
48  %3 = extractvalue { i32, <4 x i32> } %1, 1
49  ret <4 x i32> %3
50}
51
52define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_u8(<16 x i8> %a, i32* nocapture %b) {
53; CHECK-LABEL: test_vshlcq_u8:
54; CHECK:       @ %bb.0: @ %entry
55; CHECK-NEXT:    ldr r1, [r0]
56; CHECK-NEXT:    vshlc q0, r1, #17
57; CHECK-NEXT:    str r1, [r0]
58; CHECK-NEXT:    bx lr
59entry:
60  %0 = load i32, i32* %b, align 4
61  %1 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8> %a, i32 %0, i32 17)
62  %2 = extractvalue { i32, <16 x i8> } %1, 0
63  store i32 %2, i32* %b, align 4
64  %3 = extractvalue { i32, <16 x i8> } %1, 1
65  ret <16 x i8> %3
66}
67
68define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_u16(<8 x i16> %a, i32* nocapture %b) {
69; CHECK-LABEL: test_vshlcq_u16:
70; CHECK:       @ %bb.0: @ %entry
71; CHECK-NEXT:    ldr r1, [r0]
72; CHECK-NEXT:    vshlc q0, r1, #17
73; CHECK-NEXT:    str r1, [r0]
74; CHECK-NEXT:    bx lr
75entry:
76  %0 = load i32, i32* %b, align 4
77  %1 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %a, i32 %0, i32 17)
78  %2 = extractvalue { i32, <8 x i16> } %1, 0
79  store i32 %2, i32* %b, align 4
80  %3 = extractvalue { i32, <8 x i16> } %1, 1
81  ret <8 x i16> %3
82}
83
84define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_u32(<4 x i32> %a, i32* nocapture %b) {
85; CHECK-LABEL: test_vshlcq_u32:
86; CHECK:       @ %bb.0: @ %entry
87; CHECK-NEXT:    ldr r1, [r0]
88; CHECK-NEXT:    vshlc q0, r1, #20
89; CHECK-NEXT:    str r1, [r0]
90; CHECK-NEXT:    bx lr
91entry:
92  %0 = load i32, i32* %b, align 4
93  %1 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %a, i32 %0, i32 20)
94  %2 = extractvalue { i32, <4 x i32> } %1, 0
95  store i32 %2, i32* %b, align 4
96  %3 = extractvalue { i32, <4 x i32> } %1, 1
97  ret <4 x i32> %3
98}
99
100define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_s8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) {
101; CHECK-LABEL: test_vshlcq_m_s8:
102; CHECK:       @ %bb.0: @ %entry
103; CHECK-NEXT:    vmsr p0, r1
104; CHECK-NEXT:    ldr r1, [r0]
105; CHECK-NEXT:    vpst
106; CHECK-NEXT:    vshlct q0, r1, #29
107; CHECK-NEXT:    str r1, [r0]
108; CHECK-NEXT:    bx lr
109entry:
110  %0 = load i32, i32* %b, align 4
111  %1 = zext i16 %p to i32
112  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
113  %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 29, <16 x i1> %2)
114  %4 = extractvalue { i32, <16 x i8> } %3, 0
115  store i32 %4, i32* %b, align 4
116  %5 = extractvalue { i32, <16 x i8> } %3, 1
117  ret <16 x i8> %5
118}
119
120define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_s16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) {
121; CHECK-LABEL: test_vshlcq_m_s16:
122; CHECK:       @ %bb.0: @ %entry
123; CHECK-NEXT:    vmsr p0, r1
124; CHECK-NEXT:    ldr r1, [r0]
125; CHECK-NEXT:    vpst
126; CHECK-NEXT:    vshlct q0, r1, #17
127; CHECK-NEXT:    str r1, [r0]
128; CHECK-NEXT:    bx lr
129entry:
130  %0 = load i32, i32* %b, align 4
131  %1 = zext i16 %p to i32
132  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
133  %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 17, <8 x i1> %2)
134  %4 = extractvalue { i32, <8 x i16> } %3, 0
135  store i32 %4, i32* %b, align 4
136  %5 = extractvalue { i32, <8 x i16> } %3, 1
137  ret <8 x i16> %5
138}
139
140define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_s32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) {
141; CHECK-LABEL: test_vshlcq_m_s32:
142; CHECK:       @ %bb.0: @ %entry
143; CHECK-NEXT:    vmsr p0, r1
144; CHECK-NEXT:    ldr r1, [r0]
145; CHECK-NEXT:    vpst
146; CHECK-NEXT:    vshlct q0, r1, #9
147; CHECK-NEXT:    str r1, [r0]
148; CHECK-NEXT:    bx lr
149entry:
150  %0 = load i32, i32* %b, align 4
151  %1 = zext i16 %p to i32
152  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
153  %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 9, <4 x i1> %2)
154  %4 = extractvalue { i32, <4 x i32> } %3, 0
155  store i32 %4, i32* %b, align 4
156  %5 = extractvalue { i32, <4 x i32> } %3, 1
157  ret <4 x i32> %5
158}
159
160define arm_aapcs_vfpcc <16 x i8> @test_vshlcq_m_u8(<16 x i8> %a, i32* nocapture %b, i16 zeroext %p) {
161; CHECK-LABEL: test_vshlcq_m_u8:
162; CHECK:       @ %bb.0: @ %entry
163; CHECK-NEXT:    vmsr p0, r1
164; CHECK-NEXT:    ldr r1, [r0]
165; CHECK-NEXT:    vpst
166; CHECK-NEXT:    vshlct q0, r1, #21
167; CHECK-NEXT:    str r1, [r0]
168; CHECK-NEXT:    bx lr
169entry:
170  %0 = load i32, i32* %b, align 4
171  %1 = zext i16 %p to i32
172  %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
173  %3 = tail call { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8> %a, i32 %0, i32 21, <16 x i1> %2)
174  %4 = extractvalue { i32, <16 x i8> } %3, 0
175  store i32 %4, i32* %b, align 4
176  %5 = extractvalue { i32, <16 x i8> } %3, 1
177  ret <16 x i8> %5
178}
179
180define arm_aapcs_vfpcc <8 x i16> @test_vshlcq_m_u16(<8 x i16> %a, i32* nocapture %b, i16 zeroext %p) {
181; CHECK-LABEL: test_vshlcq_m_u16:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    vmsr p0, r1
184; CHECK-NEXT:    ldr r1, [r0]
185; CHECK-NEXT:    vpst
186; CHECK-NEXT:    vshlct q0, r1, #24
187; CHECK-NEXT:    str r1, [r0]
188; CHECK-NEXT:    bx lr
189entry:
190  %0 = load i32, i32* %b, align 4
191  %1 = zext i16 %p to i32
192  %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
193  %3 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16> %a, i32 %0, i32 24, <8 x i1> %2)
194  %4 = extractvalue { i32, <8 x i16> } %3, 0
195  store i32 %4, i32* %b, align 4
196  %5 = extractvalue { i32, <8 x i16> } %3, 1
197  ret <8 x i16> %5
198}
199
200define arm_aapcs_vfpcc <4 x i32> @test_vshlcq_m_u32(<4 x i32> %a, i32* nocapture %b, i16 zeroext %p) {
201; CHECK-LABEL: test_vshlcq_m_u32:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    vmsr p0, r1
204; CHECK-NEXT:    ldr r1, [r0]
205; CHECK-NEXT:    vpst
206; CHECK-NEXT:    vshlct q0, r1, #26
207; CHECK-NEXT:    str r1, [r0]
208; CHECK-NEXT:    bx lr
209entry:
210  %0 = load i32, i32* %b, align 4
211  %1 = zext i16 %p to i32
212  %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
213  %3 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32> %a, i32 %0, i32 26, <4 x i1> %2)
214  %4 = extractvalue { i32, <4 x i32> } %3, 0
215  store i32 %4, i32* %b, align 4
216  %5 = extractvalue { i32, <4 x i32> } %3, 1
217  ret <4 x i32> %5
218}
219
220declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.v16i8(<16 x i8>, i32, i32)
221declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32)
222declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32)
223declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
224declare { i32, <16 x i8> } @llvm.arm.mve.vshlc.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
225declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
226declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
227declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
228declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
229