1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s
3
4; VLDRH.u32 Qd, [base, offs, #uxtw #1]
5define arm_aapcs_vfpcc void @ext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
6; CHECK-LABEL: ext_scaled_i16_i32:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q1, [r1]
9; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
10; CHECK-NEXT:    bx lr
11entry:
12  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
13  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
14  %t = trunc <4 x i32> %input to <4 x i16>
15  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
16  ret void
17}
18
19; VSTRW.32 Qd, [base, offs, uxtw #2]
20define arm_aapcs_vfpcc void @scaled_i32_i32(i32* %base, <4 x i32>* %offptr, <4 x i32> %input) {
21; CHECK-LABEL: scaled_i32_i32:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vldrw.u32 q1, [r1]
24; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
25; CHECK-NEXT:    bx lr
26entry:
27  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
28  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
29  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
30  ret void
31}
32
33; VSTRW.32 Qd, [base, offs, uxtw #2]
34define arm_aapcs_vfpcc void @scaled_f32_i32(i32* %base, <4 x i32>* %offptr, <4 x float> %input) {
35; CHECK-LABEL: scaled_f32_i32:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    vldrw.u32 q1, [r1]
38; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
39; CHECK-NEXT:    bx lr
40entry:
41  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
42  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
43  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
44  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
45  ret void
46}
47
48; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
49define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) {
50; CHECK-LABEL: unsigned_scaled_b_i32_i16:
51; CHECK:       @ %bb.0: @ %entry
52; CHECK-NEXT:    vldrh.u32 q1, [r1]
53; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
54; CHECK-NEXT:    bx lr
55entry:
56  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
57  %offs.zext = zext <4 x i16> %offs to <4 x i32>
58  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
59  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
60  ret void
61}
62
63; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
64define arm_aapcs_vfpcc void @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) {
65; CHECK-LABEL: signed_scaled_i32_i16:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    vldrh.s32 q1, [r1]
68; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
69; CHECK-NEXT:    bx lr
70entry:
71  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
72  %offs.sext = sext <4 x i16> %offs to <4 x i32>
73  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
74  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
75  ret void
76}
77
78; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
79define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) {
80; CHECK-LABEL: a_unsigned_scaled_f32_i16:
81; CHECK:       @ %bb.0: @ %entry
82; CHECK-NEXT:    vldrh.u32 q1, [r1]
83; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
84; CHECK-NEXT:    bx lr
85entry:
86  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
87  %offs.zext = zext <4 x i16> %offs to <4 x i32>
88  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
89  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
90  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
91  ret void
92}
93
94; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
95define arm_aapcs_vfpcc void @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) {
96; CHECK-LABEL: b_signed_scaled_f32_i16:
97; CHECK:       @ %bb.0: @ %entry
98; CHECK-NEXT:    vldrh.s32 q1, [r1]
99; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
100; CHECK-NEXT:    bx lr
101entry:
102  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
103  %offs.sext = sext <4 x i16> %offs to <4 x i32>
104  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
105  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
106  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
107  ret void
108}
109
110; VLDRH.u32 Qd, [base, offs.sext, uxtw #1]
111define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) {
112; CHECK-LABEL: ext_signed_scaled_i16_i16:
113; CHECK:       @ %bb.0: @ %entry
114; CHECK-NEXT:    vldrh.s32 q1, [r1]
115; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
116; CHECK-NEXT:    bx lr
117entry:
118  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
119  %offs.sext = sext <4 x i16> %offs to <4 x i32>
120  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
121  %t = trunc <4 x i32> %input to <4 x i16>
122  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
123  ret void
124}
125
126; VSTRH.32 Qd, [base, offs.sext, uxtw #1]
127define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) {
128; CHECK-LABEL: ext_unsigned_scaled_i16_i16:
129; CHECK:       @ %bb.0: @ %entry
130; CHECK-NEXT:    vldrh.u32 q1, [r1]
131; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
132; CHECK-NEXT:    bx lr
133entry:
134  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
135  %offs.zext = zext <4 x i16> %offs to <4 x i32>
136  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
137  %t = trunc <4 x i32> %input to <4 x i16>
138  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
139  ret void
140}
141
142; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
143define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) {
144; CHECK-LABEL: unsigned_scaled_b_i32_i8:
145; CHECK:       @ %bb.0: @ %entry
146; CHECK-NEXT:    vldrb.u32 q1, [r1]
147; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
148; CHECK-NEXT:    bx lr
149entry:
150  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
151  %offs.zext = zext <4 x i8> %offs to <4 x i32>
152  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
153  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
154  ret void
155}
156
157; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
158define arm_aapcs_vfpcc void @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) {
159; CHECK-LABEL: signed_scaled_i32_i8:
160; CHECK:       @ %bb.0: @ %entry
161; CHECK-NEXT:    vldrb.s32 q1, [r1]
162; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
163; CHECK-NEXT:    bx lr
164entry:
165  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
166  %offs.sext = sext <4 x i8> %offs to <4 x i32>
167  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
168  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
169  ret void
170}
171
172; VSTRW.32 Qd, [base, offs.zext, uxtw #2]
173define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) {
174; CHECK-LABEL: a_unsigned_scaled_f32_i8:
175; CHECK:       @ %bb.0: @ %entry
176; CHECK-NEXT:    vldrb.u32 q1, [r1]
177; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
178; CHECK-NEXT:    bx lr
179entry:
180  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
181  %offs.zext = zext <4 x i8> %offs to <4 x i32>
182  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
183  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
184  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
185  ret void
186}
187
188; VSTRW.32 Qd, [base, offs.sext, uxtw #2]
189define arm_aapcs_vfpcc void @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) {
190; CHECK-LABEL: b_signed_scaled_f32_i8:
191; CHECK:       @ %bb.0: @ %entry
192; CHECK-NEXT:    vldrb.s32 q1, [r1]
193; CHECK-NEXT:    vstrw.32 q0, [r0, q1, uxtw #2]
194; CHECK-NEXT:    bx lr
195entry:
196  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
197  %offs.sext = sext <4 x i8> %offs to <4 x i32>
198  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
199  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
200  call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
201  ret void
202}
203
204; VLDRH.z32 Qd, [base, offs.sext, uxtw #1]
205define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) {
206; CHECK-LABEL: ext_signed_scaled_i16_i8:
207; CHECK:       @ %bb.0: @ %entry
208; CHECK-NEXT:    vldrb.s32 q1, [r1]
209; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
210; CHECK-NEXT:    bx lr
211entry:
212  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
213  %offs.sext = sext <4 x i8> %offs to <4 x i32>
214  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
215  %t = trunc <4 x i32> %input to <4 x i16>
216  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
217  ret void
218}
219
220; VLDRH.z32 Qd, [base, offs.zext, uxtw #1]
221define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) {
222; CHECK-LABEL: ext_unsigned_scaled_i16_i8:
223; CHECK:       @ %bb.0: @ %entry
224; CHECK-NEXT:    vldrb.u32 q1, [r1]
225; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
226; CHECK-NEXT:    bx lr
227entry:
228  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
229  %offs.zext = zext <4 x i8> %offs to <4 x i32>
230  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
231  %t = trunc <4 x i32> %input to <4 x i16>
232  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
233  ret void
234}
235
236define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
237; CHECK-LABEL: ext_scaled_i16_i32_2gep:
238; CHECK:       @ %bb.0: @ %entry
239; CHECK-NEXT:    vldrw.u32 q2, [r1]
240; CHECK-NEXT:    vmov.i32 q1, #0xa
241; CHECK-NEXT:    vmov r1, s0
242; CHECK-NEXT:    vshl.i32 q2, q2, #1
243; CHECK-NEXT:    vadd.i32 q2, q2, r0
244; CHECK-NEXT:    vadd.i32 q1, q2, q1
245; CHECK-NEXT:    vmov r0, s4
246; CHECK-NEXT:    strh r1, [r0]
247; CHECK-NEXT:    vmov r0, s5
248; CHECK-NEXT:    vmov r1, s1
249; CHECK-NEXT:    strh r1, [r0]
250; CHECK-NEXT:    vmov r0, s6
251; CHECK-NEXT:    vmov r1, s2
252; CHECK-NEXT:    strh r1, [r0]
253; CHECK-NEXT:    vmov r0, s7
254; CHECK-NEXT:    vmov r1, s3
255; CHECK-NEXT:    strh r1, [r0]
256; CHECK-NEXT:    bx lr
257entry:
258  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
259  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
260  %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5
261  %t = trunc <4 x i32> %input to <4 x i16>
262  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
263  ret void
264}
265
266define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) {
267; CHECK-LABEL: ext_scaled_i16_i32_2gep2:
268; CHECK:       @ %bb.0: @ %entry
269; CHECK-NEXT:    adr r1, .LCPI16_0
270; CHECK-NEXT:    vldrw.u32 q1, [r1]
271; CHECK-NEXT:    vstrh.32 q0, [r0, q1, uxtw #1]
272; CHECK-NEXT:    bx lr
273; CHECK-NEXT:    .p2align 4
274; CHECK-NEXT:  @ %bb.1:
275; CHECK-NEXT:  .LCPI16_0:
276; CHECK-NEXT:    .long 5 @ 0x5
277; CHECK-NEXT:    .long 8 @ 0x8
278; CHECK-NEXT:    .long 11 @ 0xb
279; CHECK-NEXT:    .long 14 @ 0xe
280entry:
281  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> <i16 0, i16 3, i16 6, i16 9>
282  %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5
283  %t = trunc <4 x i32> %input to <4 x i16>
284  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
285  ret void
286}
287
288declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
289declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
290declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
291declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
292declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
293