1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-LE
3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-BE
4
5define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) {
6; CHECK-LE-LABEL: load_4xi32_a4:
7; CHECK-LE:       @ %bb.0: @ %entry
8; CHECK-LE-NEXT:    vldrw.u32 q0, [r0]
9; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
10; CHECK-LE-NEXT:    bx lr
11;
12; CHECK-BE-LABEL: load_4xi32_a4:
13; CHECK-BE:       @ %bb.0: @ %entry
14; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
15; CHECK-BE-NEXT:    vshr.u32 q1, q0, #1
16; CHECK-BE-NEXT:    vrev64.32 q0, q1
17; CHECK-BE-NEXT:    bx lr
18entry:
19  %0 = load <4 x i32>, <4 x i32>* %vp, align 4
20  %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
21  ret <4 x i32> %1
22}
23
24define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) {
25; CHECK-LE-LABEL: load_4xi32_a2:
26; CHECK-LE:       @ %bb.0: @ %entry
27; CHECK-LE-NEXT:    vldrh.u16 q0, [r0]
28; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
29; CHECK-LE-NEXT:    bx lr
30;
31; CHECK-BE-LABEL: load_4xi32_a2:
32; CHECK-BE:       @ %bb.0: @ %entry
33; CHECK-BE-NEXT:    vldrb.u8 q0, [r0]
34; CHECK-BE-NEXT:    vrev32.8 q0, q0
35; CHECK-BE-NEXT:    vshr.u32 q1, q0, #1
36; CHECK-BE-NEXT:    vrev64.32 q0, q1
37; CHECK-BE-NEXT:    bx lr
38entry:
39  %0 = load <4 x i32>, <4 x i32>* %vp, align 2
40  %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
41  ret <4 x i32> %1
42}
43
44define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) {
45; CHECK-LE-LABEL: load_4xi32_a1:
46; CHECK-LE:       @ %bb.0: @ %entry
47; CHECK-LE-NEXT:    vldrb.u8 q0, [r0]
48; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
49; CHECK-LE-NEXT:    bx lr
50;
51; CHECK-BE-LABEL: load_4xi32_a1:
52; CHECK-BE:       @ %bb.0: @ %entry
53; CHECK-BE-NEXT:    vldrb.u8 q0, [r0]
54; CHECK-BE-NEXT:    vrev32.8 q0, q0
55; CHECK-BE-NEXT:    vshr.u32 q1, q0, #1
56; CHECK-BE-NEXT:    vrev64.32 q0, q1
57; CHECK-BE-NEXT:    bx lr
58entry:
59  %0 = load <4 x i32>, <4 x i32>* %vp, align 1
60  %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
61  ret <4 x i32> %1
62}
63
64define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) {
65; CHECK-LE-LABEL: store_4xi32_a4:
66; CHECK-LE:       @ %bb.0: @ %entry
67; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
68; CHECK-LE-NEXT:    vstrw.32 q0, [r0]
69; CHECK-LE-NEXT:    bx lr
70;
71; CHECK-BE-LABEL: store_4xi32_a4:
72; CHECK-BE:       @ %bb.0: @ %entry
73; CHECK-BE-NEXT:    vrev64.32 q1, q0
74; CHECK-BE-NEXT:    vshr.u32 q0, q1, #1
75; CHECK-BE-NEXT:    vstrw.32 q0, [r0]
76; CHECK-BE-NEXT:    bx lr
77entry:
78  %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
79  store <4 x i32> %0, <4 x i32>* %vp, align 4
80  ret void
81}
82
83define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) {
84; CHECK-LE-LABEL: store_4xi32_a2:
85; CHECK-LE:       @ %bb.0: @ %entry
86; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
87; CHECK-LE-NEXT:    vstrh.16 q0, [r0]
88; CHECK-LE-NEXT:    bx lr
89;
90; CHECK-BE-LABEL: store_4xi32_a2:
91; CHECK-BE:       @ %bb.0: @ %entry
92; CHECK-BE-NEXT:    vrev64.32 q1, q0
93; CHECK-BE-NEXT:    vshr.u32 q0, q1, #1
94; CHECK-BE-NEXT:    vrev32.8 q0, q0
95; CHECK-BE-NEXT:    vstrb.8 q0, [r0]
96; CHECK-BE-NEXT:    bx lr
97entry:
98  %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
99  store <4 x i32> %0, <4 x i32>* %vp, align 2
100  ret void
101}
102
103define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) {
104; CHECK-LE-LABEL: store_4xi32_a1:
105; CHECK-LE:       @ %bb.0: @ %entry
106; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
107; CHECK-LE-NEXT:    vstrb.8 q0, [r0]
108; CHECK-LE-NEXT:    bx lr
109;
110; CHECK-BE-LABEL: store_4xi32_a1:
111; CHECK-BE:       @ %bb.0: @ %entry
112; CHECK-BE-NEXT:    vrev64.32 q1, q0
113; CHECK-BE-NEXT:    vshr.u32 q0, q1, #1
114; CHECK-BE-NEXT:    vrev32.8 q0, q0
115; CHECK-BE-NEXT:    vstrb.8 q0, [r0]
116; CHECK-BE-NEXT:    bx lr
117entry:
118  %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
119  store <4 x i32> %0, <4 x i32>* %vp, align 1
120  ret void
121}
122
123define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) {
124; CHECK-LE-LABEL: load_4xi32_a4_offset_pos:
125; CHECK-LE:       @ %bb.0: @ %entry
126; CHECK-LE-NEXT:    vldrw.u32 q0, [r0, #508]
127; CHECK-LE-NEXT:    bx lr
128;
129; CHECK-BE-LABEL: load_4xi32_a4_offset_pos:
130; CHECK-BE:       @ %bb.0: @ %entry
131; CHECK-BE-NEXT:    add.w r0, r0, #508
132; CHECK-BE-NEXT:    vldrb.u8 q1, [r0]
133; CHECK-BE-NEXT:    vrev64.8 q0, q1
134; CHECK-BE-NEXT:    bx lr
135entry:
136  %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127
137  %vp = bitcast i32* %ipoffset to <4 x i32>*
138  %0 = load <4 x i32>, <4 x i32>* %vp, align 4
139  ret <4 x i32> %0
140}
141
142define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) {
143; CHECK-LE-LABEL: load_4xi32_a4_offset_neg:
144; CHECK-LE:       @ %bb.0: @ %entry
145; CHECK-LE-NEXT:    vldrw.u32 q0, [r0, #-508]
146; CHECK-LE-NEXT:    bx lr
147;
148; CHECK-BE-LABEL: load_4xi32_a4_offset_neg:
149; CHECK-BE:       @ %bb.0: @ %entry
150; CHECK-BE-NEXT:    sub.w r0, r0, #508
151; CHECK-BE-NEXT:    vldrb.u8 q1, [r0]
152; CHECK-BE-NEXT:    vrev64.8 q0, q1
153; CHECK-BE-NEXT:    bx lr
154entry:
155  %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127
156  %vp = bitcast i32* %ipoffset to <4 x i32>*
157  %0 = load <4 x i32>, <4 x i32>* %vp, align 4
158  ret <4 x i32> %0
159}
160
161define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() {
162; CHECK-LE-LABEL: loadstore_4xi32_stack_off16:
163; CHECK-LE:       @ %bb.0: @ %entry
164; CHECK-LE-NEXT:    .pad #40
165; CHECK-LE-NEXT:    sub sp, #40
166; CHECK-LE-NEXT:    vmov.i32 q0, #0x1
167; CHECK-LE-NEXT:    mov r0, sp
168; CHECK-LE-NEXT:    vstrw.32 q0, [r0]
169; CHECK-LE-NEXT:    movs r0, #3
170; CHECK-LE-NEXT:    vstrw.32 q0, [sp, #16]
171; CHECK-LE-NEXT:    str r0, [sp, #16]
172; CHECK-LE-NEXT:    vldrw.u32 q0, [sp, #16]
173; CHECK-LE-NEXT:    add sp, #40
174; CHECK-LE-NEXT:    bx lr
175;
176; CHECK-BE-LABEL: loadstore_4xi32_stack_off16:
177; CHECK-BE:       @ %bb.0: @ %entry
178; CHECK-BE-NEXT:    .pad #40
179; CHECK-BE-NEXT:    sub sp, #40
180; CHECK-BE-NEXT:    vmov.i32 q0, #0x1
181; CHECK-BE-NEXT:    mov r0, sp
182; CHECK-BE-NEXT:    vstrw.32 q0, [r0]
183; CHECK-BE-NEXT:    movs r0, #3
184; CHECK-BE-NEXT:    vstrw.32 q0, [sp, #16]
185; CHECK-BE-NEXT:    str r0, [sp, #16]
186; CHECK-BE-NEXT:    vldrb.u8 q1, [sp, #16]
187; CHECK-BE-NEXT:    vrev64.8 q0, q1
188; CHECK-BE-NEXT:    add sp, #40
189; CHECK-BE-NEXT:    bx lr
190entry:
191  %c = alloca [1 x [5 x [2 x i32]]], align 4
192  %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8*
193  %arrayidx5 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 0, i32 0
194  %1 = bitcast [1 x [5 x [2 x i32]]]* %c to <4 x i32>*
195  store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %1, align 4
196  %arrayidx5.2 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 2, i32 0
197  %2 = bitcast i32* %arrayidx5.2 to <4 x i32>*
198  store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %2, align 4
199  store i32 3, i32* %arrayidx5.2, align 4
200  %3 = load <4 x i32>, <4 x i32>* %2, align 4
201  ret <4 x i32> %3
202}
203
204define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() {
205; CHECK-LE-LABEL: loadstore_8xi16_stack_off16:
206; CHECK-LE:       @ %bb.0: @ %entry
207; CHECK-LE-NEXT:    .pad #40
208; CHECK-LE-NEXT:    sub sp, #40
209; CHECK-LE-NEXT:    vmov.i16 q0, #0x1
210; CHECK-LE-NEXT:    mov r0, sp
211; CHECK-LE-NEXT:    vstrh.16 q0, [r0]
212; CHECK-LE-NEXT:    movs r0, #3
213; CHECK-LE-NEXT:    vstrh.16 q0, [sp, #16]
214; CHECK-LE-NEXT:    strh.w r0, [sp, #16]
215; CHECK-LE-NEXT:    vldrh.u16 q0, [sp, #16]
216; CHECK-LE-NEXT:    add sp, #40
217; CHECK-LE-NEXT:    bx lr
218;
219; CHECK-BE-LABEL: loadstore_8xi16_stack_off16:
220; CHECK-BE:       @ %bb.0: @ %entry
221; CHECK-BE-NEXT:    .pad #40
222; CHECK-BE-NEXT:    sub sp, #40
223; CHECK-BE-NEXT:    vmov.i16 q0, #0x1
224; CHECK-BE-NEXT:    mov r0, sp
225; CHECK-BE-NEXT:    vstrh.16 q0, [r0]
226; CHECK-BE-NEXT:    movs r0, #3
227; CHECK-BE-NEXT:    vstrh.16 q0, [sp, #16]
228; CHECK-BE-NEXT:    strh.w r0, [sp, #16]
229; CHECK-BE-NEXT:    vldrb.u8 q1, [sp, #16]
230; CHECK-BE-NEXT:    vrev64.8 q0, q1
231; CHECK-BE-NEXT:    add sp, #40
232; CHECK-BE-NEXT:    bx lr
233entry:
234  %c = alloca [1 x [10 x [2 x i16]]], align 2
235  %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8*
236  %arrayidx5 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 0, i32 0
237  %1 = bitcast [1 x [10 x [2 x i16]]]* %c to <8 x i16>*
238  store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %1, align 2
239  %arrayidx5.2 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 4, i32 0
240  %2 = bitcast i16* %arrayidx5.2 to <8 x i16>*
241  store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %2, align 2
242  store i16 3, i16* %arrayidx5.2, align 2
243  %3 = load <8 x i16>, <8 x i16>* %2, align 2
244  ret <8 x i16> %3
245}
246
247define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() {
248; CHECK-LE-LABEL: loadstore_16xi8_stack_off16:
249; CHECK-LE:       @ %bb.0: @ %entry
250; CHECK-LE-NEXT:    .pad #40
251; CHECK-LE-NEXT:    sub sp, #40
252; CHECK-LE-NEXT:    vmov.i8 q0, #0x1
253; CHECK-LE-NEXT:    mov r0, sp
254; CHECK-LE-NEXT:    vstrb.8 q0, [r0]
255; CHECK-LE-NEXT:    movs r0, #3
256; CHECK-LE-NEXT:    vstrb.8 q0, [sp, #16]
257; CHECK-LE-NEXT:    strb.w r0, [sp, #16]
258; CHECK-LE-NEXT:    vldrb.u8 q0, [sp, #16]
259; CHECK-LE-NEXT:    add sp, #40
260; CHECK-LE-NEXT:    bx lr
261;
262; CHECK-BE-LABEL: loadstore_16xi8_stack_off16:
263; CHECK-BE:       @ %bb.0: @ %entry
264; CHECK-BE-NEXT:    .pad #40
265; CHECK-BE-NEXT:    sub sp, #40
266; CHECK-BE-NEXT:    vmov.i8 q0, #0x1
267; CHECK-BE-NEXT:    mov r0, sp
268; CHECK-BE-NEXT:    vstrb.8 q0, [r0]
269; CHECK-BE-NEXT:    movs r0, #3
270; CHECK-BE-NEXT:    vstrb.8 q0, [sp, #16]
271; CHECK-BE-NEXT:    strb.w r0, [sp, #16]
272; CHECK-BE-NEXT:    vldrb.u8 q1, [sp, #16]
273; CHECK-BE-NEXT:    vrev64.8 q0, q1
274; CHECK-BE-NEXT:    add sp, #40
275; CHECK-BE-NEXT:    bx lr
276entry:
277  %c = alloca [1 x [20 x [2 x i8]]], align 1
278  %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8*
279  %arrayidx5 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 0, i32 0
280  %1 = bitcast [1 x [20 x [2 x i8]]]* %c to <16 x i8>*
281  store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %1, align 1
282  %arrayidx5.2 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 8, i32 0
283  %2 = bitcast i8* %arrayidx5.2 to <16 x i8>*
284  store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %2, align 1
285  store i8 3, i8* %arrayidx5.2, align 1
286  %3 = load <16 x i8>, <16 x i8>* %2, align 1
287  ret <16 x i8> %3
288}
289