1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
4
5define void @load_load_add_store(<4 x i32> *%src1, <4 x i32> *%src2) {
6; CHECK-LABEL: load_load_add_store:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q0, [r1]
9; CHECK-NEXT:    vldrw.u32 q1, [r0]
10; CHECK-NEXT:    vadd.i32 q0, q1, q0
11; CHECK-NEXT:    vstrw.32 q0, [r0]
12; CHECK-NEXT:    bx lr
13entry:
14  %l1 = load <4 x i32>, <4 x i32>* %src1, align 4
15  %l2 = load <4 x i32>, <4 x i32>* %src2, align 4
16  %a = add <4 x i32> %l1, %l2
17  store <4 x i32> %a, <4 x i32>* %src1, align 4
18  ret void
19}
20
21define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) {
22; CHECK-LE-LABEL: load_load_add_store_align1:
23; CHECK-LE:       @ %bb.0: @ %entry
24; CHECK-LE-NEXT:    vldrb.u8 q0, [r1]
25; CHECK-LE-NEXT:    vldrb.u8 q1, [r0]
26; CHECK-LE-NEXT:    vadd.i32 q0, q1, q0
27; CHECK-LE-NEXT:    vstrb.8 q0, [r0]
28; CHECK-LE-NEXT:    bx lr
29;
30; CHECK-BE-LABEL: load_load_add_store_align1:
31; CHECK-BE:       @ %bb.0: @ %entry
32; CHECK-BE-NEXT:    vldrb.u8 q0, [r1]
33; CHECK-BE-NEXT:    vldrb.u8 q1, [r0]
34; CHECK-BE-NEXT:    vrev32.8 q0, q0
35; CHECK-BE-NEXT:    vrev32.8 q1, q1
36; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
37; CHECK-BE-NEXT:    vrev32.8 q0, q0
38; CHECK-BE-NEXT:    vstrb.8 q0, [r0]
39; CHECK-BE-NEXT:    bx lr
40entry:
41  %l1 = load <4 x i32>, <4 x i32>* %src1, align 1
42  %l2 = load <4 x i32>, <4 x i32>* %src2, align 1
43  %a = add <4 x i32> %l1, %l2
44  store <4 x i32> %a, <4 x i32>* %src1, align 1
45  ret void
46}
47
48define arm_aapcs_vfpcc void @load_arg_add_store(<4 x i32> *%src1, <4 x i32> %src2) {
49; CHECK-LE-LABEL: load_arg_add_store:
50; CHECK-LE:       @ %bb.0: @ %entry
51; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
52; CHECK-LE-NEXT:    vadd.i32 q0, q1, q0
53; CHECK-LE-NEXT:    vstrw.32 q0, [r0]
54; CHECK-LE-NEXT:    bx lr
55;
56; CHECK-BE-LABEL: load_arg_add_store:
57; CHECK-BE:       @ %bb.0: @ %entry
58; CHECK-BE-NEXT:    vrev64.32 q1, q0
59; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
60; CHECK-BE-NEXT:    vadd.i32 q0, q0, q1
61; CHECK-BE-NEXT:    vstrw.32 q0, [r0]
62; CHECK-BE-NEXT:    bx lr
63entry:
64  %l1 = load <4 x i32>, <4 x i32>* %src1, align 4
65  %a = add <4 x i32> %l1, %src2
66  store <4 x i32> %a, <4 x i32>* %src1, align 4
67  ret void
68}
69
70define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
71; CHECK-LE-LABEL: add_soft:
72; CHECK-LE:       @ %bb.0: @ %entry
73; CHECK-LE-NEXT:    vmov d1, r2, r3
74; CHECK-LE-NEXT:    vmov d0, r0, r1
75; CHECK-LE-NEXT:    mov r0, sp
76; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
77; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
78; CHECK-LE-NEXT:    vmov r0, r1, d0
79; CHECK-LE-NEXT:    vmov r2, r3, d1
80; CHECK-LE-NEXT:    bx lr
81;
82; CHECK-BE-LABEL: add_soft:
83; CHECK-BE:       @ %bb.0: @ %entry
84; CHECK-BE-NEXT:    vmov d1, r3, r2
85; CHECK-BE-NEXT:    vmov d0, r1, r0
86; CHECK-BE-NEXT:    mov r0, sp
87; CHECK-BE-NEXT:    vrev64.32 q1, q0
88; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
89; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
90; CHECK-BE-NEXT:    vrev64.32 q1, q0
91; CHECK-BE-NEXT:    vmov r1, r0, d2
92; CHECK-BE-NEXT:    vmov r3, r2, d3
93; CHECK-BE-NEXT:    bx lr
94entry:
95  %0 = add <4 x i32> %src1, %src2
96  ret <4 x i32> %0
97}
98
99define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) {
100; CHECK-LE-LABEL: add_hard:
101; CHECK-LE:       @ %bb.0: @ %entry
102; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
103; CHECK-LE-NEXT:    bx lr
104;
105; CHECK-BE-LABEL: add_hard:
106; CHECK-BE:       @ %bb.0: @ %entry
107; CHECK-BE-NEXT:    vrev64.32 q2, q1
108; CHECK-BE-NEXT:    vrev64.32 q1, q0
109; CHECK-BE-NEXT:    vadd.i32 q1, q1, q2
110; CHECK-BE-NEXT:    vrev64.32 q0, q1
111; CHECK-BE-NEXT:    bx lr
112entry:
113  %0 = add <4 x i32> %src1, %src2
114  ret <4 x i32> %0
115}
116
117define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
118; CHECK-LE-LABEL: call_soft:
119; CHECK-LE:       @ %bb.0: @ %entry
120; CHECK-LE-NEXT:    .save {r7, lr}
121; CHECK-LE-NEXT:    push {r7, lr}
122; CHECK-LE-NEXT:    .pad #16
123; CHECK-LE-NEXT:    sub sp, #16
124; CHECK-LE-NEXT:    add.w r12, sp, #24
125; CHECK-LE-NEXT:    vldrw.u32 q0, [r12]
126; CHECK-LE-NEXT:    vstrw.32 q0, [sp]
127; CHECK-LE-NEXT:    vmov d1, r2, r3
128; CHECK-LE-NEXT:    vmov d0, r0, r1
129; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
130; CHECK-LE-NEXT:    vmov r0, r1, d0
131; CHECK-LE-NEXT:    vmov r2, r3, d1
132; CHECK-LE-NEXT:    bl add_soft
133; CHECK-LE-NEXT:    vmov d1, r2, r3
134; CHECK-LE-NEXT:    vmov d0, r0, r1
135; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
136; CHECK-LE-NEXT:    vmov r0, r1, d0
137; CHECK-LE-NEXT:    vmov r2, r3, d1
138; CHECK-LE-NEXT:    add sp, #16
139; CHECK-LE-NEXT:    pop {r7, pc}
140;
141; CHECK-BE-LABEL: call_soft:
142; CHECK-BE:       @ %bb.0: @ %entry
143; CHECK-BE-NEXT:    .save {r7, lr}
144; CHECK-BE-NEXT:    push {r7, lr}
145; CHECK-BE-NEXT:    .pad #16
146; CHECK-BE-NEXT:    sub sp, #16
147; CHECK-BE-NEXT:    add.w r12, sp, #24
148; CHECK-BE-NEXT:    vldrw.u32 q0, [r12]
149; CHECK-BE-NEXT:    vstrw.32 q0, [sp]
150; CHECK-BE-NEXT:    vmov d1, r3, r2
151; CHECK-BE-NEXT:    vmov d0, r1, r0
152; CHECK-BE-NEXT:    vrev64.32 q1, q0
153; CHECK-BE-NEXT:    vshr.u32 q0, q1, #1
154; CHECK-BE-NEXT:    vrev64.32 q1, q0
155; CHECK-BE-NEXT:    vmov r1, r0, d2
156; CHECK-BE-NEXT:    vmov r3, r2, d3
157; CHECK-BE-NEXT:    bl add_soft
158; CHECK-BE-NEXT:    vmov d1, r3, r2
159; CHECK-BE-NEXT:    vmov d0, r1, r0
160; CHECK-BE-NEXT:    vrev64.32 q1, q0
161; CHECK-BE-NEXT:    vshr.u32 q0, q1, #1
162; CHECK-BE-NEXT:    vrev64.32 q1, q0
163; CHECK-BE-NEXT:    vmov r1, r0, d2
164; CHECK-BE-NEXT:    vmov r3, r2, d3
165; CHECK-BE-NEXT:    add sp, #16
166; CHECK-BE-NEXT:    pop {r7, pc}
167entry:
168  %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
169  %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2)
170  %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
171  ret <4 x i32> %2
172}
173
174define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) {
175; CHECK-LE-LABEL: call_hard:
176; CHECK-LE:       @ %bb.0: @ %entry
177; CHECK-LE-NEXT:    .save {r7, lr}
178; CHECK-LE-NEXT:    push {r7, lr}
179; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
180; CHECK-LE-NEXT:    bl add_hard
181; CHECK-LE-NEXT:    vshr.u32 q0, q0, #1
182; CHECK-LE-NEXT:    pop {r7, pc}
183;
184; CHECK-BE-LABEL: call_hard:
185; CHECK-BE:       @ %bb.0: @ %entry
186; CHECK-BE-NEXT:    .save {r7, lr}
187; CHECK-BE-NEXT:    push {r7, lr}
188; CHECK-BE-NEXT:    vrev64.32 q2, q0
189; CHECK-BE-NEXT:    vshr.u32 q2, q2, #1
190; CHECK-BE-NEXT:    vrev64.32 q0, q2
191; CHECK-BE-NEXT:    bl add_hard
192; CHECK-BE-NEXT:    vrev64.32 q1, q0
193; CHECK-BE-NEXT:    vshr.u32 q1, q1, #1
194; CHECK-BE-NEXT:    vrev64.32 q0, q1
195; CHECK-BE-NEXT:    pop {r7, pc}
196entry:
197  %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
198  %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2)
199  %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
200  ret <4 x i32> %2
201}
202
203define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) {
204; CHECK-LE-LABEL: and_v4i32:
205; CHECK-LE:       @ %bb.0: @ %entry
206; CHECK-LE-NEXT:    vmov.i32 q1, #0x1
207; CHECK-LE-NEXT:    vand q0, q0, q1
208; CHECK-LE-NEXT:    bx lr
209;
210; CHECK-BE-LABEL: and_v4i32:
211; CHECK-BE:       @ %bb.0: @ %entry
212; CHECK-BE-NEXT:    vrev64.32 q1, q0
213; CHECK-BE-NEXT:    vmov.i32 q0, #0x1
214; CHECK-BE-NEXT:    vand q1, q1, q0
215; CHECK-BE-NEXT:    vrev64.32 q0, q1
216; CHECK-BE-NEXT:    bx lr
217entry:
218  %s1 = and <4 x i32> %src, <i32 1, i32 1, i32 1, i32 1>
219  %r = bitcast <4 x i32> %s1 to <16 x i8>
220  ret <16 x i8> %r
221}
222
223; Should be the same as and_v4i32 for LE
224define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) {
225; CHECK-LE-LABEL: and_v16i8_le:
226; CHECK-LE:       @ %bb.0: @ %entry
227; CHECK-LE-NEXT:    vmov.i32 q1, #0x1
228; CHECK-LE-NEXT:    vand q0, q0, q1
229; CHECK-LE-NEXT:    bx lr
230;
231; CHECK-BE-LABEL: and_v16i8_le:
232; CHECK-BE:       @ %bb.0: @ %entry
233; CHECK-BE-NEXT:    vrev64.8 q1, q0
234; CHECK-BE-NEXT:    vmov.i32 q0, #0x1
235; CHECK-BE-NEXT:    vrev32.8 q0, q0
236; CHECK-BE-NEXT:    vand q1, q1, q0
237; CHECK-BE-NEXT:    vrev64.8 q0, q1
238; CHECK-BE-NEXT:    bx lr
239entry:
240  %0 = bitcast <4 x i32> %src to <16 x i8>
241  %r = and <16 x i8> %0, <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0>
242  ret <16 x i8> %r
243}
244
245; Should be the same (or at least equivalent) as and_v4i32 for BE
246define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) {
247; CHECK-LE-LABEL: and_v16i8_be:
248; CHECK-LE:       @ %bb.0: @ %entry
249; CHECK-LE-NEXT:    vmov.i32 q1, #0x1000000
250; CHECK-LE-NEXT:    vand q0, q0, q1
251; CHECK-LE-NEXT:    bx lr
252;
253; CHECK-BE-LABEL: and_v16i8_be:
254; CHECK-BE:       @ %bb.0: @ %entry
255; CHECK-BE-NEXT:    vrev64.8 q1, q0
256; CHECK-BE-NEXT:    vmov.i32 q0, #0x1000000
257; CHECK-BE-NEXT:    vrev32.8 q0, q0
258; CHECK-BE-NEXT:    vand q1, q1, q0
259; CHECK-BE-NEXT:    vrev64.8 q0, q1
260; CHECK-BE-NEXT:    bx lr
261entry:
262  %0 = bitcast <4 x i32> %src to <16 x i8>
263  %r = and <16 x i8> %0, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
264  ret <16 x i8> %r
265}
266
267; FIXME: This looks wrong
268define arm_aapcs_vfpcc <4 x i32> @test(i32* %data) {
269; CHECK-LE-LABEL: test:
270; CHECK-LE:       @ %bb.0: @ %entry
271; CHECK-LE-NEXT:    vldrw.u32 q1, [r0, #32]
272; CHECK-LE-NEXT:    vmov.i32 q0, #0x1
273; CHECK-LE-NEXT:    vadd.i32 q1, q1, q0
274; CHECK-LE-NEXT:    @APP
275; CHECK-LE-NEXT:    vmullb.s32 q0, q1, q1
276; CHECK-LE-NEXT:    @NO_APP
277; CHECK-LE-NEXT:    bx lr
278;
279; CHECK-BE-LABEL: test:
280; CHECK-BE:       @ %bb.0: @ %entry
281; CHECK-BE-NEXT:    vldrw.u32 q1, [r0, #32]
282; CHECK-BE-NEXT:    vmov.i32 q0, #0x1
283; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
284; CHECK-BE-NEXT:    vrev32.8 q0, q0
285; CHECK-BE-NEXT:    @APP
286; CHECK-BE-NEXT:    vmullb.s32 q1, q0, q0
287; CHECK-BE-NEXT:    @NO_APP
288; CHECK-BE-NEXT:    vrev64.8 q0, q1
289; CHECK-BE-NEXT:    bx lr
290entry:
291  %add.ptr = getelementptr inbounds i32, i32* %data, i32 8
292  %0 = bitcast i32* %add.ptr to <4 x i32>*
293  %1 = load <4 x i32>, <4 x i32>* %0, align 4
294  %2 = add <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
295  %3 = tail call <4 x i32> asm sideeffect "  VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2
296  ret <4 x i32> %3
297}
298
299; Test case demonstrating that 'bitcast' reinterprets the memory format of a
300; vector, as if stored and then loaded. So if it has to go between two
301; operations treating a register as having different lane sizes, then in
302; big-endian mode, it has to emit a vrev32.16, which is equivalent to the
303; effect that vstrw.32 followed by vldrh.16 would have.
304define arm_aapcs_vfpcc void @test_bitcast(<4 x i32>* readonly %in, <8 x i16>* %out) {
305; CHECK-LE-LABEL: test_bitcast:
306; CHECK-LE:       @ %bb.0: @ %entry
307; CHECK-LE-NEXT:    vldrw.u32 q0, [r0]
308; CHECK-LE-NEXT:    vmul.i32 q0, q0, q0
309; CHECK-LE-NEXT:    vmul.i16 q0, q0, q0
310; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
311; CHECK-LE-NEXT:    bx lr
312;
313; CHECK-BE-LABEL: test_bitcast:
314; CHECK-BE:       @ %bb.0: @ %entry
315; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
316; CHECK-BE-NEXT:    vmul.i32 q0, q0, q0
317; CHECK-BE-NEXT:    vrev32.16 q0, q0
318; CHECK-BE-NEXT:    vmul.i16 q0, q0, q0
319; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
320; CHECK-BE-NEXT:    bx lr
321entry:
322  %vin = load <4 x i32>, <4 x i32>* %in, align 8
323  %vdbl = mul <4 x i32> %vin, %vin
324  %cast = bitcast <4 x i32> %vdbl to <8 x i16>
325  %cdbl = mul <8 x i16> %cast, %cast
326  store <8 x i16> %cdbl, <8 x i16>* %out, align 8
327  ret void
328}
329
330; Similar test case but using the arm.mve.vreinterpretq intrinsic instead,
331; which is defined to reinterpret the in-register format, so it generates no
332; instruction in either endianness.
333define arm_aapcs_vfpcc void @test_vreinterpretq(<4 x i32>* readonly %in, <8 x i16>* %out) {
334; CHECK-LE-LABEL: test_vreinterpretq:
335; CHECK-LE:       @ %bb.0: @ %entry
336; CHECK-LE-NEXT:    vldrw.u32 q0, [r0]
337; CHECK-LE-NEXT:    vmul.i32 q0, q0, q0
338; CHECK-LE-NEXT:    vmul.i16 q0, q0, q0
339; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
340; CHECK-LE-NEXT:    bx lr
341;
342; CHECK-BE-LABEL: test_vreinterpretq:
343; CHECK-BE:       @ %bb.0: @ %entry
344; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
345; CHECK-BE-NEXT:    vmul.i32 q0, q0, q0
346; CHECK-BE-NEXT:    vmul.i16 q0, q0, q0
347; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
348; CHECK-BE-NEXT:    bx lr
349entry:
350  %vin = load <4 x i32>, <4 x i32>* %in, align 8
351  %vdbl = mul <4 x i32> %vin, %vin
352  %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl)
353  %cdbl = mul <8 x i16> %cast, %cast
354  store <8 x i16> %cdbl, <8 x i16>* %out, align 8
355  ret void
356}
357
358declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>)
359