1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4
5define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) {
6; CHECK-LABEL: foo_int8_int32:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    vldrw.u32 q0, [r1]
9; CHECK-NEXT:    vstrb.32 q0, [r0]
10; CHECK-NEXT:    bx lr
11entry:
12  %wide.load = load <4 x i32>, <4 x i32>* %src, align 4
13  %0 = trunc <4 x i32> %wide.load to <4 x i8>
14  store <4 x i8> %0, <4 x i8>* %dest, align 1
15  ret void
16}
17
18define void @foo_int16_int32(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) {
19; CHECK-LABEL: foo_int16_int32:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vldrw.u32 q0, [r1]
22; CHECK-NEXT:    vstrh.32 q0, [r0]
23; CHECK-NEXT:    bx lr
24entry:
25  %wide.load = load <4 x i32>, <4 x i32>* %src, align 4
26  %0 = trunc <4 x i32> %wide.load to <4 x i16>
27  store <4 x i16> %0, <4 x i16>* %dest, align 2
28  ret void
29}
30
31define void @foo_int8_int16(<8 x i8>* %dest, <8 x i16>* readonly %src, i32 %n) {
32; CHECK-LABEL: foo_int8_int16:
33; CHECK:       @ %bb.0: @ %entry
34; CHECK-NEXT:    vldrh.u16 q0, [r1]
35; CHECK-NEXT:    vstrb.16 q0, [r0]
36; CHECK-NEXT:    bx lr
37entry:
38  %wide.load = load <8 x i16>, <8 x i16>* %src, align 2
39  %0 = trunc <8 x i16> %wide.load to <8 x i8>
40  store <8 x i8> %0, <8 x i8>* %dest, align 1
41  ret void
42}
43
44
45define void @foo_int8_int32_double(<16 x i8>* %dest, <16 x i32>* readonly %src, i32 %n) {
46; CHECK-LABEL: foo_int8_int32_double:
47; CHECK:       @ %bb.0: @ %entry
48; CHECK-NEXT:    vldrw.u32 q0, [r1]
49; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
50; CHECK-NEXT:    vldrw.u32 q2, [r1, #32]
51; CHECK-NEXT:    vldrw.u32 q3, [r1, #48]
52; CHECK-NEXT:    vstrb.32 q1, [r0, #4]
53; CHECK-NEXT:    vstrb.32 q0, [r0]
54; CHECK-NEXT:    vstrb.32 q3, [r0, #12]
55; CHECK-NEXT:    vstrb.32 q2, [r0, #8]
56; CHECK-NEXT:    bx lr
57entry:
58  %wide.load = load <16 x i32>, <16 x i32>* %src, align 4
59  %0 = trunc <16 x i32> %wide.load to <16 x i8>
60  store <16 x i8> %0, <16 x i8>* %dest, align 1
61  ret void
62}
63
64define void @foo_int16_int32_double(<8 x i16>* %dest, <8 x i32>* readonly %src, i32 %n) {
65; CHECK-LABEL: foo_int16_int32_double:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    vldrw.u32 q0, [r1]
68; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
69; CHECK-NEXT:    vstrh.32 q1, [r0, #8]
70; CHECK-NEXT:    vstrh.32 q0, [r0]
71; CHECK-NEXT:    bx lr
72entry:
73  %wide.load = load <8 x i32>, <8 x i32>* %src, align 4
74  %0 = trunc <8 x i32> %wide.load to <8 x i16>
75  store <8 x i16> %0, <8 x i16>* %dest, align 2
76  ret void
77}
78
79define void @foo_int8_int16_double(<16 x i8>* %dest, <16 x i16>* readonly %src, i32 %n) {
80; CHECK-LABEL: foo_int8_int16_double:
81; CHECK:       @ %bb.0: @ %entry
82; CHECK-NEXT:    vldrh.u16 q0, [r1]
83; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
84; CHECK-NEXT:    vstrb.16 q1, [r0, #8]
85; CHECK-NEXT:    vstrb.16 q0, [r0]
86; CHECK-NEXT:    bx lr
87entry:
88  %wide.load = load <16 x i16>, <16 x i16>* %src, align 2
89  %0 = trunc <16 x i16> %wide.load to <16 x i8>
90  store <16 x i8> %0, <16 x i8>* %dest, align 1
91  ret void
92}
93
94
95define void @foo_int32_int8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) {
96; CHECK-LABEL: foo_int32_int8:
97; CHECK:       @ %bb.0: @ %entry
98; CHECK-NEXT:    vldrb.s32 q0, [r1]
99; CHECK-NEXT:    vstrw.32 q0, [r0]
100; CHECK-NEXT:    bx lr
101entry:
102  %wide.load = load <4 x i8>, <4 x i8>* %src, align 1
103  %0 = sext <4 x i8> %wide.load to <4 x i32>
104  store <4 x i32> %0, <4 x i32>* %dest, align 4
105  ret void
106}
107
108define void @foo_int16_int8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) {
109; CHECK-LABEL: foo_int16_int8:
110; CHECK:       @ %bb.0: @ %entry
111; CHECK-NEXT:    vldrb.s16 q0, [r1]
112; CHECK-NEXT:    vstrh.16 q0, [r0]
113; CHECK-NEXT:    bx lr
114entry:
115  %wide.load = load <8 x i8>, <8 x i8>* %src, align 1
116  %0 = sext <8 x i8> %wide.load to <8 x i16>
117  store <8 x i16> %0, <8 x i16>* %dest, align 2
118  ret void
119}
120
121define void @foo_int32_int16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) {
122; CHECK-LABEL: foo_int32_int16:
123; CHECK:       @ %bb.0: @ %entry
124; CHECK-NEXT:    vldrh.s32 q0, [r1]
125; CHECK-NEXT:    vstrw.32 q0, [r0]
126; CHECK-NEXT:    bx lr
127entry:
128  %wide.load = load <4 x i16>, <4 x i16>* %src, align 2
129  %0 = sext <4 x i16> %wide.load to <4 x i32>
130  store <4 x i32> %0, <4 x i32>* %dest, align 4
131  ret void
132}
133
134define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
135; CHECK-LABEL: foo_int32_int8_double:
136; CHECK:       @ %bb.0: @ %entry
137; CHECK-NEXT:    vldrb.s32 q0, [r1]
138; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
139; CHECK-NEXT:    vldrb.s32 q2, [r1, #8]
140; CHECK-NEXT:    vldrb.s32 q3, [r1, #12]
141; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
142; CHECK-NEXT:    vstrw.32 q0, [r0]
143; CHECK-NEXT:    vstrw.32 q3, [r0, #48]
144; CHECK-NEXT:    vstrw.32 q2, [r0, #32]
145; CHECK-NEXT:    bx lr
146entry:
147  %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
148  %0 = sext <16 x i8> %wide.load to <16 x i32>
149  store <16 x i32> %0, <16 x i32>* %dest, align 4
150  ret void
151}
152
153define void @foo_int16_int8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) {
154; CHECK-LABEL: foo_int16_int8_double:
155; CHECK:       @ %bb.0: @ %entry
156; CHECK-NEXT:    vldrb.s16 q0, [r1]
157; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
158; CHECK-NEXT:    vstrh.16 q1, [r0, #16]
159; CHECK-NEXT:    vstrh.16 q0, [r0]
160; CHECK-NEXT:    bx lr
161entry:
162  %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
163  %0 = sext <16 x i8> %wide.load to <16 x i16>
164  store <16 x i16> %0, <16 x i16>* %dest, align 2
165  ret void
166}
167
168define void @foo_int32_int16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
169; CHECK-LABEL: foo_int32_int16_double:
170; CHECK:       @ %bb.0: @ %entry
171; CHECK-NEXT:    vldrh.s32 q0, [r1]
172; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
173; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
174; CHECK-NEXT:    vstrw.32 q0, [r0]
175; CHECK-NEXT:    bx lr
176entry:
177  %wide.load = load <8 x i16>, <8 x i16>* %src, align 2
178  %0 = sext <8 x i16> %wide.load to <8 x i32>
179  store <8 x i32> %0, <8 x i32>* %dest, align 4
180  ret void
181}
182
183
184define void @foo_uint32_uint8(<4 x i32>* %dest, <4 x i8>* readonly %src, i32 %n) {
185; CHECK-LABEL: foo_uint32_uint8:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    vldrb.u32 q0, [r1]
188; CHECK-NEXT:    vstrw.32 q0, [r0]
189; CHECK-NEXT:    bx lr
190entry:
191  %wide.load = load <4 x i8>, <4 x i8>* %src, align 1
192  %0 = zext <4 x i8> %wide.load to <4 x i32>
193  store <4 x i32> %0, <4 x i32>* %dest, align 4
194  ret void
195}
196
197define void @foo_uint16_uint8(<8 x i16>* %dest, <8 x i8>* readonly %src, i32 %n) {
198; CHECK-LABEL: foo_uint16_uint8:
199; CHECK:       @ %bb.0: @ %entry
200; CHECK-NEXT:    vldrb.u16 q0, [r1]
201; CHECK-NEXT:    vstrh.16 q0, [r0]
202; CHECK-NEXT:    bx lr
203entry:
204  %wide.load = load <8 x i8>, <8 x i8>* %src, align 1
205  %0 = zext <8 x i8> %wide.load to <8 x i16>
206  store <8 x i16> %0, <8 x i16>* %dest, align 2
207  ret void
208}
209
210define void @foo_uint32_uint16(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) {
211; CHECK-LABEL: foo_uint32_uint16:
212; CHECK:       @ %bb.0: @ %entry
213; CHECK-NEXT:    vldrh.u32 q0, [r1]
214; CHECK-NEXT:    vstrw.32 q0, [r0]
215; CHECK-NEXT:    bx lr
216entry:
217  %wide.load = load <4 x i16>, <4 x i16>* %src, align 2
218  %0 = zext <4 x i16> %wide.load to <4 x i32>
219  store <4 x i32> %0, <4 x i32>* %dest, align 4
220  ret void
221}
222
223
224define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
225; CHECK-LABEL: foo_uint32_uint8_double:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    vldrb.u32 q0, [r1]
228; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
229; CHECK-NEXT:    vldrb.u32 q2, [r1, #8]
230; CHECK-NEXT:    vldrb.u32 q3, [r1, #12]
231; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
232; CHECK-NEXT:    vstrw.32 q0, [r0]
233; CHECK-NEXT:    vstrw.32 q3, [r0, #48]
234; CHECK-NEXT:    vstrw.32 q2, [r0, #32]
235; CHECK-NEXT:    bx lr
236entry:
237  %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
238  %0 = zext <16 x i8> %wide.load to <16 x i32>
239  store <16 x i32> %0, <16 x i32>* %dest, align 4
240  ret void
241}
242
243define void @foo_uint16_uint8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) {
244; CHECK-LABEL: foo_uint16_uint8_double:
245; CHECK:       @ %bb.0: @ %entry
246; CHECK-NEXT:    vldrb.u16 q0, [r1]
247; CHECK-NEXT:    vldrb.u16 q1, [r1, #8]
248; CHECK-NEXT:    vstrh.16 q1, [r0, #16]
249; CHECK-NEXT:    vstrh.16 q0, [r0]
250; CHECK-NEXT:    bx lr
251entry:
252  %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
253  %0 = zext <16 x i8> %wide.load to <16 x i16>
254  store <16 x i16> %0, <16 x i16>* %dest, align 2
255  ret void
256}
257
258define void @foo_uint32_uint16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
259; CHECK-LABEL: foo_uint32_uint16_double:
260; CHECK:       @ %bb.0: @ %entry
261; CHECK-NEXT:    vldrh.u32 q0, [r1]
262; CHECK-NEXT:    vldrh.u32 q1, [r1, #8]
263; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
264; CHECK-NEXT:    vstrw.32 q0, [r0]
265; CHECK-NEXT:    bx lr
266entry:
267  %wide.load = load <8 x i16>, <8 x i16>* %src, align 2
268  %0 = zext <8 x i16> %wide.load to <8 x i32>
269  store <8 x i32> %0, <8 x i32>* %dest, align 4
270  ret void
271}
272
273
274define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) {
275; CHECK-LABEL: foo_int32_int8_both:
276; CHECK:       @ %bb.0: @ %entry
277; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
278; CHECK-NEXT:    vmov.u16 r2, q1[4]
279; CHECK-NEXT:    vmov.32 q0[0], r2
280; CHECK-NEXT:    vmov.u16 r2, q1[5]
281; CHECK-NEXT:    vmov.32 q0[1], r2
282; CHECK-NEXT:    vmov.u16 r2, q1[6]
283; CHECK-NEXT:    vmov.32 q0[2], r2
284; CHECK-NEXT:    vmov.u16 r2, q1[7]
285; CHECK-NEXT:    vmov.32 q0[3], r2
286; CHECK-NEXT:    vmovlb.u16 q2, q0
287; CHECK-NEXT:    vldrb.s16 q0, [r1]
288; CHECK-NEXT:    vmov.u16 r1, q1[0]
289; CHECK-NEXT:    vstrw.32 q2, [r0, #48]
290; CHECK-NEXT:    vmov.32 q2[0], r1
291; CHECK-NEXT:    vmov.u16 r1, q1[1]
292; CHECK-NEXT:    vmov.32 q2[1], r1
293; CHECK-NEXT:    vmov.u16 r1, q1[2]
294; CHECK-NEXT:    vmov.32 q2[2], r1
295; CHECK-NEXT:    vmov.u16 r1, q1[3]
296; CHECK-NEXT:    vmov.32 q2[3], r1
297; CHECK-NEXT:    vmov.u16 r1, q0[4]
298; CHECK-NEXT:    vmovlb.u16 q1, q2
299; CHECK-NEXT:    vstrw.32 q1, [r0, #32]
300; CHECK-NEXT:    vmov.32 q1[0], r1
301; CHECK-NEXT:    vmov.u16 r1, q0[5]
302; CHECK-NEXT:    vmov.32 q1[1], r1
303; CHECK-NEXT:    vmov.u16 r1, q0[6]
304; CHECK-NEXT:    vmov.32 q1[2], r1
305; CHECK-NEXT:    vmov.u16 r1, q0[7]
306; CHECK-NEXT:    vmov.32 q1[3], r1
307; CHECK-NEXT:    vmov.u16 r1, q0[0]
308; CHECK-NEXT:    vmovlb.u16 q1, q1
309; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
310; CHECK-NEXT:    vmov.32 q1[0], r1
311; CHECK-NEXT:    vmov.u16 r1, q0[1]
312; CHECK-NEXT:    vmov.32 q1[1], r1
313; CHECK-NEXT:    vmov.u16 r1, q0[2]
314; CHECK-NEXT:    vmov.32 q1[2], r1
315; CHECK-NEXT:    vmov.u16 r1, q0[3]
316; CHECK-NEXT:    vmov.32 q1[3], r1
317; CHECK-NEXT:    vmovlb.u16 q0, q1
318; CHECK-NEXT:    vstrw.32 q0, [r0]
319; CHECK-NEXT:    bx lr
320entry:
321  %wide.load = load <16 x i8>, <16 x i8>* %src, align 1
322  %0 = sext <16 x i8> %wide.load to <16 x i16>
323  %1 = zext <16 x i16> %0 to <16 x i32>
324  store <16 x i32> %1, <16 x i32>* %dest, align 4
325  ret void
326}
327
328define <8 x i16>* @foo_uint32_uint16_double_offset(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) {
329; CHECK-LABEL: foo_uint32_uint16_double_offset:
330; CHECK:       @ %bb.0: @ %entry
331; CHECK-NEXT:    vldrh.s32 q0, [r1, #16]!
332; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
333; CHECK-NEXT:    vstrw.32 q0, [r0]
334; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
335; CHECK-NEXT:    mov r0, r1
336; CHECK-NEXT:    bx lr
337entry:
338  %z = getelementptr inbounds <8 x i16>, <8 x i16>* %src, i32 1
339  %wide.load = load <8 x i16>, <8 x i16>* %z, align 2
340  %0 = sext <8 x i16> %wide.load to <8 x i32>
341  store <8 x i32> %0, <8 x i32>* %dest, align 4
342  ret <8 x i16>* %z
343}
344
345define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>* readonly %src, i32 %n) {
346; CHECK-LABEL: foo_uint32_uint16_quad_offset:
347; CHECK:       @ %bb.0: @ %entry
348; CHECK-NEXT:    vldrh.s32 q0, [r1, #32]!
349; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
350; CHECK-NEXT:    vldrh.s32 q2, [r1, #16]
351; CHECK-NEXT:    vldrh.s32 q3, [r1, #24]
352; CHECK-NEXT:    vstrw.32 q0, [r0]
353; CHECK-NEXT:    vstrw.32 q2, [r0, #32]
354; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
355; CHECK-NEXT:    vstrw.32 q3, [r0, #48]
356; CHECK-NEXT:    mov r0, r1
357; CHECK-NEXT:    bx lr
358entry:
359  %z = getelementptr inbounds <16 x i16>, <16 x i16>* %src, i32 1
360  %wide.load = load <16 x i16>, <16 x i16>* %z, align 2
361  %0 = sext <16 x i16> %wide.load to <16 x i32>
362  store <16 x i32> %0, <16 x i32>* %dest, align 4
363  ret <16 x i16>* %z
364}
365
366
367define void @foo_int16_int32_align1(<4 x i16>* %dest, <4 x i32>* readonly %src, i32 %n) {
368; CHECK-LABEL: foo_int16_int32_align1:
369; CHECK:       @ %bb.0: @ %entry
370; CHECK-NEXT:    .pad #8
371; CHECK-NEXT:    sub sp, #8
372; CHECK-NEXT:    vldrw.u32 q0, [r1]
373; CHECK-NEXT:    mov r1, sp
374; CHECK-NEXT:    vstrh.32 q0, [r1]
375; CHECK-NEXT:    ldrd r1, r2, [sp]
376; CHECK-NEXT:    str r1, [r0]
377; CHECK-NEXT:    str r2, [r0, #4]
378; CHECK-NEXT:    add sp, #8
379; CHECK-NEXT:    bx lr
380entry:
381  %wide.load = load <4 x i32>, <4 x i32>* %src, align 4
382  %0 = trunc <4 x i32> %wide.load to <4 x i16>
383  store <4 x i16> %0, <4 x i16>* %dest, align 1
384  ret void
385}
386
387define void @foo_int32_int16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) {
388; CHECK-LABEL: foo_int32_int16_align1:
389; CHECK:       @ %bb.0: @ %entry
390; CHECK-NEXT:    .pad #8
391; CHECK-NEXT:    sub sp, #8
392; CHECK-NEXT:    ldr r2, [r1]
393; CHECK-NEXT:    ldr r1, [r1, #4]
394; CHECK-NEXT:    strd r2, r1, [sp]
395; CHECK-NEXT:    mov r1, sp
396; CHECK-NEXT:    vldrh.s32 q0, [r1]
397; CHECK-NEXT:    vstrw.32 q0, [r0]
398; CHECK-NEXT:    add sp, #8
399; CHECK-NEXT:    bx lr
400entry:
401  %wide.load = load <4 x i16>, <4 x i16>* %src, align 1
402  %0 = sext <4 x i16> %wide.load to <4 x i32>
403  store <4 x i32> %0, <4 x i32>* %dest, align 4
404  ret void
405}
406
407define void @foo_uint32_uint16_align1(<4 x i32>* %dest, <4 x i16>* readonly %src, i32 %n) {
408; CHECK-LABEL: foo_uint32_uint16_align1:
409; CHECK:       @ %bb.0: @ %entry
410; CHECK-NEXT:    .pad #8
411; CHECK-NEXT:    sub sp, #8
412; CHECK-NEXT:    ldr r2, [r1]
413; CHECK-NEXT:    ldr r1, [r1, #4]
414; CHECK-NEXT:    strd r2, r1, [sp]
415; CHECK-NEXT:    mov r1, sp
416; CHECK-NEXT:    vldrh.u32 q0, [r1]
417; CHECK-NEXT:    vstrw.32 q0, [r0]
418; CHECK-NEXT:    add sp, #8
419; CHECK-NEXT:    bx lr
420entry:
421  %wide.load = load <4 x i16>, <4 x i16>* %src, align 1
422  %0 = zext <4 x i16> %wide.load to <4 x i32>
423  store <4 x i32> %0, <4 x i32>* %dest, align 4
424  ret void
425}
426