1; Test stores of byte-swapped vector elements.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
4
5declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
6declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
7declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
8
9; Test v8i16 stores.
10define void @f1(<8 x i16> %val, <8 x i16> *%ptr) {
11; CHECK-LABEL: f1:
12; CHECK: vstbrh %v24, 0(%r2)
13; CHECK: br %r14
14  %swap = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %val)
15  store <8 x i16> %swap, <8 x i16> *%ptr
16  ret void
17}
18
19; Test v4i32 stores.
20define void @f2(<4 x i32> %val, <4 x i32> *%ptr) {
21; CHECK-LABEL: f2:
22; CHECK: vstbrf %v24, 0(%r2)
23; CHECK: br %r14
24  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
25  store <4 x i32> %swap, <4 x i32> *%ptr
26  ret void
27}
28
29; Test v2i64 stores.
30define void @f3(<2 x i64> %val, <2 x i64> *%ptr) {
31; CHECK-LABEL: f3:
32; CHECK: vstbrg %v24, 0(%r2)
33; CHECK: br %r14
34  %swap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val)
35  store <2 x i64> %swap, <2 x i64> *%ptr
36  ret void
37}
38
39; Test the highest aligned in-range offset.
40define void @f4(<4 x i32> %val, <4 x i32> *%base) {
41; CHECK-LABEL: f4:
42; CHECK: vstbrf %v24, 4080(%r2)
43; CHECK: br %r14
44  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 255
45  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
46  store <4 x i32> %swap, <4 x i32> *%ptr
47  ret void
48}
49
50; Test the highest unaligned in-range offset.
51define void @f5(<4 x i32> %val, i8 *%base) {
52; CHECK-LABEL: f5:
53; CHECK: vstbrf %v24, 4095(%r2)
54; CHECK: br %r14
55  %addr = getelementptr i8, i8 *%base, i64 4095
56  %ptr = bitcast i8 *%addr to <4 x i32> *
57  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
58  store <4 x i32> %swap, <4 x i32> *%ptr, align 1
59  ret void
60}
61
62; Test the next offset up, which requires separate address logic,
63define void @f6(<4 x i32> %val, <4 x i32> *%base) {
64; CHECK-LABEL: f6:
65; CHECK: aghi %r2, 4096
66; CHECK: vstbrf %v24, 0(%r2)
67; CHECK: br %r14
68  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 256
69  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
70  store <4 x i32> %swap, <4 x i32> *%ptr
71  ret void
72}
73
74; Test negative offsets, which also require separate address logic,
75define void @f7(<4 x i32> %val, <4 x i32> *%base) {
76; CHECK-LABEL: f7:
77; CHECK: aghi %r2, -16
78; CHECK: vstbrf %v24, 0(%r2)
79; CHECK: br %r14
80  %ptr = getelementptr <4 x i32>, <4 x i32> *%base, i64 -1
81  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
82  store <4 x i32> %swap, <4 x i32> *%ptr
83  ret void
84}
85
86; Check that indexes are allowed.
87define void @f8(<4 x i32> %val, i8 *%base, i64 %index) {
88; CHECK-LABEL: f8:
89; CHECK: vstbrf %v24, 0(%r3,%r2)
90; CHECK: br %r14
91  %addr = getelementptr i8, i8 *%base, i64 %index
92  %ptr = bitcast i8 *%addr to <4 x i32> *
93  %swap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val)
94  store <4 x i32> %swap, <4 x i32> *%ptr, align 1
95  ret void
96}
97
98