1; Test replications of a byte-swapped scalar memory value.
2;
3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
4
5declare i16 @llvm.bswap.i16(i16)
6declare i32 @llvm.bswap.i32(i32)
7declare i64 @llvm.bswap.i64(i64)
8declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
9declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
10declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
11
12; Test a v8i16 replicating load with no offset.
13define <8 x i16> @f1(i16 *%ptr) {
14; CHECK-LABEL: f1:
15; CHECK: vlbrreph %v24, 0(%r2)
16; CHECK: br %r14
17  %scalar = load i16, i16 *%ptr
18  %swap = call i16 @llvm.bswap.i16(i16 %scalar)
19  %val = insertelement <8 x i16> undef, i16 %swap, i32 0
20  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
21                       <8 x i32> zeroinitializer
22  ret <8 x i16> %ret
23}
24
25; Test a v8i16 replicating load with the maximum in-range offset.
26define <8 x i16> @f2(i16 *%base) {
27; CHECK-LABEL: f2:
28; CHECK: vlbrreph %v24, 4094(%r2)
29; CHECK: br %r14
30  %ptr = getelementptr i16, i16 *%base, i64 2047
31  %scalar = load i16, i16 *%ptr
32  %swap = call i16 @llvm.bswap.i16(i16 %scalar)
33  %val = insertelement <8 x i16> undef, i16 %swap, i32 0
34  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
35                       <8 x i32> zeroinitializer
36  ret <8 x i16> %ret
37}
38
39; Test a v8i16 replicating load with the first out-of-range offset.
40define <8 x i16> @f3(i16 *%base) {
41; CHECK-LABEL: f3:
42; CHECK: aghi %r2, 4096
43; CHECK: vlbrreph %v24, 0(%r2)
44; CHECK: br %r14
45  %ptr = getelementptr i16, i16 *%base, i64 2048
46  %scalar = load i16, i16 *%ptr
47  %swap = call i16 @llvm.bswap.i16(i16 %scalar)
48  %val = insertelement <8 x i16> undef, i16 %swap, i32 0
49  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
50                       <8 x i32> zeroinitializer
51  ret <8 x i16> %ret
52}
53
54; Test a v8i16 replicating load using a vector bswap.
55define <8 x i16> @f4(i16 *%ptr) {
56; CHECK-LABEL: f4:
57; CHECK: vlbrreph %v24, 0(%r2)
58; CHECK: br %r14
59  %scalar = load i16, i16 *%ptr
60  %val = insertelement <8 x i16> undef, i16 %scalar, i32 0
61  %rep = shufflevector <8 x i16> %val, <8 x i16> undef,
62                       <8 x i32> zeroinitializer
63  %ret = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %rep)
64  ret <8 x i16> %ret
65}
66
67; Test a v4i32 replicating load with no offset.
68define <4 x i32> @f5(i32 *%ptr) {
69; CHECK-LABEL: f5:
70; CHECK: vlbrrepf %v24, 0(%r2)
71; CHECK: br %r14
72  %scalar = load i32, i32 *%ptr
73  %swap = call i32 @llvm.bswap.i32(i32 %scalar)
74  %val = insertelement <4 x i32> undef, i32 %swap, i32 0
75  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
76                       <4 x i32> zeroinitializer
77  ret <4 x i32> %ret
78}
79
80; Test a v4i32 replicating load with the maximum in-range offset.
81define <4 x i32> @f6(i32 *%base) {
82; CHECK-LABEL: f6:
83; CHECK: vlbrrepf %v24, 4092(%r2)
84; CHECK: br %r14
85  %ptr = getelementptr i32, i32 *%base, i64 1023
86  %scalar = load i32, i32 *%ptr
87  %swap = call i32 @llvm.bswap.i32(i32 %scalar)
88  %val = insertelement <4 x i32> undef, i32 %swap, i32 0
89  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
90                       <4 x i32> zeroinitializer
91  ret <4 x i32> %ret
92}
93
94; Test a v4i32 replicating load with the first out-of-range offset.
95define <4 x i32> @f7(i32 *%base) {
96; CHECK-LABEL: f7:
97; CHECK: aghi %r2, 4096
98; CHECK: vlbrrepf %v24, 0(%r2)
99; CHECK: br %r14
100  %ptr = getelementptr i32, i32 *%base, i64 1024
101  %scalar = load i32, i32 *%ptr
102  %swap = call i32 @llvm.bswap.i32(i32 %scalar)
103  %val = insertelement <4 x i32> undef, i32 %swap, i32 0
104  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
105                       <4 x i32> zeroinitializer
106  ret <4 x i32> %ret
107}
108
109; Test a v4i32 replicating load using a vector bswap.
110define <4 x i32> @f8(i32 *%ptr) {
111; CHECK-LABEL: f8:
112; CHECK: vlbrrepf %v24, 0(%r2)
113; CHECK: br %r14
114  %scalar = load i32, i32 *%ptr
115  %val = insertelement <4 x i32> undef, i32 %scalar, i32 0
116  %rep = shufflevector <4 x i32> %val, <4 x i32> undef,
117                       <4 x i32> zeroinitializer
118  %ret = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %rep)
119  ret <4 x i32> %ret
120}
121
122; Test a v2i64 replicating load with no offset.
123define <2 x i64> @f9(i64 *%ptr) {
124; CHECK-LABEL: f9:
125; CHECK: vlbrrepg %v24, 0(%r2)
126; CHECK: br %r14
127  %scalar = load i64, i64 *%ptr
128  %swap = call i64 @llvm.bswap.i64(i64 %scalar)
129  %val = insertelement <2 x i64> undef, i64 %swap, i32 0
130  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
131                       <2 x i32> zeroinitializer
132  ret <2 x i64> %ret
133}
134
135; Test a v2i64 replicating load with the maximum in-range offset.
136define <2 x i64> @f10(i64 *%base) {
137; CHECK-LABEL: f10:
138; CHECK: vlbrrepg %v24, 4088(%r2)
139; CHECK: br %r14
140  %ptr = getelementptr i64, i64 *%base, i32 511
141  %scalar = load i64, i64 *%ptr
142  %swap = call i64 @llvm.bswap.i64(i64 %scalar)
143  %val = insertelement <2 x i64> undef, i64 %swap, i32 0
144  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
145                       <2 x i32> zeroinitializer
146  ret <2 x i64> %ret
147}
148
149; Test a v2i64 replicating load with the first out-of-range offset.
150define <2 x i64> @f11(i64 *%base) {
151; CHECK-LABEL: f11:
152; CHECK: aghi %r2, 4096
153; CHECK: vlbrrepg %v24, 0(%r2)
154; CHECK: br %r14
155  %ptr = getelementptr i64, i64 *%base, i32 512
156  %scalar = load i64, i64 *%ptr
157  %swap = call i64 @llvm.bswap.i64(i64 %scalar)
158  %val = insertelement <2 x i64> undef, i64 %swap, i32 0
159  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
160                       <2 x i32> zeroinitializer
161  ret <2 x i64> %ret
162}
163
164; Test a v2i64 replicating load using a vector bswap.
165define <2 x i64> @f12(i64 *%ptr) {
166; CHECK-LABEL: f12:
167; CHECK: vlbrrepg %v24, 0(%r2)
168; CHECK: br %r14
169  %scalar = load i64, i64 *%ptr
170  %val = insertelement <2 x i64> undef, i64 %scalar, i32 0
171  %rep = shufflevector <2 x i64> %val, <2 x i64> undef,
172                       <2 x i32> zeroinitializer
173  %ret = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %rep)
174  ret <2 x i64> %ret
175}
176
177; Test a v8i16 replicating load with an index.
178define <8 x i16> @f13(i16 *%base, i64 %index) {
179; CHECK-LABEL: f13:
180; CHECK: sllg [[REG:%r[1-5]]], %r3, 1
181; CHECK: vlbrreph %v24, 2046([[REG]],%r2)
182; CHECK: br %r14
183  %ptr1 = getelementptr i16, i16 *%base, i64 %index
184  %ptr = getelementptr i16, i16 *%ptr1, i64 1023
185  %scalar = load i16, i16 *%ptr
186  %swap = call i16 @llvm.bswap.i16(i16 %scalar)
187  %val = insertelement <8 x i16> undef, i16 %swap, i32 0
188  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
189                       <8 x i32> zeroinitializer
190  ret <8 x i16> %ret
191}
192
193