1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
2
3define i32 @test_rev_w(i32 %a) nounwind {
4entry:
5; CHECK-LABEL: test_rev_w:
6; CHECK: rev w0, w0
7  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
8  ret i32 %0
9}
10
11define i64 @test_rev_x(i64 %a) nounwind {
12entry:
13; CHECK-LABEL: test_rev_x:
14; CHECK: rev x0, x0
15  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
16  ret i64 %0
17}
18
19; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
20; of %a are zero. This optimizes rev + lsr 16 to rev16.
21define i32 @test_rev_w_srl16(i16 %a) {
22entry:
23; CHECK-LABEL: test_rev_w_srl16:
24; CHECK: and [[REG:w[0-9]+]], w0, #0xffff
25; CHECK: rev16 w0, [[REG]]
26; CHECK-NOT: lsr
27  %0 = zext i16 %a to i32
28  %1 = tail call i32 @llvm.bswap.i32(i32 %0)
29  %2 = lshr i32 %1, 16
30  ret i32 %2
31}
32
33; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
34; of %a are zero. This optimizes rev + lsr 32 to rev32.
35define i64 @test_rev_x_srl32(i32 %a) {
36entry:
37; CHECK-LABEL: test_rev_x_srl32:
38; CHECK: rev32 x0, {{x[0-9]+}}
39; CHECK-NOT: lsr
40  %0 = zext i32 %a to i64
41  %1 = tail call i64 @llvm.bswap.i64(i64 %0)
42  %2 = lshr i64 %1, 32
43  ret i64 %2
44}
45
46declare i32 @llvm.bswap.i32(i32) nounwind readnone
47declare i64 @llvm.bswap.i64(i64) nounwind readnone
48
49define i32 @test_rev16_w(i32 %X) nounwind {
50entry:
51; CHECK-LABEL: test_rev16_w:
52; CHECK: rev16 w0, w0
53  %tmp1 = lshr i32 %X, 8
54  %X15 = bitcast i32 %X to i32
55  %tmp4 = shl i32 %X15, 8
56  %tmp2 = and i32 %tmp1, 16711680
57  %tmp5 = and i32 %tmp4, -16777216
58  %tmp9 = and i32 %tmp1, 255
59  %tmp13 = and i32 %tmp4, 65280
60  %tmp6 = or i32 %tmp5, %tmp2
61  %tmp10 = or i32 %tmp6, %tmp13
62  %tmp14 = or i32 %tmp10, %tmp9
63  ret i32 %tmp14
64}
65
66; 64-bit REV16 is *not* a swap then a 16-bit rotation:
67;   01234567 ->(bswap) 76543210 ->(rotr) 10765432
68;   01234567 ->(rev16) 10325476
69define i64 @test_rev16_x(i64 %a) nounwind {
70entry:
71; CHECK-LABEL: test_rev16_x:
72; CHECK-NOT: rev16 x0, x0
73  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
74  %1 = lshr i64 %0, 16
75  %2 = shl i64 %0, 48
76  %3 = or i64 %1, %2
77  ret i64 %3
78}
79
80define i64 @test_rev32_x(i64 %a) nounwind {
81entry:
82; CHECK-LABEL: test_rev32_x:
83; CHECK: rev32 x0, x0
84  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
85  %1 = lshr i64 %0, 32
86  %2 = shl i64 %0, 32
87  %3 = or i64 %1, %2
88  ret i64 %3
89}
90
91define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
92;CHECK-LABEL: test_vrev64D8:
93;CHECK: rev64.8b
94	%tmp1 = load <8 x i8>, <8 x i8>* %A
95	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
96	ret <8 x i8> %tmp2
97}
98
99define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
100;CHECK-LABEL: test_vrev64D16:
101;CHECK: rev64.4h
102	%tmp1 = load <4 x i16>, <4 x i16>* %A
103	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
104	ret <4 x i16> %tmp2
105}
106
107define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
108;CHECK-LABEL: test_vrev64D32:
109;CHECK: rev64.2s
110	%tmp1 = load <2 x i32>, <2 x i32>* %A
111	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
112	ret <2 x i32> %tmp2
113}
114
115define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
116;CHECK-LABEL: test_vrev64Df:
117;CHECK: rev64.2s
118	%tmp1 = load <2 x float>, <2 x float>* %A
119	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
120	ret <2 x float> %tmp2
121}
122
123define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
124;CHECK-LABEL: test_vrev64Q8:
125;CHECK: rev64.16b
126	%tmp1 = load <16 x i8>, <16 x i8>* %A
127	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
128	ret <16 x i8> %tmp2
129}
130
131define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
132;CHECK-LABEL: test_vrev64Q16:
133;CHECK: rev64.8h
134	%tmp1 = load <8 x i16>, <8 x i16>* %A
135	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
136	ret <8 x i16> %tmp2
137}
138
139define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
140;CHECK-LABEL: test_vrev64Q32:
141;CHECK: rev64.4s
142	%tmp1 = load <4 x i32>, <4 x i32>* %A
143	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
144	ret <4 x i32> %tmp2
145}
146
147define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
148;CHECK-LABEL: test_vrev64Qf:
149;CHECK: rev64.4s
150	%tmp1 = load <4 x float>, <4 x float>* %A
151	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
152	ret <4 x float> %tmp2
153}
154
155define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
156;CHECK-LABEL: test_vrev32D8:
157;CHECK: rev32.8b
158	%tmp1 = load <8 x i8>, <8 x i8>* %A
159	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
160	ret <8 x i8> %tmp2
161}
162
163define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
164;CHECK-LABEL: test_vrev32D16:
165;CHECK: rev32.4h
166	%tmp1 = load <4 x i16>, <4 x i16>* %A
167	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
168	ret <4 x i16> %tmp2
169}
170
171define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
172;CHECK-LABEL: test_vrev32Q8:
173;CHECK: rev32.16b
174	%tmp1 = load <16 x i8>, <16 x i8>* %A
175	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
176	ret <16 x i8> %tmp2
177}
178
179define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
180;CHECK-LABEL: test_vrev32Q16:
181;CHECK: rev32.8h
182	%tmp1 = load <8 x i16>, <8 x i16>* %A
183	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
184	ret <8 x i16> %tmp2
185}
186
187define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
188;CHECK-LABEL: test_vrev16D8:
189;CHECK: rev16.8b
190	%tmp1 = load <8 x i8>, <8 x i8>* %A
191	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
192	ret <8 x i8> %tmp2
193}
194
195define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
196;CHECK-LABEL: test_vrev16Q8:
197;CHECK: rev16.16b
198	%tmp1 = load <16 x i8>, <16 x i8>* %A
199	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
200	ret <16 x i8> %tmp2
201}
202
203; Undef shuffle indices should not prevent matching to VREV:
204
205define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
206;CHECK-LABEL: test_vrev64D8_undef:
207;CHECK: rev64.8b
208	%tmp1 = load <8 x i8>, <8 x i8>* %A
209	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
210	ret <8 x i8> %tmp2
211}
212
213define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
214;CHECK-LABEL: test_vrev32Q16_undef:
215;CHECK: rev32.8h
216	%tmp1 = load <8 x i16>, <8 x i16>* %A
217	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
218	ret <8 x i16> %tmp2
219}
220
221; vrev <4 x i16> should use REV32 and not REV64
222define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
223; CHECK-LABEL: test_vrev64:
224; CHECK: ldr [[DEST:q[0-9]+]],
225; CHECK: st1.h
226; CHECK: st1.h
227entry:
228  %0 = bitcast <4 x i16>* %source to <8 x i16>*
229  %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
230  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
231  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
232  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
233  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
234  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
235  ret void
236}
237
238; Test vrev of float4
239define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
240; CHECK: float_vrev64
241; CHECK: ldr [[DEST:q[0-9]+]],
242; CHECK: rev64.4s
243entry:
244  %0 = bitcast float* %source to <4 x float>*
245  %tmp2 = load <4 x float>, <4 x float>* %0, align 4
246  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
247  %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
248  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
249  ret void
250}
251
252
253define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
254; CHECK-LABEL: test_vrev32_bswap:
255; CHECK: rev32.16b
256; CHECK-NOT: rev
257; CHECK: ret
258  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
259  ret <4 x i32> %bswap
260}
261
262declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
263