1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,GCN,SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI
4
5declare i16 @llvm.bswap.i16(i16) nounwind readnone
6declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone
7declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone
8declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone
9declare i32 @llvm.bswap.i32(i32) nounwind readnone
10declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
11declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
12declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
13declare i64 @llvm.bswap.i64(i64) nounwind readnone
14declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
15declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
16declare i48 @llvm.bswap.i48(i48) #1
17
18define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
19; SI-LABEL: test_bswap_i32:
20; SI:       ; %bb.0:
21; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
22; SI-NEXT:    s_waitcnt lgkmcnt(0)
23; SI-NEXT:    s_load_dword s4, s[2:3], 0x0
24; SI-NEXT:    s_mov_b32 s3, 0xf000
25; SI-NEXT:    s_mov_b32 s2, -1
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    v_alignbit_b32 v0, s4, s4, 8
28; SI-NEXT:    v_alignbit_b32 v1, s4, s4, 24
29; SI-NEXT:    s_mov_b32 s4, 0xff00ff
30; SI-NEXT:    v_bfi_b32 v0, s4, v1, v0
31; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
32; SI-NEXT:    s_endpgm
33;
34; VI-LABEL: test_bswap_i32:
35; VI:       ; %bb.0:
36; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
37; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
38; VI-NEXT:    s_mov_b32 s7, 0xf000
39; VI-NEXT:    s_mov_b32 s6, -1
40; VI-NEXT:    s_waitcnt lgkmcnt(0)
41; VI-NEXT:    s_mov_b32 s4, s0
42; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
43; VI-NEXT:    s_mov_b32 s5, s1
44; VI-NEXT:    s_waitcnt lgkmcnt(0)
45; VI-NEXT:    v_perm_b32 v0, 0, s0, v0
46; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
47; VI-NEXT:    s_endpgm
48  %val = load i32, i32 addrspace(1)* %in, align 4
49  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
50  store i32 %bswap, i32 addrspace(1)* %out, align 4
51  ret void
52}
53
54define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
55; SI-LABEL: test_bswap_v2i32:
56; SI:       ; %bb.0:
57; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
58; SI-NEXT:    s_waitcnt lgkmcnt(0)
59; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
60; SI-NEXT:    s_mov_b32 s3, 0xf000
61; SI-NEXT:    s_mov_b32 s2, -1
62; SI-NEXT:    s_mov_b32 s6, 0xff00ff
63; SI-NEXT:    s_waitcnt lgkmcnt(0)
64; SI-NEXT:    v_alignbit_b32 v0, s5, s5, 8
65; SI-NEXT:    v_alignbit_b32 v1, s5, s5, 24
66; SI-NEXT:    v_alignbit_b32 v2, s4, s4, 8
67; SI-NEXT:    v_alignbit_b32 v3, s4, s4, 24
68; SI-NEXT:    v_bfi_b32 v1, s6, v1, v0
69; SI-NEXT:    v_bfi_b32 v0, s6, v3, v2
70; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
71; SI-NEXT:    s_endpgm
72;
73; VI-LABEL: test_bswap_v2i32:
74; VI:       ; %bb.0:
75; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
76; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
77; VI-NEXT:    s_mov_b32 s7, 0xf000
78; VI-NEXT:    s_mov_b32 s6, -1
79; VI-NEXT:    s_waitcnt lgkmcnt(0)
80; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
81; VI-NEXT:    s_mov_b32 s4, s0
82; VI-NEXT:    s_mov_b32 s5, s1
83; VI-NEXT:    s_waitcnt lgkmcnt(0)
84; VI-NEXT:    v_perm_b32 v1, 0, s3, v0
85; VI-NEXT:    v_perm_b32 v0, 0, s2, v0
86; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
87; VI-NEXT:    s_endpgm
88  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
89  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
90  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
91  ret void
92}
93
94define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
95; SI-LABEL: test_bswap_v4i32:
96; SI:       ; %bb.0:
97; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
98; SI-NEXT:    s_waitcnt lgkmcnt(0)
99; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
100; SI-NEXT:    s_mov_b32 s3, 0xf000
101; SI-NEXT:    s_mov_b32 s2, -1
102; SI-NEXT:    s_mov_b32 s8, 0xff00ff
103; SI-NEXT:    s_waitcnt lgkmcnt(0)
104; SI-NEXT:    v_alignbit_b32 v0, s7, s7, 8
105; SI-NEXT:    v_alignbit_b32 v1, s7, s7, 24
106; SI-NEXT:    v_alignbit_b32 v2, s6, s6, 8
107; SI-NEXT:    v_alignbit_b32 v4, s6, s6, 24
108; SI-NEXT:    v_alignbit_b32 v5, s5, s5, 8
109; SI-NEXT:    v_alignbit_b32 v6, s5, s5, 24
110; SI-NEXT:    v_alignbit_b32 v7, s4, s4, 8
111; SI-NEXT:    v_alignbit_b32 v8, s4, s4, 24
112; SI-NEXT:    v_bfi_b32 v3, s8, v1, v0
113; SI-NEXT:    v_bfi_b32 v2, s8, v4, v2
114; SI-NEXT:    v_bfi_b32 v1, s8, v6, v5
115; SI-NEXT:    v_bfi_b32 v0, s8, v8, v7
116; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
117; SI-NEXT:    s_endpgm
118;
119; VI-LABEL: test_bswap_v4i32:
120; VI:       ; %bb.0:
121; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
122; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
123; VI-NEXT:    s_mov_b32 s3, 0xf000
124; VI-NEXT:    s_mov_b32 s2, -1
125; VI-NEXT:    s_waitcnt lgkmcnt(0)
126; VI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
127; VI-NEXT:    s_mov_b32 s0, s4
128; VI-NEXT:    s_mov_b32 s1, s5
129; VI-NEXT:    s_waitcnt lgkmcnt(0)
130; VI-NEXT:    v_perm_b32 v3, 0, s11, v0
131; VI-NEXT:    v_perm_b32 v2, 0, s10, v0
132; VI-NEXT:    v_perm_b32 v1, 0, s9, v0
133; VI-NEXT:    v_perm_b32 v0, 0, s8, v0
134; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
135; VI-NEXT:    s_endpgm
136  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
137  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
138  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
139  ret void
140}
141
142define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
143; SI-LABEL: test_bswap_v8i32:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
146; SI-NEXT:    s_waitcnt lgkmcnt(0)
147; SI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
148; SI-NEXT:    s_mov_b32 s11, 0xf000
149; SI-NEXT:    s_mov_b32 s10, -1
150; SI-NEXT:    s_mov_b32 s12, 0xff00ff
151; SI-NEXT:    s_waitcnt lgkmcnt(0)
152; SI-NEXT:    v_alignbit_b32 v0, s3, s3, 8
153; SI-NEXT:    v_alignbit_b32 v1, s3, s3, 24
154; SI-NEXT:    v_alignbit_b32 v2, s2, s2, 8
155; SI-NEXT:    v_alignbit_b32 v4, s2, s2, 24
156; SI-NEXT:    v_alignbit_b32 v5, s1, s1, 8
157; SI-NEXT:    v_alignbit_b32 v6, s1, s1, 24
158; SI-NEXT:    v_alignbit_b32 v7, s0, s0, 8
159; SI-NEXT:    v_alignbit_b32 v8, s0, s0, 24
160; SI-NEXT:    v_alignbit_b32 v9, s7, s7, 8
161; SI-NEXT:    v_alignbit_b32 v10, s7, s7, 24
162; SI-NEXT:    v_alignbit_b32 v11, s6, s6, 8
163; SI-NEXT:    v_alignbit_b32 v12, s6, s6, 24
164; SI-NEXT:    v_alignbit_b32 v13, s5, s5, 8
165; SI-NEXT:    v_alignbit_b32 v14, s5, s5, 24
166; SI-NEXT:    v_alignbit_b32 v15, s4, s4, 8
167; SI-NEXT:    v_alignbit_b32 v16, s4, s4, 24
168; SI-NEXT:    v_bfi_b32 v3, s12, v1, v0
169; SI-NEXT:    v_bfi_b32 v2, s12, v4, v2
170; SI-NEXT:    v_bfi_b32 v1, s12, v6, v5
171; SI-NEXT:    v_bfi_b32 v0, s12, v8, v7
172; SI-NEXT:    v_bfi_b32 v7, s12, v10, v9
173; SI-NEXT:    v_bfi_b32 v6, s12, v12, v11
174; SI-NEXT:    v_bfi_b32 v5, s12, v14, v13
175; SI-NEXT:    v_bfi_b32 v4, s12, v16, v15
176; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
177; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
178; SI-NEXT:    s_endpgm
179;
180; VI-LABEL: test_bswap_v8i32:
181; VI:       ; %bb.0:
182; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
183; VI-NEXT:    v_mov_b32_e32 v4, 0x10203
184; VI-NEXT:    s_mov_b32 s11, 0xf000
185; VI-NEXT:    s_mov_b32 s10, -1
186; VI-NEXT:    s_waitcnt lgkmcnt(0)
187; VI-NEXT:    s_mov_b32 s8, s0
188; VI-NEXT:    s_mov_b32 s9, s1
189; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x0
190; VI-NEXT:    s_waitcnt lgkmcnt(0)
191; VI-NEXT:    v_perm_b32 v3, 0, s3, v4
192; VI-NEXT:    v_perm_b32 v2, 0, s2, v4
193; VI-NEXT:    v_perm_b32 v1, 0, s1, v4
194; VI-NEXT:    v_perm_b32 v0, 0, s0, v4
195; VI-NEXT:    v_perm_b32 v7, 0, s7, v4
196; VI-NEXT:    v_perm_b32 v6, 0, s6, v4
197; VI-NEXT:    v_perm_b32 v5, 0, s5, v4
198; VI-NEXT:    v_perm_b32 v4, 0, s4, v4
199; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
200; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
201; VI-NEXT:    s_endpgm
202  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
203  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
204  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
205  ret void
206}
207
208define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
209; SI-LABEL: test_bswap_i64:
210; SI:       ; %bb.0:
211; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
212; SI-NEXT:    s_waitcnt lgkmcnt(0)
213; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
214; SI-NEXT:    s_mov_b32 s3, 0xf000
215; SI-NEXT:    s_mov_b32 s2, -1
216; SI-NEXT:    s_mov_b32 s6, 0xff00ff
217; SI-NEXT:    s_waitcnt lgkmcnt(0)
218; SI-NEXT:    v_alignbit_b32 v0, s4, s4, 8
219; SI-NEXT:    v_alignbit_b32 v1, s4, s4, 24
220; SI-NEXT:    v_alignbit_b32 v2, s5, s5, 8
221; SI-NEXT:    v_alignbit_b32 v3, s5, s5, 24
222; SI-NEXT:    v_bfi_b32 v1, s6, v1, v0
223; SI-NEXT:    v_bfi_b32 v0, s6, v3, v2
224; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
225; SI-NEXT:    s_endpgm
226;
227; VI-LABEL: test_bswap_i64:
228; VI:       ; %bb.0:
229; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
230; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
231; VI-NEXT:    s_mov_b32 s7, 0xf000
232; VI-NEXT:    s_mov_b32 s6, -1
233; VI-NEXT:    s_waitcnt lgkmcnt(0)
234; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
235; VI-NEXT:    s_mov_b32 s4, s0
236; VI-NEXT:    s_mov_b32 s5, s1
237; VI-NEXT:    s_waitcnt lgkmcnt(0)
238; VI-NEXT:    v_perm_b32 v1, 0, s2, v0
239; VI-NEXT:    v_perm_b32 v0, 0, s3, v0
240; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
241; VI-NEXT:    s_endpgm
242  %val = load i64, i64 addrspace(1)* %in, align 8
243  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
244  store i64 %bswap, i64 addrspace(1)* %out, align 8
245  ret void
246}
247
248define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
249; SI-LABEL: test_bswap_v2i64:
250; SI:       ; %bb.0:
251; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
252; SI-NEXT:    s_waitcnt lgkmcnt(0)
253; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
254; SI-NEXT:    s_mov_b32 s3, 0xf000
255; SI-NEXT:    s_mov_b32 s2, -1
256; SI-NEXT:    s_mov_b32 s8, 0xff00ff
257; SI-NEXT:    s_waitcnt lgkmcnt(0)
258; SI-NEXT:    v_alignbit_b32 v0, s6, s6, 8
259; SI-NEXT:    v_alignbit_b32 v1, s6, s6, 24
260; SI-NEXT:    v_alignbit_b32 v2, s7, s7, 8
261; SI-NEXT:    v_alignbit_b32 v4, s7, s7, 24
262; SI-NEXT:    v_alignbit_b32 v5, s4, s4, 8
263; SI-NEXT:    v_alignbit_b32 v6, s4, s4, 24
264; SI-NEXT:    v_alignbit_b32 v7, s5, s5, 8
265; SI-NEXT:    v_alignbit_b32 v8, s5, s5, 24
266; SI-NEXT:    v_bfi_b32 v3, s8, v1, v0
267; SI-NEXT:    v_bfi_b32 v2, s8, v4, v2
268; SI-NEXT:    v_bfi_b32 v1, s8, v6, v5
269; SI-NEXT:    v_bfi_b32 v0, s8, v8, v7
270; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
271; SI-NEXT:    s_endpgm
272;
273; VI-LABEL: test_bswap_v2i64:
274; VI:       ; %bb.0:
275; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
276; VI-NEXT:    v_mov_b32_e32 v0, 0x10203
277; VI-NEXT:    s_mov_b32 s3, 0xf000
278; VI-NEXT:    s_mov_b32 s2, -1
279; VI-NEXT:    s_waitcnt lgkmcnt(0)
280; VI-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
281; VI-NEXT:    s_mov_b32 s0, s4
282; VI-NEXT:    s_mov_b32 s1, s5
283; VI-NEXT:    s_waitcnt lgkmcnt(0)
284; VI-NEXT:    v_perm_b32 v3, 0, s10, v0
285; VI-NEXT:    v_perm_b32 v2, 0, s11, v0
286; VI-NEXT:    v_perm_b32 v1, 0, s8, v0
287; VI-NEXT:    v_perm_b32 v0, 0, s9, v0
288; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
289; VI-NEXT:    s_endpgm
290  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
291  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
292  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
293  ret void
294}
295
296define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
297; SI-LABEL: test_bswap_v4i64:
298; SI:       ; %bb.0:
299; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
300; SI-NEXT:    s_waitcnt lgkmcnt(0)
301; SI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
302; SI-NEXT:    s_mov_b32 s11, 0xf000
303; SI-NEXT:    s_mov_b32 s10, -1
304; SI-NEXT:    s_mov_b32 s12, 0xff00ff
305; SI-NEXT:    s_waitcnt lgkmcnt(0)
306; SI-NEXT:    v_alignbit_b32 v0, s2, s2, 8
307; SI-NEXT:    v_alignbit_b32 v1, s2, s2, 24
308; SI-NEXT:    v_alignbit_b32 v2, s3, s3, 8
309; SI-NEXT:    v_alignbit_b32 v4, s3, s3, 24
310; SI-NEXT:    v_alignbit_b32 v5, s0, s0, 8
311; SI-NEXT:    v_alignbit_b32 v6, s0, s0, 24
312; SI-NEXT:    v_alignbit_b32 v7, s1, s1, 8
313; SI-NEXT:    v_alignbit_b32 v8, s1, s1, 24
314; SI-NEXT:    v_alignbit_b32 v9, s6, s6, 8
315; SI-NEXT:    v_alignbit_b32 v10, s6, s6, 24
316; SI-NEXT:    v_alignbit_b32 v11, s7, s7, 8
317; SI-NEXT:    v_alignbit_b32 v12, s7, s7, 24
318; SI-NEXT:    v_alignbit_b32 v13, s4, s4, 8
319; SI-NEXT:    v_alignbit_b32 v14, s4, s4, 24
320; SI-NEXT:    v_alignbit_b32 v15, s5, s5, 8
321; SI-NEXT:    v_alignbit_b32 v16, s5, s5, 24
322; SI-NEXT:    v_bfi_b32 v3, s12, v1, v0
323; SI-NEXT:    v_bfi_b32 v2, s12, v4, v2
324; SI-NEXT:    v_bfi_b32 v1, s12, v6, v5
325; SI-NEXT:    v_bfi_b32 v0, s12, v8, v7
326; SI-NEXT:    v_bfi_b32 v7, s12, v10, v9
327; SI-NEXT:    v_bfi_b32 v6, s12, v12, v11
328; SI-NEXT:    v_bfi_b32 v5, s12, v14, v13
329; SI-NEXT:    v_bfi_b32 v4, s12, v16, v15
330; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
331; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
332; SI-NEXT:    s_endpgm
333;
334; VI-LABEL: test_bswap_v4i64:
335; VI:       ; %bb.0:
336; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
337; VI-NEXT:    v_mov_b32_e32 v4, 0x10203
338; VI-NEXT:    s_mov_b32 s11, 0xf000
339; VI-NEXT:    s_mov_b32 s10, -1
340; VI-NEXT:    s_waitcnt lgkmcnt(0)
341; VI-NEXT:    s_mov_b32 s8, s0
342; VI-NEXT:    s_mov_b32 s9, s1
343; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x0
344; VI-NEXT:    s_waitcnt lgkmcnt(0)
345; VI-NEXT:    v_perm_b32 v3, 0, s2, v4
346; VI-NEXT:    v_perm_b32 v2, 0, s3, v4
347; VI-NEXT:    v_perm_b32 v1, 0, s0, v4
348; VI-NEXT:    v_perm_b32 v0, 0, s1, v4
349; VI-NEXT:    v_perm_b32 v7, 0, s6, v4
350; VI-NEXT:    v_perm_b32 v6, 0, s7, v4
351; VI-NEXT:    v_perm_b32 v5, 0, s4, v4
352; VI-NEXT:    v_perm_b32 v4, 0, s5, v4
353; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
354; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
355; VI-NEXT:    s_endpgm
356  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
357  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
358  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
359  ret void
360}
361
362define float @missing_truncate_promote_bswap(i32 %arg) {
363; SI-LABEL: missing_truncate_promote_bswap:
364; SI:       ; %bb.0: ; %bb
365; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
367; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
368; SI-NEXT:    s_mov_b32 s4, 0xff00ff
369; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
370; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
371; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
372; SI-NEXT:    s_setpc_b64 s[30:31]
373;
374; VI-LABEL: missing_truncate_promote_bswap:
375; VI:       ; %bb.0: ; %bb
376; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
378; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
379; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
380; VI-NEXT:    s_setpc_b64 s[30:31]
381bb:
382  %tmp = trunc i32 %arg to i16
383  %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp)
384  %tmp2 = bitcast i16 %tmp1 to half
385  %tmp3 = fpext half %tmp2 to float
386  ret float %tmp3
387}
388
389define i16 @v_bswap_i16(i16 %src) {
390; SI-LABEL: v_bswap_i16:
391; SI:       ; %bb.0:
392; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
394; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
395; SI-NEXT:    s_mov_b32 s4, 0xff00ff
396; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
397; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
398; SI-NEXT:    s_setpc_b64 s[30:31]
399;
400; VI-LABEL: v_bswap_i16:
401; VI:       ; %bb.0:
402; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
404; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
405; VI-NEXT:    s_setpc_b64 s[30:31]
406  %bswap = call i16 @llvm.bswap.i16(i16 %src)
407  ret i16 %bswap
408}
409
410define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
411; SI-LABEL: v_bswap_i16_zext_to_i32:
412; SI:       ; %bb.0:
413; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
415; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
416; SI-NEXT:    s_mov_b32 s4, 0xff00ff
417; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
418; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
419; SI-NEXT:    s_setpc_b64 s[30:31]
420;
421; VI-LABEL: v_bswap_i16_zext_to_i32:
422; VI:       ; %bb.0:
423; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
425; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
426; VI-NEXT:    s_setpc_b64 s[30:31]
427  %bswap = call i16 @llvm.bswap.i16(i16 %src)
428  %zext = zext i16 %bswap to i32
429  ret i32 %zext
430}
431
432define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
433; SI-LABEL: v_bswap_i16_sext_to_i32:
434; SI:       ; %bb.0:
435; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
437; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
438; SI-NEXT:    s_mov_b32 s4, 0xff00ff
439; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
440; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
441; SI-NEXT:    s_setpc_b64 s[30:31]
442;
443; VI-LABEL: v_bswap_i16_sext_to_i32:
444; VI:       ; %bb.0:
445; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
446; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
447; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
448; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
449; VI-NEXT:    s_setpc_b64 s[30:31]
450  %bswap = call i16 @llvm.bswap.i16(i16 %src)
451  %zext = sext i16 %bswap to i32
452  ret i32 %zext
453}
454
455define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
456; SI-LABEL: v_bswap_v2i16:
457; SI:       ; %bb.0:
458; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459; SI-NEXT:    v_alignbit_b32 v2, v1, v1, 8
460; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
461; SI-NEXT:    s_mov_b32 s4, 0xff00ff
462; SI-NEXT:    v_alignbit_b32 v3, v0, v0, 8
463; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
464; SI-NEXT:    v_bfi_b32 v1, s4, v1, v2
465; SI-NEXT:    v_bfi_b32 v0, s4, v0, v3
466; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
467; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
468; SI-NEXT:    v_or_b32_e32 v0, v0, v1
469; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
470; SI-NEXT:    s_setpc_b64 s[30:31]
471;
472; VI-LABEL: v_bswap_v2i16:
473; VI:       ; %bb.0:
474; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475; VI-NEXT:    s_mov_b32 s4, 0x2030001
476; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
477; VI-NEXT:    s_setpc_b64 s[30:31]
478  %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
479  ret <2 x i16> %bswap
480}
481
482define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
483; SI-LABEL: v_bswap_v3i16:
484; SI:       ; %bb.0:
485; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
487; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
488; SI-NEXT:    s_mov_b32 s4, 0xff00ff
489; SI-NEXT:    v_alignbit_b32 v4, v0, v0, 8
490; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
491; SI-NEXT:    v_alignbit_b32 v5, v2, v2, 8
492; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
493; SI-NEXT:    v_bfi_b32 v1, s4, v1, v3
494; SI-NEXT:    v_bfi_b32 v0, s4, v0, v4
495; SI-NEXT:    v_bfi_b32 v2, s4, v2, v5
496; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
497; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
498; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
499; SI-NEXT:    v_or_b32_e32 v0, v0, v3
500; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
501; SI-NEXT:    s_setpc_b64 s[30:31]
502;
503; VI-LABEL: v_bswap_v3i16:
504; VI:       ; %bb.0:
505; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506; VI-NEXT:    s_mov_b32 s4, 0x2030001
507; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
508; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
509; VI-NEXT:    s_setpc_b64 s[30:31]
510  %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
511  ret <3 x i16> %bswap
512}
513
514define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
515; SI-LABEL: v_bswap_v4i16:
516; SI:       ; %bb.0:
517; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; SI-NEXT:    v_alignbit_b32 v4, v1, v1, 8
519; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
520; SI-NEXT:    s_mov_b32 s4, 0xff00ff
521; SI-NEXT:    s_mov_b32 s5, 0xffff0000
522; SI-NEXT:    v_alignbit_b32 v5, v0, v0, 8
523; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
524; SI-NEXT:    v_alignbit_b32 v6, v3, v3, 8
525; SI-NEXT:    v_alignbit_b32 v3, v3, v3, 24
526; SI-NEXT:    v_alignbit_b32 v7, v2, v2, 8
527; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
528; SI-NEXT:    v_bfi_b32 v1, s4, v1, v4
529; SI-NEXT:    v_bfi_b32 v0, s4, v0, v5
530; SI-NEXT:    v_bfi_b32 v3, s4, v3, v6
531; SI-NEXT:    v_bfi_b32 v2, s4, v2, v7
532; SI-NEXT:    v_and_b32_e32 v4, s5, v1
533; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
534; SI-NEXT:    v_and_b32_e32 v3, s5, v3
535; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
536; SI-NEXT:    v_or_b32_e32 v0, v0, v4
537; SI-NEXT:    v_or_b32_e32 v2, v2, v3
538; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
539; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
540; SI-NEXT:    s_setpc_b64 s[30:31]
541;
542; VI-LABEL: v_bswap_v4i16:
543; VI:       ; %bb.0:
544; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; VI-NEXT:    s_mov_b32 s4, 0x2030001
546; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
547; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
548; VI-NEXT:    s_setpc_b64 s[30:31]
549  %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src)
550  ret <4 x i16> %bswap
551}
552
553define i64 @v_bswap_i48(i64 %src) {
554; SI-LABEL: v_bswap_i48:
555; SI:       ; %bb.0:
556; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
558; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
559; SI-NEXT:    s_mov_b32 s4, 0xff00ff
560; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
561; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
562; SI-NEXT:    v_bfi_b32 v2, s4, v0, v2
563; SI-NEXT:    v_bfi_b32 v0, s4, v1, v3
564; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
565; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
566; SI-NEXT:    s_setpc_b64 s[30:31]
567;
568; VI-LABEL: v_bswap_i48:
569; VI:       ; %bb.0:
570; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
571; VI-NEXT:    s_mov_b32 s4, 0x10203
572; VI-NEXT:    v_perm_b32 v2, 0, v0, s4
573; VI-NEXT:    v_perm_b32 v0, 0, v1, s4
574; VI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
575; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
576; VI-NEXT:    s_setpc_b64 s[30:31]
577  %trunc = trunc i64 %src to i48
578  %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
579  %zext = zext i48 %bswap to i64
580  ret i64 %zext
581}
582