1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,GCN,SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI 4 5declare i16 @llvm.bswap.i16(i16) nounwind readnone 6declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) nounwind readnone 7declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) nounwind readnone 8declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) nounwind readnone 9declare i32 @llvm.bswap.i32(i32) nounwind readnone 10declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone 11declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 12declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone 13declare i64 @llvm.bswap.i64(i64) nounwind readnone 14declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone 15declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone 16declare i48 @llvm.bswap.i48(i48) #1 17 18define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 19; SI-LABEL: test_bswap_i32: 20; SI: ; %bb.0: 21; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 22; SI-NEXT: s_waitcnt lgkmcnt(0) 23; SI-NEXT: s_load_dword s4, s[2:3], 0x0 24; SI-NEXT: s_mov_b32 s3, 0xf000 25; SI-NEXT: s_mov_b32 s2, -1 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8 28; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24 29; SI-NEXT: s_mov_b32 s4, 0xff00ff 30; SI-NEXT: v_bfi_b32 v0, s4, v1, v0 31; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 32; SI-NEXT: s_endpgm 33; 34; VI-LABEL: test_bswap_i32: 35; VI: ; %bb.0: 36; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 37; VI-NEXT: v_mov_b32_e32 v0, 0x10203 38; VI-NEXT: s_mov_b32 s7, 0xf000 39; VI-NEXT: s_mov_b32 s6, -1 40; VI-NEXT: s_waitcnt lgkmcnt(0) 41; VI-NEXT: s_mov_b32 s4, s0 42; VI-NEXT: s_load_dword s0, s[2:3], 0x0 43; VI-NEXT: s_mov_b32 s5, s1 44; VI-NEXT: s_waitcnt lgkmcnt(0) 45; VI-NEXT: v_perm_b32 v0, 0, s0, v0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48 %val = load i32, i32 addrspace(1)* %in, align 4 49 %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone 50 store i32 %bswap, i32 addrspace(1)* %out, align 4 51 ret void 52} 53 54define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { 55; SI-LABEL: test_bswap_v2i32: 56; SI: ; %bb.0: 57; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 58; SI-NEXT: s_waitcnt lgkmcnt(0) 59; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 60; SI-NEXT: s_mov_b32 s3, 0xf000 61; SI-NEXT: s_mov_b32 s2, -1 62; SI-NEXT: s_mov_b32 s6, 0xff00ff 63; SI-NEXT: s_waitcnt lgkmcnt(0) 64; SI-NEXT: v_alignbit_b32 v0, s5, s5, 8 65; SI-NEXT: v_alignbit_b32 v1, s5, s5, 24 66; SI-NEXT: v_alignbit_b32 v2, s4, s4, 8 67; SI-NEXT: v_alignbit_b32 v3, s4, s4, 24 68; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 69; SI-NEXT: v_bfi_b32 v0, s6, v3, v2 70; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 71; SI-NEXT: s_endpgm 72; 73; VI-LABEL: test_bswap_v2i32: 74; VI: ; %bb.0: 75; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 76; VI-NEXT: v_mov_b32_e32 v0, 0x10203 77; VI-NEXT: s_mov_b32 s7, 0xf000 78; VI-NEXT: s_mov_b32 s6, -1 79; VI-NEXT: s_waitcnt lgkmcnt(0) 80; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 81; VI-NEXT: s_mov_b32 s4, s0 82; VI-NEXT: s_mov_b32 s5, s1 83; VI-NEXT: s_waitcnt lgkmcnt(0) 84; VI-NEXT: v_perm_b32 v1, 0, s3, v0 85; VI-NEXT: v_perm_b32 v0, 0, s2, v0 86; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 87; VI-NEXT: s_endpgm 88 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 89 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone 90 store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 91 ret void 92} 93 94define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { 95; SI-LABEL: test_bswap_v4i32: 96; SI: ; %bb.0: 97; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 98; SI-NEXT: s_waitcnt lgkmcnt(0) 99; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 100; SI-NEXT: s_mov_b32 s3, 0xf000 101; SI-NEXT: s_mov_b32 s2, -1 102; SI-NEXT: s_mov_b32 s8, 0xff00ff 103; SI-NEXT: s_waitcnt lgkmcnt(0) 104; SI-NEXT: v_alignbit_b32 v0, s7, s7, 8 105; SI-NEXT: v_alignbit_b32 v1, s7, s7, 24 106; SI-NEXT: v_alignbit_b32 v2, s6, s6, 8 107; SI-NEXT: v_alignbit_b32 v4, s6, s6, 24 108; SI-NEXT: v_alignbit_b32 v5, s5, s5, 8 109; SI-NEXT: v_alignbit_b32 v6, s5, s5, 24 110; SI-NEXT: v_alignbit_b32 v7, s4, s4, 8 111; SI-NEXT: v_alignbit_b32 v8, s4, s4, 24 112; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 113; SI-NEXT: v_bfi_b32 v2, s8, v4, v2 114; SI-NEXT: v_bfi_b32 v1, s8, v6, v5 115; SI-NEXT: v_bfi_b32 v0, s8, v8, v7 116; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 117; SI-NEXT: s_endpgm 118; 119; VI-LABEL: test_bswap_v4i32: 120; VI: ; %bb.0: 121; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 122; VI-NEXT: v_mov_b32_e32 v0, 0x10203 123; VI-NEXT: s_mov_b32 s3, 0xf000 124; VI-NEXT: s_mov_b32 s2, -1 125; VI-NEXT: s_waitcnt lgkmcnt(0) 126; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 127; VI-NEXT: s_mov_b32 s0, s4 128; VI-NEXT: s_mov_b32 s1, s5 129; VI-NEXT: s_waitcnt lgkmcnt(0) 130; VI-NEXT: v_perm_b32 v3, 0, s11, v0 131; VI-NEXT: v_perm_b32 v2, 0, s10, v0 132; VI-NEXT: v_perm_b32 v1, 0, s9, v0 133; VI-NEXT: v_perm_b32 v0, 0, s8, v0 134; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 135; VI-NEXT: s_endpgm 136 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 137 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone 138 store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 139 ret void 140} 141 142define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { 143; SI-LABEL: test_bswap_v8i32: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 146; SI-NEXT: s_waitcnt lgkmcnt(0) 147; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 148; SI-NEXT: s_mov_b32 s11, 0xf000 149; SI-NEXT: s_mov_b32 s10, -1 150; SI-NEXT: s_mov_b32 s12, 0xff00ff 151; SI-NEXT: s_waitcnt lgkmcnt(0) 152; SI-NEXT: v_alignbit_b32 v0, s3, s3, 8 153; SI-NEXT: v_alignbit_b32 v1, s3, s3, 24 154; SI-NEXT: v_alignbit_b32 v2, s2, s2, 8 155; SI-NEXT: v_alignbit_b32 v4, s2, s2, 24 156; SI-NEXT: v_alignbit_b32 v5, s1, s1, 8 157; SI-NEXT: v_alignbit_b32 v6, s1, s1, 24 158; SI-NEXT: v_alignbit_b32 v7, s0, s0, 8 159; SI-NEXT: v_alignbit_b32 v8, s0, s0, 24 160; SI-NEXT: v_alignbit_b32 v9, s7, s7, 8 161; SI-NEXT: v_alignbit_b32 v10, s7, s7, 24 162; SI-NEXT: v_alignbit_b32 v11, s6, s6, 8 163; SI-NEXT: v_alignbit_b32 v12, s6, s6, 24 164; SI-NEXT: v_alignbit_b32 v13, s5, s5, 8 165; SI-NEXT: v_alignbit_b32 v14, s5, s5, 24 166; SI-NEXT: v_alignbit_b32 v15, s4, s4, 8 167; SI-NEXT: v_alignbit_b32 v16, s4, s4, 24 168; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 169; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 170; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 171; SI-NEXT: v_bfi_b32 v0, s12, v8, v7 172; SI-NEXT: v_bfi_b32 v7, s12, v10, v9 173; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 174; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 175; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 176; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 177; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 178; SI-NEXT: s_endpgm 179; 180; VI-LABEL: test_bswap_v8i32: 181; VI: ; %bb.0: 182; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 183; VI-NEXT: v_mov_b32_e32 v4, 0x10203 184; VI-NEXT: s_mov_b32 s11, 0xf000 185; VI-NEXT: s_mov_b32 s10, -1 186; VI-NEXT: s_waitcnt lgkmcnt(0) 187; VI-NEXT: s_mov_b32 s8, s0 188; VI-NEXT: s_mov_b32 s9, s1 189; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 190; VI-NEXT: s_waitcnt lgkmcnt(0) 191; VI-NEXT: v_perm_b32 v3, 0, s3, v4 192; VI-NEXT: v_perm_b32 v2, 0, s2, v4 193; VI-NEXT: v_perm_b32 v1, 0, s1, v4 194; VI-NEXT: v_perm_b32 v0, 0, s0, v4 195; VI-NEXT: v_perm_b32 v7, 0, s7, v4 196; VI-NEXT: v_perm_b32 v6, 0, s6, v4 197; VI-NEXT: v_perm_b32 v5, 0, s5, v4 198; VI-NEXT: v_perm_b32 v4, 0, s4, v4 199; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 200; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 201; VI-NEXT: s_endpgm 202 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 203 %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone 204 store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 205 ret void 206} 207 208define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 209; SI-LABEL: test_bswap_i64: 210; SI: ; %bb.0: 211; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 212; SI-NEXT: s_waitcnt lgkmcnt(0) 213; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 214; SI-NEXT: s_mov_b32 s3, 0xf000 215; SI-NEXT: s_mov_b32 s2, -1 216; SI-NEXT: s_mov_b32 s6, 0xff00ff 217; SI-NEXT: s_waitcnt lgkmcnt(0) 218; SI-NEXT: v_alignbit_b32 v0, s4, s4, 8 219; SI-NEXT: v_alignbit_b32 v1, s4, s4, 24 220; SI-NEXT: v_alignbit_b32 v2, s5, s5, 8 221; SI-NEXT: v_alignbit_b32 v3, s5, s5, 24 222; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 223; SI-NEXT: v_bfi_b32 v0, s6, v3, v2 224; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 225; SI-NEXT: s_endpgm 226; 227; VI-LABEL: test_bswap_i64: 228; VI: ; %bb.0: 229; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 230; VI-NEXT: v_mov_b32_e32 v0, 0x10203 231; VI-NEXT: s_mov_b32 s7, 0xf000 232; VI-NEXT: s_mov_b32 s6, -1 233; VI-NEXT: s_waitcnt lgkmcnt(0) 234; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 235; VI-NEXT: s_mov_b32 s4, s0 236; VI-NEXT: s_mov_b32 s5, s1 237; VI-NEXT: s_waitcnt lgkmcnt(0) 238; VI-NEXT: v_perm_b32 v1, 0, s2, v0 239; VI-NEXT: v_perm_b32 v0, 0, s3, v0 240; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 241; VI-NEXT: s_endpgm 242 %val = load i64, i64 addrspace(1)* %in, align 8 243 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone 244 store i64 %bswap, i64 addrspace(1)* %out, align 8 245 ret void 246} 247 248define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { 249; SI-LABEL: test_bswap_v2i64: 250; SI: ; %bb.0: 251; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 252; SI-NEXT: s_waitcnt lgkmcnt(0) 253; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 254; SI-NEXT: s_mov_b32 s3, 0xf000 255; SI-NEXT: s_mov_b32 s2, -1 256; SI-NEXT: s_mov_b32 s8, 0xff00ff 257; SI-NEXT: s_waitcnt lgkmcnt(0) 258; SI-NEXT: v_alignbit_b32 v0, s6, s6, 8 259; SI-NEXT: v_alignbit_b32 v1, s6, s6, 24 260; SI-NEXT: v_alignbit_b32 v2, s7, s7, 8 261; SI-NEXT: v_alignbit_b32 v4, s7, s7, 24 262; SI-NEXT: v_alignbit_b32 v5, s4, s4, 8 263; SI-NEXT: v_alignbit_b32 v6, s4, s4, 24 264; SI-NEXT: v_alignbit_b32 v7, s5, s5, 8 265; SI-NEXT: v_alignbit_b32 v8, s5, s5, 24 266; SI-NEXT: v_bfi_b32 v3, s8, v1, v0 267; SI-NEXT: v_bfi_b32 v2, s8, v4, v2 268; SI-NEXT: v_bfi_b32 v1, s8, v6, v5 269; SI-NEXT: v_bfi_b32 v0, s8, v8, v7 270; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 271; SI-NEXT: s_endpgm 272; 273; VI-LABEL: test_bswap_v2i64: 274; VI: ; %bb.0: 275; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 276; VI-NEXT: v_mov_b32_e32 v0, 0x10203 277; VI-NEXT: s_mov_b32 s3, 0xf000 278; VI-NEXT: s_mov_b32 s2, -1 279; VI-NEXT: s_waitcnt lgkmcnt(0) 280; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 281; VI-NEXT: s_mov_b32 s0, s4 282; VI-NEXT: s_mov_b32 s1, s5 283; VI-NEXT: s_waitcnt lgkmcnt(0) 284; VI-NEXT: v_perm_b32 v3, 0, s10, v0 285; VI-NEXT: v_perm_b32 v2, 0, s11, v0 286; VI-NEXT: v_perm_b32 v1, 0, s8, v0 287; VI-NEXT: v_perm_b32 v0, 0, s9, v0 288; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 289; VI-NEXT: s_endpgm 290 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 291 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone 292 store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 293 ret void 294} 295 296define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { 297; SI-LABEL: test_bswap_v4i64: 298; SI: ; %bb.0: 299; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 300; SI-NEXT: s_waitcnt lgkmcnt(0) 301; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 302; SI-NEXT: s_mov_b32 s11, 0xf000 303; SI-NEXT: s_mov_b32 s10, -1 304; SI-NEXT: s_mov_b32 s12, 0xff00ff 305; SI-NEXT: s_waitcnt lgkmcnt(0) 306; SI-NEXT: v_alignbit_b32 v0, s2, s2, 8 307; SI-NEXT: v_alignbit_b32 v1, s2, s2, 24 308; SI-NEXT: v_alignbit_b32 v2, s3, s3, 8 309; SI-NEXT: v_alignbit_b32 v4, s3, s3, 24 310; SI-NEXT: v_alignbit_b32 v5, s0, s0, 8 311; SI-NEXT: v_alignbit_b32 v6, s0, s0, 24 312; SI-NEXT: v_alignbit_b32 v7, s1, s1, 8 313; SI-NEXT: v_alignbit_b32 v8, s1, s1, 24 314; SI-NEXT: v_alignbit_b32 v9, s6, s6, 8 315; SI-NEXT: v_alignbit_b32 v10, s6, s6, 24 316; SI-NEXT: v_alignbit_b32 v11, s7, s7, 8 317; SI-NEXT: v_alignbit_b32 v12, s7, s7, 24 318; SI-NEXT: v_alignbit_b32 v13, s4, s4, 8 319; SI-NEXT: v_alignbit_b32 v14, s4, s4, 24 320; SI-NEXT: v_alignbit_b32 v15, s5, s5, 8 321; SI-NEXT: v_alignbit_b32 v16, s5, s5, 24 322; SI-NEXT: v_bfi_b32 v3, s12, v1, v0 323; SI-NEXT: v_bfi_b32 v2, s12, v4, v2 324; SI-NEXT: v_bfi_b32 v1, s12, v6, v5 325; SI-NEXT: v_bfi_b32 v0, s12, v8, v7 326; SI-NEXT: v_bfi_b32 v7, s12, v10, v9 327; SI-NEXT: v_bfi_b32 v6, s12, v12, v11 328; SI-NEXT: v_bfi_b32 v5, s12, v14, v13 329; SI-NEXT: v_bfi_b32 v4, s12, v16, v15 330; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 331; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 332; SI-NEXT: s_endpgm 333; 334; VI-LABEL: test_bswap_v4i64: 335; VI: ; %bb.0: 336; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 337; VI-NEXT: v_mov_b32_e32 v4, 0x10203 338; VI-NEXT: s_mov_b32 s11, 0xf000 339; VI-NEXT: s_mov_b32 s10, -1 340; VI-NEXT: s_waitcnt lgkmcnt(0) 341; VI-NEXT: s_mov_b32 s8, s0 342; VI-NEXT: s_mov_b32 s9, s1 343; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x0 344; VI-NEXT: s_waitcnt lgkmcnt(0) 345; VI-NEXT: v_perm_b32 v3, 0, s2, v4 346; VI-NEXT: v_perm_b32 v2, 0, s3, v4 347; VI-NEXT: v_perm_b32 v1, 0, s0, v4 348; VI-NEXT: v_perm_b32 v0, 0, s1, v4 349; VI-NEXT: v_perm_b32 v7, 0, s6, v4 350; VI-NEXT: v_perm_b32 v6, 0, s7, v4 351; VI-NEXT: v_perm_b32 v5, 0, s4, v4 352; VI-NEXT: v_perm_b32 v4, 0, s5, v4 353; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 354; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 355; VI-NEXT: s_endpgm 356 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 357 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone 358 store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 359 ret void 360} 361 362define float @missing_truncate_promote_bswap(i32 %arg) { 363; SI-LABEL: missing_truncate_promote_bswap: 364; SI: ; %bb.0: ; %bb 365; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 366; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 367; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 368; SI-NEXT: s_mov_b32 s4, 0xff00ff 369; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 370; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 371; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 372; SI-NEXT: s_setpc_b64 s[30:31] 373; 374; VI-LABEL: missing_truncate_promote_bswap: 375; VI: ; %bb.0: ; %bb 376; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; VI-NEXT: s_mov_b32 s4, 0xc0c0001 378; VI-NEXT: v_perm_b32 v0, 0, v0, s4 379; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 380; VI-NEXT: s_setpc_b64 s[30:31] 381bb: 382 %tmp = trunc i32 %arg to i16 383 %tmp1 = call i16 @llvm.bswap.i16(i16 %tmp) 384 %tmp2 = bitcast i16 %tmp1 to half 385 %tmp3 = fpext half %tmp2 to float 386 ret float %tmp3 387} 388 389define i16 @v_bswap_i16(i16 %src) { 390; SI-LABEL: v_bswap_i16: 391; SI: ; %bb.0: 392; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 393; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 394; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 395; SI-NEXT: s_mov_b32 s4, 0xff00ff 396; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 397; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 398; SI-NEXT: s_setpc_b64 s[30:31] 399; 400; VI-LABEL: v_bswap_i16: 401; VI: ; %bb.0: 402; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 403; VI-NEXT: s_mov_b32 s4, 0xc0c0001 404; VI-NEXT: v_perm_b32 v0, 0, v0, s4 405; VI-NEXT: s_setpc_b64 s[30:31] 406 %bswap = call i16 @llvm.bswap.i16(i16 %src) 407 ret i16 %bswap 408} 409 410define i32 @v_bswap_i16_zext_to_i32(i16 %src) { 411; SI-LABEL: v_bswap_i16_zext_to_i32: 412; SI: ; %bb.0: 413; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 415; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 416; SI-NEXT: s_mov_b32 s4, 0xff00ff 417; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 418; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 419; SI-NEXT: s_setpc_b64 s[30:31] 420; 421; VI-LABEL: v_bswap_i16_zext_to_i32: 422; VI: ; %bb.0: 423; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 424; VI-NEXT: s_mov_b32 s4, 0xc0c0001 425; VI-NEXT: v_perm_b32 v0, 0, v0, s4 426; VI-NEXT: s_setpc_b64 s[30:31] 427 %bswap = call i16 @llvm.bswap.i16(i16 %src) 428 %zext = zext i16 %bswap to i32 429 ret i32 %zext 430} 431 432define i32 @v_bswap_i16_sext_to_i32(i16 %src) { 433; SI-LABEL: v_bswap_i16_sext_to_i32: 434; SI: ; %bb.0: 435; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; SI-NEXT: v_alignbit_b32 v1, v0, v0, 8 437; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 438; SI-NEXT: s_mov_b32 s4, 0xff00ff 439; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 440; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 441; SI-NEXT: s_setpc_b64 s[30:31] 442; 443; VI-LABEL: v_bswap_i16_sext_to_i32: 444; VI: ; %bb.0: 445; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 446; VI-NEXT: s_mov_b32 s4, 0xc0c0001 447; VI-NEXT: v_perm_b32 v0, 0, v0, s4 448; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 449; VI-NEXT: s_setpc_b64 s[30:31] 450 %bswap = call i16 @llvm.bswap.i16(i16 %src) 451 %zext = sext i16 %bswap to i32 452 ret i32 %zext 453} 454 455define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) { 456; SI-LABEL: v_bswap_v2i16: 457; SI: ; %bb.0: 458; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 459; SI-NEXT: v_alignbit_b32 v2, v1, v1, 8 460; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 461; SI-NEXT: s_mov_b32 s4, 0xff00ff 462; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 463; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 464; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 465; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 466; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 467; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 468; SI-NEXT: v_or_b32_e32 v0, v0, v1 469; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 470; SI-NEXT: s_setpc_b64 s[30:31] 471; 472; VI-LABEL: v_bswap_v2i16: 473; VI: ; %bb.0: 474; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 475; VI-NEXT: s_mov_b32 s4, 0x2030001 476; VI-NEXT: v_perm_b32 v0, 0, v0, s4 477; VI-NEXT: s_setpc_b64 s[30:31] 478 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) 479 ret <2 x i16> %bswap 480} 481 482define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) { 483; SI-LABEL: v_bswap_v3i16: 484; SI: ; %bb.0: 485; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 487; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 488; SI-NEXT: s_mov_b32 s4, 0xff00ff 489; SI-NEXT: v_alignbit_b32 v4, v0, v0, 8 490; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 491; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 492; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 493; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 494; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 495; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 496; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 497; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 498; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 499; SI-NEXT: v_or_b32_e32 v0, v0, v3 500; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 501; SI-NEXT: s_setpc_b64 s[30:31] 502; 503; VI-LABEL: v_bswap_v3i16: 504; VI: ; %bb.0: 505; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 506; VI-NEXT: s_mov_b32 s4, 0x2030001 507; VI-NEXT: v_perm_b32 v0, 0, v0, s4 508; VI-NEXT: v_perm_b32 v1, 0, v1, s4 509; VI-NEXT: s_setpc_b64 s[30:31] 510 %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src) 511 ret <3 x i16> %bswap 512} 513 514define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) { 515; SI-LABEL: v_bswap_v4i16: 516; SI: ; %bb.0: 517; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 518; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 519; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 520; SI-NEXT: s_mov_b32 s4, 0xff00ff 521; SI-NEXT: s_mov_b32 s5, 0xffff0000 522; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 523; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 524; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 525; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 526; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8 527; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 528; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 529; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 530; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 531; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 532; SI-NEXT: v_and_b32_e32 v4, s5, v1 533; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 534; SI-NEXT: v_and_b32_e32 v3, s5, v3 535; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 536; SI-NEXT: v_or_b32_e32 v0, v0, v4 537; SI-NEXT: v_or_b32_e32 v2, v2, v3 538; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 539; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 540; SI-NEXT: s_setpc_b64 s[30:31] 541; 542; VI-LABEL: v_bswap_v4i16: 543; VI: ; %bb.0: 544; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; VI-NEXT: s_mov_b32 s4, 0x2030001 546; VI-NEXT: v_perm_b32 v0, 0, v0, s4 547; VI-NEXT: v_perm_b32 v1, 0, v1, s4 548; VI-NEXT: s_setpc_b64 s[30:31] 549 %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src) 550 ret <4 x i16> %bswap 551} 552 553define i64 @v_bswap_i48(i64 %src) { 554; SI-LABEL: v_bswap_i48: 555; SI: ; %bb.0: 556; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 558; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 559; SI-NEXT: s_mov_b32 s4, 0xff00ff 560; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 561; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 562; SI-NEXT: v_bfi_b32 v2, s4, v0, v2 563; SI-NEXT: v_bfi_b32 v0, s4, v1, v3 564; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 565; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 566; SI-NEXT: s_setpc_b64 s[30:31] 567; 568; VI-LABEL: v_bswap_i48: 569; VI: ; %bb.0: 570; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 571; VI-NEXT: s_mov_b32 s4, 0x10203 572; VI-NEXT: v_perm_b32 v2, 0, v0, s4 573; VI-NEXT: v_perm_b32 v0, 0, v1, s4 574; VI-NEXT: v_alignbit_b32 v0, v2, v0, 16 575; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 576; VI-NEXT: s_setpc_b64 s[30:31] 577 %trunc = trunc i64 %src to i48 578 %bswap = call i48 @llvm.bswap.i48(i48 %trunc) 579 %zext = zext i48 %bswap to i64 580 ret i64 %zext 581} 582