1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600 6 7declare i32 @llvm.fshr.i32(i32, i32, i32) 8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) 9declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) 10declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 11declare i16 @llvm.fshr.i16(i16, i16, i16) 12declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) 13declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) 14declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) 15declare i64 @llvm.fshr.i64(i64, i64, i64) 16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 17declare i24 @llvm.fshr.i24(i24, i24, i24) 18declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) 19 20define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 21; SI-LABEL: fshr_i32: 22; SI: ; %bb.0: ; %entry 23; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 24; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 25; SI-NEXT: s_load_dword s0, s[0:1], 0xd 26; SI-NEXT: s_mov_b32 s7, 0xf000 27; SI-NEXT: s_mov_b32 s6, -1 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: v_mov_b32_e32 v0, s3 30; SI-NEXT: v_mov_b32_e32 v1, s0 31; SI-NEXT: v_alignbit_b32 v0, s2, v0, v1 32; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 33; SI-NEXT: s_endpgm 34; 35; VI-LABEL: fshr_i32: 36; VI: ; %bb.0: ; %entry 37; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 38; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 39; VI-NEXT: s_load_dword s0, s[0:1], 0x34 40; VI-NEXT: s_waitcnt lgkmcnt(0) 41; VI-NEXT: v_mov_b32_e32 v0, s5 42; VI-NEXT: v_mov_b32_e32 v1, s0 43; VI-NEXT: v_alignbit_b32 v2, s4, v0, v1 44; VI-NEXT: v_mov_b32_e32 v0, s2 45; VI-NEXT: v_mov_b32_e32 v1, s3 46; VI-NEXT: flat_store_dword v[0:1], v2 47; VI-NEXT: s_endpgm 48; 49; GFX9-LABEL: fshr_i32: 50; GFX9: ; %bb.0: ; %entry 51; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 52; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 53; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 54; GFX9-NEXT: v_mov_b32_e32 v0, 0 55; GFX9-NEXT: s_waitcnt lgkmcnt(0) 56; GFX9-NEXT: v_mov_b32_e32 v1, s5 57; GFX9-NEXT: v_mov_b32_e32 v2, s0 58; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2 59; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 60; GFX9-NEXT: s_endpgm 61; 62; R600-LABEL: fshr_i32: 63; R600: ; %bb.0: ; %entry 64; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 65; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 66; R600-NEXT: CF_END 67; R600-NEXT: PAD 68; R600-NEXT: ALU clause starting at 4: 69; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 70; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 71; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 72entry: 73 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 74 store i32 %0, i32 addrspace(1)* %in 75 ret void 76} 77 78define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 79; SI-LABEL: fshr_i32_imm: 80; SI: ; %bb.0: ; %entry 81; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 82; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 83; SI-NEXT: s_mov_b32 s7, 0xf000 84; SI-NEXT: s_mov_b32 s6, -1 85; SI-NEXT: s_waitcnt lgkmcnt(0) 86; SI-NEXT: v_mov_b32_e32 v0, s1 87; SI-NEXT: v_alignbit_b32 v0, s0, v0, 7 88; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 89; SI-NEXT: s_endpgm 90; 91; VI-LABEL: fshr_i32_imm: 92; VI: ; %bb.0: ; %entry 93; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 94; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 95; VI-NEXT: s_waitcnt lgkmcnt(0) 96; VI-NEXT: v_mov_b32_e32 v0, s1 97; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 98; VI-NEXT: v_mov_b32_e32 v0, s2 99; VI-NEXT: v_mov_b32_e32 v1, s3 100; VI-NEXT: flat_store_dword v[0:1], v2 101; VI-NEXT: s_endpgm 102; 103; GFX9-LABEL: fshr_i32_imm: 104; GFX9: ; %bb.0: ; %entry 105; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 106; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 107; GFX9-NEXT: v_mov_b32_e32 v0, 0 108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 109; GFX9-NEXT: v_mov_b32_e32 v1, s1 110; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7 111; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 112; GFX9-NEXT: s_endpgm 113; 114; R600-LABEL: fshr_i32_imm: 115; R600: ; %bb.0: ; %entry 116; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 117; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 118; R600-NEXT: CF_END 119; R600-NEXT: PAD 120; R600-NEXT: ALU clause starting at 4: 121; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 122; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 123; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 124; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 125entry: 126 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 127 store i32 %0, i32 addrspace(1)* %in 128 ret void 129} 130 131define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 132; SI-LABEL: fshr_v2i32: 133; SI: ; %bb.0: ; %entry 134; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 135; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 136; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 137; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 138; SI-NEXT: s_mov_b32 s7, 0xf000 139; SI-NEXT: s_mov_b32 s6, -1 140; SI-NEXT: s_waitcnt lgkmcnt(0) 141; SI-NEXT: v_mov_b32_e32 v0, s9 142; SI-NEXT: v_mov_b32_e32 v1, s1 143; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 144; SI-NEXT: v_mov_b32_e32 v0, s8 145; SI-NEXT: v_mov_b32_e32 v2, s0 146; SI-NEXT: v_alignbit_b32 v0, s2, v0, v2 147; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 148; SI-NEXT: s_endpgm 149; 150; VI-LABEL: fshr_v2i32: 151; VI: ; %bb.0: ; %entry 152; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 153; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 154; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 155; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 156; VI-NEXT: s_waitcnt lgkmcnt(0) 157; VI-NEXT: v_mov_b32_e32 v0, s7 158; VI-NEXT: v_mov_b32_e32 v1, s1 159; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 160; VI-NEXT: v_mov_b32_e32 v0, s6 161; VI-NEXT: v_mov_b32_e32 v2, s0 162; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 163; VI-NEXT: v_mov_b32_e32 v2, s2 164; VI-NEXT: v_mov_b32_e32 v3, s3 165; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 166; VI-NEXT: s_endpgm 167; 168; GFX9-LABEL: fshr_v2i32: 169; GFX9: ; %bb.0: ; %entry 170; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 171; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 172; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 173; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 174; GFX9-NEXT: v_mov_b32_e32 v2, 0 175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 176; GFX9-NEXT: v_mov_b32_e32 v0, s7 177; GFX9-NEXT: v_mov_b32_e32 v1, s1 178; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 179; GFX9-NEXT: v_mov_b32_e32 v0, s6 180; GFX9-NEXT: v_mov_b32_e32 v3, s0 181; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 182; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 183; GFX9-NEXT: s_endpgm 184; 185; R600-LABEL: fshr_v2i32: 186; R600: ; %bb.0: ; %entry 187; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 188; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 189; R600-NEXT: CF_END 190; R600-NEXT: PAD 191; R600-NEXT: ALU clause starting at 4: 192; R600-NEXT: MOV * T0.W, KC0[4].X, 193; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, 194; R600-NEXT: MOV * T0.W, KC0[3].W, 195; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, 196; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 197; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 198entry: 199 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 200 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 201 ret void 202} 203 204define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 205; SI-LABEL: fshr_v2i32_imm: 206; SI: ; %bb.0: ; %entry 207; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 208; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 209; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 210; SI-NEXT: s_mov_b32 s7, 0xf000 211; SI-NEXT: s_mov_b32 s6, -1 212; SI-NEXT: s_waitcnt lgkmcnt(0) 213; SI-NEXT: v_mov_b32_e32 v0, s1 214; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 215; SI-NEXT: v_mov_b32_e32 v0, s0 216; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 217; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 218; SI-NEXT: s_endpgm 219; 220; VI-LABEL: fshr_v2i32_imm: 221; VI: ; %bb.0: ; %entry 222; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 223; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 224; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 225; VI-NEXT: s_waitcnt lgkmcnt(0) 226; VI-NEXT: v_mov_b32_e32 v0, s1 227; VI-NEXT: v_mov_b32_e32 v2, s0 228; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 229; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 230; VI-NEXT: v_mov_b32_e32 v2, s2 231; VI-NEXT: v_mov_b32_e32 v3, s3 232; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 233; VI-NEXT: s_endpgm 234; 235; GFX9-LABEL: fshr_v2i32_imm: 236; GFX9: ; %bb.0: ; %entry 237; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 238; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 239; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 240; GFX9-NEXT: v_mov_b32_e32 v2, 0 241; GFX9-NEXT: s_waitcnt lgkmcnt(0) 242; GFX9-NEXT: v_mov_b32_e32 v0, s1 243; GFX9-NEXT: v_mov_b32_e32 v3, s0 244; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 245; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 246; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 247; GFX9-NEXT: s_endpgm 248; 249; R600-LABEL: fshr_v2i32_imm: 250; R600: ; %bb.0: ; %entry 251; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 252; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 253; R600-NEXT: CF_END 254; R600-NEXT: PAD 255; R600-NEXT: ALU clause starting at 4: 256; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 257; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 258; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 259; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 260; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 261; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 262entry: 263 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 264 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 265 ret void 266} 267 268define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 269; SI-LABEL: fshr_v4i32: 270; SI: ; %bb.0: ; %entry 271; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 272; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 273; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 274; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 275; SI-NEXT: s_mov_b32 s7, 0xf000 276; SI-NEXT: s_mov_b32 s6, -1 277; SI-NEXT: s_waitcnt lgkmcnt(0) 278; SI-NEXT: v_mov_b32_e32 v0, s15 279; SI-NEXT: v_mov_b32_e32 v1, s3 280; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 281; SI-NEXT: v_mov_b32_e32 v0, s14 282; SI-NEXT: v_mov_b32_e32 v1, s2 283; SI-NEXT: v_alignbit_b32 v2, s10, v0, v1 284; SI-NEXT: v_mov_b32_e32 v0, s13 285; SI-NEXT: v_mov_b32_e32 v1, s1 286; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 287; SI-NEXT: v_mov_b32_e32 v0, s12 288; SI-NEXT: v_mov_b32_e32 v4, s0 289; SI-NEXT: v_alignbit_b32 v0, s8, v0, v4 290; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 291; SI-NEXT: s_endpgm 292; 293; VI-LABEL: fshr_v4i32: 294; VI: ; %bb.0: ; %entry 295; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 296; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 297; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 298; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 299; VI-NEXT: s_waitcnt lgkmcnt(0) 300; VI-NEXT: v_mov_b32_e32 v0, s11 301; VI-NEXT: v_mov_b32_e32 v1, s3 302; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 303; VI-NEXT: v_mov_b32_e32 v0, s10 304; VI-NEXT: v_mov_b32_e32 v1, s2 305; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 306; VI-NEXT: v_mov_b32_e32 v0, s9 307; VI-NEXT: v_mov_b32_e32 v1, s1 308; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 309; VI-NEXT: v_mov_b32_e32 v0, s8 310; VI-NEXT: v_mov_b32_e32 v4, s0 311; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 312; VI-NEXT: v_mov_b32_e32 v4, s12 313; VI-NEXT: v_mov_b32_e32 v5, s13 314; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 315; VI-NEXT: s_endpgm 316; 317; GFX9-LABEL: fshr_v4i32: 318; GFX9: ; %bb.0: ; %entry 319; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 320; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 321; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 322; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 323; GFX9-NEXT: v_mov_b32_e32 v4, 0 324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 325; GFX9-NEXT: v_mov_b32_e32 v0, s11 326; GFX9-NEXT: v_mov_b32_e32 v1, s3 327; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 328; GFX9-NEXT: v_mov_b32_e32 v0, s10 329; GFX9-NEXT: v_mov_b32_e32 v1, s2 330; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 331; GFX9-NEXT: v_mov_b32_e32 v0, s9 332; GFX9-NEXT: v_mov_b32_e32 v1, s1 333; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 334; GFX9-NEXT: v_mov_b32_e32 v0, s8 335; GFX9-NEXT: v_mov_b32_e32 v5, s0 336; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 337; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] 338; GFX9-NEXT: s_endpgm 339; 340; R600-LABEL: fshr_v4i32: 341; R600: ; %bb.0: ; %entry 342; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 343; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 344; R600-NEXT: CF_END 345; R600-NEXT: PAD 346; R600-NEXT: ALU clause starting at 4: 347; R600-NEXT: MOV * T0.W, KC0[6].X, 348; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, 349; R600-NEXT: MOV * T1.W, KC0[5].W, 350; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, 351; R600-NEXT: MOV * T1.W, KC0[5].Z, 352; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, 353; R600-NEXT: MOV * T1.W, KC0[5].Y, 354; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, 355; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 356; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 357entry: 358 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 359 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 360 ret void 361} 362 363define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 364; SI-LABEL: fshr_v4i32_imm: 365; SI: ; %bb.0: ; %entry 366; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 367; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 368; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 369; SI-NEXT: s_mov_b32 s7, 0xf000 370; SI-NEXT: s_mov_b32 s6, -1 371; SI-NEXT: s_waitcnt lgkmcnt(0) 372; SI-NEXT: v_mov_b32_e32 v0, s3 373; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 374; SI-NEXT: v_mov_b32_e32 v0, s2 375; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9 376; SI-NEXT: v_mov_b32_e32 v0, s1 377; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 378; SI-NEXT: v_mov_b32_e32 v0, s0 379; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 380; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 381; SI-NEXT: s_endpgm 382; 383; VI-LABEL: fshr_v4i32_imm: 384; VI: ; %bb.0: ; %entry 385; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 386; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 387; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 388; VI-NEXT: s_waitcnt lgkmcnt(0) 389; VI-NEXT: v_mov_b32_e32 v4, s8 390; VI-NEXT: v_mov_b32_e32 v5, s9 391; VI-NEXT: v_mov_b32_e32 v0, s3 392; VI-NEXT: v_mov_b32_e32 v1, s2 393; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 394; VI-NEXT: v_mov_b32_e32 v0, s1 395; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 396; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7 397; VI-NEXT: v_mov_b32_e32 v0, s0 398; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 399; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 400; VI-NEXT: s_endpgm 401; 402; GFX9-LABEL: fshr_v4i32_imm: 403; GFX9: ; %bb.0: ; %entry 404; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 405; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 406; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 407; GFX9-NEXT: v_mov_b32_e32 v4, 0 408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 409; GFX9-NEXT: v_mov_b32_e32 v0, s3 410; GFX9-NEXT: v_mov_b32_e32 v1, s2 411; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 412; GFX9-NEXT: v_mov_b32_e32 v0, s1 413; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 414; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 415; GFX9-NEXT: v_mov_b32_e32 v0, s0 416; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 417; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 418; GFX9-NEXT: s_endpgm 419; 420; R600-LABEL: fshr_v4i32_imm: 421; R600: ; %bb.0: ; %entry 422; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 423; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 424; R600-NEXT: CF_END 425; R600-NEXT: PAD 426; R600-NEXT: ALU clause starting at 4: 427; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 428; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 429; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 430; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 431; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 432; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 433; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 434; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 435entry: 436 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 437 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 438 ret void 439} 440 441define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { 442; GFX89-LABEL: v_fshr_i32: 443; GFX89: ; %bb.0: 444; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 446; GFX89-NEXT: s_setpc_b64 s[30:31] 447; 448; R600-LABEL: v_fshr_i32: 449; R600: ; %bb.0: 450; R600-NEXT: CF_END 451; R600-NEXT: PAD 452 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) 453 ret i32 %ret 454} 455 456define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { 457; GFX89-LABEL: v_fshr_v2i32: 458; GFX89: ; %bb.0: 459; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 460; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 461; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5 462; GFX89-NEXT: s_setpc_b64 s[30:31] 463; 464; R600-LABEL: v_fshr_v2i32: 465; R600: ; %bb.0: 466; R600-NEXT: CF_END 467; R600-NEXT: PAD 468 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) 469 ret <2 x i32> %ret 470} 471 472define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { 473; GFX89-LABEL: v_fshr_v3i32: 474; GFX89: ; %bb.0: 475; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 476; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 477; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7 478; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8 479; GFX89-NEXT: s_setpc_b64 s[30:31] 480; 481; R600-LABEL: v_fshr_v3i32: 482; R600: ; %bb.0: 483; R600-NEXT: CF_END 484; R600-NEXT: PAD 485 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) 486 ret <3 x i32> %ret 487} 488 489define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { 490; GFX89-LABEL: v_fshr_v4i32: 491; GFX89: ; %bb.0: 492; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 494; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9 495; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10 496; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11 497; GFX89-NEXT: s_setpc_b64 s[30:31] 498; 499; R600-LABEL: v_fshr_v4i32: 500; R600: ; %bb.0: 501; R600-NEXT: CF_END 502; R600-NEXT: PAD 503 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) 504 ret <4 x i32> %ret 505} 506 507define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { 508; SI-LABEL: v_fshr_i16: 509; SI: ; %bb.0: 510; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 511; SI-NEXT: v_or_b32_e32 v2, 16, v2 512; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 513; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 514; SI-NEXT: s_setpc_b64 s[30:31] 515; 516; VI-LABEL: v_fshr_i16: 517; VI: ; %bb.0: 518; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; VI-NEXT: v_xor_b32_e32 v3, -1, v2 520; VI-NEXT: v_and_b32_e32 v2, 15, v2 521; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 522; VI-NEXT: v_and_b32_e32 v3, 15, v3 523; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 524; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 525; VI-NEXT: v_or_b32_e32 v0, v0, v1 526; VI-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX9-LABEL: v_fshr_i16: 529; GFX9: ; %bb.0: 530; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 532; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 533; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 534; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 535; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 536; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 537; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 538; GFX9-NEXT: s_setpc_b64 s[30:31] 539; 540; R600-LABEL: v_fshr_i16: 541; R600: ; %bb.0: 542; R600-NEXT: CF_END 543; R600-NEXT: PAD 544 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) 545 ret i16 %ret 546} 547 548define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { 549; SI-LABEL: v_fshr_v2i16: 550; SI: ; %bb.0: 551; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 552; SI-NEXT: v_or_b32_e32 v5, 16, v5 553; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 554; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 555; SI-NEXT: v_or_b32_e32 v3, 16, v4 556; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 557; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 558; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 559; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 560; SI-NEXT: v_or_b32_e32 v0, v0, v1 561; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 562; SI-NEXT: s_setpc_b64 s[30:31] 563; 564; VI-LABEL: v_fshr_v2i16: 565; VI: ; %bb.0: 566; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 568; VI-NEXT: v_and_b32_e32 v4, 15, v3 569; VI-NEXT: v_mov_b32_e32 v5, 1 570; VI-NEXT: v_xor_b32_e32 v3, -1, v3 571; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 572; VI-NEXT: v_and_b32_e32 v3, 15, v3 573; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 574; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 575; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 576; VI-NEXT: v_xor_b32_e32 v4, -1, v2 577; VI-NEXT: v_and_b32_e32 v2, 15, v2 578; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 579; VI-NEXT: v_and_b32_e32 v4, 15, v4 580; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 581; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 582; VI-NEXT: v_or_b32_e32 v0, v0, v1 583; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 584; VI-NEXT: s_setpc_b64 s[30:31] 585; 586; GFX9-LABEL: v_fshr_v2i16: 587; GFX9: ; %bb.0: 588; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 589; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 590; GFX9-NEXT: s_mov_b32 s4, 0xf000f 591; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 592; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 593; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 594; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 595; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 596; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 597; GFX9-NEXT: s_setpc_b64 s[30:31] 598; 599; R600-LABEL: v_fshr_v2i16: 600; R600: ; %bb.0: 601; R600-NEXT: CF_END 602; R600-NEXT: PAD 603 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) 604 ret <2 x i16> %ret 605} 606 607define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { 608; SI-LABEL: v_fshr_v3i16: 609; SI: ; %bb.0: 610; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 611; SI-NEXT: v_or_b32_e32 v7, 16, v7 612; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 613; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 614; SI-NEXT: v_or_b32_e32 v4, 16, v6 615; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 616; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 617; SI-NEXT: s_mov_b32 s4, 0xffff 618; SI-NEXT: v_or_b32_e32 v3, 16, v8 619; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 620; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 621; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 622; SI-NEXT: v_and_b32_e32 v0, s4, v0 623; SI-NEXT: v_or_b32_e32 v0, v0, v1 624; SI-NEXT: v_and_b32_e32 v2, s4, v3 625; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 626; SI-NEXT: s_setpc_b64 s[30:31] 627; 628; VI-LABEL: v_fshr_v3i16: 629; VI: ; %bb.0: 630; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 631; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 632; VI-NEXT: v_and_b32_e32 v7, 15, v6 633; VI-NEXT: v_mov_b32_e32 v8, 1 634; VI-NEXT: v_xor_b32_e32 v6, -1, v6 635; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 636; VI-NEXT: v_and_b32_e32 v6, 15, v6 637; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 638; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 639; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 640; VI-NEXT: v_xor_b32_e32 v7, -1, v5 641; VI-NEXT: v_and_b32_e32 v5, 15, v5 642; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 643; VI-NEXT: v_and_b32_e32 v7, 15, v7 644; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 645; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 646; VI-NEXT: v_or_b32_e32 v1, v1, v3 647; VI-NEXT: v_xor_b32_e32 v3, -1, v4 648; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 649; VI-NEXT: v_and_b32_e32 v3, 15, v3 650; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 651; VI-NEXT: v_and_b32_e32 v3, 15, v4 652; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 653; VI-NEXT: v_or_b32_e32 v0, v0, v2 654; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 655; VI-NEXT: s_setpc_b64 s[30:31] 656; 657; GFX9-LABEL: v_fshr_v3i16: 658; GFX9: ; %bb.0: 659; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 660; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 661; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 662; GFX9-NEXT: v_mov_b32_e32 v8, 1 663; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 664; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 665; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 666; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 667; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 668; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 669; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 670; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 671; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 672; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 673; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 674; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 675; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 676; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 677; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 678; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 679; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 680; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 681; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 682; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 683; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 684; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 685; GFX9-NEXT: s_setpc_b64 s[30:31] 686; 687; R600-LABEL: v_fshr_v3i16: 688; R600: ; %bb.0: 689; R600-NEXT: CF_END 690; R600-NEXT: PAD 691 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) 692 ret <3 x i16> %ret 693} 694 695define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { 696; SI-LABEL: v_fshr_v4i16: 697; SI: ; %bb.0: 698; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 699; SI-NEXT: v_or_b32_e32 v9, 16, v9 700; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 701; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 702; SI-NEXT: v_or_b32_e32 v5, 16, v8 703; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 704; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 705; SI-NEXT: v_or_b32_e32 v4, 16, v11 706; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 707; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 708; SI-NEXT: v_or_b32_e32 v4, 16, v10 709; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 710; SI-NEXT: s_mov_b32 s4, 0xffff 711; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 712; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 713; SI-NEXT: v_and_b32_e32 v2, s4, v2 714; SI-NEXT: v_or_b32_e32 v2, v2, v3 715; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 716; SI-NEXT: v_and_b32_e32 v0, s4, v0 717; SI-NEXT: v_or_b32_e32 v0, v0, v1 718; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 719; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 720; SI-NEXT: s_setpc_b64 s[30:31] 721; 722; VI-LABEL: v_fshr_v4i16: 723; VI: ; %bb.0: 724; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 725; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 726; VI-NEXT: v_and_b32_e32 v7, 15, v6 727; VI-NEXT: v_xor_b32_e32 v6, -1, v6 728; VI-NEXT: v_mov_b32_e32 v8, 1 729; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 730; VI-NEXT: v_and_b32_e32 v6, 15, v6 731; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 732; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 733; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 734; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 735; VI-NEXT: v_and_b32_e32 v9, 15, v7 736; VI-NEXT: v_xor_b32_e32 v7, -1, v7 737; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 738; VI-NEXT: v_and_b32_e32 v7, 15, v7 739; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 740; VI-NEXT: v_xor_b32_e32 v8, -1, v5 741; VI-NEXT: v_and_b32_e32 v5, 15, v5 742; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 743; VI-NEXT: v_and_b32_e32 v8, 15, v8 744; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 745; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 746; VI-NEXT: v_or_b32_e32 v1, v1, v3 747; VI-NEXT: v_xor_b32_e32 v3, -1, v4 748; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 749; VI-NEXT: v_and_b32_e32 v3, 15, v3 750; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 751; VI-NEXT: v_and_b32_e32 v3, 15, v4 752; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 753; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 754; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 755; VI-NEXT: v_or_b32_e32 v0, v0, v2 756; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 757; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 758; VI-NEXT: s_setpc_b64 s[30:31] 759; 760; GFX9-LABEL: v_fshr_v4i16: 761; GFX9: ; %bb.0: 762; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 763; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 764; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 765; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 766; GFX9-NEXT: v_mov_b32_e32 v8, 1 767; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 768; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 769; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 770; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 771; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 772; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 773; GFX9-NEXT: v_and_b32_e32 v9, 15, v7 774; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 775; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 776; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 777; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 778; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 779; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 780; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 781; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 782; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 783; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 784; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 785; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 786; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 787; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 788; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 789; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 790; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 791; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 792; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 793; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 794; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 795; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 796; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 797; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 798; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 799; GFX9-NEXT: s_setpc_b64 s[30:31] 800; 801; R600-LABEL: v_fshr_v4i16: 802; R600: ; %bb.0: 803; R600-NEXT: CF_END 804; R600-NEXT: PAD 805 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) 806 ret <4 x i16> %ret 807} 808 809define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { 810; SI-LABEL: v_fshr_i64: 811; SI: ; %bb.0: 812; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 813; SI-NEXT: v_and_b32_e32 v5, 63, v4 814; SI-NEXT: v_not_b32_e32 v4, v4 815; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 816; SI-NEXT: v_and_b32_e32 v4, 63, v4 817; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 818; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 819; SI-NEXT: v_or_b32_e32 v1, v1, v3 820; SI-NEXT: v_or_b32_e32 v0, v0, v2 821; SI-NEXT: s_setpc_b64 s[30:31] 822; 823; VI-LABEL: v_fshr_i64: 824; VI: ; %bb.0: 825; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 826; VI-NEXT: v_and_b32_e32 v5, 63, v4 827; VI-NEXT: v_not_b32_e32 v4, v4 828; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 829; VI-NEXT: v_and_b32_e32 v4, 63, v4 830; VI-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] 831; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 832; VI-NEXT: v_or_b32_e32 v1, v1, v3 833; VI-NEXT: v_or_b32_e32 v0, v0, v2 834; VI-NEXT: s_setpc_b64 s[30:31] 835; 836; GFX9-LABEL: v_fshr_i64: 837; GFX9: ; %bb.0: 838; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 839; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 840; GFX9-NEXT: v_not_b32_e32 v4, v4 841; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 842; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 843; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] 844; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 845; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 846; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 847; GFX9-NEXT: s_setpc_b64 s[30:31] 848; 849; R600-LABEL: v_fshr_i64: 850; R600: ; %bb.0: 851; R600-NEXT: CF_END 852; R600-NEXT: PAD 853 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) 854 ret i64 %ret 855} 856 857define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { 858; SI-LABEL: v_fshr_v2i64: 859; SI: ; %bb.0: 860; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 861; SI-NEXT: v_and_b32_e32 v9, 63, v8 862; SI-NEXT: v_not_b32_e32 v8, v8 863; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 864; SI-NEXT: v_and_b32_e32 v8, 63, v8 865; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 866; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 867; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 868; SI-NEXT: v_or_b32_e32 v1, v1, v5 869; SI-NEXT: v_and_b32_e32 v5, 63, v10 870; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v5 871; SI-NEXT: v_not_b32_e32 v7, v10 872; SI-NEXT: v_and_b32_e32 v7, 63, v7 873; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 874; SI-NEXT: v_or_b32_e32 v0, v0, v4 875; SI-NEXT: v_or_b32_e32 v3, v3, v6 876; SI-NEXT: v_or_b32_e32 v2, v2, v5 877; SI-NEXT: s_setpc_b64 s[30:31] 878; 879; VI-LABEL: v_fshr_v2i64: 880; VI: ; %bb.0: 881; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 882; VI-NEXT: v_and_b32_e32 v9, 63, v8 883; VI-NEXT: v_not_b32_e32 v8, v8 884; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 885; VI-NEXT: v_and_b32_e32 v8, 63, v8 886; VI-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] 887; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 888; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 889; VI-NEXT: v_or_b32_e32 v1, v1, v5 890; VI-NEXT: v_and_b32_e32 v5, 63, v10 891; VI-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] 892; VI-NEXT: v_not_b32_e32 v7, v10 893; VI-NEXT: v_and_b32_e32 v7, 63, v7 894; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 895; VI-NEXT: v_or_b32_e32 v0, v0, v4 896; VI-NEXT: v_or_b32_e32 v3, v3, v6 897; VI-NEXT: v_or_b32_e32 v2, v2, v5 898; VI-NEXT: s_setpc_b64 s[30:31] 899; 900; GFX9-LABEL: v_fshr_v2i64: 901; GFX9: ; %bb.0: 902; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 903; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 904; GFX9-NEXT: v_not_b32_e32 v8, v8 905; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 906; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 907; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] 908; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 909; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 910; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 911; GFX9-NEXT: v_and_b32_e32 v5, 63, v10 912; GFX9-NEXT: v_lshrrev_b64 v[5:6], v5, v[6:7] 913; GFX9-NEXT: v_not_b32_e32 v7, v10 914; GFX9-NEXT: v_and_b32_e32 v7, 63, v7 915; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 916; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 917; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 918; GFX9-NEXT: v_or_b32_e32 v2, v2, v5 919; GFX9-NEXT: s_setpc_b64 s[30:31] 920; 921; R600-LABEL: v_fshr_v2i64: 922; R600: ; %bb.0: 923; R600-NEXT: CF_END 924; R600-NEXT: PAD 925 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) 926 ret <2 x i64> %ret 927} 928 929define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { 930; SI-LABEL: v_fshr_i24: 931; SI: ; %bb.0: 932; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 933; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 934; SI-NEXT: v_mul_hi_u32 v3, v2, s4 935; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 936; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 937; SI-NEXT: v_mul_lo_u32 v3, v3, 24 938; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 939; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 940; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 941; SI-NEXT: s_setpc_b64 s[30:31] 942; 943; VI-LABEL: v_fshr_i24: 944; VI: ; %bb.0: 945; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 946; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 947; VI-NEXT: v_mul_hi_u32 v3, v2, s4 948; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 949; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 950; VI-NEXT: v_mul_lo_u32 v3, v3, 24 951; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 952; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 953; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 954; VI-NEXT: s_setpc_b64 s[30:31] 955; 956; GFX9-LABEL: v_fshr_i24: 957; GFX9: ; %bb.0: 958; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 959; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 960; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4 961; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 962; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 963; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 964; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 965; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 966; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 967; GFX9-NEXT: s_setpc_b64 s[30:31] 968; 969; R600-LABEL: v_fshr_i24: 970; R600: ; %bb.0: 971; R600-NEXT: CF_END 972; R600-NEXT: PAD 973 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) 974 ret i24 %ret 975} 976 977define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { 978; SI-LABEL: v_fshr_v2i24: 979; SI: ; %bb.0: 980; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 981; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 982; SI-NEXT: v_mul_hi_u32 v6, v4, s4 983; SI-NEXT: v_mul_hi_u32 v7, v5, s4 984; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 985; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 986; SI-NEXT: v_mul_lo_u32 v6, v6, 24 987; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 988; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 989; SI-NEXT: v_mul_lo_u32 v6, v6, 24 990; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 991; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 992; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 993; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 994; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 995; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 996; SI-NEXT: s_setpc_b64 s[30:31] 997; 998; VI-LABEL: v_fshr_v2i24: 999; VI: ; %bb.0: 1000; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1001; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1002; VI-NEXT: v_mul_hi_u32 v6, v4, s4 1003; VI-NEXT: v_mul_hi_u32 v7, v5, s4 1004; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1005; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1006; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1007; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 1008; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1009; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1010; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 1011; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1012; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1013; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6 1014; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 1015; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1016; VI-NEXT: s_setpc_b64 s[30:31] 1017; 1018; GFX9-LABEL: v_fshr_v2i24: 1019; GFX9: ; %bb.0: 1020; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1021; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1022; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 1023; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4 1024; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1025; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1026; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1027; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 1028; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1029; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1030; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 1031; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 1032; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1033; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6 1034; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 1035; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 1036; GFX9-NEXT: s_setpc_b64 s[30:31] 1037; 1038; R600-LABEL: v_fshr_v2i24: 1039; R600: ; %bb.0: 1040; R600-NEXT: CF_END 1041; R600-NEXT: PAD 1042 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) 1043 ret <2 x i24> %ret 1044} 1045