1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600 6 7declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone 8declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 9declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 10 11define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 12; SI-LABEL: fshl_i32: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 15; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 16; SI-NEXT: s_load_dword s0, s[0:1], 0xd 17; SI-NEXT: s_mov_b32 s7, 0xf000 18; SI-NEXT: s_mov_b32 s6, -1 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: v_mov_b32_e32 v0, s3 21; SI-NEXT: s_not_b32 s0, s0 22; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 23; SI-NEXT: s_lshr_b32 s1, s2, 1 24; SI-NEXT: v_mov_b32_e32 v1, s0 25; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 26; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: fshl_i32: 30; VI: ; %bb.0: ; %entry 31; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 32; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 33; VI-NEXT: s_load_dword s0, s[0:1], 0x34 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: v_mov_b32_e32 v0, s5 36; VI-NEXT: s_not_b32 s0, s0 37; VI-NEXT: s_lshr_b32 s1, s4, 1 38; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 39; VI-NEXT: v_mov_b32_e32 v1, s0 40; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 41; VI-NEXT: v_mov_b32_e32 v0, s2 42; VI-NEXT: v_mov_b32_e32 v1, s3 43; VI-NEXT: flat_store_dword v[0:1], v2 44; VI-NEXT: s_endpgm 45; 46; GFX9-LABEL: fshl_i32: 47; GFX9: ; %bb.0: ; %entry 48; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 49; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 50; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 51; GFX9-NEXT: v_mov_b32_e32 v0, 0 52; GFX9-NEXT: s_waitcnt lgkmcnt(0) 53; GFX9-NEXT: v_mov_b32_e32 v1, s5 54; GFX9-NEXT: s_not_b32 s0, s0 55; GFX9-NEXT: s_lshr_b32 s1, s4, 1 56; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1 57; GFX9-NEXT: v_mov_b32_e32 v2, s0 58; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2 59; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 60; GFX9-NEXT: s_endpgm 61; 62; R600-LABEL: fshl_i32: 63; R600: ; %bb.0: ; %entry 64; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 65; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 66; R600-NEXT: CF_END 67; R600-NEXT: PAD 68; R600-NEXT: ALU clause starting at 4: 69; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, 70; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, 71; R600-NEXT: NOT_INT * T1.W, KC0[3].X, 72; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, 73; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 74; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 75entry: 76 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) 77 store i32 %0, i32 addrspace(1)* %in 78 ret void 79} 80 81define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 82; SI-LABEL: fshl_i32_imm: 83; SI: ; %bb.0: ; %entry 84; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 85; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 86; SI-NEXT: s_mov_b32 s7, 0xf000 87; SI-NEXT: s_mov_b32 s6, -1 88; SI-NEXT: s_waitcnt lgkmcnt(0) 89; SI-NEXT: v_mov_b32_e32 v0, s1 90; SI-NEXT: v_alignbit_b32 v0, s0, v0, 25 91; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 92; SI-NEXT: s_endpgm 93; 94; VI-LABEL: fshl_i32_imm: 95; VI: ; %bb.0: ; %entry 96; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 97; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: v_mov_b32_e32 v0, s1 100; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25 101; VI-NEXT: v_mov_b32_e32 v0, s2 102; VI-NEXT: v_mov_b32_e32 v1, s3 103; VI-NEXT: flat_store_dword v[0:1], v2 104; VI-NEXT: s_endpgm 105; 106; GFX9-LABEL: fshl_i32_imm: 107; GFX9: ; %bb.0: ; %entry 108; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 109; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 110; GFX9-NEXT: v_mov_b32_e32 v0, 0 111; GFX9-NEXT: s_waitcnt lgkmcnt(0) 112; GFX9-NEXT: v_mov_b32_e32 v1, s1 113; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25 114; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 115; GFX9-NEXT: s_endpgm 116; 117; R600-LABEL: fshl_i32_imm: 118; R600: ; %bb.0: ; %entry 119; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 120; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 121; R600-NEXT: CF_END 122; R600-NEXT: PAD 123; R600-NEXT: ALU clause starting at 4: 124; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 125; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 126; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 127; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 128entry: 129 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) 130 store i32 %0, i32 addrspace(1)* %in 131 ret void 132} 133 134define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 135; SI-LABEL: fshl_v2i32: 136; SI: ; %bb.0: ; %entry 137; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 138; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 139; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 140; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 141; SI-NEXT: s_mov_b32 s7, 0xf000 142; SI-NEXT: s_mov_b32 s6, -1 143; SI-NEXT: s_waitcnt lgkmcnt(0) 144; SI-NEXT: v_mov_b32_e32 v0, s9 145; SI-NEXT: s_not_b32 s1, s1 146; SI-NEXT: v_alignbit_b32 v0, s3, v0, 1 147; SI-NEXT: v_mov_b32_e32 v1, s1 148; SI-NEXT: s_lshr_b32 s3, s3, 1 149; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 150; SI-NEXT: v_mov_b32_e32 v0, s8 151; SI-NEXT: s_not_b32 s0, s0 152; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 153; SI-NEXT: s_lshr_b32 s1, s2, 1 154; SI-NEXT: v_mov_b32_e32 v2, s0 155; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 156; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 157; SI-NEXT: s_endpgm 158; 159; VI-LABEL: fshl_v2i32: 160; VI: ; %bb.0: ; %entry 161; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 162; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 163; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 164; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 165; VI-NEXT: s_waitcnt lgkmcnt(0) 166; VI-NEXT: v_mov_b32_e32 v0, s7 167; VI-NEXT: s_not_b32 s1, s1 168; VI-NEXT: v_mov_b32_e32 v1, s1 169; VI-NEXT: s_lshr_b32 s7, s5, 1 170; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 171; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 172; VI-NEXT: v_mov_b32_e32 v0, s6 173; VI-NEXT: s_not_b32 s0, s0 174; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 175; VI-NEXT: s_lshr_b32 s1, s4, 1 176; VI-NEXT: v_mov_b32_e32 v2, s0 177; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 178; VI-NEXT: v_mov_b32_e32 v2, s2 179; VI-NEXT: v_mov_b32_e32 v3, s3 180; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 181; VI-NEXT: s_endpgm 182; 183; GFX9-LABEL: fshl_v2i32: 184; GFX9: ; %bb.0: ; %entry 185; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 186; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 187; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 188; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 189; GFX9-NEXT: v_mov_b32_e32 v2, 0 190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 191; GFX9-NEXT: v_mov_b32_e32 v0, s7 192; GFX9-NEXT: s_not_b32 s1, s1 193; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 194; GFX9-NEXT: v_mov_b32_e32 v1, s1 195; GFX9-NEXT: s_lshr_b32 s5, s5, 1 196; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 197; GFX9-NEXT: v_mov_b32_e32 v0, s6 198; GFX9-NEXT: s_not_b32 s0, s0 199; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 200; GFX9-NEXT: s_lshr_b32 s1, s4, 1 201; GFX9-NEXT: v_mov_b32_e32 v3, s0 202; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3 203; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 204; GFX9-NEXT: s_endpgm 205; 206; R600-LABEL: fshl_v2i32: 207; R600: ; %bb.0: ; %entry 208; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 209; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 210; R600-NEXT: CF_END 211; R600-NEXT: PAD 212; R600-NEXT: ALU clause starting at 4: 213; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, 214; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, 215; R600-NEXT: NOT_INT * T1.W, KC0[4].X, 216; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, 217; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, 218; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, 219; R600-NEXT: NOT_INT * T1.W, KC0[3].W, 220; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, 221; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 222; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 223entry: 224 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 225 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 226 ret void 227} 228 229define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 230; SI-LABEL: fshl_v2i32_imm: 231; SI: ; %bb.0: ; %entry 232; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 233; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 234; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 235; SI-NEXT: s_mov_b32 s7, 0xf000 236; SI-NEXT: s_mov_b32 s6, -1 237; SI-NEXT: s_waitcnt lgkmcnt(0) 238; SI-NEXT: v_mov_b32_e32 v0, s1 239; SI-NEXT: v_alignbit_b32 v1, s3, v0, 23 240; SI-NEXT: v_mov_b32_e32 v0, s0 241; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 242; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 243; SI-NEXT: s_endpgm 244; 245; VI-LABEL: fshl_v2i32_imm: 246; VI: ; %bb.0: ; %entry 247; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 248; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 249; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 250; VI-NEXT: s_waitcnt lgkmcnt(0) 251; VI-NEXT: v_mov_b32_e32 v0, s1 252; VI-NEXT: v_mov_b32_e32 v2, s0 253; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 254; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 255; VI-NEXT: v_mov_b32_e32 v2, s2 256; VI-NEXT: v_mov_b32_e32 v3, s3 257; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 258; VI-NEXT: s_endpgm 259; 260; GFX9-LABEL: fshl_v2i32_imm: 261; GFX9: ; %bb.0: ; %entry 262; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 263; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 265; GFX9-NEXT: v_mov_b32_e32 v2, 0 266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 267; GFX9-NEXT: v_mov_b32_e32 v0, s1 268; GFX9-NEXT: v_mov_b32_e32 v3, s0 269; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 270; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 271; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 272; GFX9-NEXT: s_endpgm 273; 274; R600-LABEL: fshl_v2i32_imm: 275; R600: ; %bb.0: ; %entry 276; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 277; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 278; R600-NEXT: CF_END 279; R600-NEXT: PAD 280; R600-NEXT: ALU clause starting at 4: 281; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 282; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 283; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 284; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 285; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 286; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 287entry: 288 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 289 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 290 ret void 291} 292 293define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 294; SI-LABEL: fshl_v4i32: 295; SI: ; %bb.0: ; %entry 296; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 297; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 298; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 299; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 300; SI-NEXT: s_mov_b32 s7, 0xf000 301; SI-NEXT: s_mov_b32 s6, -1 302; SI-NEXT: s_waitcnt lgkmcnt(0) 303; SI-NEXT: v_mov_b32_e32 v0, s15 304; SI-NEXT: s_not_b32 s3, s3 305; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1 306; SI-NEXT: v_mov_b32_e32 v1, s3 307; SI-NEXT: s_lshr_b32 s11, s11, 1 308; SI-NEXT: v_alignbit_b32 v3, s11, v0, v1 309; SI-NEXT: v_mov_b32_e32 v0, s14 310; SI-NEXT: s_not_b32 s2, s2 311; SI-NEXT: v_mov_b32_e32 v1, s2 312; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1 313; SI-NEXT: s_lshr_b32 s3, s10, 1 314; SI-NEXT: v_alignbit_b32 v2, s3, v0, v1 315; SI-NEXT: v_mov_b32_e32 v0, s13 316; SI-NEXT: s_not_b32 s1, s1 317; SI-NEXT: v_mov_b32_e32 v1, s1 318; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 319; SI-NEXT: s_lshr_b32 s2, s9, 1 320; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 321; SI-NEXT: v_mov_b32_e32 v0, s12 322; SI-NEXT: s_not_b32 s0, s0 323; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 324; SI-NEXT: s_lshr_b32 s1, s8, 1 325; SI-NEXT: v_mov_b32_e32 v4, s0 326; SI-NEXT: v_alignbit_b32 v0, s1, v0, v4 327; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 328; SI-NEXT: s_endpgm 329; 330; VI-LABEL: fshl_v4i32: 331; VI: ; %bb.0: ; %entry 332; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 333; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 334; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 335; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 336; VI-NEXT: s_waitcnt lgkmcnt(0) 337; VI-NEXT: v_mov_b32_e32 v0, s11 338; VI-NEXT: s_not_b32 s3, s3 339; VI-NEXT: v_mov_b32_e32 v1, s3 340; VI-NEXT: s_lshr_b32 s11, s7, 1 341; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 342; VI-NEXT: v_alignbit_b32 v3, s11, v0, v1 343; VI-NEXT: v_mov_b32_e32 v0, s10 344; VI-NEXT: s_not_b32 s2, s2 345; VI-NEXT: v_mov_b32_e32 v1, s2 346; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 347; VI-NEXT: s_lshr_b32 s3, s6, 1 348; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 349; VI-NEXT: v_mov_b32_e32 v0, s9 350; VI-NEXT: s_not_b32 s1, s1 351; VI-NEXT: v_mov_b32_e32 v1, s1 352; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 353; VI-NEXT: s_lshr_b32 s2, s5, 1 354; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 355; VI-NEXT: v_mov_b32_e32 v0, s8 356; VI-NEXT: s_not_b32 s0, s0 357; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 358; VI-NEXT: s_lshr_b32 s1, s4, 1 359; VI-NEXT: v_mov_b32_e32 v4, s0 360; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 361; VI-NEXT: v_mov_b32_e32 v4, s12 362; VI-NEXT: v_mov_b32_e32 v5, s13 363; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 364; VI-NEXT: s_endpgm 365; 366; GFX9-LABEL: fshl_v4i32: 367; GFX9: ; %bb.0: ; %entry 368; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 369; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 370; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 371; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 372; GFX9-NEXT: v_mov_b32_e32 v4, 0 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: v_mov_b32_e32 v0, s11 375; GFX9-NEXT: s_not_b32 s3, s3 376; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 377; GFX9-NEXT: v_mov_b32_e32 v1, s3 378; GFX9-NEXT: s_lshr_b32 s7, s7, 1 379; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 380; GFX9-NEXT: v_mov_b32_e32 v0, s10 381; GFX9-NEXT: s_not_b32 s2, s2 382; GFX9-NEXT: v_mov_b32_e32 v1, s2 383; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 384; GFX9-NEXT: s_lshr_b32 s3, s6, 1 385; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1 386; GFX9-NEXT: v_mov_b32_e32 v0, s9 387; GFX9-NEXT: s_not_b32 s1, s1 388; GFX9-NEXT: v_mov_b32_e32 v1, s1 389; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 390; GFX9-NEXT: s_lshr_b32 s2, s5, 1 391; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1 392; GFX9-NEXT: v_mov_b32_e32 v0, s8 393; GFX9-NEXT: s_not_b32 s0, s0 394; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 395; GFX9-NEXT: s_lshr_b32 s1, s4, 1 396; GFX9-NEXT: v_mov_b32_e32 v5, s0 397; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5 398; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] 399; GFX9-NEXT: s_endpgm 400; 401; R600-LABEL: fshl_v4i32: 402; R600: ; %bb.0: ; %entry 403; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 404; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 405; R600-NEXT: CF_END 406; R600-NEXT: PAD 407; R600-NEXT: ALU clause starting at 4: 408; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, 409; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 410; R600-NEXT: NOT_INT * T1.W, KC0[6].X, 411; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, 412; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, 413; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, 414; R600-NEXT: NOT_INT * T1.W, KC0[5].W, 415; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, 416; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, 417; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, 418; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, 419; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, 420; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, 421; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, 422; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, 423; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, 424; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 425; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 426entry: 427 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 428 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 429 ret void 430} 431 432define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 433; SI-LABEL: fshl_v4i32_imm: 434; SI: ; %bb.0: ; %entry 435; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 436; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 437; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 438; SI-NEXT: s_mov_b32 s7, 0xf000 439; SI-NEXT: s_mov_b32 s6, -1 440; SI-NEXT: s_waitcnt lgkmcnt(0) 441; SI-NEXT: v_mov_b32_e32 v0, s3 442; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 443; SI-NEXT: v_mov_b32_e32 v0, s2 444; SI-NEXT: v_alignbit_b32 v2, s10, v0, 23 445; SI-NEXT: v_mov_b32_e32 v0, s1 446; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 447; SI-NEXT: v_mov_b32_e32 v0, s0 448; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 449; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 450; SI-NEXT: s_endpgm 451; 452; VI-LABEL: fshl_v4i32_imm: 453; VI: ; %bb.0: ; %entry 454; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 455; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 456; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 457; VI-NEXT: s_waitcnt lgkmcnt(0) 458; VI-NEXT: v_mov_b32_e32 v4, s8 459; VI-NEXT: v_mov_b32_e32 v5, s9 460; VI-NEXT: v_mov_b32_e32 v0, s3 461; VI-NEXT: v_mov_b32_e32 v1, s2 462; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 463; VI-NEXT: v_mov_b32_e32 v0, s1 464; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 465; VI-NEXT: v_alignbit_b32 v1, s5, v0, 25 466; VI-NEXT: v_mov_b32_e32 v0, s0 467; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 468; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 469; VI-NEXT: s_endpgm 470; 471; GFX9-LABEL: fshl_v4i32_imm: 472; GFX9: ; %bb.0: ; %entry 473; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 474; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 475; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 476; GFX9-NEXT: v_mov_b32_e32 v4, 0 477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NEXT: v_mov_b32_e32 v0, s3 479; GFX9-NEXT: v_mov_b32_e32 v1, s2 480; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 481; GFX9-NEXT: v_mov_b32_e32 v0, s1 482; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 483; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 484; GFX9-NEXT: v_mov_b32_e32 v0, s0 485; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 486; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 487; GFX9-NEXT: s_endpgm 488; 489; R600-LABEL: fshl_v4i32_imm: 490; R600: ; %bb.0: ; %entry 491; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 492; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 493; R600-NEXT: CF_END 494; R600-NEXT: PAD 495; R600-NEXT: ALU clause starting at 4: 496; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, 497; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 498; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 499; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 500; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 501; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 502; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, 503; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 504; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 505; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 506entry: 507 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 508 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 509 ret void 510} 511