1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=FUNC,GCN,SI 3; XUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=FUNC,GCN,VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s -allow-deprecated-dag-overlap -check-prefixes=FUNC,EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8declare i32 @llvm.amdgcn.workgroup.id.x() #0 9 10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 11; GCN-LABEL: shl_v2i32: 12; GCN: ; %bb.0: 13; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 14; GCN-NEXT: s_mov_b32 s3, 0xf000 15; GCN-NEXT: s_mov_b32 s2, -1 16; GCN-NEXT: s_mov_b32 s10, s2 17; GCN-NEXT: s_mov_b32 s11, s3 18; GCN-NEXT: s_waitcnt lgkmcnt(0) 19; GCN-NEXT: s_mov_b32 s8, s6 20; GCN-NEXT: s_mov_b32 s9, s7 21; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 22; GCN-NEXT: s_mov_b32 s0, s4 23; GCN-NEXT: s_mov_b32 s1, s5 24; GCN-NEXT: s_waitcnt vmcnt(0) 25; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 26; GCN-NEXT: v_lshl_b32_e32 v0, v0, v2 27; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 28; GCN-NEXT: s_endpgm 29; 30; EG-LABEL: shl_v2i32: 31; EG: ; %bb.0: 32; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 33; EG-NEXT: TEX 1 @6 34; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 35; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 36; EG-NEXT: CF_END 37; EG-NEXT: PAD 38; EG-NEXT: Fetch clause starting at 6: 39; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 40; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 41; EG-NEXT: ALU clause starting at 10: 42; EG-NEXT: MOV * T0.X, KC0[2].Z, 43; EG-NEXT: ALU clause starting at 11: 44; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y, 45; EG-NEXT: LSHL T0.X, T0.X, T1.X, 46; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 47; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 48 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 49 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 50 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 51 %result = shl <2 x i32> %a, %b 52 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 53 ret void 54} 55 56define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 57; GCN-LABEL: shl_v4i32: 58; GCN: ; %bb.0: 59; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 60; GCN-NEXT: s_mov_b32 s3, 0xf000 61; GCN-NEXT: s_mov_b32 s2, -1 62; GCN-NEXT: s_mov_b32 s10, s2 63; GCN-NEXT: s_mov_b32 s11, s3 64; GCN-NEXT: s_waitcnt lgkmcnt(0) 65; GCN-NEXT: s_mov_b32 s8, s6 66; GCN-NEXT: s_mov_b32 s9, s7 67; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 68; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 69; GCN-NEXT: s_mov_b32 s0, s4 70; GCN-NEXT: s_mov_b32 s1, s5 71; GCN-NEXT: s_waitcnt vmcnt(0) 72; GCN-NEXT: v_lshl_b32_e32 v3, v3, v7 73; GCN-NEXT: v_lshl_b32_e32 v2, v2, v6 74; GCN-NEXT: v_lshl_b32_e32 v1, v1, v5 75; GCN-NEXT: v_lshl_b32_e32 v0, v0, v4 76; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 77; GCN-NEXT: s_endpgm 78; 79; EG-LABEL: shl_v4i32: 80; EG: ; %bb.0: 81; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 82; EG-NEXT: TEX 1 @6 83; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 84; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 85; EG-NEXT: CF_END 86; EG-NEXT: PAD 87; EG-NEXT: Fetch clause starting at 6: 88; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 89; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 90; EG-NEXT: ALU clause starting at 10: 91; EG-NEXT: MOV * T0.X, KC0[2].Z, 92; EG-NEXT: ALU clause starting at 11: 93; EG-NEXT: LSHL * T0.W, T0.W, T1.W, 94; EG-NEXT: LSHL * T0.Z, T0.Z, T1.Z, 95; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y, 96; EG-NEXT: LSHL T0.X, T0.X, T1.X, 97; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 98; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 99 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 100 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 101 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 102 %result = shl <4 x i32> %a, %b 103 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 104 ret void 105} 106 107define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 108; GCN-LABEL: shl_i16: 109; GCN: ; %bb.0: 110; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 111; GCN-NEXT: s_mov_b32 s3, 0xf000 112; GCN-NEXT: s_mov_b32 s2, -1 113; GCN-NEXT: s_waitcnt lgkmcnt(0) 114; GCN-NEXT: s_mov_b32 s0, s4 115; GCN-NEXT: s_mov_b32 s1, s5 116; GCN-NEXT: s_mov_b32 s4, s6 117; GCN-NEXT: s_mov_b32 s5, s7 118; GCN-NEXT: s_mov_b32 s6, s2 119; GCN-NEXT: s_mov_b32 s7, s3 120; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 121; GCN-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2 122; GCN-NEXT: s_waitcnt vmcnt(0) 123; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 124; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 125; GCN-NEXT: s_endpgm 126; 127; EG-LABEL: shl_i16: 128; EG: ; %bb.0: 129; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 130; EG-NEXT: TEX 1 @6 131; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 132; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 133; EG-NEXT: CF_END 134; EG-NEXT: PAD 135; EG-NEXT: Fetch clause starting at 6: 136; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 137; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 138; EG-NEXT: ALU clause starting at 10: 139; EG-NEXT: MOV * T0.X, KC0[2].Z, 140; EG-NEXT: ALU clause starting at 11: 141; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 142; EG-NEXT: LSHL * T1.W, T0.X, T1.X, 143; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 144; EG-NEXT: AND_INT T1.W, PS, literal.x, 145; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 146; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 147; EG-NEXT: LSHL T0.X, PV.W, PS, 148; EG-NEXT: LSHL * T0.W, literal.x, PS, 149; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 150; EG-NEXT: MOV T0.Y, 0.0, 151; EG-NEXT: MOV * T0.Z, 0.0, 152; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 153; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 154 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 155 %a = load i16, i16 addrspace(1)* %in 156 %b = load i16, i16 addrspace(1)* %b_ptr 157 %result = shl i16 %a, %b 158 store i16 %result, i16 addrspace(1)* %out 159 ret void 160} 161 162define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { 163; GCN-LABEL: shl_i16_v_s: 164; GCN: ; %bb.0: 165; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 166; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 167; GCN-NEXT: s_mov_b32 s3, 0xf000 168; GCN-NEXT: s_mov_b32 s2, -1 169; GCN-NEXT: s_waitcnt lgkmcnt(0) 170; GCN-NEXT: s_mov_b32 s0, s4 171; GCN-NEXT: s_mov_b32 s1, s5 172; GCN-NEXT: s_mov_b32 s4, s6 173; GCN-NEXT: s_mov_b32 s5, s7 174; GCN-NEXT: s_mov_b32 s6, s2 175; GCN-NEXT: s_mov_b32 s7, s3 176; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 177; GCN-NEXT: s_and_b32 s8, s8, 0xffff 178; GCN-NEXT: s_waitcnt vmcnt(0) 179; GCN-NEXT: v_lshlrev_b32_e32 v0, s8, v0 180; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 181; GCN-NEXT: s_endpgm 182; 183; EG-LABEL: shl_i16_v_s: 184; EG: ; %bb.0: 185; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 186; EG-NEXT: TEX 1 @6 187; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[] 188; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 189; EG-NEXT: CF_END 190; EG-NEXT: PAD 191; EG-NEXT: Fetch clause starting at 6: 192; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 193; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 194; EG-NEXT: ALU clause starting at 10: 195; EG-NEXT: MOV T0.X, 0.0, 196; EG-NEXT: MOV * T1.X, KC0[2].Z, 197; EG-NEXT: ALU clause starting at 12: 198; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 199; EG-NEXT: LSHL * T1.W, T1.X, T0.X, 200; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 201; EG-NEXT: AND_INT T1.W, PS, literal.x, 202; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 203; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 204; EG-NEXT: LSHL T0.X, PV.W, PS, 205; EG-NEXT: LSHL * T0.W, literal.x, PS, 206; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 207; EG-NEXT: MOV T0.Y, 0.0, 208; EG-NEXT: MOV * T0.Z, 0.0, 209; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 210; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 211 %a = load i16, i16 addrspace(1)* %in 212 %result = shl i16 %a, %b 213 store i16 %result, i16 addrspace(1)* %out 214 ret void 215} 216 217define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { 218; GCN-LABEL: shl_i16_v_compute_s: 219; GCN: ; %bb.0: 220; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 221; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 222; GCN-NEXT: s_mov_b32 s3, 0xf000 223; GCN-NEXT: s_mov_b32 s2, -1 224; GCN-NEXT: s_waitcnt lgkmcnt(0) 225; GCN-NEXT: s_mov_b32 s0, s4 226; GCN-NEXT: s_mov_b32 s1, s5 227; GCN-NEXT: s_mov_b32 s4, s6 228; GCN-NEXT: s_mov_b32 s5, s7 229; GCN-NEXT: s_mov_b32 s6, s2 230; GCN-NEXT: s_mov_b32 s7, s3 231; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0 232; GCN-NEXT: s_add_i32 s8, s8, 3 233; GCN-NEXT: s_and_b32 s4, s8, 0xffff 234; GCN-NEXT: s_waitcnt vmcnt(0) 235; GCN-NEXT: v_lshlrev_b32_e32 v0, s4, v0 236; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 237; GCN-NEXT: s_endpgm 238; 239; EG-LABEL: shl_i16_v_compute_s: 240; EG: ; %bb.0: 241; EG-NEXT: ALU 0, @12, KC0[], KC1[] 242; EG-NEXT: TEX 0 @8 243; EG-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] 244; EG-NEXT: TEX 0 @10 245; EG-NEXT: ALU 15, @14, KC0[CB0:0-32], KC1[] 246; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 247; EG-NEXT: CF_END 248; EG-NEXT: PAD 249; EG-NEXT: Fetch clause starting at 8: 250; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 251; EG-NEXT: Fetch clause starting at 10: 252; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 253; EG-NEXT: ALU clause starting at 12: 254; EG-NEXT: MOV * T0.X, 0.0, 255; EG-NEXT: ALU clause starting at 13: 256; EG-NEXT: MOV * T1.X, KC0[2].Z, 257; EG-NEXT: ALU clause starting at 14: 258; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 259; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 260; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 261; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 262; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 263; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 264; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 265; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 266; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 267; EG-NEXT: LSHL T0.X, PV.W, PS, 268; EG-NEXT: LSHL * T0.W, literal.x, PS, 269; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 270; EG-NEXT: MOV T0.Y, 0.0, 271; EG-NEXT: MOV * T0.Z, 0.0, 272; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 273; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 274 %a = load i16, i16 addrspace(1)* %in 275 %b.add = add i16 %b, 3 276 %result = shl i16 %a, %b.add 277 store i16 %result, i16 addrspace(1)* %out 278 ret void 279} 280 281define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 282; GCN-LABEL: shl_i16_computed_amount: 283; GCN: ; %bb.0: 284; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 285; GCN-NEXT: s_mov_b32 s3, 0xf000 286; GCN-NEXT: s_mov_b32 s2, -1 287; GCN-NEXT: s_mov_b32 s10, s2 288; GCN-NEXT: s_mov_b32 s11, s3 289; GCN-NEXT: s_waitcnt lgkmcnt(0) 290; GCN-NEXT: s_mov_b32 s8, s6 291; GCN-NEXT: s_mov_b32 s9, s7 292; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 293; GCN-NEXT: v_mov_b32_e32 v1, 0 294; GCN-NEXT: s_mov_b32 s14, 0 295; GCN-NEXT: s_mov_b32 s15, s3 296; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] 297; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 298; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 299; GCN-NEXT: s_mov_b32 s0, s4 300; GCN-NEXT: s_mov_b32 s1, s5 301; GCN-NEXT: s_waitcnt vmcnt(0) 302; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 303; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 304; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 305; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 306; GCN-NEXT: s_endpgm 307; 308; EG-LABEL: shl_i16_computed_amount: 309; EG: ; %bb.0: 310; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 311; EG-NEXT: TEX 0 @8 312; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[] 313; EG-NEXT: TEX 0 @10 314; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 315; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 316; EG-NEXT: CF_END 317; EG-NEXT: PAD 318; EG-NEXT: Fetch clause starting at 8: 319; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 320; EG-NEXT: Fetch clause starting at 10: 321; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 322; EG-NEXT: ALU clause starting at 12: 323; EG-NEXT: MOV * T1.X, KC0[2].Z, 324; EG-NEXT: ALU clause starting at 13: 325; EG-NEXT: LSHL * T0.W, T0.X, 1, 326; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 327; EG-NEXT: ALU clause starting at 15: 328; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 329; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 330; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 331; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 332; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 333; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 334; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 335; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 336; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 337; EG-NEXT: LSHL T0.X, PV.W, PS, 338; EG-NEXT: LSHL * T0.W, literal.x, PS, 339; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 340; EG-NEXT: MOV T0.Y, 0.0, 341; EG-NEXT: MOV * T0.Z, 0.0, 342; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 343; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 344 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 345 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid 346 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid 347 %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1 348 %a = load volatile i16, i16 addrspace(1)* %in 349 %b = load volatile i16, i16 addrspace(1)* %b_ptr 350 %b.add = add i16 %b, 3 351 %result = shl i16 %a, %b.add 352 store i16 %result, i16 addrspace(1)* %out 353 ret void 354} 355 356define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { 357; GCN-LABEL: shl_i16_i_s: 358; GCN: ; %bb.0: 359; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 360; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 361; GCN-NEXT: s_mov_b32 s7, 0xf000 362; GCN-NEXT: s_mov_b32 s6, -1 363; GCN-NEXT: s_waitcnt lgkmcnt(0) 364; GCN-NEXT: s_lshl_b32 s0, s0, 12 365; GCN-NEXT: v_mov_b32_e32 v0, s0 366; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 367; GCN-NEXT: s_endpgm 368; 369; EG-LABEL: shl_i16_i_s: 370; EG: ; %bb.0: 371; EG-NEXT: ALU 0, @8, KC0[], KC1[] 372; EG-NEXT: TEX 0 @6 373; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 374; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 375; EG-NEXT: CF_END 376; EG-NEXT: PAD 377; EG-NEXT: Fetch clause starting at 6: 378; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 379; EG-NEXT: ALU clause starting at 8: 380; EG-NEXT: MOV * T0.X, 0.0, 381; EG-NEXT: ALU clause starting at 9: 382; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, 383; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 384; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45) 385; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 386; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) 387; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 388; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 389; EG-NEXT: 61440(8.609578e-41), 3(4.203895e-45) 390; EG-NEXT: LSHL T0.X, PV.W, PS, 391; EG-NEXT: LSHL * T0.W, literal.x, PS, 392; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 393; EG-NEXT: MOV T0.Y, 0.0, 394; EG-NEXT: MOV * T0.Z, 0.0, 395; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 396; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 397 %result = shl i16 %a, 12 398 store i16 %result, i16 addrspace(1)* %out 399 ret void 400} 401 402define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 403; GCN-LABEL: shl_v2i16: 404; GCN: ; %bb.0: 405; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 406; GCN-NEXT: s_mov_b32 s3, 0xf000 407; GCN-NEXT: s_mov_b32 s2, -1 408; GCN-NEXT: s_mov_b32 s10, s2 409; GCN-NEXT: s_mov_b32 s11, s3 410; GCN-NEXT: s_waitcnt lgkmcnt(0) 411; GCN-NEXT: s_mov_b32 s8, s6 412; GCN-NEXT: s_mov_b32 s9, s7 413; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 414; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] 415; GCN-NEXT: v_mov_b32_e32 v1, 0 416; GCN-NEXT: s_mov_b32 s14, 0 417; GCN-NEXT: s_mov_b32 s15, s3 418; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0 419; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 420; GCN-NEXT: s_mov_b32 s6, 0xffff 421; GCN-NEXT: s_mov_b32 s0, s4 422; GCN-NEXT: s_mov_b32 s1, s5 423; GCN-NEXT: s_waitcnt vmcnt(1) 424; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 425; GCN-NEXT: s_waitcnt vmcnt(0) 426; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 427; GCN-NEXT: v_and_b32_e32 v0, s6, v0 428; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 429; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 430; GCN-NEXT: v_and_b32_e32 v0, s6, v0 431; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 432; GCN-NEXT: v_or_b32_e32 v0, v0, v1 433; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 434; GCN-NEXT: s_endpgm 435; 436; EG-LABEL: shl_v2i16: 437; EG: ; %bb.0: 438; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 439; EG-NEXT: TEX 0 @8 440; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] 441; EG-NEXT: TEX 0 @10 442; EG-NEXT: ALU 12, @16, KC0[CB0:0-32], KC1[] 443; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 444; EG-NEXT: CF_END 445; EG-NEXT: PAD 446; EG-NEXT: Fetch clause starting at 8: 447; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 448; EG-NEXT: Fetch clause starting at 10: 449; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 450; EG-NEXT: ALU clause starting at 12: 451; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 452; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 453; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 454; EG-NEXT: ALU clause starting at 15: 455; EG-NEXT: MOV * T7.X, KC0[2].Z, 456; EG-NEXT: ALU clause starting at 16: 457; EG-NEXT: AND_INT T0.Y, T0.X, literal.x, 458; EG-NEXT: AND_INT T0.Z, T7.X, literal.x, BS:VEC_120/SCL_212 459; EG-NEXT: LSHR T0.W, T0.X, literal.y, 460; EG-NEXT: LSHR * T1.W, T7.X, literal.y, 461; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 462; EG-NEXT: LSHL T0.W, PS, PV.W, 463; EG-NEXT: LSHL * T1.W, PV.Z, PV.Y, 464; EG-NEXT: AND_INT T1.W, PS, literal.x, 465; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 466; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 467; EG-NEXT: OR_INT T0.X, PV.W, PS, 468; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 469; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 470 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 471 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 472 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 473 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1 474 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in 475 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 476 %result = shl <2 x i16> %a, %b 477 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 478 ret void 479} 480 481define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 482; GCN-LABEL: shl_v4i16: 483; GCN: ; %bb.0: 484; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 485; GCN-NEXT: s_mov_b32 s3, 0xf000 486; GCN-NEXT: s_mov_b32 s2, 0 487; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 488; GCN-NEXT: v_mov_b32_e32 v1, 0 489; GCN-NEXT: s_waitcnt lgkmcnt(0) 490; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] 491; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 492; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 493; GCN-NEXT: s_mov_b32 s0, 0xffff 494; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] 495; GCN-NEXT: s_waitcnt vmcnt(1) 496; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 497; GCN-NEXT: s_waitcnt vmcnt(0) 498; GCN-NEXT: v_and_b32_e32 v8, s0, v4 499; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 500; GCN-NEXT: v_and_b32_e32 v9, s0, v5 501; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 502; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 503; GCN-NEXT: v_lshl_b32_e32 v5, v7, v5 504; GCN-NEXT: v_lshl_b32_e32 v3, v3, v9 505; GCN-NEXT: v_lshl_b32_e32 v4, v6, v4 506; GCN-NEXT: v_lshl_b32_e32 v2, v2, v8 507; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 508; GCN-NEXT: v_and_b32_e32 v3, s0, v3 509; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 510; GCN-NEXT: v_and_b32_e32 v2, s0, v2 511; GCN-NEXT: v_or_b32_e32 v3, v3, v5 512; GCN-NEXT: v_or_b32_e32 v2, v2, v4 513; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 514; GCN-NEXT: s_endpgm 515; 516; EG-LABEL: shl_v4i16: 517; EG: ; %bb.0: 518; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 519; EG-NEXT: TEX 0 @8 520; EG-NEXT: ALU 3, @15, KC0[], KC1[] 521; EG-NEXT: TEX 0 @10 522; EG-NEXT: ALU 49, @19, KC0[CB0:0-32], KC1[] 523; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 524; EG-NEXT: CF_END 525; EG-NEXT: PAD 526; EG-NEXT: Fetch clause starting at 8: 527; EG-NEXT: VTX_READ_64 T10.XY, T0.X, 0, #1 528; EG-NEXT: Fetch clause starting at 10: 529; EG-NEXT: VTX_READ_64 T10.XY, T0.X, 8, #1 530; EG-NEXT: ALU clause starting at 12: 531; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 532; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 533; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 534; EG-NEXT: ALU clause starting at 15: 535; EG-NEXT: MOV T4.X, T10.X, 536; EG-NEXT: MOV * T5.X, T10.Y, 537; EG-NEXT: MOV T0.Y, PV.X, 538; EG-NEXT: MOV * T0.Z, PS, 539; EG-NEXT: ALU clause starting at 19: 540; EG-NEXT: MOV T2.X, T10.X, 541; EG-NEXT: MOV * T3.X, T10.Y, 542; EG-NEXT: MOV T0.X, T6.X, 543; EG-NEXT: MOV * T1.Y, PV.X, 544; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, 545; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, 546; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 547; EG-NEXT: LSHL * T1.W, PS, PV.W, 548; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 549; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 550; EG-NEXT: 65535(9.183409e-41), -65536(nan) 551; EG-NEXT: OR_INT * T1.W, PS, PV.W, 552; EG-NEXT: MOV T0.X, T3.X, 553; EG-NEXT: MOV * T6.X, PV.W, 554; EG-NEXT: MOV T1.Z, PS, 555; EG-NEXT: LSHR T1.W, T1.Y, literal.x, 556; EG-NEXT: LSHR * T2.W, T0.Y, literal.x, 557; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 558; EG-NEXT: LSHL T1.W, PS, PV.W, 559; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x, 560; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 561; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 562; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 563; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 564; EG-NEXT: MOV T6.X, PV.W, 565; EG-NEXT: MOV T0.Y, T7.X, 566; EG-NEXT: AND_INT T1.W, T0.X, literal.x, BS:VEC_120/SCL_212 567; EG-NEXT: AND_INT * T2.W, T0.Z, literal.x, 568; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 569; EG-NEXT: LSHL T1.W, PS, PV.W, 570; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, 571; EG-NEXT: -65536(nan), 0(0.000000e+00) 572; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 573; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 574; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 575; EG-NEXT: MOV * T7.X, PV.W, 576; EG-NEXT: MOV T0.Y, PV.X, 577; EG-NEXT: LSHR T1.W, T0.X, literal.x, 578; EG-NEXT: LSHR * T2.W, T0.Z, literal.x, 579; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 580; EG-NEXT: LSHL * T1.W, PS, PV.W, 581; EG-NEXT: AND_INT T0.Z, T0.Y, literal.x, 582; EG-NEXT: LSHL T1.W, PV.W, literal.y, 583; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 584; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 585; EG-NEXT: LSHR T0.X, PS, literal.x, 586; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W, 587; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 588; EG-NEXT: MOV T7.X, PV.Y, 589; EG-NEXT: MOV * T10.X, T6.X, 590 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 591 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 592 %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid 593 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1 594 %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep 595 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 596 %result = shl <4 x i16> %a, %b 597 store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out 598 ret void 599} 600 601define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 602; GCN-LABEL: shl_i64: 603; GCN: ; %bb.0: 604; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 605; GCN-NEXT: s_mov_b32 s3, 0xf000 606; GCN-NEXT: s_mov_b32 s2, -1 607; GCN-NEXT: s_mov_b32 s10, s2 608; GCN-NEXT: s_mov_b32 s11, s3 609; GCN-NEXT: s_waitcnt lgkmcnt(0) 610; GCN-NEXT: s_mov_b32 s8, s6 611; GCN-NEXT: s_mov_b32 s9, s7 612; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 613; GCN-NEXT: s_mov_b32 s0, s4 614; GCN-NEXT: s_mov_b32 s1, s5 615; GCN-NEXT: s_waitcnt vmcnt(0) 616; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 617; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 618; GCN-NEXT: s_endpgm 619; 620; EG-LABEL: shl_i64: 621; EG: ; %bb.0: 622; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 623; EG-NEXT: TEX 0 @6 624; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 625; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 626; EG-NEXT: CF_END 627; EG-NEXT: PAD 628; EG-NEXT: Fetch clause starting at 6: 629; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 630; EG-NEXT: ALU clause starting at 8: 631; EG-NEXT: MOV * T0.X, KC0[2].Z, 632; EG-NEXT: ALU clause starting at 9: 633; EG-NEXT: SUB_INT * T0.W, literal.x, T0.Z, 634; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 635; EG-NEXT: LSHR * T0.W, T0.X, PV.W, 636; EG-NEXT: ADD_INT T1.Z, T0.Z, literal.x, 637; EG-NEXT: LSHR T0.W, PV.W, 1, 638; EG-NEXT: LSHL * T1.W, T0.Y, T0.Z, 639; EG-NEXT: -32(nan), 0(0.000000e+00) 640; EG-NEXT: OR_INT T2.Z, PS, PV.W, 641; EG-NEXT: LSHL T0.W, T0.X, PV.Z, 642; EG-NEXT: SETGT_UINT * T1.W, T0.Z, literal.x, 643; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 644; EG-NEXT: CNDE_INT T0.Y, PS, PV.Z, PV.W, 645; EG-NEXT: LSHL * T0.W, T0.X, T0.Z, 646; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 647; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 648; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 649 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 650 %a = load i64, i64 addrspace(1)* %in 651 %b = load i64, i64 addrspace(1)* %b_ptr 652 %result = shl i64 %a, %b 653 store i64 %result, i64 addrspace(1)* %out 654 ret void 655} 656 657define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { 658; GCN-LABEL: shl_v2i64: 659; GCN: ; %bb.0: 660; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 661; GCN-NEXT: s_mov_b32 s3, 0xf000 662; GCN-NEXT: s_mov_b32 s2, -1 663; GCN-NEXT: s_mov_b32 s10, s2 664; GCN-NEXT: s_mov_b32 s11, s3 665; GCN-NEXT: s_waitcnt lgkmcnt(0) 666; GCN-NEXT: s_mov_b32 s8, s6 667; GCN-NEXT: s_mov_b32 s9, s7 668; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 669; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 670; GCN-NEXT: s_mov_b32 s0, s4 671; GCN-NEXT: s_mov_b32 s1, s5 672; GCN-NEXT: s_waitcnt vmcnt(0) 673; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 674; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 675; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 676; GCN-NEXT: s_endpgm 677; 678; EG-LABEL: shl_v2i64: 679; EG: ; %bb.0: 680; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 681; EG-NEXT: TEX 1 @6 682; EG-NEXT: ALU 28, @11, KC0[CB0:0-32], KC1[] 683; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 684; EG-NEXT: CF_END 685; EG-NEXT: PAD 686; EG-NEXT: Fetch clause starting at 6: 687; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 688; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 689; EG-NEXT: ALU clause starting at 10: 690; EG-NEXT: MOV * T0.X, KC0[2].Z, 691; EG-NEXT: ALU clause starting at 11: 692; EG-NEXT: SUB_INT * T1.W, literal.x, T1.Z, 693; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 694; EG-NEXT: LSHR * T1.W, T0.Z, PV.W, 695; EG-NEXT: SUB_INT T2.Z, literal.x, T1.X, 696; EG-NEXT: LSHR T1.W, PV.W, 1, 697; EG-NEXT: LSHL * T0.W, T0.W, T1.Z, 698; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 699; EG-NEXT: OR_INT T3.Z, PS, PV.W, 700; EG-NEXT: LSHR T0.W, T0.X, PV.Z, 701; EG-NEXT: ADD_INT * T1.W, T1.Z, literal.x, 702; EG-NEXT: -32(nan), 0(0.000000e+00) 703; EG-NEXT: LSHL T2.X, T0.Z, PS, 704; EG-NEXT: SETGT_UINT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 705; EG-NEXT: ADD_INT T2.Z, T1.X, literal.y, 706; EG-NEXT: LSHR T0.W, PV.W, 1, 707; EG-NEXT: LSHL * T1.W, T0.Y, T1.X, 708; EG-NEXT: 31(4.344025e-44), -32(nan) 709; EG-NEXT: OR_INT T0.Y, PS, PV.W, 710; EG-NEXT: LSHL T2.Z, T0.X, PV.Z, 711; EG-NEXT: SETGT_UINT T0.W, T1.X, literal.x, BS:VEC_120/SCL_212 712; EG-NEXT: CNDE_INT * T2.W, PV.Y, T3.Z, PV.X, 713; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 714; EG-NEXT: CNDE_INT T2.Y, PV.W, PV.Y, PV.Z, 715; EG-NEXT: LSHL * T1.W, T0.Z, T1.Z, 716; EG-NEXT: CNDE_INT T2.Z, T1.Y, PV.W, 0.0, 717; EG-NEXT: LSHL * T1.W, T0.X, T1.X, 718; EG-NEXT: CNDE_INT T2.X, T0.W, PV.W, 0.0, 719; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 720; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 721 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 722 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in 723 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 724 %result = shl <2 x i64> %a, %b 725 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 726 ret void 727} 728 729define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 730; GCN-LABEL: shl_v4i64: 731; GCN: ; %bb.0: 732; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 733; GCN-NEXT: s_mov_b32 s3, 0xf000 734; GCN-NEXT: s_mov_b32 s2, -1 735; GCN-NEXT: s_mov_b32 s10, s2 736; GCN-NEXT: s_mov_b32 s11, s3 737; GCN-NEXT: s_waitcnt lgkmcnt(0) 738; GCN-NEXT: s_mov_b32 s8, s6 739; GCN-NEXT: s_mov_b32 s9, s7 740; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 741; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 742; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 743; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48 744; GCN-NEXT: s_mov_b32 s0, s4 745; GCN-NEXT: s_mov_b32 s1, s5 746; GCN-NEXT: s_waitcnt vmcnt(1) 747; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 748; GCN-NEXT: s_waitcnt vmcnt(0) 749; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 750; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v11 751; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 752; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 753; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 754; GCN-NEXT: s_endpgm 755; 756; EG-LABEL: shl_v4i64: 757; EG: ; %bb.0: 758; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 759; EG-NEXT: TEX 3 @6 760; EG-NEXT: ALU 58, @15, KC0[CB0:0-32], KC1[] 761; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 0 762; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T4.X, 1 763; EG-NEXT: CF_END 764; EG-NEXT: Fetch clause starting at 6: 765; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 766; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 767; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 768; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1 769; EG-NEXT: ALU clause starting at 14: 770; EG-NEXT: MOV * T0.X, KC0[2].Z, 771; EG-NEXT: ALU clause starting at 15: 772; EG-NEXT: SUB_INT * T0.W, literal.x, T1.Z, 773; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 774; EG-NEXT: SUB_INT T4.Z, literal.x, T0.Z, 775; EG-NEXT: SUB_INT T1.W, literal.x, T0.X, 776; EG-NEXT: LSHR * T0.W, T3.Z, PV.W, 777; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 778; EG-NEXT: SUB_INT T0.Y, literal.x, T1.X, 779; EG-NEXT: LSHR T5.Z, PS, 1, 780; EG-NEXT: LSHR T0.W, T2.X, PV.W, 781; EG-NEXT: LSHR * T1.W, T2.Z, PV.Z, 782; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 783; EG-NEXT: LSHL T4.X, T3.W, T1.Z, 784; EG-NEXT: LSHR T1.Y, PS, 1, 785; EG-NEXT: LSHL T4.Z, T2.W, T0.Z, BS:VEC_120/SCL_212 786; EG-NEXT: LSHR T0.W, PV.W, 1, 787; EG-NEXT: LSHL * T1.W, T2.Y, T0.X, 788; EG-NEXT: OR_INT T5.X, PS, PV.W, 789; EG-NEXT: OR_INT T1.Y, PV.Z, PV.Y, 790; EG-NEXT: OR_INT T4.Z, PV.X, T5.Z, 791; EG-NEXT: LSHR T0.W, T3.X, T0.Y, 792; EG-NEXT: ADD_INT * T1.W, T1.Z, literal.x, 793; EG-NEXT: -32(nan), 0(0.000000e+00) 794; EG-NEXT: LSHL T4.X, T3.Z, PS, 795; EG-NEXT: SETGT_UINT T0.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 796; EG-NEXT: ADD_INT T5.Z, T1.X, literal.y, 797; EG-NEXT: LSHR T0.W, PV.W, 1, 798; EG-NEXT: LSHL * T1.W, T3.Y, T1.X, 799; EG-NEXT: 31(4.344025e-44), -32(nan) 800; EG-NEXT: OR_INT T6.X, PS, PV.W, 801; EG-NEXT: LSHL T2.Y, T3.X, PV.Z, 802; EG-NEXT: SETGT_UINT T5.Z, T1.X, literal.x, BS:VEC_120/SCL_212 803; EG-NEXT: ADD_INT T0.W, T0.Z, literal.y, 804; EG-NEXT: CNDE_INT * T3.W, PV.Y, T4.Z, PV.X, 805; EG-NEXT: 31(4.344025e-44), -32(nan) 806; EG-NEXT: LSHL T4.X, T2.Z, PV.W, 807; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.X, PV.Y, 808; EG-NEXT: SETGT_UINT * T4.Z, T0.Z, literal.x, BS:VEC_120/SCL_212 809; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 810; EG-NEXT: LSHL T0.W, T3.Z, T1.Z, 811; EG-NEXT: ADD_INT * T1.W, T0.X, literal.x, 812; EG-NEXT: -32(nan), 0(0.000000e+00) 813; EG-NEXT: LSHL T6.X, T2.X, PS, 814; EG-NEXT: SETGT_UINT T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212 815; EG-NEXT: CNDE_INT * T3.Z, T0.Y, PV.W, 0.0, 816; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 817; EG-NEXT: LSHL T0.W, T3.X, T1.X, BS:VEC_120/SCL_212 818; EG-NEXT: CNDE_INT * T1.W, T4.Z, T1.Y, T4.X, 819; EG-NEXT: CNDE_INT T3.X, T5.Z, PV.W, 0.0, 820; EG-NEXT: CNDE_INT T1.Y, T2.Y, T5.X, T6.X, 821; EG-NEXT: LSHL T0.W, T2.Z, T0.Z, BS:VEC_120/SCL_212 822; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 823; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 824; EG-NEXT: LSHR T4.X, PS, literal.x, 825; EG-NEXT: CNDE_INT T1.Z, T4.Z, PV.W, 0.0, 826; EG-NEXT: LSHL * T0.W, T2.X, T0.X, 827; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 828; EG-NEXT: CNDE_INT T1.X, T2.Y, PV.W, 0.0, 829; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 830; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 831 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 832 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 833 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 834 %result = shl <4 x i64> %a, %b 835 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 836 ret void 837} 838 839; Make sure load width gets reduced to i32 load. 840define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { 841; GCN-LABEL: s_shl_32_i64: 842; GCN: ; %bb.0: 843; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 844; GCN-NEXT: s_load_dword s0, s[0:1], 0x13 845; GCN-NEXT: s_mov_b32 s7, 0xf000 846; GCN-NEXT: s_mov_b32 s6, -1 847; GCN-NEXT: v_mov_b32_e32 v0, 0 848; GCN-NEXT: s_waitcnt lgkmcnt(0) 849; GCN-NEXT: v_mov_b32_e32 v1, s0 850; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 851; GCN-NEXT: s_endpgm 852; 853; EG-LABEL: s_shl_32_i64: 854; EG: ; %bb.0: 855; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 856; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 857; EG-NEXT: CF_END 858; EG-NEXT: PAD 859; EG-NEXT: ALU clause starting at 4: 860; EG-NEXT: MOV * T0.Y, KC0[4].W, 861; EG-NEXT: MOV T0.X, 0.0, 862; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 863; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 864 %result = shl i64 %a, 32 865 store i64 %result, i64 addrspace(1)* %out 866 ret void 867} 868 869define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 870; GCN-LABEL: v_shl_32_i64: 871; GCN: ; %bb.0: 872; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 873; GCN-NEXT: s_ashr_i32 s3, s2, 31 874; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 875; GCN-NEXT: v_mov_b32_e32 v0, s0 876; GCN-NEXT: s_mov_b32 s7, 0xf000 877; GCN-NEXT: s_mov_b32 s6, 0 878; GCN-NEXT: s_waitcnt lgkmcnt(0) 879; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] 880; GCN-NEXT: v_mov_b32_e32 v1, s1 881; GCN-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 882; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] 883; GCN-NEXT: v_mov_b32_e32 v2, 0 884; GCN-NEXT: s_waitcnt vmcnt(0) 885; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 886; GCN-NEXT: s_endpgm 887; 888; EG-LABEL: v_shl_32_i64: 889; EG: ; %bb.0: 890; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 891; EG-NEXT: TEX 0 @6 892; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 893; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1 894; EG-NEXT: CF_END 895; EG-NEXT: PAD 896; EG-NEXT: Fetch clause starting at 6: 897; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 898; EG-NEXT: ALU clause starting at 8: 899; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 900; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 901; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 902; EG-NEXT: ALU clause starting at 11: 903; EG-NEXT: MOV T1.X, 0.0, 904; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 905; EG-NEXT: LSHR T2.X, PV.W, literal.x, 906; EG-NEXT: MOV * T1.Y, T0.X, 907; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 908 %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0 909 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 910 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 911 %a = load i64, i64 addrspace(1)* %gep.in 912 %result = shl i64 %a, 32 913 store i64 %result, i64 addrspace(1)* %gep.out 914 ret void 915} 916 917define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { 918; GCN-LABEL: s_shl_constant_i64: 919; GCN: ; %bb.0: 920; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 921; GCN-NEXT: s_mov_b32 s2, -1 922; GCN-NEXT: s_mov_b32 s9, 0xffff 923; GCN-NEXT: s_mov_b32 s8, s2 924; GCN-NEXT: s_mov_b32 s3, 0xf000 925; GCN-NEXT: s_waitcnt lgkmcnt(0) 926; GCN-NEXT: s_mov_b32 s0, s4 927; GCN-NEXT: s_mov_b32 s1, s5 928; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s6 929; GCN-NEXT: v_mov_b32_e32 v0, s4 930; GCN-NEXT: v_mov_b32_e32 v1, s5 931; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 932; GCN-NEXT: s_endpgm 933; 934; EG-LABEL: s_shl_constant_i64: 935; EG: ; %bb.0: 936; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 937; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 938; EG-NEXT: CF_END 939; EG-NEXT: PAD 940; EG-NEXT: ALU clause starting at 4: 941; EG-NEXT: SUB_INT * T0.W, literal.x, KC0[2].W, 942; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 943; EG-NEXT: LSHR * T0.W, literal.x, PV.W, 944; EG-NEXT: -1(nan), 0(0.000000e+00) 945; EG-NEXT: ADD_INT T0.Z, KC0[2].W, literal.x, 946; EG-NEXT: LSHR T0.W, PV.W, 1, 947; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 948; EG-NEXT: -32(nan), 65535(9.183409e-41) 949; EG-NEXT: OR_INT T1.Z, PS, PV.W, 950; EG-NEXT: LSHL T0.W, literal.x, PV.Z, 951; EG-NEXT: SETGT_UINT * T1.W, KC0[2].W, literal.y, 952; EG-NEXT: -1(nan), 31(4.344025e-44) 953; EG-NEXT: CNDE_INT T0.Y, PS, PV.Z, PV.W, 954; EG-NEXT: LSHL * T0.W, literal.x, KC0[2].W, 955; EG-NEXT: -1(nan), 0(0.000000e+00) 956; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 957; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 958; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 959 %shl = shl i64 281474976710655, %a 960 store i64 %shl, i64 addrspace(1)* %out, align 8 961 ret void 962} 963 964define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 965; GCN-LABEL: v_shl_constant_i64: 966; GCN: ; %bb.0: 967; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 968; GCN-NEXT: s_mov_b32 s3, 0xf000 969; GCN-NEXT: s_mov_b32 s2, -1 970; GCN-NEXT: s_mov_b32 s10, s2 971; GCN-NEXT: s_mov_b32 s11, s3 972; GCN-NEXT: s_waitcnt lgkmcnt(0) 973; GCN-NEXT: s_mov_b32 s8, s6 974; GCN-NEXT: s_mov_b32 s9, s7 975; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 976; GCN-NEXT: s_movk_i32 s7, 0x11e 977; GCN-NEXT: s_mov_b32 s6, 0xab19b207 978; GCN-NEXT: s_mov_b32 s0, s4 979; GCN-NEXT: s_mov_b32 s1, s5 980; GCN-NEXT: s_waitcnt vmcnt(0) 981; GCN-NEXT: v_lshl_b64 v[0:1], s[6:7], v0 982; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 983; GCN-NEXT: s_endpgm 984; 985; EG-LABEL: v_shl_constant_i64: 986; EG: ; %bb.0: 987; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 988; EG-NEXT: TEX 0 @6 989; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[] 990; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 991; EG-NEXT: CF_END 992; EG-NEXT: PAD 993; EG-NEXT: Fetch clause starting at 6: 994; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 995; EG-NEXT: ALU clause starting at 8: 996; EG-NEXT: MOV * T0.X, KC0[2].Z, 997; EG-NEXT: ALU clause starting at 9: 998; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X, 999; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1000; EG-NEXT: LSHR * T0.W, literal.x, PV.W, 1001; EG-NEXT: -1424379385(-5.460358e-13), 0(0.000000e+00) 1002; EG-NEXT: ADD_INT T0.Z, T0.X, literal.x, 1003; EG-NEXT: LSHR T0.W, PV.W, 1, 1004; EG-NEXT: LSHL * T1.W, literal.y, T0.X, 1005; EG-NEXT: -32(nan), 286(4.007714e-43) 1006; EG-NEXT: OR_INT T1.Z, PS, PV.W, 1007; EG-NEXT: SETGT_UINT T0.W, T0.X, literal.x, 1008; EG-NEXT: LSHL * T1.W, literal.y, PV.Z, 1009; EG-NEXT: 31(4.344025e-44), -1424379385(-5.460358e-13) 1010; EG-NEXT: CNDE_INT T0.Y, PV.W, PV.Z, PS, 1011; EG-NEXT: LSHL * T1.W, literal.x, T0.X, 1012; EG-NEXT: -1424379385(-5.460358e-13), 0(0.000000e+00) 1013; EG-NEXT: CNDE_INT T0.X, T0.W, PV.W, 0.0, 1014; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1015; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1016 %a = load i64, i64 addrspace(1)* %aptr, align 8 1017 %shl = shl i64 1231231234567, %a 1018 store i64 %shl, i64 addrspace(1)* %out, align 8 1019 ret void 1020} 1021 1022define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 1023; GCN-LABEL: v_shl_i64_32_bit_constant: 1024; GCN: ; %bb.0: 1025; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1026; GCN-NEXT: s_mov_b32 s3, 0xf000 1027; GCN-NEXT: s_mov_b32 s2, -1 1028; GCN-NEXT: s_mov_b32 s10, s2 1029; GCN-NEXT: s_mov_b32 s11, s3 1030; GCN-NEXT: s_waitcnt lgkmcnt(0) 1031; GCN-NEXT: s_mov_b32 s8, s6 1032; GCN-NEXT: s_mov_b32 s9, s7 1033; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 1034; GCN-NEXT: s_mov_b32 s7, 0 1035; GCN-NEXT: s_mov_b32 s6, 0x12d687 1036; GCN-NEXT: s_mov_b32 s0, s4 1037; GCN-NEXT: s_mov_b32 s1, s5 1038; GCN-NEXT: s_waitcnt vmcnt(0) 1039; GCN-NEXT: v_lshl_b64 v[0:1], s[6:7], v0 1040; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1041; GCN-NEXT: s_endpgm 1042; 1043; EG-LABEL: v_shl_i64_32_bit_constant: 1044; EG: ; %bb.0: 1045; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1046; EG-NEXT: TEX 0 @6 1047; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 1048; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1049; EG-NEXT: CF_END 1050; EG-NEXT: PAD 1051; EG-NEXT: Fetch clause starting at 6: 1052; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1053; EG-NEXT: ALU clause starting at 8: 1054; EG-NEXT: MOV * T0.X, KC0[2].Z, 1055; EG-NEXT: ALU clause starting at 9: 1056; EG-NEXT: SUB_INT T0.W, literal.x, T0.X, 1057; EG-NEXT: ADD_INT * T1.W, T0.X, literal.y, 1058; EG-NEXT: 31(4.344025e-44), -32(nan) 1059; EG-NEXT: LSHR * T0.W, literal.x, PV.W, 1060; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) 1061; EG-NEXT: LSHR T0.Z, PV.W, 1, 1062; EG-NEXT: LSHL T0.W, literal.x, T1.W, 1063; EG-NEXT: SETGT_UINT * T1.W, T0.X, literal.y, 1064; EG-NEXT: 1234567(1.729997e-39), 31(4.344025e-44) 1065; EG-NEXT: CNDE_INT T0.Y, PS, PV.Z, PV.W, 1066; EG-NEXT: LSHL * T0.W, literal.x, T0.X, 1067; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) 1068; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 1069; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1070; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1071 %a = load i64, i64 addrspace(1)* %aptr, align 8 1072 %shl = shl i64 1234567, %a 1073 store i64 %shl, i64 addrspace(1)* %out, align 8 1074 ret void 1075} 1076 1077define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 1078; GCN-LABEL: v_shl_inline_imm_64_i64: 1079; GCN: ; %bb.0: 1080; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1081; GCN-NEXT: s_mov_b32 s3, 0xf000 1082; GCN-NEXT: s_mov_b32 s2, -1 1083; GCN-NEXT: s_mov_b32 s10, s2 1084; GCN-NEXT: s_mov_b32 s11, s3 1085; GCN-NEXT: s_waitcnt lgkmcnt(0) 1086; GCN-NEXT: s_mov_b32 s8, s6 1087; GCN-NEXT: s_mov_b32 s9, s7 1088; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 1089; GCN-NEXT: s_mov_b32 s0, s4 1090; GCN-NEXT: s_mov_b32 s1, s5 1091; GCN-NEXT: s_waitcnt vmcnt(0) 1092; GCN-NEXT: v_lshl_b64 v[0:1], 64, v0 1093; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1094; GCN-NEXT: s_endpgm 1095; 1096; EG-LABEL: v_shl_inline_imm_64_i64: 1097; EG: ; %bb.0: 1098; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1099; EG-NEXT: TEX 0 @6 1100; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 1101; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1102; EG-NEXT: CF_END 1103; EG-NEXT: PAD 1104; EG-NEXT: Fetch clause starting at 6: 1105; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1106; EG-NEXT: ALU clause starting at 8: 1107; EG-NEXT: MOV * T0.X, KC0[2].Z, 1108; EG-NEXT: ALU clause starting at 9: 1109; EG-NEXT: SUB_INT T0.W, literal.x, T0.X, 1110; EG-NEXT: ADD_INT * T1.W, T0.X, literal.y, 1111; EG-NEXT: 31(4.344025e-44), -32(nan) 1112; EG-NEXT: LSHR * T0.W, literal.x, PV.W, 1113; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) 1114; EG-NEXT: LSHR T0.Z, PV.W, 1, 1115; EG-NEXT: LSHL T0.W, literal.x, T1.W, 1116; EG-NEXT: SETGT_UINT * T1.W, T0.X, literal.y, 1117; EG-NEXT: 64(8.968310e-44), 31(4.344025e-44) 1118; EG-NEXT: CNDE_INT T0.Y, PS, PV.Z, PV.W, 1119; EG-NEXT: LSHL * T0.W, literal.x, T0.X, 1120; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) 1121; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 1122; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1123; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1124 %a = load i64, i64 addrspace(1)* %aptr, align 8 1125 %shl = shl i64 64, %a 1126 store i64 %shl, i64 addrspace(1)* %out, align 8 1127 ret void 1128} 1129 1130define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1131; GCN-LABEL: s_shl_inline_imm_64_i64: 1132; GCN: ; %bb.0: 1133; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1134; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1135; GCN-NEXT: s_mov_b32 s7, 0xf000 1136; GCN-NEXT: s_mov_b32 s6, -1 1137; GCN-NEXT: s_waitcnt lgkmcnt(0) 1138; GCN-NEXT: s_lshl_b64 s[0:1], 64, s0 1139; GCN-NEXT: v_mov_b32_e32 v0, s0 1140; GCN-NEXT: v_mov_b32_e32 v1, s1 1141; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1142; GCN-NEXT: s_endpgm 1143; 1144; EG-LABEL: s_shl_inline_imm_64_i64: 1145; EG: ; %bb.0: 1146; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] 1147; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1148; EG-NEXT: CF_END 1149; EG-NEXT: PAD 1150; EG-NEXT: ALU clause starting at 4: 1151; EG-NEXT: SUB_INT * T0.W, literal.x, KC0[2].W, 1152; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1153; EG-NEXT: LSHR T0.W, literal.x, PV.W, 1154; EG-NEXT: ADD_INT * T1.W, KC0[2].W, literal.y, 1155; EG-NEXT: 64(8.968310e-44), -32(nan) 1156; EG-NEXT: LSHL T0.Z, literal.x, PS, 1157; EG-NEXT: LSHR T0.W, PV.W, 1, 1158; EG-NEXT: SETGT_UINT * T1.W, KC0[2].W, literal.y, 1159; EG-NEXT: 64(8.968310e-44), 31(4.344025e-44) 1160; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, PV.Z, 1161; EG-NEXT: LSHL * T0.W, literal.x, KC0[2].W, 1162; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) 1163; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 1164; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1165; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1166 %shl = shl i64 64, %a 1167 store i64 %shl, i64 addrspace(1)* %out, align 8 1168 ret void 1169} 1170 1171define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1172; GCN-LABEL: s_shl_inline_imm_1_i64: 1173; GCN: ; %bb.0: 1174; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1175; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1176; GCN-NEXT: s_mov_b32 s7, 0xf000 1177; GCN-NEXT: s_mov_b32 s6, -1 1178; GCN-NEXT: s_waitcnt lgkmcnt(0) 1179; GCN-NEXT: s_lshl_b64 s[0:1], 1, s0 1180; GCN-NEXT: v_mov_b32_e32 v0, s0 1181; GCN-NEXT: v_mov_b32_e32 v1, s1 1182; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1183; GCN-NEXT: s_endpgm 1184; 1185; EG-LABEL: s_shl_inline_imm_1_i64: 1186; EG: ; %bb.0: 1187; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1188; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1189; EG-NEXT: CF_END 1190; EG-NEXT: PAD 1191; EG-NEXT: ALU clause starting at 4: 1192; EG-NEXT: ADD_INT T0.Z, KC0[2].W, literal.x, 1193; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.y, 1194; EG-NEXT: LSHL * T1.W, 1, KC0[2].W, 1195; EG-NEXT: -32(nan), 31(4.344025e-44) 1196; EG-NEXT: CNDE_INT T0.X, PV.W, PS, 0.0, 1197; EG-NEXT: LSHL T1.W, 1, PV.Z, 1198; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1199; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1200; EG-NEXT: CNDE_INT * T0.Y, T0.W, 0.0, PV.W, 1201 %shl = shl i64 1, %a 1202 store i64 %shl, i64 addrspace(1)* %out, align 8 1203 ret void 1204} 1205 1206define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1207; GCN-LABEL: s_shl_inline_imm_1_0_i64: 1208; GCN: ; %bb.0: 1209; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1210; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1211; GCN-NEXT: s_mov_b32 s7, 0xf000 1212; GCN-NEXT: s_mov_b32 s6, -1 1213; GCN-NEXT: s_waitcnt lgkmcnt(0) 1214; GCN-NEXT: s_lshl_b64 s[0:1], 1.0, s0 1215; GCN-NEXT: v_mov_b32_e32 v0, s0 1216; GCN-NEXT: v_mov_b32_e32 v1, s1 1217; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1218; GCN-NEXT: s_endpgm 1219; 1220; EG-LABEL: s_shl_inline_imm_1_0_i64: 1221; EG: ; %bb.0: 1222; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1223; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1224; EG-NEXT: CF_END 1225; EG-NEXT: PAD 1226; EG-NEXT: ALU clause starting at 4: 1227; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1228; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1229; EG-NEXT: 31(4.344025e-44), 1072693248(1.875000e+00) 1230; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1231; EG-NEXT: MOV T0.X, 0.0, 1232; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1233; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1234 %shl = shl i64 4607182418800017408, %a 1235 store i64 %shl, i64 addrspace(1)* %out, align 8 1236 ret void 1237} 1238 1239define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1240; GCN-LABEL: s_shl_inline_imm_neg_1_0_i64: 1241; GCN: ; %bb.0: 1242; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1243; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1244; GCN-NEXT: s_mov_b32 s7, 0xf000 1245; GCN-NEXT: s_mov_b32 s6, -1 1246; GCN-NEXT: s_waitcnt lgkmcnt(0) 1247; GCN-NEXT: s_lshl_b64 s[0:1], -1.0, s0 1248; GCN-NEXT: v_mov_b32_e32 v0, s0 1249; GCN-NEXT: v_mov_b32_e32 v1, s1 1250; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1251; GCN-NEXT: s_endpgm 1252; 1253; EG-LABEL: s_shl_inline_imm_neg_1_0_i64: 1254; EG: ; %bb.0: 1255; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1256; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1257; EG-NEXT: CF_END 1258; EG-NEXT: PAD 1259; EG-NEXT: ALU clause starting at 4: 1260; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1261; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1262; EG-NEXT: 31(4.344025e-44), -1074790400(-1.875000e+00) 1263; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1264; EG-NEXT: MOV T0.X, 0.0, 1265; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1266; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1267 %shl = shl i64 13830554455654793216, %a 1268 store i64 %shl, i64 addrspace(1)* %out, align 8 1269 ret void 1270} 1271 1272define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1273; GCN-LABEL: s_shl_inline_imm_0_5_i64: 1274; GCN: ; %bb.0: 1275; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1276; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1277; GCN-NEXT: s_mov_b32 s7, 0xf000 1278; GCN-NEXT: s_mov_b32 s6, -1 1279; GCN-NEXT: s_waitcnt lgkmcnt(0) 1280; GCN-NEXT: s_lshl_b64 s[0:1], 0.5, s0 1281; GCN-NEXT: v_mov_b32_e32 v0, s0 1282; GCN-NEXT: v_mov_b32_e32 v1, s1 1283; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1284; GCN-NEXT: s_endpgm 1285; 1286; EG-LABEL: s_shl_inline_imm_0_5_i64: 1287; EG: ; %bb.0: 1288; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1289; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1290; EG-NEXT: CF_END 1291; EG-NEXT: PAD 1292; EG-NEXT: ALU clause starting at 4: 1293; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1294; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1295; EG-NEXT: 31(4.344025e-44), 1071644672(1.750000e+00) 1296; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1297; EG-NEXT: MOV T0.X, 0.0, 1298; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1299; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1300 %shl = shl i64 4602678819172646912, %a 1301 store i64 %shl, i64 addrspace(1)* %out, align 8 1302 ret void 1303} 1304 1305define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1306; GCN-LABEL: s_shl_inline_imm_neg_0_5_i64: 1307; GCN: ; %bb.0: 1308; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1309; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1310; GCN-NEXT: s_mov_b32 s7, 0xf000 1311; GCN-NEXT: s_mov_b32 s6, -1 1312; GCN-NEXT: s_waitcnt lgkmcnt(0) 1313; GCN-NEXT: s_lshl_b64 s[0:1], -0.5, s0 1314; GCN-NEXT: v_mov_b32_e32 v0, s0 1315; GCN-NEXT: v_mov_b32_e32 v1, s1 1316; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1317; GCN-NEXT: s_endpgm 1318; 1319; EG-LABEL: s_shl_inline_imm_neg_0_5_i64: 1320; EG: ; %bb.0: 1321; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1322; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1323; EG-NEXT: CF_END 1324; EG-NEXT: PAD 1325; EG-NEXT: ALU clause starting at 4: 1326; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1327; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1328; EG-NEXT: 31(4.344025e-44), -1075838976(-1.750000e+00) 1329; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1330; EG-NEXT: MOV T0.X, 0.0, 1331; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1332; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1333 %shl = shl i64 13826050856027422720, %a 1334 store i64 %shl, i64 addrspace(1)* %out, align 8 1335 ret void 1336} 1337 1338define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1339; GCN-LABEL: s_shl_inline_imm_2_0_i64: 1340; GCN: ; %bb.0: 1341; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1342; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1343; GCN-NEXT: s_mov_b32 s7, 0xf000 1344; GCN-NEXT: s_mov_b32 s6, -1 1345; GCN-NEXT: s_waitcnt lgkmcnt(0) 1346; GCN-NEXT: s_lshl_b64 s[0:1], 2.0, s0 1347; GCN-NEXT: v_mov_b32_e32 v0, s0 1348; GCN-NEXT: v_mov_b32_e32 v1, s1 1349; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1350; GCN-NEXT: s_endpgm 1351; 1352; EG-LABEL: s_shl_inline_imm_2_0_i64: 1353; EG: ; %bb.0: 1354; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1355; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1356; EG-NEXT: CF_END 1357; EG-NEXT: PAD 1358; EG-NEXT: ALU clause starting at 4: 1359; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1360; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1361; EG-NEXT: 31(4.344025e-44), 1073741824(2.000000e+00) 1362; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1363; EG-NEXT: MOV T0.X, 0.0, 1364; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1365; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1366 %shl = shl i64 4611686018427387904, %a 1367 store i64 %shl, i64 addrspace(1)* %out, align 8 1368 ret void 1369} 1370 1371define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1372; GCN-LABEL: s_shl_inline_imm_neg_2_0_i64: 1373; GCN: ; %bb.0: 1374; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1375; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1376; GCN-NEXT: s_mov_b32 s7, 0xf000 1377; GCN-NEXT: s_mov_b32 s6, -1 1378; GCN-NEXT: s_waitcnt lgkmcnt(0) 1379; GCN-NEXT: s_lshl_b64 s[0:1], -2.0, s0 1380; GCN-NEXT: v_mov_b32_e32 v0, s0 1381; GCN-NEXT: v_mov_b32_e32 v1, s1 1382; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1383; GCN-NEXT: s_endpgm 1384; 1385; EG-LABEL: s_shl_inline_imm_neg_2_0_i64: 1386; EG: ; %bb.0: 1387; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1388; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1389; EG-NEXT: CF_END 1390; EG-NEXT: PAD 1391; EG-NEXT: ALU clause starting at 4: 1392; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1393; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1394; EG-NEXT: 31(4.344025e-44), -1073741824(-2.000000e+00) 1395; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1396; EG-NEXT: MOV T0.X, 0.0, 1397; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1398; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1399 %shl = shl i64 13835058055282163712, %a 1400 store i64 %shl, i64 addrspace(1)* %out, align 8 1401 ret void 1402} 1403 1404define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1405; GCN-LABEL: s_shl_inline_imm_4_0_i64: 1406; GCN: ; %bb.0: 1407; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1408; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1409; GCN-NEXT: s_mov_b32 s7, 0xf000 1410; GCN-NEXT: s_mov_b32 s6, -1 1411; GCN-NEXT: s_waitcnt lgkmcnt(0) 1412; GCN-NEXT: s_lshl_b64 s[0:1], 4.0, s0 1413; GCN-NEXT: v_mov_b32_e32 v0, s0 1414; GCN-NEXT: v_mov_b32_e32 v1, s1 1415; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1416; GCN-NEXT: s_endpgm 1417; 1418; EG-LABEL: s_shl_inline_imm_4_0_i64: 1419; EG: ; %bb.0: 1420; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1421; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1422; EG-NEXT: CF_END 1423; EG-NEXT: PAD 1424; EG-NEXT: ALU clause starting at 4: 1425; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1426; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1427; EG-NEXT: 31(4.344025e-44), 1074790400(2.250000e+00) 1428; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1429; EG-NEXT: MOV T0.X, 0.0, 1430; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1431; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1432 %shl = shl i64 4616189618054758400, %a 1433 store i64 %shl, i64 addrspace(1)* %out, align 8 1434 ret void 1435} 1436 1437define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1438; GCN-LABEL: s_shl_inline_imm_neg_4_0_i64: 1439; GCN: ; %bb.0: 1440; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1441; GCN-NEXT: s_load_dword s0, s[0:1], 0xd 1442; GCN-NEXT: s_mov_b32 s7, 0xf000 1443; GCN-NEXT: s_mov_b32 s6, -1 1444; GCN-NEXT: s_waitcnt lgkmcnt(0) 1445; GCN-NEXT: s_lshl_b64 s[0:1], -4.0, s0 1446; GCN-NEXT: v_mov_b32_e32 v0, s0 1447; GCN-NEXT: v_mov_b32_e32 v1, s1 1448; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1449; GCN-NEXT: s_endpgm 1450; 1451; EG-LABEL: s_shl_inline_imm_neg_4_0_i64: 1452; EG: ; %bb.0: 1453; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1454; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1455; EG-NEXT: CF_END 1456; EG-NEXT: PAD 1457; EG-NEXT: ALU clause starting at 4: 1458; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1459; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1460; EG-NEXT: 31(4.344025e-44), -1072693248(-2.250000e+00) 1461; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1462; EG-NEXT: MOV T0.X, 0.0, 1463; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1464; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1465 %shl = shl i64 13839561654909534208, %a 1466 store i64 %shl, i64 addrspace(1)* %out, align 8 1467 ret void 1468} 1469 1470 1471; Test with the 64-bit integer bitpattern for a 32-bit float in the 1472; low 32-bits, which is not a valid 64-bit inline immmediate. 1473define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1474; GCN-LABEL: s_shl_inline_imm_f32_4_0_i64: 1475; GCN: ; %bb.0: 1476; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1477; GCN-NEXT: s_load_dword s2, s[0:1], 0xd 1478; GCN-NEXT: s_mov_b32 s1, 0 1479; GCN-NEXT: s_mov_b32 s0, 4.0 1480; GCN-NEXT: s_mov_b32 s7, 0xf000 1481; GCN-NEXT: s_mov_b32 s6, -1 1482; GCN-NEXT: s_waitcnt lgkmcnt(0) 1483; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1484; GCN-NEXT: v_mov_b32_e32 v0, s0 1485; GCN-NEXT: v_mov_b32_e32 v1, s1 1486; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1487; GCN-NEXT: s_endpgm 1488; 1489; EG-LABEL: s_shl_inline_imm_f32_4_0_i64: 1490; EG: ; %bb.0: 1491; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] 1492; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1493; EG-NEXT: CF_END 1494; EG-NEXT: PAD 1495; EG-NEXT: ALU clause starting at 4: 1496; EG-NEXT: SUB_INT * T0.W, literal.x, KC0[2].W, 1497; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1498; EG-NEXT: LSHR T0.W, literal.x, PV.W, 1499; EG-NEXT: ADD_INT * T1.W, KC0[2].W, literal.y, 1500; EG-NEXT: 1082130432(4.000000e+00), -32(nan) 1501; EG-NEXT: LSHL T0.Z, literal.x, PS, 1502; EG-NEXT: LSHR T0.W, PV.W, 1, 1503; EG-NEXT: SETGT_UINT * T1.W, KC0[2].W, literal.y, 1504; EG-NEXT: 1082130432(4.000000e+00), 31(4.344025e-44) 1505; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, PV.Z, 1506; EG-NEXT: LSHL * T0.W, literal.x, KC0[2].W, 1507; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) 1508; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 1509; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1510; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1511 %shl = shl i64 1082130432, %a 1512 store i64 %shl, i64 addrspace(1)* %out, align 8 1513 ret void 1514} 1515 1516; FIXME: Copy of -1 register 1517define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1518; GCN-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1519; GCN: ; %bb.0: 1520; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1521; GCN-NEXT: s_load_dword s2, s[0:1], 0xd 1522; GCN-NEXT: s_mov_b32 s6, -1 1523; GCN-NEXT: s_mov_b32 s0, -4.0 1524; GCN-NEXT: s_mov_b32 s1, s6 1525; GCN-NEXT: s_mov_b32 s7, 0xf000 1526; GCN-NEXT: s_waitcnt lgkmcnt(0) 1527; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1528; GCN-NEXT: v_mov_b32_e32 v0, s0 1529; GCN-NEXT: v_mov_b32_e32 v1, s1 1530; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1531; GCN-NEXT: s_endpgm 1532; 1533; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1534; EG: ; %bb.0: 1535; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 1536; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1537; EG-NEXT: CF_END 1538; EG-NEXT: PAD 1539; EG-NEXT: ALU clause starting at 4: 1540; EG-NEXT: SUB_INT * T0.W, literal.x, KC0[2].W, 1541; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1542; EG-NEXT: LSHR * T0.W, literal.x, PV.W, 1543; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) 1544; EG-NEXT: ADD_INT T0.Z, KC0[2].W, literal.x, 1545; EG-NEXT: LSHR T0.W, PV.W, 1, 1546; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1547; EG-NEXT: -32(nan), -1(nan) 1548; EG-NEXT: OR_INT T1.Z, PS, PV.W, 1549; EG-NEXT: LSHL T0.W, literal.x, PV.Z, 1550; EG-NEXT: SETGT_UINT * T1.W, KC0[2].W, literal.y, 1551; EG-NEXT: -1065353216(-4.000000e+00), 31(4.344025e-44) 1552; EG-NEXT: CNDE_INT T0.Y, PS, PV.Z, PV.W, 1553; EG-NEXT: LSHL * T0.W, literal.x, KC0[2].W, 1554; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) 1555; EG-NEXT: CNDE_INT T0.X, T1.W, PV.W, 0.0, 1556; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1557; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1558 %shl = shl i64 -1065353216, %a 1559 store i64 %shl, i64 addrspace(1)* %out, align 8 1560 ret void 1561} 1562 1563define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1564; GCN-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 1565; GCN: ; %bb.0: 1566; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1567; GCN-NEXT: s_load_dword s2, s[0:1], 0xd 1568; GCN-NEXT: s_mov_b32 s1, 4.0 1569; GCN-NEXT: s_mov_b32 s0, 0 1570; GCN-NEXT: s_mov_b32 s7, 0xf000 1571; GCN-NEXT: s_mov_b32 s6, -1 1572; GCN-NEXT: s_waitcnt lgkmcnt(0) 1573; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1574; GCN-NEXT: v_mov_b32_e32 v0, s0 1575; GCN-NEXT: v_mov_b32_e32 v1, s1 1576; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1577; GCN-NEXT: s_endpgm 1578; 1579; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 1580; EG: ; %bb.0: 1581; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1582; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1583; EG-NEXT: CF_END 1584; EG-NEXT: PAD 1585; EG-NEXT: ALU clause starting at 4: 1586; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1587; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1588; EG-NEXT: 31(4.344025e-44), 1082130432(4.000000e+00) 1589; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1590; EG-NEXT: MOV T0.X, 0.0, 1591; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1592; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1593 %shl = shl i64 4647714815446351872, %a 1594 store i64 %shl, i64 addrspace(1)* %out, align 8 1595 ret void 1596} 1597 1598define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1599; GCN-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 1600; GCN: ; %bb.0: 1601; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1602; GCN-NEXT: s_load_dword s2, s[0:1], 0xd 1603; GCN-NEXT: s_mov_b32 s1, -4.0 1604; GCN-NEXT: s_mov_b32 s0, 0 1605; GCN-NEXT: s_mov_b32 s7, 0xf000 1606; GCN-NEXT: s_mov_b32 s6, -1 1607; GCN-NEXT: s_waitcnt lgkmcnt(0) 1608; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1609; GCN-NEXT: v_mov_b32_e32 v0, s0 1610; GCN-NEXT: v_mov_b32_e32 v1, s1 1611; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1612; GCN-NEXT: s_endpgm 1613; 1614; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 1615; EG: ; %bb.0: 1616; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1617; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1618; EG-NEXT: CF_END 1619; EG-NEXT: PAD 1620; EG-NEXT: ALU clause starting at 4: 1621; EG-NEXT: SETGT_UINT T0.W, KC0[2].W, literal.x, 1622; EG-NEXT: LSHL * T1.W, literal.y, KC0[2].W, 1623; EG-NEXT: 31(4.344025e-44), -1065353216(-4.000000e+00) 1624; EG-NEXT: CNDE_INT * T0.Y, PV.W, PS, 0.0, 1625; EG-NEXT: MOV T0.X, 0.0, 1626; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1627; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1628 %shl = shl i64 13871086852301127680, %a 1629 store i64 %shl, i64 addrspace(1)* %out, align 8 1630 ret void 1631} 1632 1633define amdgpu_kernel void @test_mul2(i32 %p) { 1634; GCN-LABEL: test_mul2: 1635; GCN: ; %bb.0: 1636; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 1637; GCN-NEXT: s_mov_b32 s3, 0xf000 1638; GCN-NEXT: s_mov_b32 s2, -1 1639; GCN-NEXT: s_waitcnt lgkmcnt(0) 1640; GCN-NEXT: s_lshl_b32 s0, s0, 1 1641; GCN-NEXT: v_mov_b32_e32 v0, s0 1642; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 1643; GCN-NEXT: s_endpgm 1644; 1645; EG-LABEL: test_mul2: 1646; EG: ; %bb.0: 1647; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1648; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1649; EG-NEXT: CF_END 1650; EG-NEXT: PAD 1651; EG-NEXT: ALU clause starting at 4: 1652; EG-NEXT: MOV T0.X, literal.x, 1653; EG-NEXT: LSHL * T1.X, KC0[2].Y, 1, 1654; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1655 %i = mul i32 %p, 2 1656 store volatile i32 %i, i32 addrspace(1)* undef 1657 ret void 1658} 1659 1660define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) { 1661; GCN-LABEL: shl_or_k: 1662; GCN: ; %bb.0: 1663; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1664; GCN-NEXT: s_mov_b32 s6, 0 1665; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v2 1666; GCN-NEXT: s_mov_b32 s7, 0xf000 1667; GCN-NEXT: s_mov_b32 s4, s6 1668; GCN-NEXT: s_mov_b32 s5, s6 1669; GCN-NEXT: v_or_b32_e32 v2, 4, v2 1670; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 1671; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1672; GCN-NEXT: s_setpc_b64 s[30:31] 1673; 1674; EG-LABEL: shl_or_k: 1675; EG: ; %bb.0: 1676; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 1677; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1678; EG-NEXT: CF_END 1679; EG-NEXT: PAD 1680; EG-NEXT: ALU clause starting at 4: 1681; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 1682; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1683; EG-NEXT: OR_INT T0.X, PV.W, literal.x, 1684; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1685; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 1686 %tmp0 = or i32 %in, 1 1687 %tmp2 = shl i32 %tmp0, 2 1688 store i32 %tmp2, i32 addrspace(1)* %out 1689 ret void 1690} 1691 1692define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) { 1693; GCN-LABEL: shl_or_k_two_uses: 1694; GCN: ; %bb.0: 1695; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1696; GCN-NEXT: s_mov_b32 s6, 0 1697; GCN-NEXT: v_or_b32_e32 v4, 1, v4 1698; GCN-NEXT: s_mov_b32 s7, 0xf000 1699; GCN-NEXT: s_mov_b32 s4, s6 1700; GCN-NEXT: s_mov_b32 s5, s6 1701; GCN-NEXT: v_lshlrev_b32_e32 v5, 2, v4 1702; GCN-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 1703; GCN-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 1704; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1705; GCN-NEXT: s_setpc_b64 s[30:31] 1706; 1707; EG-LABEL: shl_or_k_two_uses: 1708; EG: ; %bb.0: 1709; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1710; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 1711; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1712; EG-NEXT: CF_END 1713; EG-NEXT: ALU clause starting at 4: 1714; EG-NEXT: LSHR T0.X, KC0[2].Z, literal.x, 1715; EG-NEXT: OR_INT * T1.X, KC0[2].W, 1, 1716; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1717; EG-NEXT: LSHL T2.X, PS, literal.x, 1718; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1719; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1720 %tmp0 = or i32 %in, 1 1721 %tmp2 = shl i32 %tmp0, 2 1722 store i32 %tmp2, i32 addrspace(1)* %out0 1723 store i32 %tmp0, i32 addrspace(1)* %out1 1724 ret void 1725} 1726 1727attributes #0 = { nounwind readnone } 1728