1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 5 6; Test optimization to reduce shifts to narrower sizes. 7 8define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) { 9; GCN-LABEL: s_shl_i64_zext_i32: 10; GCN: ; %bb.0: 11; GCN-NEXT: s_andn2_b32 s0, s0, -2.0 12; GCN-NEXT: s_lshl_b32 s0, s0, 2 13; GCN-NEXT: s_mov_b32 s1, 0 14; GCN-NEXT: ; return to shader part epilog 15 %and = and i32 %x, 1073741823 16 %ext = zext i32 %and to i64 17 %shl = shl i64 %ext, 2 18 ret i64 %shl 19} 20 21define i64 @v_shl_i64_zext_i32(i32 %x) { 22; GCN-LABEL: v_shl_i64_zext_i32: 23; GCN: ; %bb.0: 24; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 25; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 26; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 27; GCN-NEXT: v_mov_b32_e32 v1, 0 28; GCN-NEXT: s_setpc_b64 s[30:31] 29 %and = and i32 %x, 1073741823 30 %ext = zext i32 %and to i64 31 %shl = shl i64 %ext, 2 32 ret i64 %shl 33} 34 35define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) { 36; GCN-LABEL: s_shl_i64_sext_i32: 37; GCN: ; %bb.0: 38; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff 39; GCN-NEXT: s_lshl_b32 s0, s0, 2 40; GCN-NEXT: s_mov_b32 s1, 0 41; GCN-NEXT: ; return to shader part epilog 42 %and = and i32 %x, 536870911 43 %ext = sext i32 %and to i64 44 %shl = shl i64 %ext, 2 45 ret i64 %shl 46} 47 48define i64 @v_shl_i64_sext_i32(i32 %x) { 49; GCN-LABEL: v_shl_i64_sext_i32: 50; GCN: ; %bb.0: 51; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 53; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 54; GCN-NEXT: v_mov_b32_e32 v1, 0 55; GCN-NEXT: s_setpc_b64 s[30:31] 56 %and = and i32 %x, 536870911 57 %ext = sext i32 %and to i64 58 %shl = shl i64 %ext, 2 59 ret i64 %shl 60} 61 62define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) { 63; GCN-LABEL: s_shl_i64_zext_i32_overflow: 64; GCN: ; %bb.0: 65; GCN-NEXT: s_bitset0_b32 s0, 31 66; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 67; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 68; GCN-NEXT: ; return to shader part epilog 69 %and = and i32 %x, 2147483647 70 %ext = zext i32 %and to i64 71 %shl = shl i64 %ext, 2 72 ret i64 %shl 73} 74 75define i64 @v_shl_i64_zext_i32_overflow(i32 %x) { 76; GFX7-LABEL: v_shl_i64_zext_i32_overflow: 77; GFX7: ; %bb.0: 78; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 79; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 80; GFX7-NEXT: v_mov_b32_e32 v1, 0 81; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 82; GFX7-NEXT: s_setpc_b64 s[30:31] 83; 84; GFX8-LABEL: v_shl_i64_zext_i32_overflow: 85; GFX8: ; %bb.0: 86; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 88; GFX8-NEXT: v_mov_b32_e32 v1, 0 89; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 90; GFX8-NEXT: s_setpc_b64 s[30:31] 91; 92; GFX9-LABEL: v_shl_i64_zext_i32_overflow: 93; GFX9: ; %bb.0: 94; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 95; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 96; GFX9-NEXT: v_mov_b32_e32 v1, 0 97; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 98; GFX9-NEXT: s_setpc_b64 s[30:31] 99 %and = and i32 %x, 2147483647 100 %ext = zext i32 %and to i64 101 %shl = shl i64 %ext, 2 102 ret i64 %shl 103} 104 105define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) { 106; GCN-LABEL: s_shl_i64_sext_i32_overflow: 107; GCN: ; %bb.0: 108; GCN-NEXT: s_bitset0_b32 s0, 31 109; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 110; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 111; GCN-NEXT: ; return to shader part epilog 112 %and = and i32 %x, 2147483647 113 %ext = sext i32 %and to i64 114 %shl = shl i64 %ext, 2 115 ret i64 %shl 116} 117 118define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { 119; GFX7-LABEL: v_shl_i64_sext_i32_overflow: 120; GFX7: ; %bb.0: 121; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 123; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 124; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 125; GFX7-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX8-LABEL: v_shl_i64_sext_i32_overflow: 128; GFX8: ; %bb.0: 129; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 131; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 132; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 133; GFX8-NEXT: s_setpc_b64 s[30:31] 134; 135; GFX9-LABEL: v_shl_i64_sext_i32_overflow: 136; GFX9: ; %bb.0: 137; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 139; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 140; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 141; GFX9-NEXT: s_setpc_b64 s[30:31] 142 %and = and i32 %x, 2147483647 143 %ext = sext i32 %and to i64 144 %shl = shl i64 %ext, 2 145 ret i64 %shl 146} 147 148define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) { 149; GFX7-LABEL: mulu24_shl64: 150; GFX7: ; %bb.0: ; %bb 151; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 152; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 153; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 154; GFX7-NEXT: v_mov_b32_e32 v1, 0 155; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2 156; GFX7-NEXT: s_mov_b32 s2, 0 157; GFX7-NEXT: s_mov_b32 s3, 0xf000 158; GFX7-NEXT: s_waitcnt lgkmcnt(0) 159; GFX7-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 160; GFX7-NEXT: s_endpgm 161; 162; GFX8-LABEL: mulu24_shl64: 163; GFX8: ; %bb.0: ; %bb 164; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 165; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 166; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 167; GFX8-NEXT: v_mov_b32_e32 v1, 0 168; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] 169; GFX8-NEXT: s_waitcnt lgkmcnt(0) 170; GFX8-NEXT: v_mov_b32_e32 v5, s1 171; GFX8-NEXT: v_mov_b32_e32 v4, s0 172; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 173; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 174; GFX8-NEXT: flat_store_dword v[2:3], v1 175; GFX8-NEXT: s_endpgm 176; 177; GFX9-LABEL: mulu24_shl64: 178; GFX9: ; %bb.0: ; %bb 179; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 180; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 181; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 182; GFX9-NEXT: v_mov_b32_e32 v1, 0 183; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] 184; GFX9-NEXT: s_waitcnt lgkmcnt(0) 185; GFX9-NEXT: v_mov_b32_e32 v5, s1 186; GFX9-NEXT: v_mov_b32_e32 v4, s0 187; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 188; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 189; GFX9-NEXT: global_store_dword v[2:3], v1, off 190; GFX9-NEXT: s_endpgm 191bb: 192 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 193 %tmp1 = and i32 %tmp, 6 194 %mulconv = mul nuw nsw i32 %tmp1, 7 195 %tmp2 = zext i32 %mulconv to i64 196 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp2 197 store i32 0, i32 addrspace(1)* %tmp3, align 4 198 ret void 199} 200 201define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { 202; GFX7-LABEL: muli24_shl64: 203; GFX7: ; %bb.0: ; %bb 204; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 205; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v0 206; GFX7-NEXT: v_mov_b32_e32 v2, 0 207; GFX7-NEXT: s_mov_b32 s6, 0 208; GFX7-NEXT: s_mov_b32 s7, 0xf000 209; GFX7-NEXT: s_waitcnt lgkmcnt(0) 210; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 211; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 212; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v0 213; GFX7-NEXT: v_mov_b32_e32 v4, s1 214; GFX7-NEXT: v_mov_b32_e32 v3, s0 215; GFX7-NEXT: s_waitcnt vmcnt(0) 216; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v1 217; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 218; GFX7-NEXT: v_lshl_b64 v[0:1], v[1:2], 3 219; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v5 220; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 221; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 222; GFX7-NEXT: s_endpgm 223; 224; GFX8-LABEL: muli24_shl64: 225; GFX8: ; %bb.0: ; %bb 226; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 227; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 228; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v0 229; GFX8-NEXT: s_waitcnt lgkmcnt(0) 230; GFX8-NEXT: v_mov_b32_e32 v1, s2 231; GFX8-NEXT: v_mov_b32_e32 v2, s3 232; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 233; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 234; GFX8-NEXT: flat_load_dword v4, v[1:2] 235; GFX8-NEXT: v_mov_b32_e32 v3, s1 236; GFX8-NEXT: v_mov_b32_e32 v2, s0 237; GFX8-NEXT: v_mov_b32_e32 v1, 0 238; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 239; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 240; GFX8-NEXT: s_waitcnt vmcnt(0) 241; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4 242; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0 243; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] 244; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 245; GFX8-NEXT: s_endpgm 246; 247; GFX9-LABEL: muli24_shl64: 248; GFX9: ; %bb.0: ; %bb 249; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 250; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 251; GFX9-NEXT: v_mov_b32_e32 v2, 0 252; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 254; GFX9-NEXT: global_load_dword v1, v1, s[2:3] 255; GFX9-NEXT: s_waitcnt vmcnt(0) 256; GFX9-NEXT: v_or_b32_e32 v1, 0xff800000, v1 257; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v1 258; GFX9-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] 259; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] 260; GFX9-NEXT: s_endpgm 261bb: 262 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 263 %tmp2 = sext i32 %tmp to i64 264 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp2 265 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 266 %tmp5 = or i32 %tmp4, -8388608 267 %tmp6 = mul nsw i32 %tmp5, -7 268 %tmp7 = zext i32 %tmp6 to i64 269 %tmp8 = shl nuw nsw i64 %tmp7, 3 270 %tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp2 271 store i64 %tmp8, i64 addrspace(1)* %tmp9, align 8 272 ret void 273} 274 275define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) { 276; GCN-LABEL: s_shl_v2i64_zext_v2i32: 277; GCN: ; %bb.0: 278; GCN-NEXT: s_brev_b32 s2, -4 279; GCN-NEXT: s_mov_b32 s3, s2 280; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 281; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 282; GCN-NEXT: s_mov_b32 s0, s1 283; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000 284; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 285; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 286; GCN-NEXT: ; return to shader part epilog 287 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823> 288 %ext = zext <2 x i32> %and to <2 x i64> 289 %shl = shl <2 x i64> %ext, <i64 2, i64 2> 290 ret <2 x i64> %shl 291} 292 293define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) { 294; GFX7-LABEL: v_shl_v2i64_zext_v2i32: 295; GFX7: ; %bb.0: 296; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; GFX7-NEXT: s_brev_b32 s4, -4 298; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 299; GFX7-NEXT: v_mov_b32_e32 v1, 0 300; GFX7-NEXT: v_mov_b32_e32 v3, v1 301; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 302; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 303; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 304; GFX7-NEXT: s_setpc_b64 s[30:31] 305; 306; GFX8-LABEL: v_shl_v2i64_zext_v2i32: 307; GFX8: ; %bb.0: 308; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 309; GFX8-NEXT: s_brev_b32 s4, -4 310; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 311; GFX8-NEXT: v_mov_b32_e32 v1, 0 312; GFX8-NEXT: v_mov_b32_e32 v3, v1 313; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 314; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 315; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] 316; GFX8-NEXT: s_setpc_b64 s[30:31] 317; 318; GFX9-LABEL: v_shl_v2i64_zext_v2i32: 319; GFX9: ; %bb.0: 320; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 321; GFX9-NEXT: s_brev_b32 s4, -4 322; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 323; GFX9-NEXT: v_mov_b32_e32 v1, 0 324; GFX9-NEXT: v_mov_b32_e32 v3, v1 325; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 326; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 327; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] 328; GFX9-NEXT: s_setpc_b64 s[30:31] 329 %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823> 330 %ext = zext <2 x i32> %and to <2 x i64> 331 %shl = shl <2 x i64> %ext, <i64 2, i64 2> 332 ret <2 x i64> %shl 333} 334 335define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) { 336; GCN-LABEL: s_shl_v2i64_sext_v2i32: 337; GCN: ; %bb.0: 338; GCN-NEXT: s_brev_b32 s2, -8 339; GCN-NEXT: s_mov_b32 s3, s2 340; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 341; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000 342; GCN-NEXT: s_mov_b32 s0, s1 343; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000 344; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 345; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 346; GCN-NEXT: ; return to shader part epilog 347 %and = and <2 x i32> %x, <i32 536870911, i32 536870911> 348 %ext = sext <2 x i32> %and to <2 x i64> 349 %shl = shl <2 x i64> %ext, <i64 2, i64 2> 350 ret <2 x i64> %shl 351} 352 353define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) { 354; GFX7-LABEL: v_shl_v2i64_sext_v2i32: 355; GFX7: ; %bb.0: 356; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 357; GFX7-NEXT: s_brev_b32 s4, -8 358; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 359; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 360; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 361; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 362; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 363; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 364; GFX7-NEXT: s_setpc_b64 s[30:31] 365; 366; GFX8-LABEL: v_shl_v2i64_sext_v2i32: 367; GFX8: ; %bb.0: 368; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 369; GFX8-NEXT: s_brev_b32 s4, -8 370; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 371; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 372; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 373; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 374; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 375; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] 376; GFX8-NEXT: s_setpc_b64 s[30:31] 377; 378; GFX9-LABEL: v_shl_v2i64_sext_v2i32: 379; GFX9: ; %bb.0: 380; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 381; GFX9-NEXT: s_brev_b32 s4, -8 382; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 383; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 384; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 385; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 386; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 387; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] 388; GFX9-NEXT: s_setpc_b64 s[30:31] 389 %and = and <2 x i32> %x, <i32 536870911, i32 536870911> 390 %ext = sext <2 x i32> %and to <2 x i64> 391 %shl = shl <2 x i64> %ext, <i64 2, i64 2> 392 ret <2 x i64> %shl 393} 394 395define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { 396; GFX7-LABEL: s_shl_i32_zext_i16: 397; GFX7: ; %bb.0: 398; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff 399; GFX7-NEXT: s_lshl_b32 s0, s0, 2 400; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 401; GFX7-NEXT: ; return to shader part epilog 402; 403; GFX8-LABEL: s_shl_i32_zext_i16: 404; GFX8: ; %bb.0: 405; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 406; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff 407; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000 408; GFX8-NEXT: s_lshl_b32 s0, s0, s1 409; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 410; GFX8-NEXT: ; return to shader part epilog 411; 412; GFX9-LABEL: s_shl_i32_zext_i16: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 415; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff 416; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000 417; GFX9-NEXT: s_lshl_b32 s0, s0, s1 418; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 419; GFX9-NEXT: ; return to shader part epilog 420 %and = and i16 %x, 16383 421 %ext = zext i16 %and to i32 422 %shl = shl i32 %ext, 2 423 ret i32 %shl 424} 425 426define i32 @v_shl_i32_zext_i16(i16 %x) { 427; GFX7-LABEL: v_shl_i32_zext_i16: 428; GFX7: ; %bb.0: 429; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 430; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0 431; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 432; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 433; GFX7-NEXT: s_setpc_b64 s[30:31] 434; 435; GFX8-LABEL: v_shl_i32_zext_i16: 436; GFX8: ; %bb.0: 437; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 438; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0 439; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0 440; GFX8-NEXT: s_setpc_b64 s[30:31] 441; 442; GFX9-LABEL: v_shl_i32_zext_i16: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0 446; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 447; GFX9-NEXT: s_setpc_b64 s[30:31] 448 %and = and i16 %x, 16383 449 %ext = zext i16 %and to i32 450 %shl = shl i32 %ext, 2 451 ret i32 %shl 452} 453 454define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { 455; GFX7-LABEL: s_shl_v2i32_zext_v2i16: 456; GFX7: ; %bb.0: 457; GFX7-NEXT: s_mov_b32 s2, 0xffff 458; GFX7-NEXT: s_lshl_b32 s1, s1, 16 459; GFX7-NEXT: s_and_b32 s0, s0, s2 460; GFX7-NEXT: s_or_b32 s0, s1, s0 461; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff 462; GFX7-NEXT: s_lshr_b32 s1, s0, 16 463; GFX7-NEXT: s_and_b32 s0, s0, s2 464; GFX7-NEXT: s_lshl_b32 s0, s0, 2 465; GFX7-NEXT: s_lshl_b32 s1, s1, 2 466; GFX7-NEXT: ; return to shader part epilog 467; 468; GFX8-LABEL: s_shl_v2i32_zext_v2i16: 469; GFX8: ; %bb.0: 470; GFX8-NEXT: s_movk_i32 s2, 0x3fff 471; GFX8-NEXT: s_mov_b32 s4, 0xffff 472; GFX8-NEXT: s_lshr_b32 s1, s0, 16 473; GFX8-NEXT: s_mov_b32 s3, s2 474; GFX8-NEXT: s_and_b32 s0, s0, s4 475; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 476; GFX8-NEXT: s_mov_b32 s5, s4 477; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] 478; GFX8-NEXT: s_lshl_b32 s0, s0, 2 479; GFX8-NEXT: s_lshl_b32 s1, s1, 2 480; GFX8-NEXT: ; return to shader part epilog 481; 482; GFX9-LABEL: s_shl_v2i32_zext_v2i16: 483; GFX9: ; %bb.0: 484; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff 485; GFX9-NEXT: s_lshr_b32 s1, s0, 16 486; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 487; GFX9-NEXT: s_lshl_b32 s0, s0, 2 488; GFX9-NEXT: s_lshl_b32 s1, s1, 2 489; GFX9-NEXT: ; return to shader part epilog 490 %and = and <2 x i16> %x, <i16 16383, i16 16383> 491 %ext = zext <2 x i16> %and to <2 x i32> 492 %shl = shl <2 x i32> %ext, <i32 2, i32 2> 493 ret <2 x i32> %shl 494} 495 496; FIXME: This doesn't do what we want. The pre-legalizer combiner 497; fails to handle the vector splat. The post-legalizer sees the zext 498; legalized into the and. This is probably not that important, since 499; we really do this combine in the machine level for lowered 500; getelementptrs. 501define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) { 502; GFX7-LABEL: v_shl_v2i32_zext_v2i16: 503; GFX7: ; %bb.0: 504; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 505; GFX7-NEXT: v_mov_b32_e32 v2, 0xffff 506; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 507; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 508; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 509; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 510; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 511; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 512; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 513; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 514; GFX7-NEXT: s_setpc_b64 s[30:31] 515; 516; GFX8-LABEL: v_shl_v2i32_zext_v2i16: 517; GFX8: ; %bb.0: 518; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; GFX8-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 520; GFX8-NEXT: v_mov_b32_e32 v2, 2 521; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 522; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 523; GFX8-NEXT: s_setpc_b64 s[30:31] 524; 525; GFX9-LABEL: v_shl_v2i32_zext_v2i16: 526; GFX9: ; %bb.0: 527; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX9-NEXT: s_mov_b32 s4, 2 529; GFX9-NEXT: v_and_b32_e32 v1, 0x3fff3fff, v0 530; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 531; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 532; GFX9-NEXT: s_setpc_b64 s[30:31] 533 %and = and <2 x i16> %x, <i16 16383, i16 16383> 534 %ext = zext <2 x i16> %and to <2 x i32> 535 %shl = shl <2 x i32> %ext, <i32 2, i32 2> 536 ret <2 x i32> %shl 537} 538 539declare i32 @llvm.amdgcn.workitem.id.x() #0 540 541attributes #0 = { nounwind readnone speculatable willreturn } 542