1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s 3; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s 4 5; FIXME: Generated test checks do not check metadata at the end of the 6; function, so this also includes manually added checks. 7 8; Test that we can select a statically sized alloca outside of the 9; entry block. 10 11; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an 12; alignment less than the stack alignment. 13define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 14; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 15; GCN: ; %bb.0: ; %entry 16; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 17; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 18; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 19; GCN-NEXT: s_add_u32 s0, s0, s9 20; GCN-NEXT: s_addc_u32 s1, s1, 0 21; GCN-NEXT: s_movk_i32 s32, 0x400 22; GCN-NEXT: s_waitcnt lgkmcnt(0) 23; GCN-NEXT: s_cmp_lg_u32 s6, 0 24; GCN-NEXT: s_cselect_b32 s6, 1, 0 25; GCN-NEXT: s_and_b32 s6, s6, 1 26; GCN-NEXT: s_cmp_lg_u32 s6, 0 27; GCN-NEXT: s_mov_b32 s33, 0 28; GCN-NEXT: s_cbranch_scc1 BB0_3 29; GCN-NEXT: ; %bb.1: ; %bb.0 30; GCN-NEXT: s_load_dword s6, s[4:5], 0xc 31; GCN-NEXT: s_waitcnt lgkmcnt(0) 32; GCN-NEXT: s_cmp_lg_u32 s6, 0 33; GCN-NEXT: s_cselect_b32 s6, 1, 0 34; GCN-NEXT: s_and_b32 s6, s6, 1 35; GCN-NEXT: s_cmp_lg_u32 s6, 0 36; GCN-NEXT: s_cbranch_scc1 BB0_3 37; GCN-NEXT: ; %bb.2: ; %bb.1 38; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 39; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 40; GCN-NEXT: s_add_u32 s5, s32, 0x1000 41; GCN-NEXT: s_add_u32 s8, s5, 4 42; GCN-NEXT: v_mov_b32_e32 v1, 0 43; GCN-NEXT: v_mov_b32_e32 v2, s5 44; GCN-NEXT: s_waitcnt lgkmcnt(0) 45; GCN-NEXT: s_lshl_b32 s4, s4, 2 46; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 47; GCN-NEXT: v_mov_b32_e32 v2, 1 48; GCN-NEXT: v_mov_b32_e32 v3, s8 49; GCN-NEXT: s_add_u32 s4, s5, s4 50; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 51; GCN-NEXT: v_mov_b32_e32 v2, s4 52; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 53; GCN-NEXT: s_waitcnt vmcnt(0) 54; GCN-NEXT: v_add_u32_e32 v0, v2, v0 55; GCN-NEXT: global_store_dword v1, v0, s[6:7] 56; GCN-NEXT: BB0_3: ; %bb.2 57; GCN-NEXT: v_mov_b32_e32 v0, 0 58; GCN-NEXT: global_store_dword v[0:1], v0, off 59; GCN-NEXT: s_endpgm 60 61entry: 62 %cond0 = icmp eq i32 %arg.cond0, 0 63 br i1 %cond0, label %bb.0, label %bb.2 64 65bb.0: 66 %alloca = alloca [16 x i32], align 4, addrspace(5) 67 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 68 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 69 %cond1 = icmp eq i32 %arg.cond1, 0 70 br i1 %cond1, label %bb.1, label %bb.2 71 72bb.1: 73 ; Use the alloca outside of the defining block. 74 store i32 0, i32 addrspace(5)* %gep0 75 store i32 1, i32 addrspace(5)* %gep1 76 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 77 %load = load i32, i32 addrspace(5)* %gep2 78 %tid = call i32 @llvm.amdgcn.workitem.id.x() 79 %add = add i32 %load, %tid 80 store i32 %add, i32 addrspace(1)* %out 81 br label %bb.2 82 83bb.2: 84 store volatile i32 0, i32 addrspace(1)* undef 85 ret void 86} 87; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 88; DEFAULTSIZE: ; ScratchSize: 4112 89 90; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 91; ASSUME1024: ; ScratchSize: 1040 92 93define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 94; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 95; GCN: ; %bb.0: ; %entry 96; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 97; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 98; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 99; GCN-NEXT: s_add_u32 s0, s0, s9 100; GCN-NEXT: s_addc_u32 s1, s1, 0 101; GCN-NEXT: s_movk_i32 s32, 0x1000 102; GCN-NEXT: s_waitcnt lgkmcnt(0) 103; GCN-NEXT: s_cmp_lg_u32 s6, 0 104; GCN-NEXT: s_cselect_b32 s6, 1, 0 105; GCN-NEXT: s_and_b32 s6, s6, 1 106; GCN-NEXT: s_cmp_lg_u32 s6, 0 107; GCN-NEXT: s_mov_b32 s33, 0 108; GCN-NEXT: s_cbranch_scc1 BB1_2 109; GCN-NEXT: ; %bb.1: ; %bb.0 110; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 111; GCN-NEXT: s_load_dword s4, s[4:5], 0xc 112; GCN-NEXT: s_add_u32 s5, s32, 0x1000 113; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 114; GCN-NEXT: s_add_u32 s8, s5, 4 115; GCN-NEXT: v_mov_b32_e32 v1, 0 116; GCN-NEXT: s_waitcnt lgkmcnt(0) 117; GCN-NEXT: s_lshl_b32 s4, s4, 2 118; GCN-NEXT: v_mov_b32_e32 v2, s5 119; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 120; GCN-NEXT: v_mov_b32_e32 v2, 1 121; GCN-NEXT: v_mov_b32_e32 v3, s8 122; GCN-NEXT: s_add_u32 s4, s5, s4 123; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 124; GCN-NEXT: v_mov_b32_e32 v2, s4 125; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 126; GCN-NEXT: s_waitcnt vmcnt(0) 127; GCN-NEXT: v_add_u32_e32 v0, v2, v0 128; GCN-NEXT: global_store_dword v1, v0, s[6:7] 129; GCN-NEXT: BB1_2: ; %bb.1 130; GCN-NEXT: v_mov_b32_e32 v0, 0 131; GCN-NEXT: global_store_dword v[0:1], v0, off 132; GCN-NEXT: s_endpgm 133entry: 134 %cond = icmp eq i32 %arg.cond, 0 135 br i1 %cond, label %bb.0, label %bb.1 136 137bb.0: 138 %alloca = alloca [16 x i32], align 64, addrspace(5) 139 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 140 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 141 store i32 0, i32 addrspace(5)* %gep0 142 store i32 1, i32 addrspace(5)* %gep1 143 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 144 %load = load i32, i32 addrspace(5)* %gep2 145 %tid = call i32 @llvm.amdgcn.workitem.id.x() 146 %add = add i32 %load, %tid 147 store i32 %add, i32 addrspace(1)* %out 148 br label %bb.1 149 150bb.1: 151 store volatile i32 0, i32 addrspace(1)* undef 152 ret void 153} 154 155; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 156; DEFAULTSIZE: ; ScratchSize: 4160 157 158; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 159; ASSUME1024: ; ScratchSize: 1088 160 161 162define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 163; GCN-LABEL: func_non_entry_block_static_alloca_align4: 164; GCN: ; %bb.0: ; %entry 165; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GCN-NEXT: s_mov_b32 s8, s33 167; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 168; GCN-NEXT: s_mov_b32 s33, s32 169; GCN-NEXT: s_add_u32 s32, s32, 0x400 170; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc 171; GCN-NEXT: s_cbranch_execz BB2_3 172; GCN-NEXT: ; %bb.1: ; %bb.0 173; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 174; GCN-NEXT: s_and_b64 exec, exec, vcc 175; GCN-NEXT: s_cbranch_execz BB2_3 176; GCN-NEXT: ; %bb.2: ; %bb.1 177; GCN-NEXT: s_add_u32 s6, s32, 0x1000 178; GCN-NEXT: v_mov_b32_e32 v2, 0 179; GCN-NEXT: v_mov_b32_e32 v3, s6 180; GCN-NEXT: s_add_u32 s7, s6, 4 181; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 182; GCN-NEXT: v_mov_b32_e32 v2, 1 183; GCN-NEXT: v_mov_b32_e32 v3, s7 184; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 185; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 186; GCN-NEXT: v_add_u32_e32 v2, s6, v2 187; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 188; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 189; GCN-NEXT: s_waitcnt vmcnt(0) 190; GCN-NEXT: v_add_u32_e32 v2, v2, v3 191; GCN-NEXT: global_store_dword v[0:1], v2, off 192; GCN-NEXT: BB2_3: ; %bb.2 193; GCN-NEXT: s_or_b64 exec, exec, s[4:5] 194; GCN-NEXT: v_mov_b32_e32 v0, 0 195; GCN-NEXT: global_store_dword v[0:1], v0, off 196; GCN-NEXT: s_sub_u32 s32, s32, 0x400 197; GCN-NEXT: s_mov_b32 s33, s8 198; GCN-NEXT: s_waitcnt vmcnt(0) 199; GCN-NEXT: s_setpc_b64 s[30:31] 200 201entry: 202 %cond0 = icmp eq i32 %arg.cond0, 0 203 br i1 %cond0, label %bb.0, label %bb.2 204 205bb.0: 206 %alloca = alloca [16 x i32], align 4, addrspace(5) 207 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 208 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 209 %cond1 = icmp eq i32 %arg.cond1, 0 210 br i1 %cond1, label %bb.1, label %bb.2 211 212bb.1: 213 ; Use the alloca outside of the defining block. 214 store i32 0, i32 addrspace(5)* %gep0 215 store i32 1, i32 addrspace(5)* %gep1 216 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 217 %load = load i32, i32 addrspace(5)* %gep2 218 %tid = call i32 @llvm.amdgcn.workitem.id.x() 219 %add = add i32 %load, %tid 220 store i32 %add, i32 addrspace(1)* %out 221 br label %bb.2 222 223bb.2: 224 store volatile i32 0, i32 addrspace(1)* undef 225 ret void 226} 227 228define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 229; GCN-LABEL: func_non_entry_block_static_alloca_align64: 230; GCN: ; %bb.0: ; %entry 231; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GCN-NEXT: s_add_u32 s4, s32, 0xfc0 233; GCN-NEXT: s_mov_b32 s8, s33 234; GCN-NEXT: s_and_b32 s33, s4, 0xfffff000 235; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 236; GCN-NEXT: s_add_u32 s32, s32, 0x2000 237; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc 238; GCN-NEXT: s_cbranch_execz BB3_2 239; GCN-NEXT: ; %bb.1: ; %bb.0 240; GCN-NEXT: s_add_u32 s6, s32, 0x1000 241; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 242; GCN-NEXT: s_add_u32 s7, s6, 4 243; GCN-NEXT: v_mov_b32_e32 v2, 0 244; GCN-NEXT: v_mov_b32_e32 v5, s6 245; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen 246; GCN-NEXT: v_mov_b32_e32 v2, 1 247; GCN-NEXT: v_mov_b32_e32 v5, s7 248; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen 249; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 250; GCN-NEXT: v_add_u32_e32 v2, s6, v2 251; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 252; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 253; GCN-NEXT: s_waitcnt vmcnt(0) 254; GCN-NEXT: v_add_u32_e32 v2, v2, v3 255; GCN-NEXT: global_store_dword v[0:1], v2, off 256; GCN-NEXT: BB3_2: ; %bb.1 257; GCN-NEXT: s_or_b64 exec, exec, s[4:5] 258; GCN-NEXT: v_mov_b32_e32 v0, 0 259; GCN-NEXT: global_store_dword v[0:1], v0, off 260; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 261; GCN-NEXT: s_mov_b32 s33, s8 262; GCN-NEXT: s_waitcnt vmcnt(0) 263; GCN-NEXT: s_setpc_b64 s[30:31] 264entry: 265 %cond = icmp eq i32 %arg.cond, 0 266 br i1 %cond, label %bb.0, label %bb.1 267 268bb.0: 269 %alloca = alloca [16 x i32], align 64, addrspace(5) 270 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 271 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 272 store i32 0, i32 addrspace(5)* %gep0 273 store i32 1, i32 addrspace(5)* %gep1 274 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 275 %load = load i32, i32 addrspace(5)* %gep2 276 %tid = call i32 @llvm.amdgcn.workitem.id.x() 277 %add = add i32 %load, %tid 278 store i32 %add, i32 addrspace(1)* %out 279 br label %bb.1 280 281bb.1: 282 store volatile i32 0, i32 addrspace(1)* undef 283 ret void 284} 285 286declare i32 @llvm.amdgcn.workitem.id.x() #0 287 288attributes #0 = { nounwind readnone speculatable } 289