1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4 5@gv = external addrspace(4) constant i32 6 7define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { 8; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 11; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 12; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 13; GFX9-NEXT: s_add_u32 s0, s0, s9 14; GFX9-NEXT: s_addc_u32 s1, s1, 0 15; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 17; GFX9-NEXT: s_and_b32 s4, s4, -16 18; GFX9-NEXT: s_movk_i32 s32, 0x400 19; GFX9-NEXT: s_lshl_b32 s4, s4, 6 20; GFX9-NEXT: s_add_u32 s4, s32, s4 21; GFX9-NEXT: v_mov_b32_e32 v0, 0 22; GFX9-NEXT: v_mov_b32_e32 v1, s4 23; GFX9-NEXT: s_mov_b32 s33, 0 24; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 25; GFX9-NEXT: s_endpgm 26; 27; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: 28; GFX10: ; %bb.0: 29; GFX10-NEXT: s_add_u32 s6, s6, s9 30; GFX10-NEXT: s_movk_i32 s32, 0x200 31; GFX10-NEXT: s_mov_b32 s33, 0 32; GFX10-NEXT: s_addc_u32 s7, s7, 0 33; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 34; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 35; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 36; GFX10-NEXT: s_add_u32 s0, s0, s9 37; GFX10-NEXT: s_addc_u32 s1, s1, 0 38; GFX10-NEXT: v_mov_b32_e32 v0, 0 39; GFX10-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 41; GFX10-NEXT: s_and_b32 s4, s4, -16 42; GFX10-NEXT: s_lshl_b32 s4, s4, 5 43; GFX10-NEXT: s_add_u32 s4, s32, s4 44; GFX10-NEXT: v_mov_b32_e32 v1, s4 45; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 46; GFX10-NEXT: s_endpgm 47 %alloca = alloca i32, i32 %n, align 4, addrspace(5) 48 store i32 0, i32 addrspace(5)* %alloca 49 ret void 50} 51 52define void @func_dynamic_stackalloc_sgpr_align4() { 53; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4: 54; GFX9: ; %bb.0: 55; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX9-NEXT: s_mov_b32 s6, s33 57; GFX9-NEXT: s_mov_b32 s33, s32 58; GFX9-NEXT: s_add_u32 s32, s32, 0x400 59; GFX9-NEXT: s_getpc_b64 s[4:5] 60; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 61; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 62; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 63; GFX9-NEXT: v_mov_b32_e32 v0, 0 64; GFX9-NEXT: s_mov_b32 s33, s6 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 67; GFX9-NEXT: s_waitcnt lgkmcnt(0) 68; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 69; GFX9-NEXT: s_and_b32 s4, s4, -16 70; GFX9-NEXT: s_lshl_b32 s4, s4, 6 71; GFX9-NEXT: s_add_u32 s4, s32, s4 72; GFX9-NEXT: v_mov_b32_e32 v1, s4 73; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 74; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 75; GFX9-NEXT: s_waitcnt vmcnt(0) 76; GFX9-NEXT: s_setpc_b64 s[30:31] 77; 78; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4: 79; GFX10: ; %bb.0: 80; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 81; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 82; GFX10-NEXT: s_mov_b32 s6, s33 83; GFX10-NEXT: s_mov_b32 s33, s32 84; GFX10-NEXT: s_add_u32 s32, s32, 0x200 85; GFX10-NEXT: s_getpc_b64 s[4:5] 86; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 87; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 88; GFX10-NEXT: v_mov_b32_e32 v0, 0 89; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 90; GFX10-NEXT: s_mov_b32 s33, s6 91; GFX10-NEXT: s_waitcnt lgkmcnt(0) 92; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 93; GFX10-NEXT: s_waitcnt lgkmcnt(0) 94; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 95; GFX10-NEXT: s_and_b32 s4, s4, -16 96; GFX10-NEXT: s_lshl_b32 s4, s4, 5 97; GFX10-NEXT: s_add_u32 s4, s32, s4 98; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 99; GFX10-NEXT: v_mov_b32_e32 v1, s4 100; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 101; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 102; GFX10-NEXT: s_setpc_b64 s[30:31] 103 %n = load i32, i32 addrspace(4)* @gv, align 4 104 %alloca = alloca i32, i32 %n, addrspace(5) 105 store i32 0, i32 addrspace(5)* %alloca 106 ret void 107} 108 109define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { 110; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: 111; GFX9: ; %bb.0: 112; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 113; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 114; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 115; GFX9-NEXT: s_add_u32 s0, s0, s9 116; GFX9-NEXT: s_addc_u32 s1, s1, 0 117; GFX9-NEXT: s_waitcnt lgkmcnt(0) 118; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 119; GFX9-NEXT: s_and_b32 s4, s4, -16 120; GFX9-NEXT: s_movk_i32 s32, 0x400 121; GFX9-NEXT: s_lshl_b32 s4, s4, 6 122; GFX9-NEXT: s_add_u32 s4, s32, s4 123; GFX9-NEXT: v_mov_b32_e32 v0, 0 124; GFX9-NEXT: v_mov_b32_e32 v1, s4 125; GFX9-NEXT: s_mov_b32 s33, 0 126; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 127; GFX9-NEXT: s_endpgm 128; 129; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: 130; GFX10: ; %bb.0: 131; GFX10-NEXT: s_add_u32 s6, s6, s9 132; GFX10-NEXT: s_movk_i32 s32, 0x200 133; GFX10-NEXT: s_mov_b32 s33, 0 134; GFX10-NEXT: s_addc_u32 s7, s7, 0 135; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 136; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 137; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 138; GFX10-NEXT: s_add_u32 s0, s0, s9 139; GFX10-NEXT: s_addc_u32 s1, s1, 0 140; GFX10-NEXT: v_mov_b32_e32 v0, 0 141; GFX10-NEXT: s_waitcnt lgkmcnt(0) 142; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 143; GFX10-NEXT: s_and_b32 s4, s4, -16 144; GFX10-NEXT: s_lshl_b32 s4, s4, 5 145; GFX10-NEXT: s_add_u32 s4, s32, s4 146; GFX10-NEXT: v_mov_b32_e32 v1, s4 147; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 148; GFX10-NEXT: s_endpgm 149 %alloca = alloca i32, i32 %n, align 16, addrspace(5) 150 store i32 0, i32 addrspace(5)* %alloca 151 ret void 152} 153 154define void @func_dynamic_stackalloc_sgpr_align16() { 155; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16: 156; GFX9: ; %bb.0: 157; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 158; GFX9-NEXT: s_mov_b32 s6, s33 159; GFX9-NEXT: s_mov_b32 s33, s32 160; GFX9-NEXT: s_add_u32 s32, s32, 0x400 161; GFX9-NEXT: s_getpc_b64 s[4:5] 162; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 163; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 164; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 165; GFX9-NEXT: v_mov_b32_e32 v0, 0 166; GFX9-NEXT: s_mov_b32 s33, s6 167; GFX9-NEXT: s_waitcnt lgkmcnt(0) 168; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 170; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 171; GFX9-NEXT: s_and_b32 s4, s4, -16 172; GFX9-NEXT: s_lshl_b32 s4, s4, 6 173; GFX9-NEXT: s_add_u32 s4, s32, s4 174; GFX9-NEXT: v_mov_b32_e32 v1, s4 175; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 176; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 177; GFX9-NEXT: s_waitcnt vmcnt(0) 178; GFX9-NEXT: s_setpc_b64 s[30:31] 179; 180; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16: 181; GFX10: ; %bb.0: 182; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX10-NEXT: s_mov_b32 s6, s33 185; GFX10-NEXT: s_mov_b32 s33, s32 186; GFX10-NEXT: s_add_u32 s32, s32, 0x200 187; GFX10-NEXT: s_getpc_b64 s[4:5] 188; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 189; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 190; GFX10-NEXT: v_mov_b32_e32 v0, 0 191; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 192; GFX10-NEXT: s_mov_b32 s33, s6 193; GFX10-NEXT: s_waitcnt lgkmcnt(0) 194; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 195; GFX10-NEXT: s_waitcnt lgkmcnt(0) 196; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 197; GFX10-NEXT: s_and_b32 s4, s4, -16 198; GFX10-NEXT: s_lshl_b32 s4, s4, 5 199; GFX10-NEXT: s_add_u32 s4, s32, s4 200; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 201; GFX10-NEXT: v_mov_b32_e32 v1, s4 202; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 203; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 204; GFX10-NEXT: s_setpc_b64 s[30:31] 205 %n = load i32, i32 addrspace(4)* @gv, align 16 206 %alloca = alloca i32, i32 %n, addrspace(5) 207 store i32 0, i32 addrspace(5)* %alloca 208 ret void 209} 210 211define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { 212; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: 213; GFX9: ; %bb.0: 214; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 215; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 216; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 217; GFX9-NEXT: s_add_u32 s0, s0, s9 218; GFX9-NEXT: s_addc_u32 s1, s1, 0 219; GFX9-NEXT: s_waitcnt lgkmcnt(0) 220; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 221; GFX9-NEXT: s_and_b32 s4, s4, -16 222; GFX9-NEXT: s_movk_i32 s32, 0x800 223; GFX9-NEXT: s_lshl_b32 s4, s4, 6 224; GFX9-NEXT: s_add_u32 s4, s32, s4 225; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 226; GFX9-NEXT: v_mov_b32_e32 v0, 0 227; GFX9-NEXT: v_mov_b32_e32 v1, s4 228; GFX9-NEXT: s_mov_b32 s33, 0 229; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 230; GFX9-NEXT: s_endpgm 231; 232; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: 233; GFX10: ; %bb.0: 234; GFX10-NEXT: s_add_u32 s6, s6, s9 235; GFX10-NEXT: s_movk_i32 s32, 0x400 236; GFX10-NEXT: s_mov_b32 s33, 0 237; GFX10-NEXT: s_addc_u32 s7, s7, 0 238; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 239; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 240; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 241; GFX10-NEXT: s_add_u32 s0, s0, s9 242; GFX10-NEXT: s_addc_u32 s1, s1, 0 243; GFX10-NEXT: v_mov_b32_e32 v0, 0 244; GFX10-NEXT: s_waitcnt lgkmcnt(0) 245; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 246; GFX10-NEXT: s_and_b32 s4, s4, -16 247; GFX10-NEXT: s_lshl_b32 s4, s4, 5 248; GFX10-NEXT: s_add_u32 s4, s32, s4 249; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 250; GFX10-NEXT: v_mov_b32_e32 v1, s4 251; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 252; GFX10-NEXT: s_endpgm 253 %alloca = alloca i32, i32 %n, align 32, addrspace(5) 254 store i32 0, i32 addrspace(5)* %alloca 255 ret void 256} 257 258define void @func_dynamic_stackalloc_sgpr_align32(i32 addrspace(1)* %out) { 259; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; GFX9-NEXT: s_add_u32 s4, s32, 0x7c0 263; GFX9-NEXT: s_mov_b32 s6, s33 264; GFX9-NEXT: s_and_b32 s33, s4, 0xfffff800 265; GFX9-NEXT: s_add_u32 s32, s32, 0x1000 266; GFX9-NEXT: s_getpc_b64 s[4:5] 267; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 268; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 269; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 270; GFX9-NEXT: v_mov_b32_e32 v0, 0 271; GFX9-NEXT: s_mov_b32 s33, s6 272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 273; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 274; GFX9-NEXT: s_waitcnt lgkmcnt(0) 275; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 276; GFX9-NEXT: s_and_b32 s4, s4, -16 277; GFX9-NEXT: s_lshl_b32 s4, s4, 6 278; GFX9-NEXT: s_add_u32 s4, s32, s4 279; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 280; GFX9-NEXT: v_mov_b32_e32 v1, s4 281; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 282; GFX9-NEXT: s_sub_u32 s32, s32, 0x1000 283; GFX9-NEXT: s_waitcnt vmcnt(0) 284; GFX9-NEXT: s_setpc_b64 s[30:31] 285; 286; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32: 287; GFX10: ; %bb.0: 288; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 289; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX10-NEXT: s_add_u32 s4, s32, 0x3e0 291; GFX10-NEXT: s_mov_b32 s6, s33 292; GFX10-NEXT: s_and_b32 s33, s4, 0xfffffc00 293; GFX10-NEXT: s_add_u32 s32, s32, 0x800 294; GFX10-NEXT: s_getpc_b64 s[4:5] 295; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 296; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 297; GFX10-NEXT: v_mov_b32_e32 v0, 0 298; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 299; GFX10-NEXT: s_mov_b32 s33, s6 300; GFX10-NEXT: s_waitcnt lgkmcnt(0) 301; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 302; GFX10-NEXT: s_waitcnt lgkmcnt(0) 303; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 304; GFX10-NEXT: s_and_b32 s4, s4, -16 305; GFX10-NEXT: s_lshl_b32 s4, s4, 5 306; GFX10-NEXT: s_add_u32 s4, s32, s4 307; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 308; GFX10-NEXT: s_sub_u32 s32, s32, 0x800 309; GFX10-NEXT: v_mov_b32_e32 v1, s4 310; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 311; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 312; GFX10-NEXT: s_setpc_b64 s[30:31] 313 %n = load i32, i32 addrspace(4)* @gv 314 %alloca = alloca i32, i32 %n, align 32, addrspace(5) 315 store i32 0, i32 addrspace(5)* %alloca 316 ret void 317} 318