1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI,MESA %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI,MESA %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,MESA %s 4target datalayout = "A5" 5 6; FIXME: Why is this commuted only sometimes? 7; GCN-LABEL: {{^}}i32_fastcc_i32_i32: 8; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 10; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 11; GCN-NEXT: s_setpc_b64 12define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { 13 %add0 = add i32 %arg0, %arg1 14 ret i32 %add0 15} 16 17; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: 18; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 20; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 21; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 22; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 23; GCN: s_waitcnt vmcnt(0) 24; GCN: s_setpc_b64 25; GCN: ; ScratchSize: 68 26define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { 27 %alloca = alloca [16 x i32], align 4, addrspace(5) 28 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 29 store volatile i32 9, i32 addrspace(5)* %gep 30 %add0 = add i32 %arg0, %arg1 31 ret i32 %add0 32} 33 34; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: 35define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { 36entry: 37 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 38 ret i32 %ret 39} 40 41; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: 42; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 43; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 44; GCN: s_setpc_b64 45; GCN: ; ScratchSize: 68 46define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { 47entry: 48 %alloca = alloca [16 x i32], align 4, addrspace(5) 49 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 50 store volatile i32 9, i32 addrspace(5)* %gep 51 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 52 ret i32 %ret 53} 54 55; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: 56; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 57; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 58; GCN: s_setpc_b64 59; GCN: ; ScratchSize: 136 60define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { 61entry: 62 %alloca = alloca [16 x i32], align 4, addrspace(5) 63 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 64 store volatile i32 9, i32 addrspace(5)* %gep 65 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b) 66 ret i32 %ret 67} 68 69; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: 70define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 71entry: 72 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 73 ret void 74} 75 76; It doesn't make sense to do a tail from a kernel 77; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: 78;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 79define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 80entry: 81 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 82 ret void 83} 84 85; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: 86; GCN: s_waitcnt 87; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}} 88; GCN-NEXT: s_waitcnt vmcnt(0) 89 90; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 91; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 92 93; GCN-NEXT: s_setpc_b64 s[30:31] 94define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 { 95 %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 96 %add0 = add i32 %arg0, %arg1.load 97 ret i32 %add0 98} 99 100; Tail call disallowed with byval in parent. 101; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: 102; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 103; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} 104; GCN: s_swappc_b64 105; GCN-NOT: v_readlane_b32 s32 106; GCN: s_setpc_b64 107define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 { 108entry: 109 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval) 110 ret i32 %ret 111} 112 113; Tail call disallowed with byval in parent, not callee. The stack 114; usage of incoming arguments must be <= the outgoing stack 115; arguments. 116 117; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: 118; GCN-NOT: v0 119; GCN-NOT: s32 120; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 121; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} 122; GCN-NEXT: s_setpc_b64 123define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { 124entry: 125 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*)) 126 ret i32 %ret 127} 128 129; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: 130; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} 132; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 133 134; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 135; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] 136; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] 137 138 139; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 140; GFX9: v_add3_u32 v0, v0, v3, v2 141 142; GCN-NEXT: s_setpc_b64 143define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { 144 %val_firststack = extractvalue [32 x i32] %large, 30 145 %val_laststack = extractvalue [32 x i32] %large, 31 146 %add0 = add i32 %arg0, %arg1 147 %add1 = add i32 %add0, %val_firststack 148 %add2 = add i32 %add1, %val_laststack 149 ret i32 %add2 150} 151 152; FIXME: Why load and store same location for stack args? 153; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: 154 155; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} 156; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 157 158; GCN-NOT: s32 159 160; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} 161; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 162 163; GCN-NOT: s32 164; GCN: s_setpc_b64 165define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 166entry: 167 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 168 ret i32 %ret 169} 170 171; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: 172; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 173; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28 174; GCN: s_setpc_b64 175define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { 176entry: 177 %alloca = alloca [16 x i32], align 4, addrspace(5) 178 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 179 store volatile i32 9, i32 addrspace(5)* %gep 180 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 181 ret i32 %ret 182} 183 184; If the callee requires more stack argument space than the caller, 185; don't do a tail call. 186; TODO: Do we really need this restriction? 187 188; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: 189; GCN: s_swappc_b64 190; GCN: s_setpc_b64 191define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { 192entry: 193 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 194 ret i32 %ret 195} 196 197; Have another non-tail in the function 198; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: 199; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 200; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill 201; GCN-NEXT: s_mov_b64 exec 202; GCN: s_mov_b32 s33, s32 203; GCN-DAG: s_add_u32 s32, s32, 0x400 204 205; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 206; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill 207; GCN-DAG: v_writelane_b32 v42, s34, 0 208; GCN-DAG: v_writelane_b32 v42, s35, 1 209 210; GCN-DAG: s_getpc_b64 s[4:5] 211; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 212; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 213 214 215; GCN: s_swappc_b64 216 217; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload 218; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 219 220; GCN: s_getpc_b64 s[4:5] 221; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 222; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 223 224; GCN-DAG: v_readlane_b32 s34, v42, 0 225; GCN-DAG: v_readlane_b32 s35, v42, 1 226 227; GCN: s_sub_u32 s32, s32, 0x400 228; GCN-NEXT: v_readlane_b32 s33, 229; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 230; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload 231; GCN-NEXT: s_mov_b64 exec, s[6:7] 232; GCN-NEXT: s_setpc_b64 s[4:5] 233define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { 234entry: 235 %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 236 %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) 237 ret i32 %ret 238} 239 240; Have stack object in caller and stack passed arguments. SP should be 241; in same place at function exit. 242 243; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: 244; GCN-NOT: s33 245; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: 246 247; GCN-NOT: s33 248 249; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: 250; GCN: s_setpc_b64 s[4:5] 251define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 252entry: 253 %alloca = alloca [16 x i32], align 4, addrspace(5) 254 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 255 store volatile i32 9, i32 addrspace(5)* %gep 256 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 257 ret i32 %ret 258} 259 260; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: 261; GCN-NOT: s33 262; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 263 264; GCN-NOT: s33 265; GCN: s_setpc_b64 s[4:5] 266define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { 267entry: 268 %alloca = alloca [16 x i32], align 4, addrspace(5) 269 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 270 store volatile i32 9, i32 addrspace(5)* %gep 271 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 272 ret i32 %ret 273} 274 275attributes #0 = { nounwind } 276attributes #1 = { nounwind noinline } 277