1; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s 2; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s 3 4target datalayout = "A5" 5 6@all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4 7@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 8 9@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4 10@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4 11 12; This function cannot promote to using LDS because of the size of the 13; constant expression use in the function, which was previously not 14; detected. 15; IR-LABEL: @constant_expression_uses_all_lds( 16; IR: alloca 17 18; ASM-LABEL: constant_expression_uses_all_lds: 19; ASM: .amdhsa_group_segment_fixed_size 65536 20define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 21entry: 22 %stack = alloca [4 x i32], align 4, addrspace(5) 23 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 24 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 25 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 26 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 27 store i32 9, i32 addrspace(5)* %gep0 28 store i32 10, i32 addrspace(5)* %gep1 29 store i32 99, i32 addrspace(5)* %gep2 30 store i32 43, i32 addrspace(5)* %gep3 31 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 32 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 33 store i32 %load, i32 addrspace(1)* %out 34 35 store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef 36 ret void 37} 38 39; Has a constant expression use through a single level of constant 40; expression, but not enough LDS to block promotion 41 42; IR-LABEL: @constant_expression_uses_some_lds( 43; IR-NOT: alloca 44 45; ASM-LABEL: {{^}}constant_expression_uses_some_lds: 46; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} 47define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 48entry: 49 %stack = alloca [4 x i32], align 4, addrspace(5) 50 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 51 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 52 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 53 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 54 store i32 9, i32 addrspace(5)* %gep0 55 store i32 10, i32 addrspace(5)* %gep1 56 store i32 99, i32 addrspace(5)* %gep2 57 store i32 43, i32 addrspace(5)* %gep3 58 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 59 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 60 store i32 %load, i32 addrspace(1)* %out 61 store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef 62 ret void 63} 64 65declare void @callee(i8*) 66 67; IR-LABEL: @constant_expression_uses_all_lds_multi_level( 68; IR: alloca 69 70; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level: 71; ASM: .amdhsa_group_segment_fixed_size 65536{{$}} 72define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 73entry: 74 %stack = alloca [4 x i32], align 4, addrspace(5) 75 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 76 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 77 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 78 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 79 store i32 9, i32 addrspace(5)* %gep0 80 store i32 10, i32 addrspace(5)* %gep1 81 store i32 99, i32 addrspace(5)* %gep2 82 store i32 43, i32 addrspace(5)* %gep3 83 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 84 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 85 store i32 %load, i32 addrspace(1)* %out 86 call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) 87 ret void 88} 89 90; IR-LABEL: @constant_expression_uses_some_lds_multi_level( 91; IR-NOT: alloca 92; IR: llvm.amdgcn.workitem.id 93 94; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level: 95; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} 96define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 97entry: 98 %stack = alloca [4 x i32], align 4, addrspace(5) 99 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 100 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 101 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 102 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 103 store i32 9, i32 addrspace(5)* %gep0 104 store i32 10, i32 addrspace(5)* %gep1 105 store i32 99, i32 addrspace(5)* %gep2 106 store i32 43, i32 addrspace(5)* %gep3 107 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 108 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 109 store i32 %load, i32 addrspace(1)* %out 110 call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) 111 ret void 112} 113 114; IR-LABEL: @constant_expression_uses_some_lds_global_initializer( 115; IR-NOT: alloca 116; IR: llvm.amdgcn.workitem.id 117 118; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer: 119; ASM: .amdhsa_group_segment_fixed_size 4096{{$}} 120define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 121entry: 122 %stack = alloca [4 x i32], align 4, addrspace(5) 123 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 124 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 125 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 126 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 127 store i32 9, i32 addrspace(5)* %gep0 128 store i32 10, i32 addrspace(5)* %gep1 129 store i32 99, i32 addrspace(5)* %gep2 130 store i32 43, i32 addrspace(5)* %gep3 131 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 132 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 133 store i32 %load, i32 addrspace(1)* %out 134 135 store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef 136 ret void 137} 138 139; We can't actually handle LDS initializers in global initializers, 140; but this should count as usage. 141 142; IR-LABEL: @constant_expression_uses_all_lds_global_initializer( 143; IR: alloca 144 145; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer: 146; ASM: .group_segment_fixed_size: 65536 147define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { 148entry: 149 %stack = alloca [4 x i32], align 4, addrspace(5) 150 %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 151 %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 152 %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 153 %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 154 store i32 9, i32 addrspace(5)* %gep0 155 store i32 10, i32 addrspace(5)* %gep1 156 store i32 99, i32 addrspace(5)* %gep2 157 store i32 43, i32 addrspace(5)* %gep3 158 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx 159 %load = load i32, i32 addrspace(5)* %arrayidx, align 4 160 store i32 %load, i32 addrspace(1)* %out 161 store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef 162 ret void 163} 164 165attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" } 166