1; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
2; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
3
4target datalayout = "A5"
5
6@all_lds = internal unnamed_addr addrspace(3) global [16384 x i32] undef, align 4
7@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
8
9@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
10@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
11
12; This function cannot promote to using LDS because of the size of the
13; constant expression use in the function, which was previously not
14; detected.
15; IR-LABEL: @constant_expression_uses_all_lds(
16; IR: alloca
17
18; ASM-LABEL: constant_expression_uses_all_lds:
19; ASM: .amdhsa_group_segment_fixed_size 65536
20define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
21entry:
22  %stack = alloca [4 x i32], align 4, addrspace(5)
23  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
24  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
25  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
26  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
27  store i32 9, i32 addrspace(5)* %gep0
28  store i32 10, i32 addrspace(5)* %gep1
29  store i32 99, i32 addrspace(5)* %gep2
30  store i32 43, i32 addrspace(5)* %gep3
31  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
32  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
33  store i32 %load, i32 addrspace(1)* %out
34
35  store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef
36  ret void
37}
38
39; Has a constant expression use through a single level of constant
40; expression, but not enough LDS to block promotion
41
42; IR-LABEL: @constant_expression_uses_some_lds(
43; IR-NOT: alloca
44
45; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
46; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
47define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
48entry:
49  %stack = alloca [4 x i32], align 4, addrspace(5)
50  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
51  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
52  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
53  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
54  store i32 9, i32 addrspace(5)* %gep0
55  store i32 10, i32 addrspace(5)* %gep1
56  store i32 99, i32 addrspace(5)* %gep2
57  store i32 43, i32 addrspace(5)* %gep3
58  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
59  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
60  store i32 %load, i32 addrspace(1)* %out
61  store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
62  ret void
63}
64
65declare void @callee(i8*)
66
67; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
68; IR: alloca
69
70; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
71; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
72define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
73entry:
74  %stack = alloca [4 x i32], align 4, addrspace(5)
75  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
76  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
77  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
78  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
79  store i32 9, i32 addrspace(5)* %gep0
80  store i32 10, i32 addrspace(5)* %gep1
81  store i32 99, i32 addrspace(5)* %gep2
82  store i32 43, i32 addrspace(5)* %gep3
83  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
84  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
85  store i32 %load, i32 addrspace(1)* %out
86  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
87  ret void
88}
89
90; IR-LABEL: @constant_expression_uses_some_lds_multi_level(
91; IR-NOT: alloca
92; IR: llvm.amdgcn.workitem.id
93
94; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
95; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
96define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
97entry:
98  %stack = alloca [4 x i32], align 4, addrspace(5)
99  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
100  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
101  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
102  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
103  store i32 9, i32 addrspace(5)* %gep0
104  store i32 10, i32 addrspace(5)* %gep1
105  store i32 99, i32 addrspace(5)* %gep2
106  store i32 43, i32 addrspace(5)* %gep3
107  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
108  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
109  store i32 %load, i32 addrspace(1)* %out
110  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
111  ret void
112}
113
114; IR-LABEL: @constant_expression_uses_some_lds_global_initializer(
115; IR-NOT: alloca
116; IR: llvm.amdgcn.workitem.id
117
118; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
119; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
120define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
121entry:
122  %stack = alloca [4 x i32], align 4, addrspace(5)
123  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
124  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
125  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
126  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
127  store i32 9, i32 addrspace(5)* %gep0
128  store i32 10, i32 addrspace(5)* %gep1
129  store i32 99, i32 addrspace(5)* %gep2
130  store i32 43, i32 addrspace(5)* %gep3
131  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
132  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
133  store i32 %load, i32 addrspace(1)* %out
134
135  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
136  ret void
137}
138
139; We can't actually handle LDS initializers in global initializers,
140; but this should count as usage.
141
142; IR-LABEL: @constant_expression_uses_all_lds_global_initializer(
143; IR: alloca
144
145; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
146; ASM: .group_segment_fixed_size: 65536
147define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
148entry:
149  %stack = alloca [4 x i32], align 4, addrspace(5)
150  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
151  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
152  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
153  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
154  store i32 9, i32 addrspace(5)* %gep0
155  store i32 10, i32 addrspace(5)* %gep1
156  store i32 99, i32 addrspace(5)* %gep2
157  store i32 43, i32 addrspace(5)* %gep3
158  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
159  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
160  store i32 %load, i32 addrspace(1)* %out
161  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
162  ret void
163}
164
165attributes #0 = { "amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="256,256" }
166