1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=GCN %s
2
3; This shows that the amount LDS size estimate should try to not be
4; sensitive to the order of the LDS globals. This should try to
5; estimate the worst case padding behavior to avoid overallocating
6; LDS.
7
8; These functions use the same amount of LDS, but the total, final
9; size changes depending on the visit order of first use.
10
11; The one with the suboptimal order resulting in extra padding exceeds
12; the desired limit
13
14; The padding estimate heuristic used by the promote alloca pass
15; is mostly determined by the order of the globals,
16
17; Raw usage = 1060 bytes
18; Rounded usage:
19; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072
20; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060
21
22; At default occupancy guess of 7, 2340 bytes available total.
23
24; 1280 need to be left to promote alloca
25; optimally packed, this requires
26
27
28@lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16
29@lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
30@lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4
31
32
33; GCN-LABEL: {{^}}promote_alloca_size_order_0:
34; GCN: workgroup_group_segment_byte_size = 1060
35define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
36entry:
37  %stack = alloca [5 x i32], align 4, addrspace(5)
38  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
39  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
40  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
41  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
42  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
43  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
44  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
45  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
46  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
47  store i32 %tmp2, i32 addrspace(1)* %out, align 4
48  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
49  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
50  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
51  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
52
53  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
54  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
55
56  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
57  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
58
59  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
60  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
61
62  ret void
63}
64
65; GCN-LABEL: {{^}}promote_alloca_size_order_1:
66; GCN: workgroup_group_segment_byte_size = 1072
67define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
68entry:
69  %stack = alloca [5 x i32], align 4, addrspace(5)
70  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
71  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
72  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
73  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
74  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
75  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
76  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
77  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
78  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
79  store i32 %tmp2, i32 addrspace(1)* %out, align 4
80  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
81  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
82  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
83  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
84
85  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
86  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
87
88  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
89  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
90
91  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
92  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
93
94  ret void
95}
96
97@lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4
98@lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16
99
100; The guess from the alignment padding pushes this over the determined
101; size limit, so it isn't promoted
102
103; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
104; GCN: workgroup_group_segment_byte_size = 1060
105define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
106entry:
107  %stack = alloca [5 x i32], align 4, addrspace(5)
108  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
109  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
110  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
111  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
112  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
113  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
114  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
115  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
116  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
117  store i32 %tmp2, i32 addrspace(1)* %out, align 4
118  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
119  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
120  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
121  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
122
123  %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
124  store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
125
126  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
127  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
128
129  ret void
130}
131
132attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,7" }
133