1; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
2
3declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
4declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
5declare void @llvm.amdgcn.s.barrier() #1
6
7
8@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
9@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
10@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
11
12; FUNC-LABEL: @reorder_local_load_global_store_local_load
13; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
14; CI: buffer_store_dword
15define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
16  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
17
18  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
19  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
20
21  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
22  store i32 99, i32 addrspace(1)* %gptr, align 4
23  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
24
25  %add = add nsw i32 %tmp1, %tmp2
26
27  store i32 %add, i32 addrspace(1)* %out, align 4
28  ret void
29}
30
31; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
32; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
33; CI: buffer_store_dword
34; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
35define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
36  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
37
38  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
39  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
40
41  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
42  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
43  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
44
45  %add = add nsw i32 %tmp1, %tmp2
46
47  store i32 %add, i32 addrspace(1)* %out, align 4
48  ret void
49}
50
51; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
52; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
53; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
54; CI: buffer_store_dword
55define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
56  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
57
58  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
59  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
60
61  %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
62  store i32 99, i32 addrspace(1)* %gptr, align 4
63  call void @llvm.amdgcn.s.barrier() #1
64  %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
65
66  %add = add nsw i32 %tmp1, %tmp2
67
68  store i32 %add, i32 addrspace(1)* %out, align 4
69  ret void
70}
71
72; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
73; CI-DAG: buffer_store_dword
74; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
75; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
76; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
77; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
78; CI: buffer_store_dword
79define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
80  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
81
82  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
83  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
84
85  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
86  store i32 99, i32 addrspace(1)* %gptr, align 4
87  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
88
89  %add = add nsw i32 %tmp1, %tmp2
90
91  store i32 %add, i32 addrspace(1)* %out, align 4
92  ret void
93}
94
95; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
96; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
97; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
98; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
99; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
100; CI: ds_write_b32
101; CI: buffer_store_dword
102define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
103  %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
104
105  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
106  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 3
107
108  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
109  store i32 99, i32 addrspace(3)* %lptr, align 4
110  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
111
112  %add = add nsw i32 %tmp1, %tmp2
113
114  store i32 %add, i32 addrspace(1)* %out, align 4
115  ret void
116}
117
118; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
119; CI: s_load_dword
120; CI: s_load_dword
121; CI: s_load_dword
122; CI: ds_write_b32
123; CI: buffer_store_dword
124define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
125  %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
126  %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
127
128  %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4
129  store i32 99, i32 addrspace(3)* %lptr, align 4
130  %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4
131
132  %add = add nsw i32 %tmp1, %tmp2
133
134  store i32 %add, i32 addrspace(1)* %out, align 4
135  ret void
136}
137
138; FUNC-LABEL: @reorder_global_load_local_store_global_load
139; CI: buffer_load_dword
140; CI: buffer_load_dword
141; CI: ds_write_b32
142; CI: buffer_store_dword
143define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
144  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
145  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
146
147  %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4
148  store i32 99, i32 addrspace(3)* %lptr, align 4
149  %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4
150
151  %add = add nsw i32 %tmp1, %tmp2
152
153  store i32 %add, i32 addrspace(1)* %out, align 4
154  ret void
155}
156
157; FUNC-LABEL: @reorder_local_offsets
158; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
159; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
160; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
161; CI: buffer_store_dword
162; CI: s_endpgm
163define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
164  %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
165  %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
166  %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
167
168  store i32 123, i32 addrspace(3)* %ptr1, align 4
169  %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4
170  %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4
171  store i32 123, i32 addrspace(3)* %ptr2, align 4
172  %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4
173  store i32 789, i32 addrspace(3)* %ptr3, align 4
174
175  %add.0 = add nsw i32 %tmp2, %tmp1
176  %add.1 = add nsw i32 %add.0, %tmp3
177  store i32 %add.1, i32 addrspace(1)* %out, align 4
178  ret void
179}
180
181; FUNC-LABEL: @reorder_global_offsets
182; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
183; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
184; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
185; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
186; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
187; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
188; CI: s_endpgm
189define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
190  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
191  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
192  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
193
194  store i32 123, i32 addrspace(1)* %ptr1, align 4
195  %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4
196  %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4
197  store i32 123, i32 addrspace(1)* %ptr2, align 4
198  %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4
199  store i32 789, i32 addrspace(1)* %ptr3, align 4
200
201  %add.0 = add nsw i32 %tmp2, %tmp1
202  %add.1 = add nsw i32 %add.0, %tmp3
203  store i32 %add.1, i32 addrspace(1)* %out, align 4
204  ret void
205}
206
207; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
208; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
209; XCI: TBUFFER_STORE_FORMAT
210; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
211; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 {
212;   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
213
214;   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
215;   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2
216
217;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
218
219;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
220;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
221;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
222;         i32 1, i32 0)
223
224;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
225
226;   %add = add nsw i32 %tmp1, %tmp2
227
228;   store i32 %add, i32 addrspace(1)* %out, align 4
229;   ret void
230; }
231
232attributes #0 = { nounwind }
233attributes #1 = { nounwind convergent }
234