1; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
3; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
5
6declare i32 @llvm.amdgcn.workitem.id.x()
7
8; GCN-LABEL: {{^}}system_unordered:
9; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
10; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
11; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
12; GFX89-NOT: buffer_wbinvl1_vol
13; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
14define amdgpu_kernel void @system_unordered(
15    i32* %in, i32* %out) {
16entry:
17  %val = load atomic i32, i32* %in unordered, align 4
18  store i32 %val, i32* %out
19  ret void
20}
21
22; GCN-LABEL: {{^}}system_monotonic:
23; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
24; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
25; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
26; GFX89-NOT: buffer_wbinvl1_vol
27; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
28define amdgpu_kernel void @system_monotonic(
29    i32* %in, i32* %out) {
30entry:
31  %val = load atomic i32, i32* %in monotonic, align 4
32  store i32 %val, i32* %out
33  ret void
34}
35
36; GCN-LABEL: {{^}}system_acquire:
37; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
38; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
39; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
40; GFX89-NEXT: buffer_wbinvl1_vol
41; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
42define amdgpu_kernel void @system_acquire(
43    i32* %in, i32* %out) {
44entry:
45  %val = load atomic i32, i32* %in acquire, align 4
46  store i32 %val, i32* %out
47  ret void
48}
49
50; GCN-LABEL: {{^}}system_seq_cst:
51; GCN:        s_waitcnt vmcnt(0){{$}}
52; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
53; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
54; GFX89-NEXT: buffer_wbinvl1_vol
55; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
56define amdgpu_kernel void @system_seq_cst(
57    i32* %in, i32* %out) {
58entry:
59  %val = load atomic i32, i32* %in seq_cst, align 4
60  store i32 %val, i32* %out
61  ret void
62}
63
64; GCN-LABEL: {{^}}singlethread_unordered:
65; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
66; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
67; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
68; GFX89-NOT: buffer_wbinvl1_vol
69; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
70define amdgpu_kernel void @singlethread_unordered(
71    i32* %in, i32* %out) {
72entry:
73  %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
74  store i32 %val, i32* %out
75  ret void
76}
77
78; GCN-LABEL: {{^}}singlethread_monotonic:
79; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
80; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
81; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
82; GFX89-NOT: buffer_wbinvl1_vol
83; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
84define amdgpu_kernel void @singlethread_monotonic(
85    i32* %in, i32* %out) {
86entry:
87  %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
88  store i32 %val, i32* %out
89  ret void
90}
91
92; GCN-LABEL: {{^}}singlethread_acquire:
93; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
94; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
95; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
96; GFX89-NOT: buffer_wbinvl1_vol
97; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
98define amdgpu_kernel void @singlethread_acquire(
99    i32* %in, i32* %out) {
100entry:
101  %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
102  store i32 %val, i32* %out
103  ret void
104}
105
106; GCN-LABEL: {{^}}singlethread_seq_cst:
107; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
108; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
109; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
110; GFX89-NOT: buffer_wbinvl1_vol
111; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
112define amdgpu_kernel void @singlethread_seq_cst(
113    i32* %in, i32* %out) {
114entry:
115  %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
116  store i32 %val, i32* %out
117  ret void
118}
119
120; GCN-LABEL: {{^}}agent_unordered:
121; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
122; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
123; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
124; GFX89-NOT: buffer_wbinvl1_vol
125; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
126define amdgpu_kernel void @agent_unordered(
127    i32* %in, i32* %out) {
128entry:
129  %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
130  store i32 %val, i32* %out
131  ret void
132}
133
134; GCN-LABEL: {{^}}agent_monotonic:
135; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
136; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
137; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
138; GFX89-NOT: buffer_wbinvl1_vol
139; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
140define amdgpu_kernel void @agent_monotonic(
141    i32* %in, i32* %out) {
142entry:
143  %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
144  store i32 %val, i32* %out
145  ret void
146}
147
148; GCN-LABEL: {{^}}agent_acquire:
149; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
150; GCN:        flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
151; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
152; GFX89-NEXT: buffer_wbinvl1_vol
153; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
154define amdgpu_kernel void @agent_acquire(
155    i32* %in, i32* %out) {
156entry:
157  %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
158  store i32 %val, i32* %out
159  ret void
160}
161
162; GCN-LABEL: {{^}}agent_seq_cst:
163; GCN:        s_waitcnt vmcnt(0){{$}}
164; GCN-NEXT:   flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
165; GCN-NEXT:   s_waitcnt vmcnt(0){{$}}
166; GFX89-NEXT: buffer_wbinvl1_vol
167; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
168define amdgpu_kernel void @agent_seq_cst(
169    i32* %in, i32* %out) {
170entry:
171  %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
172  store i32 %val, i32* %out
173  ret void
174}
175
176; GCN-LABEL: {{^}}workgroup_unordered:
177; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
178; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
179; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
180; GFX89-NOT: buffer_wbinvl1_vol
181; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
182define amdgpu_kernel void @workgroup_unordered(
183    i32* %in, i32* %out) {
184entry:
185  %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
186  store i32 %val, i32* %out
187  ret void
188}
189
190; GCN-LABEL: {{^}}workgroup_monotonic:
191; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
192; GFX89:     flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
193; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
194; GFX89-NOT: buffer_wbinvl1_vol
195; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
196define amdgpu_kernel void @workgroup_monotonic(
197    i32* %in, i32* %out) {
198entry:
199  %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
200  store i32 %val, i32* %out
201  ret void
202}
203
204; GCN-LABEL: {{^}}workgroup_acquire:
205; GCN-NOT:    s_waitcnt vmcnt(0){{$}}
206; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
207; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
208; GFX89-NOT:  buffer_wbinvl1_vol
209; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
210define amdgpu_kernel void @workgroup_acquire(
211    i32* %in, i32* %out) {
212entry:
213  %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
214  store i32 %val, i32* %out
215  ret void
216}
217
218; GCN-LABEL: {{^}}workgroup_seq_cst:
219; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
220; GFX89:      flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
221; GFX89-NOT:  s_waitcnt vmcnt(0){{$}}
222; GFX89-NOT:  buffer_wbinvl1_vol
223; GCN:        flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
224define amdgpu_kernel void @workgroup_seq_cst(
225    i32* %in, i32* %out) {
226entry:
227  %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
228  store i32 %val, i32* %out
229  ret void
230}
231
232; GCN-LABEL: {{^}}wavefront_unordered:
233; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
234; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
235; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
236; GFX89-NOT: buffer_wbinvl1_vol
237; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
238define amdgpu_kernel void @wavefront_unordered(
239    i32* %in, i32* %out) {
240entry:
241  %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
242  store i32 %val, i32* %out
243  ret void
244}
245
246; GCN-LABEL: {{^}}wavefront_monotonic:
247; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
248; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
249; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
250; GFX89-NOT: buffer_wbinvl1_vol
251; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
252define amdgpu_kernel void @wavefront_monotonic(
253    i32* %in, i32* %out) {
254entry:
255  %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
256  store i32 %val, i32* %out
257  ret void
258}
259
260; GCN-LABEL: {{^}}wavefront_acquire:
261; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
262; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
263; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
264; GFX89-NOT: buffer_wbinvl1_vol
265; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
266define amdgpu_kernel void @wavefront_acquire(
267    i32* %in, i32* %out) {
268entry:
269  %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
270  store i32 %val, i32* %out
271  ret void
272}
273
274; GCN-LABEL: {{^}}wavefront_seq_cst:
275; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
276; GCN:       flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
277; GCN-NOT:   s_waitcnt vmcnt(0){{$}}
278; GFX89-NOT: buffer_wbinvl1_vol
279; GCN:       flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
280define amdgpu_kernel void @wavefront_seq_cst(
281    i32* %in, i32* %out) {
282entry:
283  %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
284  store i32 %val, i32* %out
285  ret void
286}
287
288; GCN-LABEL: {{^}}nontemporal_private_0:
289; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
290define amdgpu_kernel void @nontemporal_private_0(
291    i32 addrspace(5)* %in, i32* %out) {
292entry:
293  %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
294  store i32 %val, i32* %out
295  ret void
296}
297
298; GCN-LABEL: {{^}}nontemporal_private_1:
299; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
300define amdgpu_kernel void @nontemporal_private_1(
301    i32 addrspace(5)* %in, i32* %out) {
302entry:
303  %tid = call i32 @llvm.amdgcn.workitem.id.x()
304  %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
305  %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
306  store i32 %val, i32* %out
307  ret void
308}
309
310; GCN-LABEL: {{^}}nontemporal_global_0:
311; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
312define amdgpu_kernel void @nontemporal_global_0(
313    i32 addrspace(1)* %in, i32* %out) {
314entry:
315  %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
316  store i32 %val, i32* %out
317  ret void
318}
319
320; GCN-LABEL: {{^}}nontemporal_global_1:
321; GFX8:  flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
322; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
323define amdgpu_kernel void @nontemporal_global_1(
324    i32 addrspace(1)* %in, i32* %out) {
325entry:
326  %tid = call i32 @llvm.amdgcn.workitem.id.x()
327  %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
328  %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
329  store i32 %val, i32* %out
330  ret void
331}
332
333; GCN-LABEL: {{^}}nontemporal_local_0:
334; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
335define amdgpu_kernel void @nontemporal_local_0(
336    i32 addrspace(3)* %in, i32* %out) {
337entry:
338  %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
339  store i32 %val, i32* %out
340  ret void
341}
342
343; GCN-LABEL: {{^}}nontemporal_local_1:
344; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
345define amdgpu_kernel void @nontemporal_local_1(
346    i32 addrspace(3)* %in, i32* %out) {
347entry:
348  %tid = call i32 @llvm.amdgcn.workitem.id.x()
349  %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
350  %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
351  store i32 %val, i32* %out
352  ret void
353}
354
355; GCN-LABEL: {{^}}nontemporal_flat_0:
356; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
357define amdgpu_kernel void @nontemporal_flat_0(
358    i32* %in, i32* %out) {
359entry:
360  %val = load i32, i32* %in, align 4, !nontemporal !0
361  store i32 %val, i32* %out
362  ret void
363}
364
365; GCN-LABEL: {{^}}nontemporal_flat_1:
366; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
367define amdgpu_kernel void @nontemporal_flat_1(
368    i32* %in, i32* %out) {
369entry:
370  %tid = call i32 @llvm.amdgcn.workitem.id.x()
371  %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
372  %val = load i32, i32* %val.gep, align 4, !nontemporal !0
373  store i32 %val, i32* %out
374  ret void
375}
376
377!0 = !{i32 1}
378