1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @global_workgroup_unordered_load(
9; GFX6-LABEL: global_workgroup_unordered_load:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
12; GFX6-NEXT:    s_mov_b32 s3, 0xf000
13; GFX6-NEXT:    s_mov_b32 s2, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_mov_b32 s0, s4
16; GFX6-NEXT:    s_mov_b32 s1, s5
17; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
18; GFX6-NEXT:    s_mov_b32 s4, s6
19; GFX6-NEXT:    s_mov_b32 s5, s7
20; GFX6-NEXT:    s_mov_b32 s6, s2
21; GFX6-NEXT:    s_mov_b32 s7, s3
22; GFX6-NEXT:    s_waitcnt vmcnt(0)
23; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
24; GFX6-NEXT:    s_endpgm
25;
26; GFX7-LABEL: global_workgroup_unordered_load:
27; GFX7:       ; %bb.0: ; %entry
28; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
29; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX7-NEXT:    v_mov_b32_e32 v0, s0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    flat_load_dword v0, v[0:1]
33; GFX7-NEXT:    v_mov_b32_e32 v2, s2
34; GFX7-NEXT:    v_mov_b32_e32 v3, s3
35; GFX7-NEXT:    s_waitcnt vmcnt(0)
36; GFX7-NEXT:    flat_store_dword v[2:3], v0
37; GFX7-NEXT:    s_endpgm
38;
39; GFX10-WGP-LABEL: global_workgroup_unordered_load:
40; GFX10-WGP:       ; %bb.0: ; %entry
41; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
45; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
46; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
47; GFX10-WGP-NEXT:    s_endpgm
48;
49; GFX10-CU-LABEL: global_workgroup_unordered_load:
50; GFX10-CU:       ; %bb.0: ; %entry
51; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
53; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
55; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
56; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
57; GFX10-CU-NEXT:    s_endpgm
58;
59; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load:
60; SKIP-CACHE-INV:       ; %bb.0: ; %entry
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
62; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
63; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
64; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
66; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
67; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
69; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
70; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
71; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
72; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
73; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
74; SKIP-CACHE-INV-NEXT:    s_endpgm
75    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
76entry:
77  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4
78  store i32 %val, i32 addrspace(1)* %out
79  ret void
80}
81
82define amdgpu_kernel void @global_workgroup_monotonic_load(
83; GFX6-LABEL: global_workgroup_monotonic_load:
84; GFX6:       ; %bb.0: ; %entry
85; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
86; GFX6-NEXT:    s_mov_b32 s3, 0xf000
87; GFX6-NEXT:    s_mov_b32 s2, -1
88; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX6-NEXT:    s_mov_b32 s0, s4
90; GFX6-NEXT:    s_mov_b32 s1, s5
91; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
92; GFX6-NEXT:    s_mov_b32 s4, s6
93; GFX6-NEXT:    s_mov_b32 s5, s7
94; GFX6-NEXT:    s_mov_b32 s6, s2
95; GFX6-NEXT:    s_mov_b32 s7, s3
96; GFX6-NEXT:    s_waitcnt vmcnt(0)
97; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
98; GFX6-NEXT:    s_endpgm
99;
100; GFX7-LABEL: global_workgroup_monotonic_load:
101; GFX7:       ; %bb.0: ; %entry
102; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
103; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX7-NEXT:    v_mov_b32_e32 v0, s0
105; GFX7-NEXT:    v_mov_b32_e32 v1, s1
106; GFX7-NEXT:    flat_load_dword v0, v[0:1]
107; GFX7-NEXT:    v_mov_b32_e32 v2, s2
108; GFX7-NEXT:    v_mov_b32_e32 v3, s3
109; GFX7-NEXT:    s_waitcnt vmcnt(0)
110; GFX7-NEXT:    flat_store_dword v[2:3], v0
111; GFX7-NEXT:    s_endpgm
112;
113; GFX10-WGP-LABEL: global_workgroup_monotonic_load:
114; GFX10-WGP:       ; %bb.0: ; %entry
115; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
117; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
119; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
120; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
121; GFX10-WGP-NEXT:    s_endpgm
122;
123; GFX10-CU-LABEL: global_workgroup_monotonic_load:
124; GFX10-CU:       ; %bb.0: ; %entry
125; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
126; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
127; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
129; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
130; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
131; GFX10-CU-NEXT:    s_endpgm
132;
133; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load:
134; SKIP-CACHE-INV:       ; %bb.0: ; %entry
135; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
136; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
137; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
138; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
139; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
140; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
141; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
142; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
143; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
144; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
145; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
146; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
147; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
148; SKIP-CACHE-INV-NEXT:    s_endpgm
149    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
150entry:
151  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4
152  store i32 %val, i32 addrspace(1)* %out
153  ret void
154}
155
156define amdgpu_kernel void @global_workgroup_acquire_load(
157; GFX6-LABEL: global_workgroup_acquire_load:
158; GFX6:       ; %bb.0: ; %entry
159; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
160; GFX6-NEXT:    s_mov_b32 s3, 0xf000
161; GFX6-NEXT:    s_mov_b32 s2, -1
162; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX6-NEXT:    s_mov_b32 s0, s4
164; GFX6-NEXT:    s_mov_b32 s1, s5
165; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
166; GFX6-NEXT:    s_mov_b32 s4, s6
167; GFX6-NEXT:    s_mov_b32 s5, s7
168; GFX6-NEXT:    s_mov_b32 s6, s2
169; GFX6-NEXT:    s_mov_b32 s7, s3
170; GFX6-NEXT:    s_waitcnt vmcnt(0)
171; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
172; GFX6-NEXT:    s_endpgm
173;
174; GFX7-LABEL: global_workgroup_acquire_load:
175; GFX7:       ; %bb.0: ; %entry
176; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX7-NEXT:    v_mov_b32_e32 v0, s0
179; GFX7-NEXT:    v_mov_b32_e32 v1, s1
180; GFX7-NEXT:    flat_load_dword v0, v[0:1]
181; GFX7-NEXT:    v_mov_b32_e32 v2, s2
182; GFX7-NEXT:    v_mov_b32_e32 v3, s3
183; GFX7-NEXT:    s_waitcnt vmcnt(0)
184; GFX7-NEXT:    flat_store_dword v[2:3], v0
185; GFX7-NEXT:    s_endpgm
186;
187; GFX10-WGP-LABEL: global_workgroup_acquire_load:
188; GFX10-WGP:       ; %bb.0: ; %entry
189; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
190; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
191; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
193; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
194; GFX10-WGP-NEXT:    buffer_gl0_inv
195; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
196; GFX10-WGP-NEXT:    s_endpgm
197;
198; GFX10-CU-LABEL: global_workgroup_acquire_load:
199; GFX10-CU:       ; %bb.0: ; %entry
200; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
201; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
202; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
204; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
205; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
206; GFX10-CU-NEXT:    s_endpgm
207;
208; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load:
209; SKIP-CACHE-INV:       ; %bb.0: ; %entry
210; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
211; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
212; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
213; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
214; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
215; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
216; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
217; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
218; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
221; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
222; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
223; SKIP-CACHE-INV-NEXT:    s_endpgm
224    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
225entry:
226  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
227  store i32 %val, i32 addrspace(1)* %out
228  ret void
229}
230
231define amdgpu_kernel void @global_workgroup_seq_cst_load(
232; GFX6-LABEL: global_workgroup_seq_cst_load:
233; GFX6:       ; %bb.0: ; %entry
234; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
235; GFX6-NEXT:    s_mov_b32 s3, 0xf000
236; GFX6-NEXT:    s_mov_b32 s2, -1
237; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX6-NEXT:    s_mov_b32 s0, s4
239; GFX6-NEXT:    s_mov_b32 s1, s5
240; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
242; GFX6-NEXT:    s_mov_b32 s4, s6
243; GFX6-NEXT:    s_mov_b32 s5, s7
244; GFX6-NEXT:    s_mov_b32 s6, s2
245; GFX6-NEXT:    s_mov_b32 s7, s3
246; GFX6-NEXT:    s_waitcnt vmcnt(0)
247; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
248; GFX6-NEXT:    s_endpgm
249;
250; GFX7-LABEL: global_workgroup_seq_cst_load:
251; GFX7:       ; %bb.0: ; %entry
252; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
253; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX7-NEXT:    v_mov_b32_e32 v0, s0
255; GFX7-NEXT:    v_mov_b32_e32 v1, s1
256; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX7-NEXT:    flat_load_dword v0, v[0:1]
258; GFX7-NEXT:    v_mov_b32_e32 v2, s2
259; GFX7-NEXT:    v_mov_b32_e32 v3, s3
260; GFX7-NEXT:    s_waitcnt vmcnt(0)
261; GFX7-NEXT:    flat_store_dword v[2:3], v0
262; GFX7-NEXT:    s_endpgm
263;
264; GFX10-WGP-LABEL: global_workgroup_seq_cst_load:
265; GFX10-WGP:       ; %bb.0: ; %entry
266; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
267; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
268; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
269; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
270; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
271; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
272; GFX10-WGP-NEXT:    buffer_gl0_inv
273; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
274; GFX10-WGP-NEXT:    s_endpgm
275;
276; GFX10-CU-LABEL: global_workgroup_seq_cst_load:
277; GFX10-CU:       ; %bb.0: ; %entry
278; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
279; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
280; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
282; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
283; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
284; GFX10-CU-NEXT:    s_endpgm
285;
286; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load:
287; SKIP-CACHE-INV:       ; %bb.0: ; %entry
288; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
289; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
290; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
291; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
292; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
293; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
294; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
295; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
296; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
297; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
298; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
299; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
300; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
301; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
302; SKIP-CACHE-INV-NEXT:    s_endpgm
303    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
304entry:
305  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4
306  store i32 %val, i32 addrspace(1)* %out
307  ret void
308}
309
310define amdgpu_kernel void @global_workgroup_unordered_store(
311; GFX6-LABEL: global_workgroup_unordered_store:
312; GFX6:       ; %bb.0: ; %entry
313; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
314; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
315; GFX6-NEXT:    s_mov_b32 s3, 0xf000
316; GFX6-NEXT:    s_mov_b32 s2, -1
317; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX6-NEXT:    v_mov_b32_e32 v0, s4
319; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
320; GFX6-NEXT:    s_endpgm
321;
322; GFX7-LABEL: global_workgroup_unordered_store:
323; GFX7:       ; %bb.0: ; %entry
324; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
325; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
326; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX7-NEXT:    v_mov_b32_e32 v2, s2
328; GFX7-NEXT:    v_mov_b32_e32 v0, s0
329; GFX7-NEXT:    v_mov_b32_e32 v1, s1
330; GFX7-NEXT:    flat_store_dword v[0:1], v2
331; GFX7-NEXT:    s_endpgm
332;
333; GFX10-WGP-LABEL: global_workgroup_unordered_store:
334; GFX10-WGP:       ; %bb.0: ; %entry
335; GFX10-WGP-NEXT:    s_clause 0x1
336; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
337; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
338; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
339; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
341; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
342; GFX10-WGP-NEXT:    s_endpgm
343;
344; GFX10-CU-LABEL: global_workgroup_unordered_store:
345; GFX10-CU:       ; %bb.0: ; %entry
346; GFX10-CU-NEXT:    s_clause 0x1
347; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
348; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
349; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
350; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
352; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
353; GFX10-CU-NEXT:    s_endpgm
354;
355; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_store:
356; SKIP-CACHE-INV:       ; %bb.0: ; %entry
357; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
358; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
359; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
360; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
361; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
362; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
363; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
364; SKIP-CACHE-INV-NEXT:    s_endpgm
365    i32 %in, i32 addrspace(1)* %out) {
366entry:
367  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4
368  ret void
369}
370
371define amdgpu_kernel void @global_workgroup_monotonic_store(
372; GFX6-LABEL: global_workgroup_monotonic_store:
373; GFX6:       ; %bb.0: ; %entry
374; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
375; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
376; GFX6-NEXT:    s_mov_b32 s3, 0xf000
377; GFX6-NEXT:    s_mov_b32 s2, -1
378; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX6-NEXT:    v_mov_b32_e32 v0, s4
380; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
381; GFX6-NEXT:    s_endpgm
382;
383; GFX7-LABEL: global_workgroup_monotonic_store:
384; GFX7:       ; %bb.0: ; %entry
385; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
386; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
387; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX7-NEXT:    v_mov_b32_e32 v2, s2
389; GFX7-NEXT:    v_mov_b32_e32 v0, s0
390; GFX7-NEXT:    v_mov_b32_e32 v1, s1
391; GFX7-NEXT:    flat_store_dword v[0:1], v2
392; GFX7-NEXT:    s_endpgm
393;
394; GFX10-WGP-LABEL: global_workgroup_monotonic_store:
395; GFX10-WGP:       ; %bb.0: ; %entry
396; GFX10-WGP-NEXT:    s_clause 0x1
397; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
398; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
399; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
400; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
402; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
403; GFX10-WGP-NEXT:    s_endpgm
404;
405; GFX10-CU-LABEL: global_workgroup_monotonic_store:
406; GFX10-CU:       ; %bb.0: ; %entry
407; GFX10-CU-NEXT:    s_clause 0x1
408; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
409; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
410; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
411; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
413; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
414; GFX10-CU-NEXT:    s_endpgm
415;
416; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_store:
417; SKIP-CACHE-INV:       ; %bb.0: ; %entry
418; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
419; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
420; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
421; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
422; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
423; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
424; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
425; SKIP-CACHE-INV-NEXT:    s_endpgm
426    i32 %in, i32 addrspace(1)* %out) {
427entry:
428  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4
429  ret void
430}
431
432define amdgpu_kernel void @global_workgroup_release_store(
433; GFX6-LABEL: global_workgroup_release_store:
434; GFX6:       ; %bb.0: ; %entry
435; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
436; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
437; GFX6-NEXT:    s_mov_b32 s3, 0xf000
438; GFX6-NEXT:    s_mov_b32 s2, -1
439; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
440; GFX6-NEXT:    v_mov_b32_e32 v0, s4
441; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
443; GFX6-NEXT:    s_endpgm
444;
445; GFX7-LABEL: global_workgroup_release_store:
446; GFX7:       ; %bb.0: ; %entry
447; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
448; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
449; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX7-NEXT:    v_mov_b32_e32 v2, s2
451; GFX7-NEXT:    v_mov_b32_e32 v0, s0
452; GFX7-NEXT:    v_mov_b32_e32 v1, s1
453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX7-NEXT:    flat_store_dword v[0:1], v2
455; GFX7-NEXT:    s_endpgm
456;
457; GFX10-WGP-LABEL: global_workgroup_release_store:
458; GFX10-WGP:       ; %bb.0: ; %entry
459; GFX10-WGP-NEXT:    s_clause 0x1
460; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
461; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
462; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
463; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
465; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
466; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
467; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
468; GFX10-WGP-NEXT:    s_endpgm
469;
470; GFX10-CU-LABEL: global_workgroup_release_store:
471; GFX10-CU:       ; %bb.0: ; %entry
472; GFX10-CU-NEXT:    s_clause 0x1
473; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
474; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
475; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
476; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
478; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
480; GFX10-CU-NEXT:    s_endpgm
481;
482; SKIP-CACHE-INV-LABEL: global_workgroup_release_store:
483; SKIP-CACHE-INV:       ; %bb.0: ; %entry
484; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
485; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
486; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
487; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
488; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
489; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
490; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
491; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
492; SKIP-CACHE-INV-NEXT:    s_endpgm
493    i32 %in, i32 addrspace(1)* %out) {
494entry:
495  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
496  ret void
497}
498
499define amdgpu_kernel void @global_workgroup_seq_cst_store(
500; GFX6-LABEL: global_workgroup_seq_cst_store:
501; GFX6:       ; %bb.0: ; %entry
502; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
503; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
504; GFX6-NEXT:    s_mov_b32 s3, 0xf000
505; GFX6-NEXT:    s_mov_b32 s2, -1
506; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX6-NEXT:    v_mov_b32_e32 v0, s4
508; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
510; GFX6-NEXT:    s_endpgm
511;
512; GFX7-LABEL: global_workgroup_seq_cst_store:
513; GFX7:       ; %bb.0: ; %entry
514; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
515; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
516; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX7-NEXT:    v_mov_b32_e32 v2, s2
518; GFX7-NEXT:    v_mov_b32_e32 v0, s0
519; GFX7-NEXT:    v_mov_b32_e32 v1, s1
520; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX7-NEXT:    flat_store_dword v[0:1], v2
522; GFX7-NEXT:    s_endpgm
523;
524; GFX10-WGP-LABEL: global_workgroup_seq_cst_store:
525; GFX10-WGP:       ; %bb.0: ; %entry
526; GFX10-WGP-NEXT:    s_clause 0x1
527; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
528; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
529; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
530; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
532; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
533; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
534; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
535; GFX10-WGP-NEXT:    s_endpgm
536;
537; GFX10-CU-LABEL: global_workgroup_seq_cst_store:
538; GFX10-CU:       ; %bb.0: ; %entry
539; GFX10-CU-NEXT:    s_clause 0x1
540; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
541; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
542; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
543; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
544; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
545; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
546; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
547; GFX10-CU-NEXT:    s_endpgm
548;
549; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_store:
550; SKIP-CACHE-INV:       ; %bb.0: ; %entry
551; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
552; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
553; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
554; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
555; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
556; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
557; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
558; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
559; SKIP-CACHE-INV-NEXT:    s_endpgm
560    i32 %in, i32 addrspace(1)* %out) {
561entry:
562  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4
563  ret void
564}
565
566define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
567; GFX6-LABEL: global_workgroup_monotonic_atomicrmw:
568; GFX6:       ; %bb.0: ; %entry
569; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
570; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
571; GFX6-NEXT:    s_mov_b32 s7, 0xf000
572; GFX6-NEXT:    s_mov_b32 s6, -1
573; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX6-NEXT:    v_mov_b32_e32 v0, s0
575; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
576; GFX6-NEXT:    s_endpgm
577;
578; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
579; GFX7:       ; %bb.0: ; %entry
580; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
581; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
582; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX7-NEXT:    v_mov_b32_e32 v0, s0
584; GFX7-NEXT:    v_mov_b32_e32 v1, s1
585; GFX7-NEXT:    v_mov_b32_e32 v2, s2
586; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
587; GFX7-NEXT:    s_endpgm
588;
589; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw:
590; GFX10-WGP:       ; %bb.0: ; %entry
591; GFX10-WGP-NEXT:    s_clause 0x1
592; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
593; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
594; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
595; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
596; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
597; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
598; GFX10-WGP-NEXT:    s_endpgm
599;
600; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw:
601; GFX10-CU:       ; %bb.0: ; %entry
602; GFX10-CU-NEXT:    s_clause 0x1
603; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
604; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
605; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
606; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
608; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
609; GFX10-CU-NEXT:    s_endpgm
610;
611; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw:
612; SKIP-CACHE-INV:       ; %bb.0: ; %entry
613; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
614; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
615; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
616; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
617; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
618; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
619; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
620; SKIP-CACHE-INV-NEXT:    s_endpgm
621    i32 addrspace(1)* %out, i32 %in) {
622entry:
623  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic
624  ret void
625}
626
627define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
628; GFX6-LABEL: global_workgroup_acquire_atomicrmw:
629; GFX6:       ; %bb.0: ; %entry
630; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
631; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
632; GFX6-NEXT:    s_mov_b32 s7, 0xf000
633; GFX6-NEXT:    s_mov_b32 s6, -1
634; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX6-NEXT:    v_mov_b32_e32 v0, s0
636; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
637; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX6-NEXT:    s_endpgm
639;
640; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
641; GFX7:       ; %bb.0: ; %entry
642; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
643; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
644; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
645; GFX7-NEXT:    v_mov_b32_e32 v0, s0
646; GFX7-NEXT:    v_mov_b32_e32 v1, s1
647; GFX7-NEXT:    v_mov_b32_e32 v2, s2
648; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
649; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX7-NEXT:    s_endpgm
651;
652; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw:
653; GFX10-WGP:       ; %bb.0: ; %entry
654; GFX10-WGP-NEXT:    s_clause 0x1
655; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
656; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
657; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
658; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
660; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
661; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
663; GFX10-WGP-NEXT:    buffer_gl0_inv
664; GFX10-WGP-NEXT:    s_endpgm
665;
666; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw:
667; GFX10-CU:       ; %bb.0: ; %entry
668; GFX10-CU-NEXT:    s_clause 0x1
669; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
670; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
671; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
672; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
674; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
675; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX10-CU-NEXT:    s_endpgm
677;
678; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw:
679; SKIP-CACHE-INV:       ; %bb.0: ; %entry
680; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
681; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
682; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
683; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
684; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
685; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
686; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
687; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
688; SKIP-CACHE-INV-NEXT:    s_endpgm
689    i32 addrspace(1)* %out, i32 %in) {
690entry:
691  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
692  ret void
693}
694
695define amdgpu_kernel void @global_workgroup_release_atomicrmw(
696; GFX6-LABEL: global_workgroup_release_atomicrmw:
697; GFX6:       ; %bb.0: ; %entry
698; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
699; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
700; GFX6-NEXT:    s_mov_b32 s7, 0xf000
701; GFX6-NEXT:    s_mov_b32 s6, -1
702; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
703; GFX6-NEXT:    v_mov_b32_e32 v0, s0
704; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
706; GFX6-NEXT:    s_endpgm
707;
708; GFX7-LABEL: global_workgroup_release_atomicrmw:
709; GFX7:       ; %bb.0: ; %entry
710; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
711; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
712; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX7-NEXT:    v_mov_b32_e32 v0, s0
714; GFX7-NEXT:    v_mov_b32_e32 v1, s1
715; GFX7-NEXT:    v_mov_b32_e32 v2, s2
716; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
718; GFX7-NEXT:    s_endpgm
719;
720; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw:
721; GFX10-WGP:       ; %bb.0: ; %entry
722; GFX10-WGP-NEXT:    s_clause 0x1
723; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
724; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
725; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
726; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
728; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
729; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
730; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
731; GFX10-WGP-NEXT:    s_endpgm
732;
733; GFX10-CU-LABEL: global_workgroup_release_atomicrmw:
734; GFX10-CU:       ; %bb.0: ; %entry
735; GFX10-CU-NEXT:    s_clause 0x1
736; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
737; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
738; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
739; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
740; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
741; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
743; GFX10-CU-NEXT:    s_endpgm
744;
745; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw:
746; SKIP-CACHE-INV:       ; %bb.0: ; %entry
747; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
748; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
751; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
753; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
754; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
755; SKIP-CACHE-INV-NEXT:    s_endpgm
756    i32 addrspace(1)* %out, i32 %in) {
757entry:
758  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release
759  ret void
760}
761
762define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
763; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw:
764; GFX6:       ; %bb.0: ; %entry
765; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
766; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
767; GFX6-NEXT:    s_mov_b32 s7, 0xf000
768; GFX6-NEXT:    s_mov_b32 s6, -1
769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX6-NEXT:    v_mov_b32_e32 v0, s0
771; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
773; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX6-NEXT:    s_endpgm
775;
776; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
777; GFX7:       ; %bb.0: ; %entry
778; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
779; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
780; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
781; GFX7-NEXT:    v_mov_b32_e32 v0, s0
782; GFX7-NEXT:    v_mov_b32_e32 v1, s1
783; GFX7-NEXT:    v_mov_b32_e32 v2, s2
784; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
785; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
786; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX7-NEXT:    s_endpgm
788;
789; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
790; GFX10-WGP:       ; %bb.0: ; %entry
791; GFX10-WGP-NEXT:    s_clause 0x1
792; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
793; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
794; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
795; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
797; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
798; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
799; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
800; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
802; GFX10-WGP-NEXT:    buffer_gl0_inv
803; GFX10-WGP-NEXT:    s_endpgm
804;
805; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
806; GFX10-CU:       ; %bb.0: ; %entry
807; GFX10-CU-NEXT:    s_clause 0x1
808; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
809; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
810; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
811; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
813; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
815; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX10-CU-NEXT:    s_endpgm
817;
818; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
819; SKIP-CACHE-INV:       ; %bb.0: ; %entry
820; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
821; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
822; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
823; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
824; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
825; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
826; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
827; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
828; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
829; SKIP-CACHE-INV-NEXT:    s_endpgm
830    i32 addrspace(1)* %out, i32 %in) {
831entry:
832  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
833  ret void
834}
835
836define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
837; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw:
838; GFX6:       ; %bb.0: ; %entry
839; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
840; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
841; GFX6-NEXT:    s_mov_b32 s7, 0xf000
842; GFX6-NEXT:    s_mov_b32 s6, -1
843; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX6-NEXT:    v_mov_b32_e32 v0, s0
845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
847; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
848; GFX6-NEXT:    s_endpgm
849;
850; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
851; GFX7:       ; %bb.0: ; %entry
852; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
853; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
854; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX7-NEXT:    v_mov_b32_e32 v0, s0
856; GFX7-NEXT:    v_mov_b32_e32 v1, s1
857; GFX7-NEXT:    v_mov_b32_e32 v2, s2
858; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
860; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
861; GFX7-NEXT:    s_endpgm
862;
863; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
864; GFX10-WGP:       ; %bb.0: ; %entry
865; GFX10-WGP-NEXT:    s_clause 0x1
866; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
867; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
868; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
869; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
870; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
871; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
872; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
873; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
874; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
876; GFX10-WGP-NEXT:    buffer_gl0_inv
877; GFX10-WGP-NEXT:    s_endpgm
878;
879; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
880; GFX10-CU:       ; %bb.0: ; %entry
881; GFX10-CU-NEXT:    s_clause 0x1
882; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
883; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
884; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
885; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
887; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
888; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
889; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX10-CU-NEXT:    s_endpgm
891;
892; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
893; SKIP-CACHE-INV:       ; %bb.0: ; %entry
894; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
895; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
896; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
897; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
898; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
899; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
900; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
901; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
902; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
903; SKIP-CACHE-INV-NEXT:    s_endpgm
904    i32 addrspace(1)* %out, i32 %in) {
905entry:
906  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
907  ret void
908}
909
910define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
911; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw:
912; GFX6:       ; %bb.0: ; %entry
913; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
914; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
915; GFX6-NEXT:    s_mov_b32 s7, 0xf000
916; GFX6-NEXT:    s_mov_b32 s6, -1
917; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX6-NEXT:    v_mov_b32_e32 v0, s0
919; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
920; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
921; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
922; GFX6-NEXT:    s_endpgm
923;
924; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
925; GFX7:       ; %bb.0: ; %entry
926; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
927; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
928; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX7-NEXT:    v_mov_b32_e32 v0, s0
930; GFX7-NEXT:    v_mov_b32_e32 v1, s1
931; GFX7-NEXT:    v_mov_b32_e32 v2, s2
932; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
933; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
934; GFX7-NEXT:    flat_store_dword v[0:1], v2
935; GFX7-NEXT:    s_endpgm
936;
937; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw:
938; GFX10-WGP:       ; %bb.0: ; %entry
939; GFX10-WGP-NEXT:    s_clause 0x1
940; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
941; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
942; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
943; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
945; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
946; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
947; GFX10-WGP-NEXT:    buffer_gl0_inv
948; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
949; GFX10-WGP-NEXT:    s_endpgm
950;
951; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
952; GFX10-CU:       ; %bb.0: ; %entry
953; GFX10-CU-NEXT:    s_clause 0x1
954; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
955; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
956; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
957; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
959; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
960; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
961; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
962; GFX10-CU-NEXT:    s_endpgm
963;
964; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw:
965; SKIP-CACHE-INV:       ; %bb.0: ; %entry
966; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
967; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
968; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
969; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
970; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
971; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
972; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
973; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
974; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
975; SKIP-CACHE-INV-NEXT:    s_endpgm
976    i32 addrspace(1)* %out, i32 %in) {
977entry:
978  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
979  store i32 %val, i32 addrspace(1)* %out, align 4
980  ret void
981}
982
983define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
984; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
985; GFX6:       ; %bb.0: ; %entry
986; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
987; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
988; GFX6-NEXT:    s_mov_b32 s7, 0xf000
989; GFX6-NEXT:    s_mov_b32 s6, -1
990; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX6-NEXT:    v_mov_b32_e32 v0, s0
992; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
994; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
995; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
996; GFX6-NEXT:    s_endpgm
997;
998; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
999; GFX7:       ; %bb.0: ; %entry
1000; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1001; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1004; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1005; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1006; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1008; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1009; GFX7-NEXT:    flat_store_dword v[0:1], v2
1010; GFX7-NEXT:    s_endpgm
1011;
1012; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
1013; GFX10-WGP:       ; %bb.0: ; %entry
1014; GFX10-WGP-NEXT:    s_clause 0x1
1015; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1016; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1017; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1018; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1020; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1021; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1022; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1023; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1024; GFX10-WGP-NEXT:    buffer_gl0_inv
1025; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1026; GFX10-WGP-NEXT:    s_endpgm
1027;
1028; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
1029; GFX10-CU:       ; %bb.0: ; %entry
1030; GFX10-CU-NEXT:    s_clause 0x1
1031; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1032; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1033; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1034; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1036; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1037; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1038; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1039; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1040; GFX10-CU-NEXT:    s_endpgm
1041;
1042; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
1043; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1044; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1045; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1046; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1047; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1048; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1049; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1050; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1051; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1052; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1053; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1054; SKIP-CACHE-INV-NEXT:    s_endpgm
1055    i32 addrspace(1)* %out, i32 %in) {
1056entry:
1057  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
1058  store i32 %val, i32 addrspace(1)* %out, align 4
1059  ret void
1060}
1061
1062define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
1063; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
1064; GFX6:       ; %bb.0: ; %entry
1065; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1066; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
1067; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1068; GFX6-NEXT:    s_mov_b32 s6, -1
1069; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1071; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1072; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1073; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1074; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1075; GFX6-NEXT:    s_endpgm
1076;
1077; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
1078; GFX7:       ; %bb.0: ; %entry
1079; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1080; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1081; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1083; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1084; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1085; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1087; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1088; GFX7-NEXT:    flat_store_dword v[0:1], v2
1089; GFX7-NEXT:    s_endpgm
1090;
1091; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
1092; GFX10-WGP:       ; %bb.0: ; %entry
1093; GFX10-WGP-NEXT:    s_clause 0x1
1094; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1095; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1096; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1097; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1098; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1099; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1100; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1101; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1102; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1103; GFX10-WGP-NEXT:    buffer_gl0_inv
1104; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1105; GFX10-WGP-NEXT:    s_endpgm
1106;
1107; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
1108; GFX10-CU:       ; %bb.0: ; %entry
1109; GFX10-CU-NEXT:    s_clause 0x1
1110; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1111; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1112; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1113; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1115; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1117; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1118; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1119; GFX10-CU-NEXT:    s_endpgm
1120;
1121; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
1122; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1123; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1124; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1125; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1126; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1127; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1128; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1129; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1130; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1131; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1132; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1133; SKIP-CACHE-INV-NEXT:    s_endpgm
1134    i32 addrspace(1)* %out, i32 %in) {
1135entry:
1136  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
1137  store i32 %val, i32 addrspace(1)* %out, align 4
1138  ret void
1139}
1140
1141define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
1142; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
1143; GFX6:       ; %bb.0: ; %entry
1144; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1145; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1146; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1147; GFX6-NEXT:    s_mov_b32 s6, -1
1148; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1150; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1151; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1152; GFX6-NEXT:    s_endpgm
1153;
1154; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
1155; GFX7:       ; %bb.0: ; %entry
1156; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1157; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1158; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1159; GFX7-NEXT:    s_add_u32 s0, s0, 16
1160; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1161; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1162; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1163; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1164; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1165; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1166; GFX7-NEXT:    s_endpgm
1167;
1168; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
1169; GFX10-WGP:       ; %bb.0: ; %entry
1170; GFX10-WGP-NEXT:    s_clause 0x1
1171; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1172; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1173; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1174; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1176; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1177; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1178; GFX10-WGP-NEXT:    s_endpgm
1179;
1180; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
1181; GFX10-CU:       ; %bb.0: ; %entry
1182; GFX10-CU-NEXT:    s_clause 0x1
1183; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1184; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1185; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1186; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1188; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1189; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1190; GFX10-CU-NEXT:    s_endpgm
1191;
1192; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
1193; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1194; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1195; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1196; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1197; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1198; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1199; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1200; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1201; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1202; SKIP-CACHE-INV-NEXT:    s_endpgm
1203    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1204entry:
1205  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1206  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
1207  ret void
1208}
1209
1210define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
1211; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
1212; GFX6:       ; %bb.0: ; %entry
1213; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1214; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1215; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1216; GFX6-NEXT:    s_mov_b32 s6, -1
1217; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1218; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1219; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1220; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1221; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX6-NEXT:    s_endpgm
1223;
1224; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
1225; GFX7:       ; %bb.0: ; %entry
1226; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1227; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1228; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX7-NEXT:    s_add_u32 s0, s0, 16
1230; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1231; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1232; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1233; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1234; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1235; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1236; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1237; GFX7-NEXT:    s_endpgm
1238;
1239; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
1240; GFX10-WGP:       ; %bb.0: ; %entry
1241; GFX10-WGP-NEXT:    s_clause 0x1
1242; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1243; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1244; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1245; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1246; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1247; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1248; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1249; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1250; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1251; GFX10-WGP-NEXT:    buffer_gl0_inv
1252; GFX10-WGP-NEXT:    s_endpgm
1253;
1254; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
1255; GFX10-CU:       ; %bb.0: ; %entry
1256; GFX10-CU-NEXT:    s_clause 0x1
1257; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1258; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1259; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1260; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1262; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1264; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1265; GFX10-CU-NEXT:    s_endpgm
1266;
1267; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
1268; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1269; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1270; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1271; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1272; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1273; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1274; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1275; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1276; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1277; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1278; SKIP-CACHE-INV-NEXT:    s_endpgm
1279    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1280entry:
1281  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1282  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
1283  ret void
1284}
1285
1286define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
1287; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg:
1288; GFX6:       ; %bb.0: ; %entry
1289; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1290; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1291; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1292; GFX6-NEXT:    s_mov_b32 s6, -1
1293; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1295; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1296; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1298; GFX6-NEXT:    s_endpgm
1299;
1300; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
1301; GFX7:       ; %bb.0: ; %entry
1302; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1303; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1304; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1305; GFX7-NEXT:    s_add_u32 s0, s0, 16
1306; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1307; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1308; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1309; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1310; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1311; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1312; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1313; GFX7-NEXT:    s_endpgm
1314;
1315; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
1316; GFX10-WGP:       ; %bb.0: ; %entry
1317; GFX10-WGP-NEXT:    s_clause 0x1
1318; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1319; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1320; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1321; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1323; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1324; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1325; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1326; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1327; GFX10-WGP-NEXT:    s_endpgm
1328;
1329; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
1330; GFX10-CU:       ; %bb.0: ; %entry
1331; GFX10-CU-NEXT:    s_clause 0x1
1332; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1333; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1334; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1335; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1337; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1338; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1339; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1340; GFX10-CU-NEXT:    s_endpgm
1341;
1342; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg:
1343; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1344; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1345; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1346; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1347; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1348; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1349; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1351; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1352; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1353; SKIP-CACHE-INV-NEXT:    s_endpgm
1354    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1355entry:
1356  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1357  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
1358  ret void
1359}
1360
1361define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
1362; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
1363; GFX6:       ; %bb.0: ; %entry
1364; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1365; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1366; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1367; GFX6-NEXT:    s_mov_b32 s6, -1
1368; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1369; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1370; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1371; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1373; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1374; GFX6-NEXT:    s_endpgm
1375;
1376; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
1377; GFX7:       ; %bb.0: ; %entry
1378; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1379; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1380; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX7-NEXT:    s_add_u32 s0, s0, 16
1382; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1383; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1384; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1385; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1386; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1387; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1388; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1389; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX7-NEXT:    s_endpgm
1391;
1392; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
1393; GFX10-WGP:       ; %bb.0: ; %entry
1394; GFX10-WGP-NEXT:    s_clause 0x1
1395; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1396; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1397; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1398; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1399; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1400; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1401; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1402; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1403; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1404; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1406; GFX10-WGP-NEXT:    buffer_gl0_inv
1407; GFX10-WGP-NEXT:    s_endpgm
1408;
1409; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
1410; GFX10-CU:       ; %bb.0: ; %entry
1411; GFX10-CU-NEXT:    s_clause 0x1
1412; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1413; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1414; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1415; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1417; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1418; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1419; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1420; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1421; GFX10-CU-NEXT:    s_endpgm
1422;
1423; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
1424; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1425; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1426; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1427; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1428; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1429; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1430; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1431; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1432; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1433; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1434; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1435; SKIP-CACHE-INV-NEXT:    s_endpgm
1436    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1437entry:
1438  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1439  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
1440  ret void
1441}
1442
1443define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
1444; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
1445; GFX6:       ; %bb.0: ; %entry
1446; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1447; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1448; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1449; GFX6-NEXT:    s_mov_b32 s6, -1
1450; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1452; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1453; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1454; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1455; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1456; GFX6-NEXT:    s_endpgm
1457;
1458; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
1459; GFX7:       ; %bb.0: ; %entry
1460; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1461; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1462; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1463; GFX7-NEXT:    s_add_u32 s0, s0, 16
1464; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1465; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1466; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1467; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1468; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1469; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1470; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1471; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX7-NEXT:    s_endpgm
1473;
1474; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
1475; GFX10-WGP:       ; %bb.0: ; %entry
1476; GFX10-WGP-NEXT:    s_clause 0x1
1477; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1478; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1479; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1480; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1481; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1482; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1483; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1484; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1485; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1486; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1488; GFX10-WGP-NEXT:    buffer_gl0_inv
1489; GFX10-WGP-NEXT:    s_endpgm
1490;
1491; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
1492; GFX10-CU:       ; %bb.0: ; %entry
1493; GFX10-CU-NEXT:    s_clause 0x1
1494; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1495; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1496; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1497; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1499; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1500; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1501; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1502; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX10-CU-NEXT:    s_endpgm
1504;
1505; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
1506; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1507; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1508; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1509; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1510; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1511; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1512; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1513; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1514; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1515; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1516; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1517; SKIP-CACHE-INV-NEXT:    s_endpgm
1518    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1519entry:
1520  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1521  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
1522  ret void
1523}
1524
1525define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
1526; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg:
1527; GFX6:       ; %bb.0: ; %entry
1528; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1529; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1530; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1531; GFX6-NEXT:    s_mov_b32 s6, -1
1532; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1534; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1535; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1536; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1537; GFX6-NEXT:    s_endpgm
1538;
1539; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
1540; GFX7:       ; %bb.0: ; %entry
1541; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1542; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1543; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1544; GFX7-NEXT:    s_add_u32 s0, s0, 16
1545; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1546; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1547; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1548; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1549; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1550; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1551; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX7-NEXT:    s_endpgm
1553;
1554; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
1555; GFX10-WGP:       ; %bb.0: ; %entry
1556; GFX10-WGP-NEXT:    s_clause 0x1
1557; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1558; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1559; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1560; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1562; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1563; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1564; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1565; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1566; GFX10-WGP-NEXT:    buffer_gl0_inv
1567; GFX10-WGP-NEXT:    s_endpgm
1568;
1569; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
1570; GFX10-CU:       ; %bb.0: ; %entry
1571; GFX10-CU-NEXT:    s_clause 0x1
1572; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1573; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1574; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1575; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1576; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1577; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1578; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1579; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX10-CU-NEXT:    s_endpgm
1581;
1582; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg:
1583; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1584; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1585; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1586; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1587; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1588; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1589; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1590; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1591; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1592; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1593; SKIP-CACHE-INV-NEXT:    s_endpgm
1594    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1595entry:
1596  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1597  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
1598  ret void
1599}
1600
1601define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
1602; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg:
1603; GFX6:       ; %bb.0: ; %entry
1604; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1605; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1606; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1607; GFX6-NEXT:    s_mov_b32 s6, -1
1608; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1610; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1611; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1612; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1613; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX6-NEXT:    s_endpgm
1615;
1616; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
1617; GFX7:       ; %bb.0: ; %entry
1618; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1619; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1620; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1621; GFX7-NEXT:    s_add_u32 s0, s0, 16
1622; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1623; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1624; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1625; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1626; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1627; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1628; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1629; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1630; GFX7-NEXT:    s_endpgm
1631;
1632; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
1633; GFX10-WGP:       ; %bb.0: ; %entry
1634; GFX10-WGP-NEXT:    s_clause 0x1
1635; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1636; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1637; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1638; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1639; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1640; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1641; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1642; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1643; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1644; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1646; GFX10-WGP-NEXT:    buffer_gl0_inv
1647; GFX10-WGP-NEXT:    s_endpgm
1648;
1649; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
1650; GFX10-CU:       ; %bb.0: ; %entry
1651; GFX10-CU-NEXT:    s_clause 0x1
1652; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1653; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1654; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1655; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1656; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1657; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1658; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1659; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1660; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1661; GFX10-CU-NEXT:    s_endpgm
1662;
1663; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
1664; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1665; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1666; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1667; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1668; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1669; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1670; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1671; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1672; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1673; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1674; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1675; SKIP-CACHE-INV-NEXT:    s_endpgm
1676    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1677entry:
1678  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1679  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
1680  ret void
1681}
1682
1683define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
1684; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
1685; GFX6:       ; %bb.0: ; %entry
1686; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1687; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1688; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1689; GFX6-NEXT:    s_mov_b32 s6, -1
1690; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1691; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1692; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1693; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1695; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX6-NEXT:    s_endpgm
1697;
1698; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
1699; GFX7:       ; %bb.0: ; %entry
1700; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1701; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1702; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX7-NEXT:    s_add_u32 s0, s0, 16
1704; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1705; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1706; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1707; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1708; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1709; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1711; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1712; GFX7-NEXT:    s_endpgm
1713;
1714; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
1715; GFX10-WGP:       ; %bb.0: ; %entry
1716; GFX10-WGP-NEXT:    s_clause 0x1
1717; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1718; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1719; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1720; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1721; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1722; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1723; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1724; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1725; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1726; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1727; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1728; GFX10-WGP-NEXT:    buffer_gl0_inv
1729; GFX10-WGP-NEXT:    s_endpgm
1730;
1731; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
1732; GFX10-CU:       ; %bb.0: ; %entry
1733; GFX10-CU-NEXT:    s_clause 0x1
1734; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1735; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1736; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1737; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1739; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1740; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1741; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1742; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1743; GFX10-CU-NEXT:    s_endpgm
1744;
1745; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
1746; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1747; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1748; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1749; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1750; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1751; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1752; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1753; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1754; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1755; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1756; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1757; SKIP-CACHE-INV-NEXT:    s_endpgm
1758    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1759entry:
1760  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1761  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
1762  ret void
1763}
1764
1765define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
1766; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
1767; GFX6:       ; %bb.0: ; %entry
1768; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1769; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1770; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1771; GFX6-NEXT:    s_mov_b32 s6, -1
1772; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1773; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1774; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1775; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1777; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1778; GFX6-NEXT:    s_endpgm
1779;
1780; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
1781; GFX7:       ; %bb.0: ; %entry
1782; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1783; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1784; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1785; GFX7-NEXT:    s_add_u32 s0, s0, 16
1786; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1787; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1788; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1789; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1790; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1791; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1792; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1793; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1794; GFX7-NEXT:    s_endpgm
1795;
1796; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
1797; GFX10-WGP:       ; %bb.0: ; %entry
1798; GFX10-WGP-NEXT:    s_clause 0x1
1799; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1800; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1801; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1802; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1803; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1804; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1805; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1806; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1807; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1808; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1809; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1810; GFX10-WGP-NEXT:    buffer_gl0_inv
1811; GFX10-WGP-NEXT:    s_endpgm
1812;
1813; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
1814; GFX10-CU:       ; %bb.0: ; %entry
1815; GFX10-CU-NEXT:    s_clause 0x1
1816; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1817; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1818; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1819; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1820; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1821; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1822; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1823; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1824; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1825; GFX10-CU-NEXT:    s_endpgm
1826;
1827; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
1828; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1829; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1830; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1831; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1832; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1833; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1834; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1835; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1836; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1837; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1838; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1839; SKIP-CACHE-INV-NEXT:    s_endpgm
1840    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1841entry:
1842  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1843  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
1844  ret void
1845}
1846
1847define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
1848; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
1849; GFX6:       ; %bb.0: ; %entry
1850; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1851; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1852; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1853; GFX6-NEXT:    s_mov_b32 s6, -1
1854; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1855; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1856; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1857; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1858; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1859; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1860; GFX6-NEXT:    s_endpgm
1861;
1862; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
1863; GFX7:       ; %bb.0: ; %entry
1864; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1865; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1866; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1867; GFX7-NEXT:    s_add_u32 s0, s0, 16
1868; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1869; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1870; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1871; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1872; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1873; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1875; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1876; GFX7-NEXT:    s_endpgm
1877;
1878; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
1879; GFX10-WGP:       ; %bb.0: ; %entry
1880; GFX10-WGP-NEXT:    s_clause 0x1
1881; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1882; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1883; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1884; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1885; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1886; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1887; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1888; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1889; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1890; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1891; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1892; GFX10-WGP-NEXT:    buffer_gl0_inv
1893; GFX10-WGP-NEXT:    s_endpgm
1894;
1895; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
1896; GFX10-CU:       ; %bb.0: ; %entry
1897; GFX10-CU-NEXT:    s_clause 0x1
1898; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1899; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1900; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1901; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1902; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1903; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1904; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1906; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1907; GFX10-CU-NEXT:    s_endpgm
1908;
1909; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
1910; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1911; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1912; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1913; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1914; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1915; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1916; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1917; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1918; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1919; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1920; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1921; SKIP-CACHE-INV-NEXT:    s_endpgm
1922    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1923entry:
1924  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1925  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
1926  ret void
1927}
1928
1929define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
1930; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
1931; GFX6:       ; %bb.0: ; %entry
1932; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1933; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1934; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1935; GFX6-NEXT:    s_mov_b32 s6, -1
1936; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1937; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1938; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1939; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
1940; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1941; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1942; GFX6-NEXT:    s_endpgm
1943;
1944; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
1945; GFX7:       ; %bb.0: ; %entry
1946; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1947; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1948; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX7-NEXT:    s_add_u32 s4, s0, 16
1950; GFX7-NEXT:    s_addc_u32 s5, s1, 0
1951; GFX7-NEXT:    v_mov_b32_e32 v0, s4
1952; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1953; GFX7-NEXT:    v_mov_b32_e32 v1, s5
1954; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1955; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
1956; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1958; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1959; GFX7-NEXT:    s_waitcnt vmcnt(0)
1960; GFX7-NEXT:    flat_store_dword v[0:1], v2
1961; GFX7-NEXT:    s_endpgm
1962;
1963; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
1964; GFX10-WGP:       ; %bb.0: ; %entry
1965; GFX10-WGP-NEXT:    s_clause 0x1
1966; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1967; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1968; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1969; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1971; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1972; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
1973; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1974; GFX10-WGP-NEXT:    buffer_gl0_inv
1975; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
1976; GFX10-WGP-NEXT:    s_endpgm
1977;
1978; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
1979; GFX10-CU:       ; %bb.0: ; %entry
1980; GFX10-CU-NEXT:    s_clause 0x1
1981; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1982; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1983; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1984; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1986; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1987; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
1988; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1989; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
1990; GFX10-CU-NEXT:    s_endpgm
1991;
1992; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
1993; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1994; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1995; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1996; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1998; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1999; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2000; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2001; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2002; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2003; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2004; SKIP-CACHE-INV-NEXT:    s_endpgm
2005    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2006entry:
2007  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2008  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
2009  %val0 = extractvalue { i32, i1 } %val, 0
2010  store i32 %val0, i32 addrspace(1)* %out, align 4
2011  ret void
2012}
2013
2014define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
2015; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
2016; GFX6:       ; %bb.0: ; %entry
2017; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2018; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2019; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2020; GFX6-NEXT:    s_mov_b32 s6, -1
2021; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2022; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2023; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2024; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2026; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2027; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2028; GFX6-NEXT:    s_endpgm
2029;
2030; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
2031; GFX7:       ; %bb.0: ; %entry
2032; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2033; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2034; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2035; GFX7-NEXT:    s_add_u32 s4, s0, 16
2036; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2037; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2038; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2039; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2040; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2041; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2042; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2043; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2044; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2045; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2046; GFX7-NEXT:    s_waitcnt vmcnt(0)
2047; GFX7-NEXT:    flat_store_dword v[0:1], v2
2048; GFX7-NEXT:    s_endpgm
2049;
2050; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
2051; GFX10-WGP:       ; %bb.0: ; %entry
2052; GFX10-WGP-NEXT:    s_clause 0x1
2053; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2054; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2055; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2056; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2057; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2058; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2059; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2060; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2061; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2062; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2063; GFX10-WGP-NEXT:    buffer_gl0_inv
2064; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2065; GFX10-WGP-NEXT:    s_endpgm
2066;
2067; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
2068; GFX10-CU:       ; %bb.0: ; %entry
2069; GFX10-CU-NEXT:    s_clause 0x1
2070; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2071; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2072; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2073; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2074; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2075; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2076; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2077; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2078; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2079; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2080; GFX10-CU-NEXT:    s_endpgm
2081;
2082; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
2083; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2084; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2085; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2086; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2087; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2088; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2089; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2090; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2091; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2092; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2093; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2094; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2095; SKIP-CACHE-INV-NEXT:    s_endpgm
2096    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2097entry:
2098  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2099  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
2100  %val0 = extractvalue { i32, i1 } %val, 0
2101  store i32 %val0, i32 addrspace(1)* %out, align 4
2102  ret void
2103}
2104
2105define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
2106; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
2107; GFX6:       ; %bb.0: ; %entry
2108; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2109; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2110; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2111; GFX6-NEXT:    s_mov_b32 s6, -1
2112; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2113; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2114; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2115; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2116; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2117; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2118; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2119; GFX6-NEXT:    s_endpgm
2120;
2121; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
2122; GFX7:       ; %bb.0: ; %entry
2123; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2124; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2125; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2126; GFX7-NEXT:    s_add_u32 s4, s0, 16
2127; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2128; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2129; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2130; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2131; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2132; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2133; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2134; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2135; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2136; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2137; GFX7-NEXT:    s_waitcnt vmcnt(0)
2138; GFX7-NEXT:    flat_store_dword v[0:1], v2
2139; GFX7-NEXT:    s_endpgm
2140;
2141; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
2142; GFX10-WGP:       ; %bb.0: ; %entry
2143; GFX10-WGP-NEXT:    s_clause 0x1
2144; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2145; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2146; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2147; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2149; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2150; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2151; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2152; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2153; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2154; GFX10-WGP-NEXT:    buffer_gl0_inv
2155; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2156; GFX10-WGP-NEXT:    s_endpgm
2157;
2158; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
2159; GFX10-CU:       ; %bb.0: ; %entry
2160; GFX10-CU-NEXT:    s_clause 0x1
2161; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2162; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2163; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2164; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2166; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2167; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2169; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2170; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2171; GFX10-CU-NEXT:    s_endpgm
2172;
2173; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
2174; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2175; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2176; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2177; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2178; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2179; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2180; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2182; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2183; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2184; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2185; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2186; SKIP-CACHE-INV-NEXT:    s_endpgm
2187    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2188entry:
2189  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2190  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
2191  %val0 = extractvalue { i32, i1 } %val, 0
2192  store i32 %val0, i32 addrspace(1)* %out, align 4
2193  ret void
2194}
2195
2196define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
2197; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
2198; GFX6:       ; %bb.0: ; %entry
2199; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2200; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2201; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2202; GFX6-NEXT:    s_mov_b32 s6, -1
2203; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2204; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2205; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2206; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2207; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2208; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2209; GFX6-NEXT:    s_endpgm
2210;
2211; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
2212; GFX7:       ; %bb.0: ; %entry
2213; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2214; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2215; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2216; GFX7-NEXT:    s_add_u32 s4, s0, 16
2217; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2218; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2219; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2220; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2221; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2222; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2223; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2225; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2226; GFX7-NEXT:    s_waitcnt vmcnt(0)
2227; GFX7-NEXT:    flat_store_dword v[0:1], v2
2228; GFX7-NEXT:    s_endpgm
2229;
2230; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
2231; GFX10-WGP:       ; %bb.0: ; %entry
2232; GFX10-WGP-NEXT:    s_clause 0x1
2233; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2234; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2235; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2236; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2238; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2239; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2240; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2241; GFX10-WGP-NEXT:    buffer_gl0_inv
2242; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2243; GFX10-WGP-NEXT:    s_endpgm
2244;
2245; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
2246; GFX10-CU:       ; %bb.0: ; %entry
2247; GFX10-CU-NEXT:    s_clause 0x1
2248; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2249; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2250; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2251; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2252; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2253; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2254; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2255; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2256; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2257; GFX10-CU-NEXT:    s_endpgm
2258;
2259; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
2260; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2261; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2262; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2263; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2264; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2265; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2266; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2267; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2268; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2269; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2270; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2271; SKIP-CACHE-INV-NEXT:    s_endpgm
2272    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2273entry:
2274  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2275  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
2276  %val0 = extractvalue { i32, i1 } %val, 0
2277  store i32 %val0, i32 addrspace(1)* %out, align 4
2278  ret void
2279}
2280
2281define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
2282; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
2283; GFX6:       ; %bb.0: ; %entry
2284; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2285; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2286; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2287; GFX6-NEXT:    s_mov_b32 s6, -1
2288; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2289; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2290; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2291; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2292; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2293; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2294; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2295; GFX6-NEXT:    s_endpgm
2296;
2297; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
2298; GFX7:       ; %bb.0: ; %entry
2299; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2300; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2301; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2302; GFX7-NEXT:    s_add_u32 s4, s0, 16
2303; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2304; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2305; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2306; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2307; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2308; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2309; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2310; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2311; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2312; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2313; GFX7-NEXT:    s_waitcnt vmcnt(0)
2314; GFX7-NEXT:    flat_store_dword v[0:1], v2
2315; GFX7-NEXT:    s_endpgm
2316;
2317; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
2318; GFX10-WGP:       ; %bb.0: ; %entry
2319; GFX10-WGP-NEXT:    s_clause 0x1
2320; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2321; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2322; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2323; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2324; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2325; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2326; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2327; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2328; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2329; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2330; GFX10-WGP-NEXT:    buffer_gl0_inv
2331; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2332; GFX10-WGP-NEXT:    s_endpgm
2333;
2334; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
2335; GFX10-CU:       ; %bb.0: ; %entry
2336; GFX10-CU-NEXT:    s_clause 0x1
2337; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2338; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2339; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2340; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2341; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2342; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2343; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2344; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2345; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2346; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2347; GFX10-CU-NEXT:    s_endpgm
2348;
2349; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
2350; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2351; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2352; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2353; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2354; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2355; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2356; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2357; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2358; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2359; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2360; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2361; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2362; SKIP-CACHE-INV-NEXT:    s_endpgm
2363    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2364entry:
2365  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2366  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
2367  %val0 = extractvalue { i32, i1 } %val, 0
2368  store i32 %val0, i32 addrspace(1)* %out, align 4
2369  ret void
2370}
2371
2372define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
2373; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
2374; GFX6:       ; %bb.0: ; %entry
2375; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2376; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2377; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2378; GFX6-NEXT:    s_mov_b32 s6, -1
2379; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2380; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2381; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2382; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2383; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2384; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2385; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2386; GFX6-NEXT:    s_endpgm
2387;
2388; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
2389; GFX7:       ; %bb.0: ; %entry
2390; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2391; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2392; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2393; GFX7-NEXT:    s_add_u32 s4, s0, 16
2394; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2395; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2396; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2397; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2398; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2399; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2400; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2401; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2403; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2404; GFX7-NEXT:    s_waitcnt vmcnt(0)
2405; GFX7-NEXT:    flat_store_dword v[0:1], v2
2406; GFX7-NEXT:    s_endpgm
2407;
2408; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
2409; GFX10-WGP:       ; %bb.0: ; %entry
2410; GFX10-WGP-NEXT:    s_clause 0x1
2411; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2412; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2413; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2414; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2415; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2416; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2417; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2418; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2419; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2420; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2421; GFX10-WGP-NEXT:    buffer_gl0_inv
2422; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2423; GFX10-WGP-NEXT:    s_endpgm
2424;
2425; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
2426; GFX10-CU:       ; %bb.0: ; %entry
2427; GFX10-CU-NEXT:    s_clause 0x1
2428; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2429; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2430; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2431; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2433; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2434; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2435; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2436; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2437; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2438; GFX10-CU-NEXT:    s_endpgm
2439;
2440; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
2441; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2442; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2443; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2444; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2445; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2446; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2447; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2448; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2449; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2450; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2451; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2452; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2453; SKIP-CACHE-INV-NEXT:    s_endpgm
2454    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2455entry:
2456  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2457  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
2458  %val0 = extractvalue { i32, i1 } %val, 0
2459  store i32 %val0, i32 addrspace(1)* %out, align 4
2460  ret void
2461}
2462
2463define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
2464; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
2465; GFX6:       ; %bb.0: ; %entry
2466; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2467; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2468; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2469; GFX6-NEXT:    s_mov_b32 s6, -1
2470; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2471; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2472; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2473; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2474; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2475; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2476; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2477; GFX6-NEXT:    s_endpgm
2478;
2479; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
2480; GFX7:       ; %bb.0: ; %entry
2481; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2482; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2483; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2484; GFX7-NEXT:    s_add_u32 s4, s0, 16
2485; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2486; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2487; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2488; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2489; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2490; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2491; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2492; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2494; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2495; GFX7-NEXT:    s_waitcnt vmcnt(0)
2496; GFX7-NEXT:    flat_store_dword v[0:1], v2
2497; GFX7-NEXT:    s_endpgm
2498;
2499; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
2500; GFX10-WGP:       ; %bb.0: ; %entry
2501; GFX10-WGP-NEXT:    s_clause 0x1
2502; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2503; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2504; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2505; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2507; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2508; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2509; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2510; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2511; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2512; GFX10-WGP-NEXT:    buffer_gl0_inv
2513; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2514; GFX10-WGP-NEXT:    s_endpgm
2515;
2516; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
2517; GFX10-CU:       ; %bb.0: ; %entry
2518; GFX10-CU-NEXT:    s_clause 0x1
2519; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2520; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2521; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2522; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2523; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2524; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2525; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2526; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2527; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2528; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2529; GFX10-CU-NEXT:    s_endpgm
2530;
2531; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
2532; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2533; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2534; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2535; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2536; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2537; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2538; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2539; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2540; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2541; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2542; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2543; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2544; SKIP-CACHE-INV-NEXT:    s_endpgm
2545    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2546entry:
2547  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2548  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
2549  %val0 = extractvalue { i32, i1 } %val, 0
2550  store i32 %val0, i32 addrspace(1)* %out, align 4
2551  ret void
2552}
2553
2554define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
2555; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
2556; GFX6:       ; %bb.0: ; %entry
2557; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2558; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2559; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2560; GFX6-NEXT:    s_mov_b32 s6, -1
2561; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2563; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2564; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2565; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2566; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2567; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2568; GFX6-NEXT:    s_endpgm
2569;
2570; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
2571; GFX7:       ; %bb.0: ; %entry
2572; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2573; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2574; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2575; GFX7-NEXT:    s_add_u32 s4, s0, 16
2576; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2577; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2578; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2579; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2580; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2581; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2582; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2583; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2584; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2585; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2586; GFX7-NEXT:    s_waitcnt vmcnt(0)
2587; GFX7-NEXT:    flat_store_dword v[0:1], v2
2588; GFX7-NEXT:    s_endpgm
2589;
2590; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
2591; GFX10-WGP:       ; %bb.0: ; %entry
2592; GFX10-WGP-NEXT:    s_clause 0x1
2593; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2594; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2595; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2596; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2597; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2598; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2599; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2600; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2601; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2602; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2603; GFX10-WGP-NEXT:    buffer_gl0_inv
2604; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2605; GFX10-WGP-NEXT:    s_endpgm
2606;
2607; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
2608; GFX10-CU:       ; %bb.0: ; %entry
2609; GFX10-CU-NEXT:    s_clause 0x1
2610; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2611; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2612; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2613; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2614; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2615; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2616; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2617; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2618; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2619; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2620; GFX10-CU-NEXT:    s_endpgm
2621;
2622; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
2623; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2624; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2625; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2626; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2627; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2628; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2629; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2630; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2631; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2632; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2633; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2634; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2635; SKIP-CACHE-INV-NEXT:    s_endpgm
2636    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2637entry:
2638  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2639  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
2640  %val0 = extractvalue { i32, i1 } %val, 0
2641  store i32 %val0, i32 addrspace(1)* %out, align 4
2642  ret void
2643}
2644
2645define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
2646; GFX6-LABEL: global_workgroup_one_as_unordered_load:
2647; GFX6:       ; %bb.0: ; %entry
2648; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2649; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2650; GFX6-NEXT:    s_mov_b32 s2, -1
2651; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2652; GFX6-NEXT:    s_mov_b32 s0, s4
2653; GFX6-NEXT:    s_mov_b32 s1, s5
2654; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2655; GFX6-NEXT:    s_mov_b32 s4, s6
2656; GFX6-NEXT:    s_mov_b32 s5, s7
2657; GFX6-NEXT:    s_mov_b32 s6, s2
2658; GFX6-NEXT:    s_mov_b32 s7, s3
2659; GFX6-NEXT:    s_waitcnt vmcnt(0)
2660; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2661; GFX6-NEXT:    s_endpgm
2662;
2663; GFX7-LABEL: global_workgroup_one_as_unordered_load:
2664; GFX7:       ; %bb.0: ; %entry
2665; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2666; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2667; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2668; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2669; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2670; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2671; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2672; GFX7-NEXT:    s_waitcnt vmcnt(0)
2673; GFX7-NEXT:    flat_store_dword v[2:3], v0
2674; GFX7-NEXT:    s_endpgm
2675;
2676; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load:
2677; GFX10-WGP:       ; %bb.0: ; %entry
2678; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2679; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2680; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2681; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
2682; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2683; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2684; GFX10-WGP-NEXT:    s_endpgm
2685;
2686; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load:
2687; GFX10-CU:       ; %bb.0: ; %entry
2688; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2689; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2690; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2691; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
2692; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2693; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2694; GFX10-CU-NEXT:    s_endpgm
2695;
2696; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load:
2697; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2698; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2699; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2700; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2701; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2702; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2703; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2704; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2705; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2706; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2707; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2708; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2709; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2710; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2711; SKIP-CACHE-INV-NEXT:    s_endpgm
2712    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2713entry:
2714  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4
2715  store i32 %val, i32 addrspace(1)* %out
2716  ret void
2717}
2718
2719define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
2720; GFX6-LABEL: global_workgroup_one_as_monotonic_load:
2721; GFX6:       ; %bb.0: ; %entry
2722; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2723; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2724; GFX6-NEXT:    s_mov_b32 s2, -1
2725; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX6-NEXT:    s_mov_b32 s0, s4
2727; GFX6-NEXT:    s_mov_b32 s1, s5
2728; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2729; GFX6-NEXT:    s_mov_b32 s4, s6
2730; GFX6-NEXT:    s_mov_b32 s5, s7
2731; GFX6-NEXT:    s_mov_b32 s6, s2
2732; GFX6-NEXT:    s_mov_b32 s7, s3
2733; GFX6-NEXT:    s_waitcnt vmcnt(0)
2734; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2735; GFX6-NEXT:    s_endpgm
2736;
2737; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
2738; GFX7:       ; %bb.0: ; %entry
2739; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2740; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2741; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2742; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2743; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2744; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2745; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2746; GFX7-NEXT:    s_waitcnt vmcnt(0)
2747; GFX7-NEXT:    flat_store_dword v[2:3], v0
2748; GFX7-NEXT:    s_endpgm
2749;
2750; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load:
2751; GFX10-WGP:       ; %bb.0: ; %entry
2752; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2753; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2754; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2755; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
2756; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2757; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2758; GFX10-WGP-NEXT:    s_endpgm
2759;
2760; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load:
2761; GFX10-CU:       ; %bb.0: ; %entry
2762; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2763; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2764; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2765; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
2766; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2767; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2768; GFX10-CU-NEXT:    s_endpgm
2769;
2770; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load:
2771; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2772; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2773; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2774; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2775; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2776; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2777; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2778; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2779; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2780; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2781; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2782; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2783; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2784; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2785; SKIP-CACHE-INV-NEXT:    s_endpgm
2786    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2787entry:
2788  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4
2789  store i32 %val, i32 addrspace(1)* %out
2790  ret void
2791}
2792
2793define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
2794; GFX6-LABEL: global_workgroup_one_as_acquire_load:
2795; GFX6:       ; %bb.0: ; %entry
2796; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2797; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2798; GFX6-NEXT:    s_mov_b32 s2, -1
2799; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2800; GFX6-NEXT:    s_mov_b32 s0, s4
2801; GFX6-NEXT:    s_mov_b32 s1, s5
2802; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2803; GFX6-NEXT:    s_mov_b32 s4, s6
2804; GFX6-NEXT:    s_mov_b32 s5, s7
2805; GFX6-NEXT:    s_mov_b32 s6, s2
2806; GFX6-NEXT:    s_mov_b32 s7, s3
2807; GFX6-NEXT:    s_waitcnt vmcnt(0)
2808; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2809; GFX6-NEXT:    s_endpgm
2810;
2811; GFX7-LABEL: global_workgroup_one_as_acquire_load:
2812; GFX7:       ; %bb.0: ; %entry
2813; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2814; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2815; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2816; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2817; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2818; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2819; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2820; GFX7-NEXT:    s_waitcnt vmcnt(0)
2821; GFX7-NEXT:    flat_store_dword v[2:3], v0
2822; GFX7-NEXT:    s_endpgm
2823;
2824; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load:
2825; GFX10-WGP:       ; %bb.0: ; %entry
2826; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2827; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2828; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2829; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
2830; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2831; GFX10-WGP-NEXT:    buffer_gl0_inv
2832; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2833; GFX10-WGP-NEXT:    s_endpgm
2834;
2835; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load:
2836; GFX10-CU:       ; %bb.0: ; %entry
2837; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2838; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2839; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2840; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
2841; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2842; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2843; GFX10-CU-NEXT:    s_endpgm
2844;
2845; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load:
2846; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2847; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2848; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2849; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2850; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2852; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2853; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2854; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2855; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2856; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2857; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2858; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2859; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2860; SKIP-CACHE-INV-NEXT:    s_endpgm
2861    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2862entry:
2863  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4
2864  store i32 %val, i32 addrspace(1)* %out
2865  ret void
2866}
2867
2868define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
2869; GFX6-LABEL: global_workgroup_one_as_seq_cst_load:
2870; GFX6:       ; %bb.0: ; %entry
2871; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2872; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2873; GFX6-NEXT:    s_mov_b32 s2, -1
2874; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2875; GFX6-NEXT:    s_mov_b32 s0, s4
2876; GFX6-NEXT:    s_mov_b32 s1, s5
2877; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2878; GFX6-NEXT:    s_mov_b32 s4, s6
2879; GFX6-NEXT:    s_mov_b32 s5, s7
2880; GFX6-NEXT:    s_mov_b32 s6, s2
2881; GFX6-NEXT:    s_mov_b32 s7, s3
2882; GFX6-NEXT:    s_waitcnt vmcnt(0)
2883; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2884; GFX6-NEXT:    s_endpgm
2885;
2886; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
2887; GFX7:       ; %bb.0: ; %entry
2888; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2889; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2890; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2891; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2892; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2893; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2894; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2895; GFX7-NEXT:    s_waitcnt vmcnt(0)
2896; GFX7-NEXT:    flat_store_dword v[2:3], v0
2897; GFX7-NEXT:    s_endpgm
2898;
2899; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
2900; GFX10-WGP:       ; %bb.0: ; %entry
2901; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2902; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2903; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2904; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2905; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc
2906; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2907; GFX10-WGP-NEXT:    buffer_gl0_inv
2908; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2909; GFX10-WGP-NEXT:    s_endpgm
2910;
2911; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load:
2912; GFX10-CU:       ; %bb.0: ; %entry
2913; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2914; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2915; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2916; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
2917; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2918; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2919; GFX10-CU-NEXT:    s_endpgm
2920;
2921; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load:
2922; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2923; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2924; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2925; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2926; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2927; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2928; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2929; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2930; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2931; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2932; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2933; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2934; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2935; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2936; SKIP-CACHE-INV-NEXT:    s_endpgm
2937    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2938entry:
2939  %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4
2940  store i32 %val, i32 addrspace(1)* %out
2941  ret void
2942}
2943
2944define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
2945; GFX6-LABEL: global_workgroup_one_as_unordered_store:
2946; GFX6:       ; %bb.0: ; %entry
2947; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
2948; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2949; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2950; GFX6-NEXT:    s_mov_b32 s2, -1
2951; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2952; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2953; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2954; GFX6-NEXT:    s_endpgm
2955;
2956; GFX7-LABEL: global_workgroup_one_as_unordered_store:
2957; GFX7:       ; %bb.0: ; %entry
2958; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
2959; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
2960; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2961; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2962; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2963; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2964; GFX7-NEXT:    flat_store_dword v[0:1], v2
2965; GFX7-NEXT:    s_endpgm
2966;
2967; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store:
2968; GFX10-WGP:       ; %bb.0: ; %entry
2969; GFX10-WGP-NEXT:    s_clause 0x1
2970; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
2971; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2972; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2973; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2974; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
2975; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
2976; GFX10-WGP-NEXT:    s_endpgm
2977;
2978; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store:
2979; GFX10-CU:       ; %bb.0: ; %entry
2980; GFX10-CU-NEXT:    s_clause 0x1
2981; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
2982; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2983; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2984; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2985; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
2986; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
2987; GFX10-CU-NEXT:    s_endpgm
2988;
2989; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store:
2990; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2991; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
2992; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2993; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2994; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2995; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2996; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
2997; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2998; SKIP-CACHE-INV-NEXT:    s_endpgm
2999    i32 %in, i32 addrspace(1)* %out) {
3000entry:
3001  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4
3002  ret void
3003}
3004
3005define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
3006; GFX6-LABEL: global_workgroup_one_as_monotonic_store:
3007; GFX6:       ; %bb.0: ; %entry
3008; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3009; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3010; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3011; GFX6-NEXT:    s_mov_b32 s2, -1
3012; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3013; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3014; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3015; GFX6-NEXT:    s_endpgm
3016;
3017; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
3018; GFX7:       ; %bb.0: ; %entry
3019; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3020; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3021; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3022; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3023; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3024; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3025; GFX7-NEXT:    flat_store_dword v[0:1], v2
3026; GFX7-NEXT:    s_endpgm
3027;
3028; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store:
3029; GFX10-WGP:       ; %bb.0: ; %entry
3030; GFX10-WGP-NEXT:    s_clause 0x1
3031; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3032; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3033; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3034; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3035; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3036; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3037; GFX10-WGP-NEXT:    s_endpgm
3038;
3039; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store:
3040; GFX10-CU:       ; %bb.0: ; %entry
3041; GFX10-CU-NEXT:    s_clause 0x1
3042; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3043; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3044; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3045; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3046; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3047; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3048; GFX10-CU-NEXT:    s_endpgm
3049;
3050; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store:
3051; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3052; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3053; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3054; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3055; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3056; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3057; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3058; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3059; SKIP-CACHE-INV-NEXT:    s_endpgm
3060    i32 %in, i32 addrspace(1)* %out) {
3061entry:
3062  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4
3063  ret void
3064}
3065
3066define amdgpu_kernel void @global_workgroup_one_as_release_store(
3067; GFX6-LABEL: global_workgroup_one_as_release_store:
3068; GFX6:       ; %bb.0: ; %entry
3069; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3070; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3071; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3072; GFX6-NEXT:    s_mov_b32 s2, -1
3073; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3074; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3075; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3076; GFX6-NEXT:    s_endpgm
3077;
3078; GFX7-LABEL: global_workgroup_one_as_release_store:
3079; GFX7:       ; %bb.0: ; %entry
3080; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3081; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3082; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3083; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3084; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3085; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3086; GFX7-NEXT:    flat_store_dword v[0:1], v2
3087; GFX7-NEXT:    s_endpgm
3088;
3089; GFX10-WGP-LABEL: global_workgroup_one_as_release_store:
3090; GFX10-WGP:       ; %bb.0: ; %entry
3091; GFX10-WGP-NEXT:    s_clause 0x1
3092; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3093; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3094; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3095; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3096; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3097; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3098; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3099; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3100; GFX10-WGP-NEXT:    s_endpgm
3101;
3102; GFX10-CU-LABEL: global_workgroup_one_as_release_store:
3103; GFX10-CU:       ; %bb.0: ; %entry
3104; GFX10-CU-NEXT:    s_clause 0x1
3105; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3106; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3107; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3108; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3109; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3110; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3111; GFX10-CU-NEXT:    s_endpgm
3112;
3113; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store:
3114; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3115; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3116; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3117; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3118; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3119; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3120; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3121; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3122; SKIP-CACHE-INV-NEXT:    s_endpgm
3123    i32 %in, i32 addrspace(1)* %out) {
3124entry:
3125  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4
3126  ret void
3127}
3128
3129define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
3130; GFX6-LABEL: global_workgroup_one_as_seq_cst_store:
3131; GFX6:       ; %bb.0: ; %entry
3132; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3133; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3134; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3135; GFX6-NEXT:    s_mov_b32 s2, -1
3136; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3137; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3138; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3139; GFX6-NEXT:    s_endpgm
3140;
3141; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
3142; GFX7:       ; %bb.0: ; %entry
3143; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3144; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3146; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3147; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3148; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3149; GFX7-NEXT:    flat_store_dword v[0:1], v2
3150; GFX7-NEXT:    s_endpgm
3151;
3152; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
3153; GFX10-WGP:       ; %bb.0: ; %entry
3154; GFX10-WGP-NEXT:    s_clause 0x1
3155; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3156; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3157; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3158; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3159; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3160; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3161; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3162; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3163; GFX10-WGP-NEXT:    s_endpgm
3164;
3165; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store:
3166; GFX10-CU:       ; %bb.0: ; %entry
3167; GFX10-CU-NEXT:    s_clause 0x1
3168; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3169; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3170; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3171; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3172; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3173; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3174; GFX10-CU-NEXT:    s_endpgm
3175;
3176; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store:
3177; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3178; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3179; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3180; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3181; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3182; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3183; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3184; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3185; SKIP-CACHE-INV-NEXT:    s_endpgm
3186    i32 %in, i32 addrspace(1)* %out) {
3187entry:
3188  store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4
3189  ret void
3190}
3191
3192define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
3193; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
3194; GFX6:       ; %bb.0: ; %entry
3195; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3196; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3197; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3198; GFX6-NEXT:    s_mov_b32 s6, -1
3199; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3200; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3201; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3202; GFX6-NEXT:    s_endpgm
3203;
3204; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
3205; GFX7:       ; %bb.0: ; %entry
3206; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3207; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3208; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3209; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3210; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3211; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3212; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3213; GFX7-NEXT:    s_endpgm
3214;
3215; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
3216; GFX10-WGP:       ; %bb.0: ; %entry
3217; GFX10-WGP-NEXT:    s_clause 0x1
3218; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3219; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3220; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3221; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3222; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3223; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3224; GFX10-WGP-NEXT:    s_endpgm
3225;
3226; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
3227; GFX10-CU:       ; %bb.0: ; %entry
3228; GFX10-CU-NEXT:    s_clause 0x1
3229; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3230; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3231; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3232; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3233; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3234; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3235; GFX10-CU-NEXT:    s_endpgm
3236;
3237; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
3238; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3239; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3240; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3241; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3242; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3245; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3246; SKIP-CACHE-INV-NEXT:    s_endpgm
3247    i32 addrspace(1)* %out, i32 %in) {
3248entry:
3249  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic
3250  ret void
3251}
3252
3253define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
3254; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw:
3255; GFX6:       ; %bb.0: ; %entry
3256; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3257; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3258; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3259; GFX6-NEXT:    s_mov_b32 s6, -1
3260; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3261; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3262; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3263; GFX6-NEXT:    s_endpgm
3264;
3265; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
3266; GFX7:       ; %bb.0: ; %entry
3267; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3268; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3269; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3270; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3271; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3272; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3273; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3274; GFX7-NEXT:    s_endpgm
3275;
3276; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw:
3277; GFX10-WGP:       ; %bb.0: ; %entry
3278; GFX10-WGP-NEXT:    s_clause 0x1
3279; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3280; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3281; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3282; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3283; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3284; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3285; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3286; GFX10-WGP-NEXT:    buffer_gl0_inv
3287; GFX10-WGP-NEXT:    s_endpgm
3288;
3289; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
3290; GFX10-CU:       ; %bb.0: ; %entry
3291; GFX10-CU-NEXT:    s_clause 0x1
3292; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3293; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3294; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3295; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3296; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3297; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3298; GFX10-CU-NEXT:    s_endpgm
3299;
3300; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw:
3301; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3302; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3303; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3304; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3305; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3306; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3307; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3308; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3309; SKIP-CACHE-INV-NEXT:    s_endpgm
3310    i32 addrspace(1)* %out, i32 %in) {
3311entry:
3312  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
3313  ret void
3314}
3315
3316define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
3317; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw:
3318; GFX6:       ; %bb.0: ; %entry
3319; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3320; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3321; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3322; GFX6-NEXT:    s_mov_b32 s6, -1
3323; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3324; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3325; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3326; GFX6-NEXT:    s_endpgm
3327;
3328; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
3329; GFX7:       ; %bb.0: ; %entry
3330; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3331; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3332; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3333; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3334; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3335; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3336; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3337; GFX7-NEXT:    s_endpgm
3338;
3339; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw:
3340; GFX10-WGP:       ; %bb.0: ; %entry
3341; GFX10-WGP-NEXT:    s_clause 0x1
3342; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3343; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3344; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3345; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3346; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3347; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3348; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3349; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3350; GFX10-WGP-NEXT:    s_endpgm
3351;
3352; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
3353; GFX10-CU:       ; %bb.0: ; %entry
3354; GFX10-CU-NEXT:    s_clause 0x1
3355; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3356; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3357; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3358; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3359; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3360; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3361; GFX10-CU-NEXT:    s_endpgm
3362;
3363; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw:
3364; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3365; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3366; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3367; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3368; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3369; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3370; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3371; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3372; SKIP-CACHE-INV-NEXT:    s_endpgm
3373    i32 addrspace(1)* %out, i32 %in) {
3374entry:
3375  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release
3376  ret void
3377}
3378
3379define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
3380; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
3381; GFX6:       ; %bb.0: ; %entry
3382; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3383; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3384; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3385; GFX6-NEXT:    s_mov_b32 s6, -1
3386; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3387; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3388; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3389; GFX6-NEXT:    s_endpgm
3390;
3391; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
3392; GFX7:       ; %bb.0: ; %entry
3393; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3394; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3395; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3396; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3397; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3398; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3399; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3400; GFX7-NEXT:    s_endpgm
3401;
3402; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
3403; GFX10-WGP:       ; %bb.0: ; %entry
3404; GFX10-WGP-NEXT:    s_clause 0x1
3405; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3406; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3407; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3408; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3409; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3410; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3411; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3412; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3413; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3414; GFX10-WGP-NEXT:    buffer_gl0_inv
3415; GFX10-WGP-NEXT:    s_endpgm
3416;
3417; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
3418; GFX10-CU:       ; %bb.0: ; %entry
3419; GFX10-CU-NEXT:    s_clause 0x1
3420; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3421; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3422; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3423; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3424; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3425; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3426; GFX10-CU-NEXT:    s_endpgm
3427;
3428; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
3429; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3431; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3432; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3433; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3434; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3436; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3437; SKIP-CACHE-INV-NEXT:    s_endpgm
3438    i32 addrspace(1)* %out, i32 %in) {
3439entry:
3440  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
3441  ret void
3442}
3443
3444define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
3445; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
3446; GFX6:       ; %bb.0: ; %entry
3447; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3448; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3449; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3450; GFX6-NEXT:    s_mov_b32 s6, -1
3451; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3452; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3453; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3454; GFX6-NEXT:    s_endpgm
3455;
3456; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
3457; GFX7:       ; %bb.0: ; %entry
3458; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3459; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3460; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3461; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3462; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3463; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3464; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3465; GFX7-NEXT:    s_endpgm
3466;
3467; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
3468; GFX10-WGP:       ; %bb.0: ; %entry
3469; GFX10-WGP-NEXT:    s_clause 0x1
3470; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3471; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3472; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3473; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3474; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3475; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3476; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3477; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3478; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3479; GFX10-WGP-NEXT:    buffer_gl0_inv
3480; GFX10-WGP-NEXT:    s_endpgm
3481;
3482; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
3483; GFX10-CU:       ; %bb.0: ; %entry
3484; GFX10-CU-NEXT:    s_clause 0x1
3485; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3486; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3487; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3488; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3489; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3490; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3491; GFX10-CU-NEXT:    s_endpgm
3492;
3493; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
3494; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3495; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3496; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3497; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3498; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3499; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3500; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3501; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3502; SKIP-CACHE-INV-NEXT:    s_endpgm
3503    i32 addrspace(1)* %out, i32 %in) {
3504entry:
3505  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
3506  ret void
3507}
3508
3509define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
3510; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
3511; GFX6:       ; %bb.0: ; %entry
3512; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3513; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3514; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3515; GFX6-NEXT:    s_mov_b32 s6, -1
3516; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3517; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3518; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3519; GFX6-NEXT:    s_waitcnt vmcnt(0)
3520; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3521; GFX6-NEXT:    s_endpgm
3522;
3523; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
3524; GFX7:       ; %bb.0: ; %entry
3525; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3526; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3527; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3528; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3529; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3530; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3531; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3532; GFX7-NEXT:    s_waitcnt vmcnt(0)
3533; GFX7-NEXT:    flat_store_dword v[0:1], v2
3534; GFX7-NEXT:    s_endpgm
3535;
3536; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
3537; GFX10-WGP:       ; %bb.0: ; %entry
3538; GFX10-WGP-NEXT:    s_clause 0x1
3539; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3540; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3541; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3542; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3543; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3544; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3545; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3546; GFX10-WGP-NEXT:    buffer_gl0_inv
3547; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3548; GFX10-WGP-NEXT:    s_endpgm
3549;
3550; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
3551; GFX10-CU:       ; %bb.0: ; %entry
3552; GFX10-CU-NEXT:    s_clause 0x1
3553; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3554; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3555; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3556; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3557; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3558; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3559; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3560; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3561; GFX10-CU-NEXT:    s_endpgm
3562;
3563; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
3564; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3565; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3566; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3567; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3568; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3569; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3570; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3571; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3572; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3573; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3574; SKIP-CACHE-INV-NEXT:    s_endpgm
3575    i32 addrspace(1)* %out, i32 %in) {
3576entry:
3577  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
3578  store i32 %val, i32 addrspace(1)* %out, align 4
3579  ret void
3580}
3581
3582define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
3583; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
3584; GFX6:       ; %bb.0: ; %entry
3585; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3586; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3587; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3588; GFX6-NEXT:    s_mov_b32 s6, -1
3589; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3590; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3591; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3592; GFX6-NEXT:    s_waitcnt vmcnt(0)
3593; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3594; GFX6-NEXT:    s_endpgm
3595;
3596; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
3597; GFX7:       ; %bb.0: ; %entry
3598; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3599; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3600; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3601; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3602; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3603; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3604; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3605; GFX7-NEXT:    s_waitcnt vmcnt(0)
3606; GFX7-NEXT:    flat_store_dword v[0:1], v2
3607; GFX7-NEXT:    s_endpgm
3608;
3609; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
3610; GFX10-WGP:       ; %bb.0: ; %entry
3611; GFX10-WGP-NEXT:    s_clause 0x1
3612; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3613; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3614; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3615; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3616; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3617; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3618; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3619; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3620; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3621; GFX10-WGP-NEXT:    buffer_gl0_inv
3622; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3623; GFX10-WGP-NEXT:    s_endpgm
3624;
3625; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
3626; GFX10-CU:       ; %bb.0: ; %entry
3627; GFX10-CU-NEXT:    s_clause 0x1
3628; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3629; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3630; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3631; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3632; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3633; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3634; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3635; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3636; GFX10-CU-NEXT:    s_endpgm
3637;
3638; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
3639; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3640; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3641; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3642; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3643; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3644; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3645; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3646; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3647; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3648; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3649; SKIP-CACHE-INV-NEXT:    s_endpgm
3650    i32 addrspace(1)* %out, i32 %in) {
3651entry:
3652  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
3653  store i32 %val, i32 addrspace(1)* %out, align 4
3654  ret void
3655}
3656
3657define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
3658; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
3659; GFX6:       ; %bb.0: ; %entry
3660; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3661; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3662; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3663; GFX6-NEXT:    s_mov_b32 s6, -1
3664; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3665; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3666; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3667; GFX6-NEXT:    s_waitcnt vmcnt(0)
3668; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3669; GFX6-NEXT:    s_endpgm
3670;
3671; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
3672; GFX7:       ; %bb.0: ; %entry
3673; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3674; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3675; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3677; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3678; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3679; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3680; GFX7-NEXT:    s_waitcnt vmcnt(0)
3681; GFX7-NEXT:    flat_store_dword v[0:1], v2
3682; GFX7-NEXT:    s_endpgm
3683;
3684; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
3685; GFX10-WGP:       ; %bb.0: ; %entry
3686; GFX10-WGP-NEXT:    s_clause 0x1
3687; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3688; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3689; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3690; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3691; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3692; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3693; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3694; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3695; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3696; GFX10-WGP-NEXT:    buffer_gl0_inv
3697; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3698; GFX10-WGP-NEXT:    s_endpgm
3699;
3700; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
3701; GFX10-CU:       ; %bb.0: ; %entry
3702; GFX10-CU-NEXT:    s_clause 0x1
3703; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3704; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3705; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3706; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3707; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3708; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3709; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3710; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3711; GFX10-CU-NEXT:    s_endpgm
3712;
3713; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
3714; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3715; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3716; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3717; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3718; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3719; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3720; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3721; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3722; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3723; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3724; SKIP-CACHE-INV-NEXT:    s_endpgm
3725    i32 addrspace(1)* %out, i32 %in) {
3726entry:
3727  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
3728  store i32 %val, i32 addrspace(1)* %out, align 4
3729  ret void
3730}
3731
3732define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
3733; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
3734; GFX6:       ; %bb.0: ; %entry
3735; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3736; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3737; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3738; GFX6-NEXT:    s_mov_b32 s6, -1
3739; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3740; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3741; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3742; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3743; GFX6-NEXT:    s_endpgm
3744;
3745; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
3746; GFX7:       ; %bb.0: ; %entry
3747; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3748; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3749; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3750; GFX7-NEXT:    s_add_u32 s0, s0, 16
3751; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3752; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3753; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3754; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3755; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3756; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3757; GFX7-NEXT:    s_endpgm
3758;
3759; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
3760; GFX10-WGP:       ; %bb.0: ; %entry
3761; GFX10-WGP-NEXT:    s_clause 0x1
3762; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3763; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3764; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3765; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3766; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3767; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3768; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3769; GFX10-WGP-NEXT:    s_endpgm
3770;
3771; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
3772; GFX10-CU:       ; %bb.0: ; %entry
3773; GFX10-CU-NEXT:    s_clause 0x1
3774; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3775; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3776; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3777; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3778; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3779; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3780; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3781; GFX10-CU-NEXT:    s_endpgm
3782;
3783; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
3784; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3785; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3786; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3787; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3788; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3789; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3790; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3791; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3792; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3793; SKIP-CACHE-INV-NEXT:    s_endpgm
3794    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3795entry:
3796  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3797  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
3798  ret void
3799}
3800
3801define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
3802; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
3803; GFX6:       ; %bb.0: ; %entry
3804; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3805; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3806; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3807; GFX6-NEXT:    s_mov_b32 s6, -1
3808; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3809; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3810; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3811; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3812; GFX6-NEXT:    s_endpgm
3813;
3814; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
3815; GFX7:       ; %bb.0: ; %entry
3816; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3817; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3818; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX7-NEXT:    s_add_u32 s0, s0, 16
3820; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3821; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3822; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3823; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3824; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3825; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3826; GFX7-NEXT:    s_endpgm
3827;
3828; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
3829; GFX10-WGP:       ; %bb.0: ; %entry
3830; GFX10-WGP-NEXT:    s_clause 0x1
3831; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3832; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3833; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3834; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3835; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3836; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3837; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3838; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3839; GFX10-WGP-NEXT:    buffer_gl0_inv
3840; GFX10-WGP-NEXT:    s_endpgm
3841;
3842; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
3843; GFX10-CU:       ; %bb.0: ; %entry
3844; GFX10-CU-NEXT:    s_clause 0x1
3845; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3846; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3847; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3848; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3849; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3851; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3852; GFX10-CU-NEXT:    s_endpgm
3853;
3854; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
3855; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3856; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3857; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3858; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3859; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3860; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3861; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3862; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3863; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3864; SKIP-CACHE-INV-NEXT:    s_endpgm
3865    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3866entry:
3867  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3868  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
3869  ret void
3870}
3871
3872define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
3873; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
3874; GFX6:       ; %bb.0: ; %entry
3875; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3876; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3877; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3878; GFX6-NEXT:    s_mov_b32 s6, -1
3879; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3881; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3882; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3883; GFX6-NEXT:    s_endpgm
3884;
3885; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
3886; GFX7:       ; %bb.0: ; %entry
3887; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3888; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3889; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3890; GFX7-NEXT:    s_add_u32 s0, s0, 16
3891; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3892; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3893; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3894; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3895; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3896; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3897; GFX7-NEXT:    s_endpgm
3898;
3899; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
3900; GFX10-WGP:       ; %bb.0: ; %entry
3901; GFX10-WGP-NEXT:    s_clause 0x1
3902; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3903; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3904; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3905; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3906; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3907; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3908; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3909; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3910; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3911; GFX10-WGP-NEXT:    s_endpgm
3912;
3913; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
3914; GFX10-CU:       ; %bb.0: ; %entry
3915; GFX10-CU-NEXT:    s_clause 0x1
3916; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3917; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3918; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3919; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3920; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3921; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3922; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3923; GFX10-CU-NEXT:    s_endpgm
3924;
3925; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
3926; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3927; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3928; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3929; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3930; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3931; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3932; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
3934; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3935; SKIP-CACHE-INV-NEXT:    s_endpgm
3936    i32 addrspace(1)* %out, i32 %in, i32 %old) {
3937entry:
3938  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
3939  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
3940  ret void
3941}
3942
3943define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
3944; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
3945; GFX6:       ; %bb.0: ; %entry
3946; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3947; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3948; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3949; GFX6-NEXT:    s_mov_b32 s6, -1
3950; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3951; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3952; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3953; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3954; GFX6-NEXT:    s_endpgm
3955;
3956; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
3957; GFX7:       ; %bb.0: ; %entry
3958; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3959; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3960; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3961; GFX7-NEXT:    s_add_u32 s0, s0, 16
3962; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3963; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3964; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3965; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3966; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3967; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3968; GFX7-NEXT:    s_endpgm
3969;
3970; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
3971; GFX10-WGP:       ; %bb.0: ; %entry
3972; GFX10-WGP-NEXT:    s_clause 0x1
3973; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3974; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3975; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
3976; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3977; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
3978; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
3979; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3980; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3981; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3982; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3983; GFX10-WGP-NEXT:    buffer_gl0_inv
3984; GFX10-WGP-NEXT:    s_endpgm
3985;
3986; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
3987; GFX10-CU:       ; %bb.0: ; %entry
3988; GFX10-CU-NEXT:    s_clause 0x1
3989; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3990; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3991; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
3992; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3993; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
3994; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
3995; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
3996; GFX10-CU-NEXT:    s_endpgm
3997;
3998; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
3999; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4000; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4001; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4002; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4003; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4004; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4005; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4006; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4007; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4008; SKIP-CACHE-INV-NEXT:    s_endpgm
4009    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4010entry:
4011  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4012  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
4013  ret void
4014}
4015
4016define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
4017; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4018; GFX6:       ; %bb.0: ; %entry
4019; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4020; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4021; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4022; GFX6-NEXT:    s_mov_b32 s6, -1
4023; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4024; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4025; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4026; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4027; GFX6-NEXT:    s_endpgm
4028;
4029; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4030; GFX7:       ; %bb.0: ; %entry
4031; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4032; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4033; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4034; GFX7-NEXT:    s_add_u32 s0, s0, 16
4035; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4036; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4037; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4038; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4039; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4040; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4041; GFX7-NEXT:    s_endpgm
4042;
4043; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4044; GFX10-WGP:       ; %bb.0: ; %entry
4045; GFX10-WGP-NEXT:    s_clause 0x1
4046; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4047; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4048; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4049; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4050; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4051; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4052; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4053; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4054; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4055; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4056; GFX10-WGP-NEXT:    buffer_gl0_inv
4057; GFX10-WGP-NEXT:    s_endpgm
4058;
4059; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4060; GFX10-CU:       ; %bb.0: ; %entry
4061; GFX10-CU-NEXT:    s_clause 0x1
4062; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4063; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4064; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4065; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4066; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4067; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4068; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4069; GFX10-CU-NEXT:    s_endpgm
4070;
4071; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
4072; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4073; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4074; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4075; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4076; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4077; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4078; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4079; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4080; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4081; SKIP-CACHE-INV-NEXT:    s_endpgm
4082    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4083entry:
4084  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4085  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
4086  ret void
4087}
4088
4089define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
4090; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
4091; GFX6:       ; %bb.0: ; %entry
4092; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4093; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4094; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4095; GFX6-NEXT:    s_mov_b32 s6, -1
4096; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4097; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4098; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4099; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4100; GFX6-NEXT:    s_endpgm
4101;
4102; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
4103; GFX7:       ; %bb.0: ; %entry
4104; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4105; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4106; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4107; GFX7-NEXT:    s_add_u32 s0, s0, 16
4108; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4109; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4110; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4111; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4112; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4113; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4114; GFX7-NEXT:    s_endpgm
4115;
4116; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
4117; GFX10-WGP:       ; %bb.0: ; %entry
4118; GFX10-WGP-NEXT:    s_clause 0x1
4119; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4120; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4121; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4122; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4123; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4124; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4125; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4126; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4127; GFX10-WGP-NEXT:    buffer_gl0_inv
4128; GFX10-WGP-NEXT:    s_endpgm
4129;
4130; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
4131; GFX10-CU:       ; %bb.0: ; %entry
4132; GFX10-CU-NEXT:    s_clause 0x1
4133; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4134; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4135; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4136; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4137; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4138; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4139; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4140; GFX10-CU-NEXT:    s_endpgm
4141;
4142; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
4143; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4144; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4145; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4146; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4147; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4148; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4151; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4152; SKIP-CACHE-INV-NEXT:    s_endpgm
4153    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4154entry:
4155  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4156  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
4157  ret void
4158}
4159
4160define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
4161; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
4162; GFX6:       ; %bb.0: ; %entry
4163; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4164; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4165; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4166; GFX6-NEXT:    s_mov_b32 s6, -1
4167; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4168; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4169; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4170; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4171; GFX6-NEXT:    s_endpgm
4172;
4173; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
4174; GFX7:       ; %bb.0: ; %entry
4175; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4176; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4178; GFX7-NEXT:    s_add_u32 s0, s0, 16
4179; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4180; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4181; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4182; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4183; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4184; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4185; GFX7-NEXT:    s_endpgm
4186;
4187; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
4188; GFX10-WGP:       ; %bb.0: ; %entry
4189; GFX10-WGP-NEXT:    s_clause 0x1
4190; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4191; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4192; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4193; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4194; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4195; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4196; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4197; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4198; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4199; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4200; GFX10-WGP-NEXT:    buffer_gl0_inv
4201; GFX10-WGP-NEXT:    s_endpgm
4202;
4203; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
4204; GFX10-CU:       ; %bb.0: ; %entry
4205; GFX10-CU-NEXT:    s_clause 0x1
4206; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4207; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4208; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4209; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4210; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4211; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4212; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4213; GFX10-CU-NEXT:    s_endpgm
4214;
4215; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
4216; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4217; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4218; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4221; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4222; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4223; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4224; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4225; SKIP-CACHE-INV-NEXT:    s_endpgm
4226    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4227entry:
4228  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4229  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
4230  ret void
4231}
4232
4233define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
4234; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
4235; GFX6:       ; %bb.0: ; %entry
4236; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4237; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4238; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4239; GFX6-NEXT:    s_mov_b32 s6, -1
4240; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4241; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4242; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4243; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4244; GFX6-NEXT:    s_endpgm
4245;
4246; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
4247; GFX7:       ; %bb.0: ; %entry
4248; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4249; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4250; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4251; GFX7-NEXT:    s_add_u32 s0, s0, 16
4252; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4253; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4254; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4255; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4256; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4257; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4258; GFX7-NEXT:    s_endpgm
4259;
4260; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
4261; GFX10-WGP:       ; %bb.0: ; %entry
4262; GFX10-WGP-NEXT:    s_clause 0x1
4263; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4264; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4265; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4266; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4267; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4268; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4269; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4270; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4271; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4272; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4273; GFX10-WGP-NEXT:    buffer_gl0_inv
4274; GFX10-WGP-NEXT:    s_endpgm
4275;
4276; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
4277; GFX10-CU:       ; %bb.0: ; %entry
4278; GFX10-CU-NEXT:    s_clause 0x1
4279; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4280; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4281; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4282; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4283; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4284; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4285; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4286; GFX10-CU-NEXT:    s_endpgm
4287;
4288; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
4289; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4290; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4291; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4292; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4293; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4294; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4295; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4296; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4297; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4298; SKIP-CACHE-INV-NEXT:    s_endpgm
4299    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4300entry:
4301  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4302  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
4303  ret void
4304}
4305
4306define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
4307; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
4308; GFX6:       ; %bb.0: ; %entry
4309; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4310; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4311; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4312; GFX6-NEXT:    s_mov_b32 s6, -1
4313; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4314; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4315; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4316; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4317; GFX6-NEXT:    s_endpgm
4318;
4319; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
4320; GFX7:       ; %bb.0: ; %entry
4321; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4322; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4323; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4324; GFX7-NEXT:    s_add_u32 s0, s0, 16
4325; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4326; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4327; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4328; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4329; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4330; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4331; GFX7-NEXT:    s_endpgm
4332;
4333; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
4334; GFX10-WGP:       ; %bb.0: ; %entry
4335; GFX10-WGP-NEXT:    s_clause 0x1
4336; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4337; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4338; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4339; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4340; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4341; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4342; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4343; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4344; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4345; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4346; GFX10-WGP-NEXT:    buffer_gl0_inv
4347; GFX10-WGP-NEXT:    s_endpgm
4348;
4349; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
4350; GFX10-CU:       ; %bb.0: ; %entry
4351; GFX10-CU-NEXT:    s_clause 0x1
4352; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4353; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4354; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4355; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4356; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4357; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4358; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4359; GFX10-CU-NEXT:    s_endpgm
4360;
4361; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
4362; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4363; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4364; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4365; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4366; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4367; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4368; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4369; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4370; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4371; SKIP-CACHE-INV-NEXT:    s_endpgm
4372    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4373entry:
4374  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4375  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
4376  ret void
4377}
4378
4379define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
4380; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
4381; GFX6:       ; %bb.0: ; %entry
4382; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4383; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4384; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4385; GFX6-NEXT:    s_mov_b32 s6, -1
4386; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4387; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4388; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4389; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4390; GFX6-NEXT:    s_endpgm
4391;
4392; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
4393; GFX7:       ; %bb.0: ; %entry
4394; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4395; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4396; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4397; GFX7-NEXT:    s_add_u32 s0, s0, 16
4398; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4399; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4400; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4401; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4402; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4403; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4404; GFX7-NEXT:    s_endpgm
4405;
4406; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
4407; GFX10-WGP:       ; %bb.0: ; %entry
4408; GFX10-WGP-NEXT:    s_clause 0x1
4409; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4410; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4411; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4412; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4413; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4414; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4415; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4416; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4417; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4418; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4419; GFX10-WGP-NEXT:    buffer_gl0_inv
4420; GFX10-WGP-NEXT:    s_endpgm
4421;
4422; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
4423; GFX10-CU:       ; %bb.0: ; %entry
4424; GFX10-CU-NEXT:    s_clause 0x1
4425; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4426; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4427; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4428; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4429; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4430; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4431; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4432; GFX10-CU-NEXT:    s_endpgm
4433;
4434; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
4435; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4436; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4437; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4438; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4439; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4440; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4441; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4442; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4443; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4444; SKIP-CACHE-INV-NEXT:    s_endpgm
4445    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4446entry:
4447  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4448  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
4449  ret void
4450}
4451
4452define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
4453; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
4454; GFX6:       ; %bb.0: ; %entry
4455; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4456; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4457; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4458; GFX6-NEXT:    s_mov_b32 s6, -1
4459; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4460; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4461; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4462; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4463; GFX6-NEXT:    s_waitcnt vmcnt(0)
4464; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4465; GFX6-NEXT:    s_endpgm
4466;
4467; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
4468; GFX7:       ; %bb.0: ; %entry
4469; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4470; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4471; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4472; GFX7-NEXT:    s_add_u32 s4, s0, 16
4473; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4474; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4475; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4476; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4477; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4478; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4479; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4480; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4481; GFX7-NEXT:    s_waitcnt vmcnt(0)
4482; GFX7-NEXT:    flat_store_dword v[0:1], v2
4483; GFX7-NEXT:    s_endpgm
4484;
4485; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
4486; GFX10-WGP:       ; %bb.0: ; %entry
4487; GFX10-WGP-NEXT:    s_clause 0x1
4488; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4489; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4490; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4491; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4492; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4493; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4494; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4495; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4496; GFX10-WGP-NEXT:    buffer_gl0_inv
4497; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4498; GFX10-WGP-NEXT:    s_endpgm
4499;
4500; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
4501; GFX10-CU:       ; %bb.0: ; %entry
4502; GFX10-CU-NEXT:    s_clause 0x1
4503; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4504; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4505; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4506; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4507; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4508; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4509; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4510; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4511; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4512; GFX10-CU-NEXT:    s_endpgm
4513;
4514; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
4515; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4516; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4517; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4519; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4520; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4521; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4522; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4523; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4524; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4525; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4526; SKIP-CACHE-INV-NEXT:    s_endpgm
4527    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4528entry:
4529  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4530  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
4531  %val0 = extractvalue { i32, i1 } %val, 0
4532  store i32 %val0, i32 addrspace(1)* %out, align 4
4533  ret void
4534}
4535
4536define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
4537; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
4538; GFX6:       ; %bb.0: ; %entry
4539; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4540; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4541; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4542; GFX6-NEXT:    s_mov_b32 s6, -1
4543; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4544; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4545; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4546; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4547; GFX6-NEXT:    s_waitcnt vmcnt(0)
4548; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4549; GFX6-NEXT:    s_endpgm
4550;
4551; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
4552; GFX7:       ; %bb.0: ; %entry
4553; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4554; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4555; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4556; GFX7-NEXT:    s_add_u32 s4, s0, 16
4557; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4558; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4559; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4560; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4561; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4562; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4563; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4564; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4565; GFX7-NEXT:    s_waitcnt vmcnt(0)
4566; GFX7-NEXT:    flat_store_dword v[0:1], v2
4567; GFX7-NEXT:    s_endpgm
4568;
4569; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
4570; GFX10-WGP:       ; %bb.0: ; %entry
4571; GFX10-WGP-NEXT:    s_clause 0x1
4572; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4573; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4574; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4575; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4576; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4577; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4578; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4579; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4580; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4581; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4582; GFX10-WGP-NEXT:    buffer_gl0_inv
4583; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4584; GFX10-WGP-NEXT:    s_endpgm
4585;
4586; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
4587; GFX10-CU:       ; %bb.0: ; %entry
4588; GFX10-CU-NEXT:    s_clause 0x1
4589; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4590; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4591; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4592; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4593; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4594; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4595; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4596; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4597; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4598; GFX10-CU-NEXT:    s_endpgm
4599;
4600; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
4601; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4602; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4603; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4604; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4605; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4606; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4607; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4608; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4609; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4610; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4611; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4612; SKIP-CACHE-INV-NEXT:    s_endpgm
4613    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4614entry:
4615  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4616  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
4617  %val0 = extractvalue { i32, i1 } %val, 0
4618  store i32 %val0, i32 addrspace(1)* %out, align 4
4619  ret void
4620}
4621
4622define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
4623; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
4624; GFX6:       ; %bb.0: ; %entry
4625; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4626; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4627; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4628; GFX6-NEXT:    s_mov_b32 s6, -1
4629; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4630; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4631; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4632; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4633; GFX6-NEXT:    s_waitcnt vmcnt(0)
4634; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4635; GFX6-NEXT:    s_endpgm
4636;
4637; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
4638; GFX7:       ; %bb.0: ; %entry
4639; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4640; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4641; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4642; GFX7-NEXT:    s_add_u32 s4, s0, 16
4643; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4644; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4645; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4646; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4647; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4648; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4649; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4650; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4651; GFX7-NEXT:    s_waitcnt vmcnt(0)
4652; GFX7-NEXT:    flat_store_dword v[0:1], v2
4653; GFX7-NEXT:    s_endpgm
4654;
4655; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
4656; GFX10-WGP:       ; %bb.0: ; %entry
4657; GFX10-WGP-NEXT:    s_clause 0x1
4658; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4659; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4660; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4661; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4662; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4663; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4664; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4665; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4666; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4667; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4668; GFX10-WGP-NEXT:    buffer_gl0_inv
4669; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4670; GFX10-WGP-NEXT:    s_endpgm
4671;
4672; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
4673; GFX10-CU:       ; %bb.0: ; %entry
4674; GFX10-CU-NEXT:    s_clause 0x1
4675; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4676; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4677; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4678; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4679; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4680; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4681; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4682; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4683; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4684; GFX10-CU-NEXT:    s_endpgm
4685;
4686; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
4687; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4688; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4689; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4690; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4691; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4692; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4693; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4695; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4696; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4697; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4698; SKIP-CACHE-INV-NEXT:    s_endpgm
4699    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4700entry:
4701  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4702  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
4703  %val0 = extractvalue { i32, i1 } %val, 0
4704  store i32 %val0, i32 addrspace(1)* %out, align 4
4705  ret void
4706}
4707
4708define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
4709; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
4710; GFX6:       ; %bb.0: ; %entry
4711; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4712; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4713; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4714; GFX6-NEXT:    s_mov_b32 s6, -1
4715; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4716; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4717; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4718; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4719; GFX6-NEXT:    s_waitcnt vmcnt(0)
4720; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4721; GFX6-NEXT:    s_endpgm
4722;
4723; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
4724; GFX7:       ; %bb.0: ; %entry
4725; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4726; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4727; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4728; GFX7-NEXT:    s_add_u32 s4, s0, 16
4729; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4730; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4731; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4732; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4733; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4734; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4735; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4736; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4737; GFX7-NEXT:    s_waitcnt vmcnt(0)
4738; GFX7-NEXT:    flat_store_dword v[0:1], v2
4739; GFX7-NEXT:    s_endpgm
4740;
4741; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
4742; GFX10-WGP:       ; %bb.0: ; %entry
4743; GFX10-WGP-NEXT:    s_clause 0x1
4744; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4745; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4746; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4747; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4748; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4749; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4750; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4751; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4752; GFX10-WGP-NEXT:    buffer_gl0_inv
4753; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4754; GFX10-WGP-NEXT:    s_endpgm
4755;
4756; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
4757; GFX10-CU:       ; %bb.0: ; %entry
4758; GFX10-CU-NEXT:    s_clause 0x1
4759; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4760; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4761; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4762; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4763; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4764; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4765; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4766; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4767; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4768; GFX10-CU-NEXT:    s_endpgm
4769;
4770; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
4771; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4772; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4773; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4774; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4775; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4776; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4777; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4778; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4779; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4780; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4781; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4782; SKIP-CACHE-INV-NEXT:    s_endpgm
4783    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4784entry:
4785  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4786  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
4787  %val0 = extractvalue { i32, i1 } %val, 0
4788  store i32 %val0, i32 addrspace(1)* %out, align 4
4789  ret void
4790}
4791
4792define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
4793; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
4794; GFX6:       ; %bb.0: ; %entry
4795; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4796; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4797; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4798; GFX6-NEXT:    s_mov_b32 s6, -1
4799; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4800; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4801; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4802; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4803; GFX6-NEXT:    s_waitcnt vmcnt(0)
4804; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4805; GFX6-NEXT:    s_endpgm
4806;
4807; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
4808; GFX7:       ; %bb.0: ; %entry
4809; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4810; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4811; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4812; GFX7-NEXT:    s_add_u32 s4, s0, 16
4813; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4814; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4815; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4816; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4817; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4818; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4819; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4820; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4821; GFX7-NEXT:    s_waitcnt vmcnt(0)
4822; GFX7-NEXT:    flat_store_dword v[0:1], v2
4823; GFX7-NEXT:    s_endpgm
4824;
4825; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
4826; GFX10-WGP:       ; %bb.0: ; %entry
4827; GFX10-WGP-NEXT:    s_clause 0x1
4828; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4829; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4830; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4831; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4832; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4833; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4834; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4835; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4836; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4837; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4838; GFX10-WGP-NEXT:    buffer_gl0_inv
4839; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4840; GFX10-WGP-NEXT:    s_endpgm
4841;
4842; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
4843; GFX10-CU:       ; %bb.0: ; %entry
4844; GFX10-CU-NEXT:    s_clause 0x1
4845; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4846; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4847; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4848; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4849; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4850; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4851; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4852; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4853; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4854; GFX10-CU-NEXT:    s_endpgm
4855;
4856; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
4857; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4858; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4859; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4860; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4861; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4862; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4863; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4864; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4865; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4866; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4867; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4868; SKIP-CACHE-INV-NEXT:    s_endpgm
4869    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4870entry:
4871  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4872  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
4873  %val0 = extractvalue { i32, i1 } %val, 0
4874  store i32 %val0, i32 addrspace(1)* %out, align 4
4875  ret void
4876}
4877
4878define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
4879; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
4880; GFX6:       ; %bb.0: ; %entry
4881; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4882; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4883; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4884; GFX6-NEXT:    s_mov_b32 s6, -1
4885; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4886; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4887; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4888; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4889; GFX6-NEXT:    s_waitcnt vmcnt(0)
4890; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4891; GFX6-NEXT:    s_endpgm
4892;
4893; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
4894; GFX7:       ; %bb.0: ; %entry
4895; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4896; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4897; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4898; GFX7-NEXT:    s_add_u32 s4, s0, 16
4899; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4900; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4901; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4902; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4903; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4904; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4905; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4906; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4907; GFX7-NEXT:    s_waitcnt vmcnt(0)
4908; GFX7-NEXT:    flat_store_dword v[0:1], v2
4909; GFX7-NEXT:    s_endpgm
4910;
4911; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
4912; GFX10-WGP:       ; %bb.0: ; %entry
4913; GFX10-WGP-NEXT:    s_clause 0x1
4914; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4915; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4916; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4917; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4918; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4919; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4920; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4921; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4922; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4923; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4924; GFX10-WGP-NEXT:    buffer_gl0_inv
4925; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4926; GFX10-WGP-NEXT:    s_endpgm
4927;
4928; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
4929; GFX10-CU:       ; %bb.0: ; %entry
4930; GFX10-CU-NEXT:    s_clause 0x1
4931; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4932; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4933; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4934; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4935; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4936; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4937; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4938; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4939; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4940; GFX10-CU-NEXT:    s_endpgm
4941;
4942; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
4943; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4944; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4945; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4946; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4947; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4948; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4949; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4950; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4951; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4952; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4953; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4954; SKIP-CACHE-INV-NEXT:    s_endpgm
4955    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4956entry:
4957  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4958  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
4959  %val0 = extractvalue { i32, i1 } %val, 0
4960  store i32 %val0, i32 addrspace(1)* %out, align 4
4961  ret void
4962}
4963
4964define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
4965; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
4966; GFX6:       ; %bb.0: ; %entry
4967; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4968; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4969; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4970; GFX6-NEXT:    s_mov_b32 s6, -1
4971; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4972; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4973; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4974; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4975; GFX6-NEXT:    s_waitcnt vmcnt(0)
4976; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4977; GFX6-NEXT:    s_endpgm
4978;
4979; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
4980; GFX7:       ; %bb.0: ; %entry
4981; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4982; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4983; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4984; GFX7-NEXT:    s_add_u32 s4, s0, 16
4985; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4986; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4987; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4988; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4989; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4990; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4991; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4992; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4993; GFX7-NEXT:    s_waitcnt vmcnt(0)
4994; GFX7-NEXT:    flat_store_dword v[0:1], v2
4995; GFX7-NEXT:    s_endpgm
4996;
4997; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
4998; GFX10-WGP:       ; %bb.0: ; %entry
4999; GFX10-WGP-NEXT:    s_clause 0x1
5000; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5001; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5002; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5003; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5004; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5005; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5006; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5007; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5008; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5009; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5010; GFX10-WGP-NEXT:    buffer_gl0_inv
5011; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5012; GFX10-WGP-NEXT:    s_endpgm
5013;
5014; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5015; GFX10-CU:       ; %bb.0: ; %entry
5016; GFX10-CU-NEXT:    s_clause 0x1
5017; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5018; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5019; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5020; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5021; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5022; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5023; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5024; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5025; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5026; GFX10-CU-NEXT:    s_endpgm
5027;
5028; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
5029; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5030; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5031; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5032; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5033; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5034; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5035; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5036; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5037; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5038; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5039; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5040; SKIP-CACHE-INV-NEXT:    s_endpgm
5041    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5042entry:
5043  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5044  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
5045  %val0 = extractvalue { i32, i1 } %val, 0
5046  store i32 %val0, i32 addrspace(1)* %out, align 4
5047  ret void
5048}
5049
5050define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
5051; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5052; GFX6:       ; %bb.0: ; %entry
5053; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5054; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5055; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5056; GFX6-NEXT:    s_mov_b32 s6, -1
5057; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5058; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5059; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5060; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5061; GFX6-NEXT:    s_waitcnt vmcnt(0)
5062; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5063; GFX6-NEXT:    s_endpgm
5064;
5065; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5066; GFX7:       ; %bb.0: ; %entry
5067; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5068; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5069; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5070; GFX7-NEXT:    s_add_u32 s4, s0, 16
5071; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5072; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5073; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5074; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5075; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5076; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5077; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5078; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5079; GFX7-NEXT:    s_waitcnt vmcnt(0)
5080; GFX7-NEXT:    flat_store_dword v[0:1], v2
5081; GFX7-NEXT:    s_endpgm
5082;
5083; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5084; GFX10-WGP:       ; %bb.0: ; %entry
5085; GFX10-WGP-NEXT:    s_clause 0x1
5086; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5087; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5088; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5089; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5090; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5091; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5092; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5093; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5094; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5095; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5096; GFX10-WGP-NEXT:    buffer_gl0_inv
5097; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5098; GFX10-WGP-NEXT:    s_endpgm
5099;
5100; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5101; GFX10-CU:       ; %bb.0: ; %entry
5102; GFX10-CU-NEXT:    s_clause 0x1
5103; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5104; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5105; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5106; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5107; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5108; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5109; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5110; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5111; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5112; GFX10-CU-NEXT:    s_endpgm
5113;
5114; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
5115; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5116; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5117; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5118; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5119; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5120; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5121; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5122; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5123; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5124; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5125; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5126; SKIP-CACHE-INV-NEXT:    s_endpgm
5127    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5128entry:
5129  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5130  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
5131  %val0 = extractvalue { i32, i1 } %val, 0
5132  store i32 %val0, i32 addrspace(1)* %out, align 4
5133  ret void
5134}
5135
5136