1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7
8define amdgpu_kernel void @global_system_unordered_load(
9; GFX6-LABEL: global_system_unordered_load:
10; GFX6:       ; %bb.0: ; %entry
11; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
12; GFX6-NEXT:    s_mov_b32 s3, 0xf000
13; GFX6-NEXT:    s_mov_b32 s2, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_mov_b32 s0, s4
16; GFX6-NEXT:    s_mov_b32 s1, s5
17; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
18; GFX6-NEXT:    s_mov_b32 s4, s6
19; GFX6-NEXT:    s_mov_b32 s5, s7
20; GFX6-NEXT:    s_mov_b32 s6, s2
21; GFX6-NEXT:    s_mov_b32 s7, s3
22; GFX6-NEXT:    s_waitcnt vmcnt(0)
23; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
24; GFX6-NEXT:    s_endpgm
25;
26; GFX7-LABEL: global_system_unordered_load:
27; GFX7:       ; %bb.0: ; %entry
28; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
29; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
30; GFX7-NEXT:    v_mov_b32_e32 v0, s0
31; GFX7-NEXT:    v_mov_b32_e32 v1, s1
32; GFX7-NEXT:    flat_load_dword v0, v[0:1]
33; GFX7-NEXT:    v_mov_b32_e32 v2, s2
34; GFX7-NEXT:    v_mov_b32_e32 v3, s3
35; GFX7-NEXT:    s_waitcnt vmcnt(0)
36; GFX7-NEXT:    flat_store_dword v[2:3], v0
37; GFX7-NEXT:    s_endpgm
38;
39; GFX10-WGP-LABEL: global_system_unordered_load:
40; GFX10-WGP:       ; %bb.0: ; %entry
41; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
42; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
43; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
44; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
45; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
46; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
47; GFX10-WGP-NEXT:    s_endpgm
48;
49; GFX10-CU-LABEL: global_system_unordered_load:
50; GFX10-CU:       ; %bb.0: ; %entry
51; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
52; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
53; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
55; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
56; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
57; GFX10-CU-NEXT:    s_endpgm
58;
59; SKIP-CACHE-INV-LABEL: global_system_unordered_load:
60; SKIP-CACHE-INV:       ; %bb.0: ; %entry
61; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
62; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
63; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
64; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
65; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
66; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
67; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
68; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
69; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
70; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
71; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
72; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
73; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
74; SKIP-CACHE-INV-NEXT:    s_endpgm
75    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
76entry:
77  %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4
78  store i32 %val, i32 addrspace(1)* %out
79  ret void
80}
81
82define amdgpu_kernel void @global_system_monotonic_load(
83; GFX6-LABEL: global_system_monotonic_load:
84; GFX6:       ; %bb.0: ; %entry
85; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
86; GFX6-NEXT:    s_mov_b32 s3, 0xf000
87; GFX6-NEXT:    s_mov_b32 s2, -1
88; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX6-NEXT:    s_mov_b32 s0, s4
90; GFX6-NEXT:    s_mov_b32 s1, s5
91; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
92; GFX6-NEXT:    s_mov_b32 s4, s6
93; GFX6-NEXT:    s_mov_b32 s5, s7
94; GFX6-NEXT:    s_mov_b32 s6, s2
95; GFX6-NEXT:    s_mov_b32 s7, s3
96; GFX6-NEXT:    s_waitcnt vmcnt(0)
97; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
98; GFX6-NEXT:    s_endpgm
99;
100; GFX7-LABEL: global_system_monotonic_load:
101; GFX7:       ; %bb.0: ; %entry
102; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
103; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX7-NEXT:    v_mov_b32_e32 v0, s0
105; GFX7-NEXT:    v_mov_b32_e32 v1, s1
106; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
107; GFX7-NEXT:    v_mov_b32_e32 v2, s2
108; GFX7-NEXT:    v_mov_b32_e32 v3, s3
109; GFX7-NEXT:    s_waitcnt vmcnt(0)
110; GFX7-NEXT:    flat_store_dword v[2:3], v0
111; GFX7-NEXT:    s_endpgm
112;
113; GFX10-WGP-LABEL: global_system_monotonic_load:
114; GFX10-WGP:       ; %bb.0: ; %entry
115; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
116; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
117; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
119; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
120; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
121; GFX10-WGP-NEXT:    s_endpgm
122;
123; GFX10-CU-LABEL: global_system_monotonic_load:
124; GFX10-CU:       ; %bb.0: ; %entry
125; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
126; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
127; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
129; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
130; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
131; GFX10-CU-NEXT:    s_endpgm
132;
133; SKIP-CACHE-INV-LABEL: global_system_monotonic_load:
134; SKIP-CACHE-INV:       ; %bb.0: ; %entry
135; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
136; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
137; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
138; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
139; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
140; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
141; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
142; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
143; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
144; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
145; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
146; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
147; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
148; SKIP-CACHE-INV-NEXT:    s_endpgm
149    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
150entry:
151  %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4
152  store i32 %val, i32 addrspace(1)* %out
153  ret void
154}
155
156define amdgpu_kernel void @global_system_acquire_load(
157; GFX6-LABEL: global_system_acquire_load:
158; GFX6:       ; %bb.0: ; %entry
159; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
160; GFX6-NEXT:    s_mov_b32 s3, 0xf000
161; GFX6-NEXT:    s_mov_b32 s2, -1
162; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX6-NEXT:    s_mov_b32 s0, s4
164; GFX6-NEXT:    s_mov_b32 s1, s5
165; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
166; GFX6-NEXT:    s_waitcnt vmcnt(0)
167; GFX6-NEXT:    buffer_wbinvl1
168; GFX6-NEXT:    s_mov_b32 s4, s6
169; GFX6-NEXT:    s_mov_b32 s5, s7
170; GFX6-NEXT:    s_mov_b32 s6, s2
171; GFX6-NEXT:    s_mov_b32 s7, s3
172; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
173; GFX6-NEXT:    s_endpgm
174;
175; GFX7-LABEL: global_system_acquire_load:
176; GFX7:       ; %bb.0: ; %entry
177; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
178; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX7-NEXT:    v_mov_b32_e32 v0, s0
180; GFX7-NEXT:    v_mov_b32_e32 v1, s1
181; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
182; GFX7-NEXT:    s_waitcnt vmcnt(0)
183; GFX7-NEXT:    buffer_wbinvl1_vol
184; GFX7-NEXT:    v_mov_b32_e32 v2, s2
185; GFX7-NEXT:    v_mov_b32_e32 v3, s3
186; GFX7-NEXT:    flat_store_dword v[2:3], v0
187; GFX7-NEXT:    s_endpgm
188;
189; GFX10-WGP-LABEL: global_system_acquire_load:
190; GFX10-WGP:       ; %bb.0: ; %entry
191; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
192; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
193; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
195; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
196; GFX10-WGP-NEXT:    buffer_gl0_inv
197; GFX10-WGP-NEXT:    buffer_gl1_inv
198; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
199; GFX10-WGP-NEXT:    s_endpgm
200;
201; GFX10-CU-LABEL: global_system_acquire_load:
202; GFX10-CU:       ; %bb.0: ; %entry
203; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
204; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
205; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
207; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
208; GFX10-CU-NEXT:    buffer_gl0_inv
209; GFX10-CU-NEXT:    buffer_gl1_inv
210; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
211; GFX10-CU-NEXT:    s_endpgm
212;
213; SKIP-CACHE-INV-LABEL: global_system_acquire_load:
214; SKIP-CACHE-INV:       ; %bb.0: ; %entry
215; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
216; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
217; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
218; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
219; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
220; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
221; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
222; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
223; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
224; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
225; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
226; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
227; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
228; SKIP-CACHE-INV-NEXT:    s_endpgm
229    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
230entry:
231  %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4
232  store i32 %val, i32 addrspace(1)* %out
233  ret void
234}
235
236define amdgpu_kernel void @global_system_seq_cst_load(
237; GFX6-LABEL: global_system_seq_cst_load:
238; GFX6:       ; %bb.0: ; %entry
239; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
240; GFX6-NEXT:    s_mov_b32 s3, 0xf000
241; GFX6-NEXT:    s_mov_b32 s2, -1
242; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX6-NEXT:    s_mov_b32 s0, s4
244; GFX6-NEXT:    s_mov_b32 s1, s5
245; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
246; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
247; GFX6-NEXT:    s_waitcnt vmcnt(0)
248; GFX6-NEXT:    buffer_wbinvl1
249; GFX6-NEXT:    s_mov_b32 s4, s6
250; GFX6-NEXT:    s_mov_b32 s5, s7
251; GFX6-NEXT:    s_mov_b32 s6, s2
252; GFX6-NEXT:    s_mov_b32 s7, s3
253; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
254; GFX6-NEXT:    s_endpgm
255;
256; GFX7-LABEL: global_system_seq_cst_load:
257; GFX7:       ; %bb.0: ; %entry
258; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
259; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7-NEXT:    v_mov_b32_e32 v0, s0
261; GFX7-NEXT:    v_mov_b32_e32 v1, s1
262; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
263; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
264; GFX7-NEXT:    s_waitcnt vmcnt(0)
265; GFX7-NEXT:    buffer_wbinvl1_vol
266; GFX7-NEXT:    v_mov_b32_e32 v2, s2
267; GFX7-NEXT:    v_mov_b32_e32 v3, s3
268; GFX7-NEXT:    flat_store_dword v[2:3], v0
269; GFX7-NEXT:    s_endpgm
270;
271; GFX10-WGP-LABEL: global_system_seq_cst_load:
272; GFX10-WGP:       ; %bb.0: ; %entry
273; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
274; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
275; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
276; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
277; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
278; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
279; GFX10-WGP-NEXT:    buffer_gl0_inv
280; GFX10-WGP-NEXT:    buffer_gl1_inv
281; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
282; GFX10-WGP-NEXT:    s_endpgm
283;
284; GFX10-CU-LABEL: global_system_seq_cst_load:
285; GFX10-CU:       ; %bb.0: ; %entry
286; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
287; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
288; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
291; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
292; GFX10-CU-NEXT:    buffer_gl0_inv
293; GFX10-CU-NEXT:    buffer_gl1_inv
294; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
295; GFX10-CU-NEXT:    s_endpgm
296;
297; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load:
298; SKIP-CACHE-INV:       ; %bb.0: ; %entry
299; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
300; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
302; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
303; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
304; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
305; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
306; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
307; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
308; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
309; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
310; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
311; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
312; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
313; SKIP-CACHE-INV-NEXT:    s_endpgm
314    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
315entry:
316  %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
317  store i32 %val, i32 addrspace(1)* %out
318  ret void
319}
320
321define amdgpu_kernel void @global_system_unordered_store(
322; GFX6-LABEL: global_system_unordered_store:
323; GFX6:       ; %bb.0: ; %entry
324; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
325; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
326; GFX6-NEXT:    s_mov_b32 s3, 0xf000
327; GFX6-NEXT:    s_mov_b32 s2, -1
328; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX6-NEXT:    v_mov_b32_e32 v0, s4
330; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
331; GFX6-NEXT:    s_endpgm
332;
333; GFX7-LABEL: global_system_unordered_store:
334; GFX7:       ; %bb.0: ; %entry
335; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
336; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
337; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX7-NEXT:    v_mov_b32_e32 v2, s2
339; GFX7-NEXT:    v_mov_b32_e32 v0, s0
340; GFX7-NEXT:    v_mov_b32_e32 v1, s1
341; GFX7-NEXT:    flat_store_dword v[0:1], v2
342; GFX7-NEXT:    s_endpgm
343;
344; GFX10-WGP-LABEL: global_system_unordered_store:
345; GFX10-WGP:       ; %bb.0: ; %entry
346; GFX10-WGP-NEXT:    s_clause 0x1
347; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
348; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
349; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
350; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
352; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
353; GFX10-WGP-NEXT:    s_endpgm
354;
355; GFX10-CU-LABEL: global_system_unordered_store:
356; GFX10-CU:       ; %bb.0: ; %entry
357; GFX10-CU-NEXT:    s_clause 0x1
358; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
359; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
360; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
361; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
363; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
364; GFX10-CU-NEXT:    s_endpgm
365;
366; SKIP-CACHE-INV-LABEL: global_system_unordered_store:
367; SKIP-CACHE-INV:       ; %bb.0: ; %entry
368; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
369; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
370; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
371; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
372; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
373; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
374; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
375; SKIP-CACHE-INV-NEXT:    s_endpgm
376    i32 %in, i32 addrspace(1)* %out) {
377entry:
378  store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4
379  ret void
380}
381
382define amdgpu_kernel void @global_system_monotonic_store(
383; GFX6-LABEL: global_system_monotonic_store:
384; GFX6:       ; %bb.0: ; %entry
385; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
386; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
387; GFX6-NEXT:    s_mov_b32 s3, 0xf000
388; GFX6-NEXT:    s_mov_b32 s2, -1
389; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX6-NEXT:    v_mov_b32_e32 v0, s4
391; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
392; GFX6-NEXT:    s_endpgm
393;
394; GFX7-LABEL: global_system_monotonic_store:
395; GFX7:       ; %bb.0: ; %entry
396; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
397; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
398; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX7-NEXT:    v_mov_b32_e32 v2, s2
400; GFX7-NEXT:    v_mov_b32_e32 v0, s0
401; GFX7-NEXT:    v_mov_b32_e32 v1, s1
402; GFX7-NEXT:    flat_store_dword v[0:1], v2
403; GFX7-NEXT:    s_endpgm
404;
405; GFX10-WGP-LABEL: global_system_monotonic_store:
406; GFX10-WGP:       ; %bb.0: ; %entry
407; GFX10-WGP-NEXT:    s_clause 0x1
408; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
409; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
410; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
411; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
413; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
414; GFX10-WGP-NEXT:    s_endpgm
415;
416; GFX10-CU-LABEL: global_system_monotonic_store:
417; GFX10-CU:       ; %bb.0: ; %entry
418; GFX10-CU-NEXT:    s_clause 0x1
419; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
420; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
421; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
422; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
424; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
425; GFX10-CU-NEXT:    s_endpgm
426;
427; SKIP-CACHE-INV-LABEL: global_system_monotonic_store:
428; SKIP-CACHE-INV:       ; %bb.0: ; %entry
429; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
431; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
432; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
433; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
434; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
435; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
436; SKIP-CACHE-INV-NEXT:    s_endpgm
437    i32 %in, i32 addrspace(1)* %out) {
438entry:
439  store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4
440  ret void
441}
442
443define amdgpu_kernel void @global_system_release_store(
444; GFX6-LABEL: global_system_release_store:
445; GFX6:       ; %bb.0: ; %entry
446; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
447; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
448; GFX6-NEXT:    s_mov_b32 s3, 0xf000
449; GFX6-NEXT:    s_mov_b32 s2, -1
450; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX6-NEXT:    v_mov_b32_e32 v0, s4
452; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
453; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
454; GFX6-NEXT:    s_endpgm
455;
456; GFX7-LABEL: global_system_release_store:
457; GFX7:       ; %bb.0: ; %entry
458; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
459; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
460; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX7-NEXT:    v_mov_b32_e32 v2, s2
462; GFX7-NEXT:    v_mov_b32_e32 v0, s0
463; GFX7-NEXT:    v_mov_b32_e32 v1, s1
464; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
465; GFX7-NEXT:    flat_store_dword v[0:1], v2
466; GFX7-NEXT:    s_endpgm
467;
468; GFX10-WGP-LABEL: global_system_release_store:
469; GFX10-WGP:       ; %bb.0: ; %entry
470; GFX10-WGP-NEXT:    s_clause 0x1
471; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
472; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
473; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
474; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
476; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
477; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
478; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
479; GFX10-WGP-NEXT:    s_endpgm
480;
481; GFX10-CU-LABEL: global_system_release_store:
482; GFX10-CU:       ; %bb.0: ; %entry
483; GFX10-CU-NEXT:    s_clause 0x1
484; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
485; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
486; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
487; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
489; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
490; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
491; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
492; GFX10-CU-NEXT:    s_endpgm
493;
494; SKIP-CACHE-INV-LABEL: global_system_release_store:
495; SKIP-CACHE-INV:       ; %bb.0: ; %entry
496; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
497; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
498; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
499; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
500; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
501; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
502; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
503; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
504; SKIP-CACHE-INV-NEXT:    s_endpgm
505    i32 %in, i32 addrspace(1)* %out) {
506entry:
507  store atomic i32 %in, i32 addrspace(1)* %out release, align 4
508  ret void
509}
510
511define amdgpu_kernel void @global_system_seq_cst_store(
512; GFX6-LABEL: global_system_seq_cst_store:
513; GFX6:       ; %bb.0: ; %entry
514; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
515; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
516; GFX6-NEXT:    s_mov_b32 s3, 0xf000
517; GFX6-NEXT:    s_mov_b32 s2, -1
518; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
519; GFX6-NEXT:    v_mov_b32_e32 v0, s4
520; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
521; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
522; GFX6-NEXT:    s_endpgm
523;
524; GFX7-LABEL: global_system_seq_cst_store:
525; GFX7:       ; %bb.0: ; %entry
526; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
527; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
528; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX7-NEXT:    v_mov_b32_e32 v2, s2
530; GFX7-NEXT:    v_mov_b32_e32 v0, s0
531; GFX7-NEXT:    v_mov_b32_e32 v1, s1
532; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
533; GFX7-NEXT:    flat_store_dword v[0:1], v2
534; GFX7-NEXT:    s_endpgm
535;
536; GFX10-WGP-LABEL: global_system_seq_cst_store:
537; GFX10-WGP:       ; %bb.0: ; %entry
538; GFX10-WGP-NEXT:    s_clause 0x1
539; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
540; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
541; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
542; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
544; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
545; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
546; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
547; GFX10-WGP-NEXT:    s_endpgm
548;
549; GFX10-CU-LABEL: global_system_seq_cst_store:
550; GFX10-CU:       ; %bb.0: ; %entry
551; GFX10-CU-NEXT:    s_clause 0x1
552; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
553; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
554; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
555; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
557; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
558; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
559; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
560; GFX10-CU-NEXT:    s_endpgm
561;
562; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store:
563; SKIP-CACHE-INV:       ; %bb.0: ; %entry
564; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
565; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
566; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
567; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
568; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
569; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
570; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
571; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
572; SKIP-CACHE-INV-NEXT:    s_endpgm
573    i32 %in, i32 addrspace(1)* %out) {
574entry:
575  store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
576  ret void
577}
578
579define amdgpu_kernel void @global_system_monotonic_atomicrmw(
580; GFX6-LABEL: global_system_monotonic_atomicrmw:
581; GFX6:       ; %bb.0: ; %entry
582; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
583; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
584; GFX6-NEXT:    s_mov_b32 s7, 0xf000
585; GFX6-NEXT:    s_mov_b32 s6, -1
586; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
587; GFX6-NEXT:    v_mov_b32_e32 v0, s0
588; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
589; GFX6-NEXT:    s_endpgm
590;
591; GFX7-LABEL: global_system_monotonic_atomicrmw:
592; GFX7:       ; %bb.0: ; %entry
593; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
594; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
595; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
596; GFX7-NEXT:    v_mov_b32_e32 v0, s0
597; GFX7-NEXT:    v_mov_b32_e32 v1, s1
598; GFX7-NEXT:    v_mov_b32_e32 v2, s2
599; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
600; GFX7-NEXT:    s_endpgm
601;
602; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw:
603; GFX10-WGP:       ; %bb.0: ; %entry
604; GFX10-WGP-NEXT:    s_clause 0x1
605; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
606; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
607; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
608; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
610; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
611; GFX10-WGP-NEXT:    s_endpgm
612;
613; GFX10-CU-LABEL: global_system_monotonic_atomicrmw:
614; GFX10-CU:       ; %bb.0: ; %entry
615; GFX10-CU-NEXT:    s_clause 0x1
616; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
617; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
618; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
619; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
620; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
621; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
622; GFX10-CU-NEXT:    s_endpgm
623;
624; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw:
625; SKIP-CACHE-INV:       ; %bb.0: ; %entry
626; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
627; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
628; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
629; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
630; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
631; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
632; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
633; SKIP-CACHE-INV-NEXT:    s_endpgm
634    i32 addrspace(1)* %out, i32 %in) {
635entry:
636  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic
637  ret void
638}
639
640define amdgpu_kernel void @global_system_acquire_atomicrmw(
641; GFX6-LABEL: global_system_acquire_atomicrmw:
642; GFX6:       ; %bb.0: ; %entry
643; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
644; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
645; GFX6-NEXT:    s_mov_b32 s7, 0xf000
646; GFX6-NEXT:    s_mov_b32 s6, -1
647; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX6-NEXT:    v_mov_b32_e32 v0, s0
649; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
650; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
651; GFX6-NEXT:    buffer_wbinvl1
652; GFX6-NEXT:    s_endpgm
653;
654; GFX7-LABEL: global_system_acquire_atomicrmw:
655; GFX7:       ; %bb.0: ; %entry
656; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
657; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
658; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX7-NEXT:    v_mov_b32_e32 v0, s0
660; GFX7-NEXT:    v_mov_b32_e32 v1, s1
661; GFX7-NEXT:    v_mov_b32_e32 v2, s2
662; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
663; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
664; GFX7-NEXT:    buffer_wbinvl1_vol
665; GFX7-NEXT:    s_endpgm
666;
667; GFX10-WGP-LABEL: global_system_acquire_atomicrmw:
668; GFX10-WGP:       ; %bb.0: ; %entry
669; GFX10-WGP-NEXT:    s_clause 0x1
670; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
671; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
672; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
673; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
675; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
676; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
678; GFX10-WGP-NEXT:    buffer_gl0_inv
679; GFX10-WGP-NEXT:    buffer_gl1_inv
680; GFX10-WGP-NEXT:    s_endpgm
681;
682; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
683; GFX10-CU:       ; %bb.0: ; %entry
684; GFX10-CU-NEXT:    s_clause 0x1
685; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
686; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
687; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
688; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
690; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
691; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
693; GFX10-CU-NEXT:    buffer_gl0_inv
694; GFX10-CU-NEXT:    buffer_gl1_inv
695; GFX10-CU-NEXT:    s_endpgm
696;
697; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
698; SKIP-CACHE-INV:       ; %bb.0: ; %entry
699; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
700; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
701; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
702; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
703; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
704; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
705; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
706; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
707; SKIP-CACHE-INV-NEXT:    s_endpgm
708    i32 addrspace(1)* %out, i32 %in) {
709entry:
710  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
711  ret void
712}
713
714define amdgpu_kernel void @global_system_release_atomicrmw(
715; GFX6-LABEL: global_system_release_atomicrmw:
716; GFX6:       ; %bb.0: ; %entry
717; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
718; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
719; GFX6-NEXT:    s_mov_b32 s7, 0xf000
720; GFX6-NEXT:    s_mov_b32 s6, -1
721; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX6-NEXT:    v_mov_b32_e32 v0, s0
723; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
724; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
725; GFX6-NEXT:    s_endpgm
726;
727; GFX7-LABEL: global_system_release_atomicrmw:
728; GFX7:       ; %bb.0: ; %entry
729; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
730; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
731; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX7-NEXT:    v_mov_b32_e32 v0, s0
733; GFX7-NEXT:    v_mov_b32_e32 v1, s1
734; GFX7-NEXT:    v_mov_b32_e32 v2, s2
735; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
736; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
737; GFX7-NEXT:    s_endpgm
738;
739; GFX10-WGP-LABEL: global_system_release_atomicrmw:
740; GFX10-WGP:       ; %bb.0: ; %entry
741; GFX10-WGP-NEXT:    s_clause 0x1
742; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
743; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
744; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
745; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
746; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
747; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
748; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
749; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
750; GFX10-WGP-NEXT:    s_endpgm
751;
752; GFX10-CU-LABEL: global_system_release_atomicrmw:
753; GFX10-CU:       ; %bb.0: ; %entry
754; GFX10-CU-NEXT:    s_clause 0x1
755; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
756; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
757; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
758; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
760; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
761; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
762; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
763; GFX10-CU-NEXT:    s_endpgm
764;
765; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw:
766; SKIP-CACHE-INV:       ; %bb.0: ; %entry
767; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
768; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
769; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
770; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
771; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
772; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
773; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
774; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
775; SKIP-CACHE-INV-NEXT:    s_endpgm
776    i32 addrspace(1)* %out, i32 %in) {
777entry:
778  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release
779  ret void
780}
781
782define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
783; GFX6-LABEL: global_system_acq_rel_atomicrmw:
784; GFX6:       ; %bb.0: ; %entry
785; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
786; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
787; GFX6-NEXT:    s_mov_b32 s7, 0xf000
788; GFX6-NEXT:    s_mov_b32 s6, -1
789; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX6-NEXT:    v_mov_b32_e32 v0, s0
791; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
792; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
793; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
794; GFX6-NEXT:    buffer_wbinvl1
795; GFX6-NEXT:    s_endpgm
796;
797; GFX7-LABEL: global_system_acq_rel_atomicrmw:
798; GFX7:       ; %bb.0: ; %entry
799; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
800; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
801; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX7-NEXT:    v_mov_b32_e32 v0, s0
803; GFX7-NEXT:    v_mov_b32_e32 v1, s1
804; GFX7-NEXT:    v_mov_b32_e32 v2, s2
805; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
806; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
807; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
808; GFX7-NEXT:    buffer_wbinvl1_vol
809; GFX7-NEXT:    s_endpgm
810;
811; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw:
812; GFX10-WGP:       ; %bb.0: ; %entry
813; GFX10-WGP-NEXT:    s_clause 0x1
814; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
815; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
816; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
817; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
819; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
820; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
821; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
822; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
824; GFX10-WGP-NEXT:    buffer_gl0_inv
825; GFX10-WGP-NEXT:    buffer_gl1_inv
826; GFX10-WGP-NEXT:    s_endpgm
827;
828; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
829; GFX10-CU:       ; %bb.0: ; %entry
830; GFX10-CU-NEXT:    s_clause 0x1
831; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
832; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
833; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
834; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
836; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
837; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
838; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
839; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
840; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
841; GFX10-CU-NEXT:    buffer_gl0_inv
842; GFX10-CU-NEXT:    buffer_gl1_inv
843; GFX10-CU-NEXT:    s_endpgm
844;
845; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
846; SKIP-CACHE-INV:       ; %bb.0: ; %entry
847; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
848; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
849; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
850; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
851; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
852; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
853; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
854; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
855; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
856; SKIP-CACHE-INV-NEXT:    s_endpgm
857    i32 addrspace(1)* %out, i32 %in) {
858entry:
859  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
860  ret void
861}
862
863define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
864; GFX6-LABEL: global_system_seq_cst_atomicrmw:
865; GFX6:       ; %bb.0: ; %entry
866; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
867; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
868; GFX6-NEXT:    s_mov_b32 s7, 0xf000
869; GFX6-NEXT:    s_mov_b32 s6, -1
870; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
871; GFX6-NEXT:    v_mov_b32_e32 v0, s0
872; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
873; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
874; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
875; GFX6-NEXT:    buffer_wbinvl1
876; GFX6-NEXT:    s_endpgm
877;
878; GFX7-LABEL: global_system_seq_cst_atomicrmw:
879; GFX7:       ; %bb.0: ; %entry
880; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
881; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
882; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX7-NEXT:    v_mov_b32_e32 v0, s0
884; GFX7-NEXT:    v_mov_b32_e32 v1, s1
885; GFX7-NEXT:    v_mov_b32_e32 v2, s2
886; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
887; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
888; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
889; GFX7-NEXT:    buffer_wbinvl1_vol
890; GFX7-NEXT:    s_endpgm
891;
892; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw:
893; GFX10-WGP:       ; %bb.0: ; %entry
894; GFX10-WGP-NEXT:    s_clause 0x1
895; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
896; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
897; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
898; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
900; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
901; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
902; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
903; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
905; GFX10-WGP-NEXT:    buffer_gl0_inv
906; GFX10-WGP-NEXT:    buffer_gl1_inv
907; GFX10-WGP-NEXT:    s_endpgm
908;
909; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
910; GFX10-CU:       ; %bb.0: ; %entry
911; GFX10-CU-NEXT:    s_clause 0x1
912; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
913; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
914; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
915; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
917; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
918; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
919; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
920; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
922; GFX10-CU-NEXT:    buffer_gl0_inv
923; GFX10-CU-NEXT:    buffer_gl1_inv
924; GFX10-CU-NEXT:    s_endpgm
925;
926; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
927; SKIP-CACHE-INV:       ; %bb.0: ; %entry
928; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
929; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
930; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
931; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
932; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
934; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
935; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
936; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
937; SKIP-CACHE-INV-NEXT:    s_endpgm
938    i32 addrspace(1)* %out, i32 %in) {
939entry:
940  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
941  ret void
942}
943
944define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
945; GFX6-LABEL: global_system_acquire_ret_atomicrmw:
946; GFX6:       ; %bb.0: ; %entry
947; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
948; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
949; GFX6-NEXT:    s_mov_b32 s7, 0xf000
950; GFX6-NEXT:    s_mov_b32 s6, -1
951; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
952; GFX6-NEXT:    v_mov_b32_e32 v0, s0
953; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
954; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
955; GFX6-NEXT:    buffer_wbinvl1
956; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
957; GFX6-NEXT:    s_endpgm
958;
959; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
960; GFX7:       ; %bb.0: ; %entry
961; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
962; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
963; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX7-NEXT:    v_mov_b32_e32 v0, s0
965; GFX7-NEXT:    v_mov_b32_e32 v1, s1
966; GFX7-NEXT:    v_mov_b32_e32 v2, s2
967; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
968; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
969; GFX7-NEXT:    buffer_wbinvl1_vol
970; GFX7-NEXT:    flat_store_dword v[0:1], v2
971; GFX7-NEXT:    s_endpgm
972;
973; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw:
974; GFX10-WGP:       ; %bb.0: ; %entry
975; GFX10-WGP-NEXT:    s_clause 0x1
976; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
977; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
978; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
979; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
980; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
981; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
982; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
983; GFX10-WGP-NEXT:    buffer_gl0_inv
984; GFX10-WGP-NEXT:    buffer_gl1_inv
985; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
986; GFX10-WGP-NEXT:    s_endpgm
987;
988; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw:
989; GFX10-CU:       ; %bb.0: ; %entry
990; GFX10-CU-NEXT:    s_clause 0x1
991; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
992; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
993; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
994; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
996; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
997; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
998; GFX10-CU-NEXT:    buffer_gl0_inv
999; GFX10-CU-NEXT:    buffer_gl1_inv
1000; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1001; GFX10-CU-NEXT:    s_endpgm
1002;
1003; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw:
1004; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1005; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1006; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1007; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1008; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1009; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1010; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1011; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1012; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1013; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1014; SKIP-CACHE-INV-NEXT:    s_endpgm
1015    i32 addrspace(1)* %out, i32 %in) {
1016entry:
1017  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
1018  store i32 %val, i32 addrspace(1)* %out, align 4
1019  ret void
1020}
1021
1022define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
1023; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw:
1024; GFX6:       ; %bb.0: ; %entry
1025; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1026; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
1027; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1028; GFX6-NEXT:    s_mov_b32 s6, -1
1029; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1031; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1032; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1033; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1034; GFX6-NEXT:    buffer_wbinvl1
1035; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1036; GFX6-NEXT:    s_endpgm
1037;
1038; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
1039; GFX7:       ; %bb.0: ; %entry
1040; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1041; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1042; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1044; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1045; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1046; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1047; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1048; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1049; GFX7-NEXT:    buffer_wbinvl1_vol
1050; GFX7-NEXT:    flat_store_dword v[0:1], v2
1051; GFX7-NEXT:    s_endpgm
1052;
1053; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
1054; GFX10-WGP:       ; %bb.0: ; %entry
1055; GFX10-WGP-NEXT:    s_clause 0x1
1056; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1057; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1058; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1059; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1061; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1062; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1063; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1064; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1065; GFX10-WGP-NEXT:    buffer_gl0_inv
1066; GFX10-WGP-NEXT:    buffer_gl1_inv
1067; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1068; GFX10-WGP-NEXT:    s_endpgm
1069;
1070; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
1071; GFX10-CU:       ; %bb.0: ; %entry
1072; GFX10-CU-NEXT:    s_clause 0x1
1073; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1074; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1075; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1076; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1078; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1079; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1080; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1081; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1082; GFX10-CU-NEXT:    buffer_gl0_inv
1083; GFX10-CU-NEXT:    buffer_gl1_inv
1084; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1085; GFX10-CU-NEXT:    s_endpgm
1086;
1087; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw:
1088; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1089; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1090; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1091; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1092; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1093; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1094; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1095; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1096; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1097; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1098; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1099; SKIP-CACHE-INV-NEXT:    s_endpgm
1100    i32 addrspace(1)* %out, i32 %in) {
1101entry:
1102  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
1103  store i32 %val, i32 addrspace(1)* %out, align 4
1104  ret void
1105}
1106
1107define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
1108; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw:
1109; GFX6:       ; %bb.0: ; %entry
1110; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1111; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
1112; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1113; GFX6-NEXT:    s_mov_b32 s6, -1
1114; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1116; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1117; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1118; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1119; GFX6-NEXT:    buffer_wbinvl1
1120; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1121; GFX6-NEXT:    s_endpgm
1122;
1123; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
1124; GFX7:       ; %bb.0: ; %entry
1125; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1126; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
1127; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1129; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1130; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1131; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1132; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
1133; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1134; GFX7-NEXT:    buffer_wbinvl1_vol
1135; GFX7-NEXT:    flat_store_dword v[0:1], v2
1136; GFX7-NEXT:    s_endpgm
1137;
1138; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
1139; GFX10-WGP:       ; %bb.0: ; %entry
1140; GFX10-WGP-NEXT:    s_clause 0x1
1141; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
1142; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1143; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
1144; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
1146; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1147; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1148; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1149; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1150; GFX10-WGP-NEXT:    buffer_gl0_inv
1151; GFX10-WGP-NEXT:    buffer_gl1_inv
1152; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
1153; GFX10-WGP-NEXT:    s_endpgm
1154;
1155; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
1156; GFX10-CU:       ; %bb.0: ; %entry
1157; GFX10-CU-NEXT:    s_clause 0x1
1158; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
1159; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1160; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
1161; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
1163; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1164; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1165; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
1166; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1167; GFX10-CU-NEXT:    buffer_gl0_inv
1168; GFX10-CU-NEXT:    buffer_gl1_inv
1169; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
1170; GFX10-CU-NEXT:    s_endpgm
1171;
1172; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw:
1173; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1174; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1175; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
1176; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1177; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1178; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1179; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1180; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1181; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
1182; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1183; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1184; SKIP-CACHE-INV-NEXT:    s_endpgm
1185    i32 addrspace(1)* %out, i32 %in) {
1186entry:
1187  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
1188  store i32 %val, i32 addrspace(1)* %out, align 4
1189  ret void
1190}
1191
1192define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
1193; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg:
1194; GFX6:       ; %bb.0: ; %entry
1195; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1196; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1197; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1198; GFX6-NEXT:    s_mov_b32 s6, -1
1199; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1201; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1202; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1203; GFX6-NEXT:    s_endpgm
1204;
1205; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
1206; GFX7:       ; %bb.0: ; %entry
1207; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1208; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX7-NEXT:    s_add_u32 s0, s0, 16
1211; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1212; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1213; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1214; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1215; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1216; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1217; GFX7-NEXT:    s_endpgm
1218;
1219; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
1220; GFX10-WGP:       ; %bb.0: ; %entry
1221; GFX10-WGP-NEXT:    s_clause 0x1
1222; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1223; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1224; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1225; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1226; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1227; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1228; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1229; GFX10-WGP-NEXT:    s_endpgm
1230;
1231; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
1232; GFX10-CU:       ; %bb.0: ; %entry
1233; GFX10-CU-NEXT:    s_clause 0x1
1234; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1235; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1236; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1237; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1239; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1240; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1241; GFX10-CU-NEXT:    s_endpgm
1242;
1243; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg:
1244; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1245; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1246; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1247; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1248; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1249; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1250; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1251; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1252; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1253; SKIP-CACHE-INV-NEXT:    s_endpgm
1254    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1255entry:
1256  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1257  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic monotonic
1258  ret void
1259}
1260
1261define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
1262; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg:
1263; GFX6:       ; %bb.0: ; %entry
1264; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1265; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1266; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1267; GFX6-NEXT:    s_mov_b32 s6, -1
1268; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1269; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1270; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1271; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1272; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1273; GFX6-NEXT:    buffer_wbinvl1
1274; GFX6-NEXT:    s_endpgm
1275;
1276; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
1277; GFX7:       ; %bb.0: ; %entry
1278; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1279; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1280; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1281; GFX7-NEXT:    s_add_u32 s0, s0, 16
1282; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1283; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1284; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1285; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1286; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1287; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1288; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1289; GFX7-NEXT:    buffer_wbinvl1_vol
1290; GFX7-NEXT:    s_endpgm
1291;
1292; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
1293; GFX10-WGP:       ; %bb.0: ; %entry
1294; GFX10-WGP-NEXT:    s_clause 0x1
1295; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1296; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1297; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1298; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1299; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1300; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1301; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1302; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1304; GFX10-WGP-NEXT:    buffer_gl0_inv
1305; GFX10-WGP-NEXT:    buffer_gl1_inv
1306; GFX10-WGP-NEXT:    s_endpgm
1307;
1308; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
1309; GFX10-CU:       ; %bb.0: ; %entry
1310; GFX10-CU-NEXT:    s_clause 0x1
1311; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1312; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1313; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1314; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1316; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1317; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1318; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1320; GFX10-CU-NEXT:    buffer_gl0_inv
1321; GFX10-CU-NEXT:    buffer_gl1_inv
1322; GFX10-CU-NEXT:    s_endpgm
1323;
1324; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
1325; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1326; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1327; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1328; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1329; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1330; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1331; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1332; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1333; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1334; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1335; SKIP-CACHE-INV-NEXT:    s_endpgm
1336    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1337entry:
1338  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1339  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic
1340  ret void
1341}
1342
1343define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
1344; GFX6-LABEL: global_system_release_monotonic_cmpxchg:
1345; GFX6:       ; %bb.0: ; %entry
1346; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1347; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1348; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1349; GFX6-NEXT:    s_mov_b32 s6, -1
1350; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1351; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1352; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1353; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1354; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1355; GFX6-NEXT:    s_endpgm
1356;
1357; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
1358; GFX7:       ; %bb.0: ; %entry
1359; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1360; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1361; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX7-NEXT:    s_add_u32 s0, s0, 16
1363; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1364; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1365; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1366; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1367; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1368; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1369; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1370; GFX7-NEXT:    s_endpgm
1371;
1372; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg:
1373; GFX10-WGP:       ; %bb.0: ; %entry
1374; GFX10-WGP-NEXT:    s_clause 0x1
1375; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1376; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1377; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1378; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1379; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1380; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1381; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1382; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1383; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1384; GFX10-WGP-NEXT:    s_endpgm
1385;
1386; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg:
1387; GFX10-CU:       ; %bb.0: ; %entry
1388; GFX10-CU-NEXT:    s_clause 0x1
1389; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1390; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1391; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1392; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1394; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1395; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1396; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1397; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1398; GFX10-CU-NEXT:    s_endpgm
1399;
1400; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg:
1401; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1402; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1403; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1404; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1405; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1406; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1407; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1408; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1409; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1410; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1411; SKIP-CACHE-INV-NEXT:    s_endpgm
1412    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1413entry:
1414  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1415  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release monotonic
1416  ret void
1417}
1418
1419define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
1420; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1421; GFX6:       ; %bb.0: ; %entry
1422; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1423; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1424; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1425; GFX6-NEXT:    s_mov_b32 s6, -1
1426; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1428; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1429; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1430; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1431; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1432; GFX6-NEXT:    buffer_wbinvl1
1433; GFX6-NEXT:    s_endpgm
1434;
1435; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1436; GFX7:       ; %bb.0: ; %entry
1437; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1438; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1439; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1440; GFX7-NEXT:    s_add_u32 s0, s0, 16
1441; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1442; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1443; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1444; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1445; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1446; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1447; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1448; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1449; GFX7-NEXT:    buffer_wbinvl1_vol
1450; GFX7-NEXT:    s_endpgm
1451;
1452; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1453; GFX10-WGP:       ; %bb.0: ; %entry
1454; GFX10-WGP-NEXT:    s_clause 0x1
1455; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1456; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1457; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1458; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1460; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1461; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1462; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1463; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1464; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1465; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1466; GFX10-WGP-NEXT:    buffer_gl0_inv
1467; GFX10-WGP-NEXT:    buffer_gl1_inv
1468; GFX10-WGP-NEXT:    s_endpgm
1469;
1470; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1471; GFX10-CU:       ; %bb.0: ; %entry
1472; GFX10-CU-NEXT:    s_clause 0x1
1473; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1474; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1475; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1476; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1478; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1479; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1480; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1481; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1482; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1483; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1484; GFX10-CU-NEXT:    buffer_gl0_inv
1485; GFX10-CU-NEXT:    buffer_gl1_inv
1486; GFX10-CU-NEXT:    s_endpgm
1487;
1488; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
1489; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1490; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1491; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1492; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1493; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1494; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1496; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1497; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1498; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1499; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1500; SKIP-CACHE-INV-NEXT:    s_endpgm
1501    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1502entry:
1503  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1504  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic
1505  ret void
1506}
1507
1508define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
1509; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg:
1510; GFX6:       ; %bb.0: ; %entry
1511; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1512; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1513; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1514; GFX6-NEXT:    s_mov_b32 s6, -1
1515; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1516; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1517; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1518; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1519; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1520; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1521; GFX6-NEXT:    buffer_wbinvl1
1522; GFX6-NEXT:    s_endpgm
1523;
1524; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
1525; GFX7:       ; %bb.0: ; %entry
1526; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1527; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1528; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1529; GFX7-NEXT:    s_add_u32 s0, s0, 16
1530; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1531; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1532; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1533; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1534; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1535; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1536; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1537; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1538; GFX7-NEXT:    buffer_wbinvl1_vol
1539; GFX7-NEXT:    s_endpgm
1540;
1541; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
1542; GFX10-WGP:       ; %bb.0: ; %entry
1543; GFX10-WGP-NEXT:    s_clause 0x1
1544; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1545; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1546; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1547; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1548; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1549; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1550; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1551; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1552; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1553; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1555; GFX10-WGP-NEXT:    buffer_gl0_inv
1556; GFX10-WGP-NEXT:    buffer_gl1_inv
1557; GFX10-WGP-NEXT:    s_endpgm
1558;
1559; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
1560; GFX10-CU:       ; %bb.0: ; %entry
1561; GFX10-CU-NEXT:    s_clause 0x1
1562; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1563; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1564; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1565; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1566; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1567; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1568; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1569; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1570; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1571; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1572; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1573; GFX10-CU-NEXT:    buffer_gl0_inv
1574; GFX10-CU-NEXT:    buffer_gl1_inv
1575; GFX10-CU-NEXT:    s_endpgm
1576;
1577; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
1578; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1579; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1580; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1581; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1582; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1583; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1584; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1585; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1586; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1587; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1588; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1589; SKIP-CACHE-INV-NEXT:    s_endpgm
1590    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1591entry:
1592  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1593  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic
1594  ret void
1595}
1596
1597define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
1598; GFX6-LABEL: global_system_acquire_acquire_cmpxchg:
1599; GFX6:       ; %bb.0: ; %entry
1600; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1601; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1602; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1603; GFX6-NEXT:    s_mov_b32 s6, -1
1604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1605; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1606; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1607; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1608; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1609; GFX6-NEXT:    buffer_wbinvl1
1610; GFX6-NEXT:    s_endpgm
1611;
1612; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
1613; GFX7:       ; %bb.0: ; %entry
1614; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1615; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1616; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1617; GFX7-NEXT:    s_add_u32 s0, s0, 16
1618; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1619; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1620; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1621; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1622; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1623; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1624; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1625; GFX7-NEXT:    buffer_wbinvl1_vol
1626; GFX7-NEXT:    s_endpgm
1627;
1628; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
1629; GFX10-WGP:       ; %bb.0: ; %entry
1630; GFX10-WGP-NEXT:    s_clause 0x1
1631; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1632; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1633; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1634; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1635; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1636; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1637; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1638; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1639; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1640; GFX10-WGP-NEXT:    buffer_gl0_inv
1641; GFX10-WGP-NEXT:    buffer_gl1_inv
1642; GFX10-WGP-NEXT:    s_endpgm
1643;
1644; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
1645; GFX10-CU:       ; %bb.0: ; %entry
1646; GFX10-CU-NEXT:    s_clause 0x1
1647; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1648; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1649; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1650; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1652; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1653; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1654; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1655; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1656; GFX10-CU-NEXT:    buffer_gl0_inv
1657; GFX10-CU-NEXT:    buffer_gl1_inv
1658; GFX10-CU-NEXT:    s_endpgm
1659;
1660; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
1661; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1662; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1663; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1664; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1665; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1666; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1667; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1668; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1669; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1670; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1671; SKIP-CACHE-INV-NEXT:    s_endpgm
1672    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1673entry:
1674  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1675  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire
1676  ret void
1677}
1678
1679define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
1680; GFX6-LABEL: global_system_release_acquire_cmpxchg:
1681; GFX6:       ; %bb.0: ; %entry
1682; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1683; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1684; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1685; GFX6-NEXT:    s_mov_b32 s6, -1
1686; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1687; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1688; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1689; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1690; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1691; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1692; GFX6-NEXT:    buffer_wbinvl1
1693; GFX6-NEXT:    s_endpgm
1694;
1695; GFX7-LABEL: global_system_release_acquire_cmpxchg:
1696; GFX7:       ; %bb.0: ; %entry
1697; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1698; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1699; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1700; GFX7-NEXT:    s_add_u32 s0, s0, 16
1701; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1702; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1703; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1704; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1705; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1706; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1707; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1708; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1709; GFX7-NEXT:    buffer_wbinvl1_vol
1710; GFX7-NEXT:    s_endpgm
1711;
1712; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg:
1713; GFX10-WGP:       ; %bb.0: ; %entry
1714; GFX10-WGP-NEXT:    s_clause 0x1
1715; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1716; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1717; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1718; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1720; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1721; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1722; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1723; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1724; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1726; GFX10-WGP-NEXT:    buffer_gl0_inv
1727; GFX10-WGP-NEXT:    buffer_gl1_inv
1728; GFX10-WGP-NEXT:    s_endpgm
1729;
1730; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
1731; GFX10-CU:       ; %bb.0: ; %entry
1732; GFX10-CU-NEXT:    s_clause 0x1
1733; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1734; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1735; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1736; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1737; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1738; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1739; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1740; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1741; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1742; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1743; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1744; GFX10-CU-NEXT:    buffer_gl0_inv
1745; GFX10-CU-NEXT:    buffer_gl1_inv
1746; GFX10-CU-NEXT:    s_endpgm
1747;
1748; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
1749; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1750; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1751; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1752; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1753; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1754; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1755; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1756; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1757; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1758; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1759; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1760; SKIP-CACHE-INV-NEXT:    s_endpgm
1761    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1762entry:
1763  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1764  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire
1765  ret void
1766}
1767
1768define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
1769; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg:
1770; GFX6:       ; %bb.0: ; %entry
1771; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1772; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1773; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1774; GFX6-NEXT:    s_mov_b32 s6, -1
1775; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1777; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1778; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1779; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1780; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1781; GFX6-NEXT:    buffer_wbinvl1
1782; GFX6-NEXT:    s_endpgm
1783;
1784; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
1785; GFX7:       ; %bb.0: ; %entry
1786; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1787; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1788; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1789; GFX7-NEXT:    s_add_u32 s0, s0, 16
1790; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1791; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1792; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1793; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1794; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1795; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1796; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1797; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1798; GFX7-NEXT:    buffer_wbinvl1_vol
1799; GFX7-NEXT:    s_endpgm
1800;
1801; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
1802; GFX10-WGP:       ; %bb.0: ; %entry
1803; GFX10-WGP-NEXT:    s_clause 0x1
1804; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1805; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1806; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1807; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1808; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1809; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1810; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1811; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1812; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1813; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1814; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1815; GFX10-WGP-NEXT:    buffer_gl0_inv
1816; GFX10-WGP-NEXT:    buffer_gl1_inv
1817; GFX10-WGP-NEXT:    s_endpgm
1818;
1819; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
1820; GFX10-CU:       ; %bb.0: ; %entry
1821; GFX10-CU-NEXT:    s_clause 0x1
1822; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1823; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1824; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1825; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1826; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1827; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1828; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1829; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1830; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1831; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1833; GFX10-CU-NEXT:    buffer_gl0_inv
1834; GFX10-CU-NEXT:    buffer_gl1_inv
1835; GFX10-CU-NEXT:    s_endpgm
1836;
1837; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
1838; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1839; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1840; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1841; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1842; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1843; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1844; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1845; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1846; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1847; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1848; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1849; SKIP-CACHE-INV-NEXT:    s_endpgm
1850    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1851entry:
1852  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1853  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire
1854  ret void
1855}
1856
1857define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
1858; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg:
1859; GFX6:       ; %bb.0: ; %entry
1860; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1861; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1862; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1863; GFX6-NEXT:    s_mov_b32 s6, -1
1864; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1865; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1866; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1867; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1868; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1869; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1870; GFX6-NEXT:    buffer_wbinvl1
1871; GFX6-NEXT:    s_endpgm
1872;
1873; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
1874; GFX7:       ; %bb.0: ; %entry
1875; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1876; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1877; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1878; GFX7-NEXT:    s_add_u32 s0, s0, 16
1879; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1880; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1881; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1882; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1883; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1884; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1885; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1886; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1887; GFX7-NEXT:    buffer_wbinvl1_vol
1888; GFX7-NEXT:    s_endpgm
1889;
1890; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
1891; GFX10-WGP:       ; %bb.0: ; %entry
1892; GFX10-WGP-NEXT:    s_clause 0x1
1893; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1894; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1895; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1896; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1897; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1898; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1899; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1900; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1901; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1902; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1903; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1904; GFX10-WGP-NEXT:    buffer_gl0_inv
1905; GFX10-WGP-NEXT:    buffer_gl1_inv
1906; GFX10-WGP-NEXT:    s_endpgm
1907;
1908; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
1909; GFX10-CU:       ; %bb.0: ; %entry
1910; GFX10-CU-NEXT:    s_clause 0x1
1911; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1912; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1913; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
1914; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1915; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
1916; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
1917; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1918; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1919; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1920; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
1922; GFX10-CU-NEXT:    buffer_gl0_inv
1923; GFX10-CU-NEXT:    buffer_gl1_inv
1924; GFX10-CU-NEXT:    s_endpgm
1925;
1926; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
1927; SKIP-CACHE-INV:       ; %bb.0: ; %entry
1928; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1929; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1930; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
1931; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
1932; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
1933; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
1934; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
1935; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1936; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1937; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1938; SKIP-CACHE-INV-NEXT:    s_endpgm
1939    i32 addrspace(1)* %out, i32 %in, i32 %old) {
1940entry:
1941  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
1942  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire
1943  ret void
1944}
1945
1946define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
1947; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
1948; GFX6:       ; %bb.0: ; %entry
1949; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1950; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
1951; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1952; GFX6-NEXT:    s_mov_b32 s6, -1
1953; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1955; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1956; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1957; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
1958; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1959; GFX6-NEXT:    buffer_wbinvl1
1960; GFX6-NEXT:    s_endpgm
1961;
1962; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
1963; GFX7:       ; %bb.0: ; %entry
1964; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1965; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
1966; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1967; GFX7-NEXT:    s_add_u32 s0, s0, 16
1968; GFX7-NEXT:    s_addc_u32 s1, s1, 0
1969; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1970; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1971; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1972; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1973; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1974; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
1975; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1976; GFX7-NEXT:    buffer_wbinvl1_vol
1977; GFX7-NEXT:    s_endpgm
1978;
1979; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
1980; GFX10-WGP:       ; %bb.0: ; %entry
1981; GFX10-WGP-NEXT:    s_clause 0x1
1982; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1983; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1984; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
1985; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1986; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
1987; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
1988; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1989; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1990; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
1991; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1992; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
1993; GFX10-WGP-NEXT:    buffer_gl0_inv
1994; GFX10-WGP-NEXT:    buffer_gl1_inv
1995; GFX10-WGP-NEXT:    s_endpgm
1996;
1997; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
1998; GFX10-CU:       ; %bb.0: ; %entry
1999; GFX10-CU-NEXT:    s_clause 0x1
2000; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2001; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2002; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2003; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2004; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2005; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2006; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2007; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2008; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
2009; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2010; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2011; GFX10-CU-NEXT:    buffer_gl0_inv
2012; GFX10-CU-NEXT:    buffer_gl1_inv
2013; GFX10-CU-NEXT:    s_endpgm
2014;
2015; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
2016; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2017; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2018; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2019; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2020; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2021; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2022; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2023; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2024; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2025; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
2026; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2027; SKIP-CACHE-INV-NEXT:    s_endpgm
2028    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2029entry:
2030  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2031  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
2032  ret void
2033}
2034
2035define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
2036; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2037; GFX6:       ; %bb.0: ; %entry
2038; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2039; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2040; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2041; GFX6-NEXT:    s_mov_b32 s6, -1
2042; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2043; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2044; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2045; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2046; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2047; GFX6-NEXT:    buffer_wbinvl1
2048; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2049; GFX6-NEXT:    s_endpgm
2050;
2051; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2052; GFX7:       ; %bb.0: ; %entry
2053; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2054; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2055; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2056; GFX7-NEXT:    s_add_u32 s4, s0, 16
2057; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2058; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2059; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2060; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2061; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2062; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2063; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2064; GFX7-NEXT:    buffer_wbinvl1_vol
2065; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2066; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2067; GFX7-NEXT:    flat_store_dword v[0:1], v2
2068; GFX7-NEXT:    s_endpgm
2069;
2070; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2071; GFX10-WGP:       ; %bb.0: ; %entry
2072; GFX10-WGP-NEXT:    s_clause 0x1
2073; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2074; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2075; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2076; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2077; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2078; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2079; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2080; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2081; GFX10-WGP-NEXT:    buffer_gl0_inv
2082; GFX10-WGP-NEXT:    buffer_gl1_inv
2083; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2084; GFX10-WGP-NEXT:    s_endpgm
2085;
2086; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2087; GFX10-CU:       ; %bb.0: ; %entry
2088; GFX10-CU-NEXT:    s_clause 0x1
2089; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2090; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2091; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2092; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2093; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2094; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2095; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2096; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2097; GFX10-CU-NEXT:    buffer_gl0_inv
2098; GFX10-CU-NEXT:    buffer_gl1_inv
2099; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2100; GFX10-CU-NEXT:    s_endpgm
2101;
2102; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
2103; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2104; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2105; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2106; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2107; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2108; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2109; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2110; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2111; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2112; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2113; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2114; SKIP-CACHE-INV-NEXT:    s_endpgm
2115    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2116entry:
2117  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2118  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic
2119  %val0 = extractvalue { i32, i1 } %val, 0
2120  store i32 %val0, i32 addrspace(1)* %out, align 4
2121  ret void
2122}
2123
2124define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
2125; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2126; GFX6:       ; %bb.0: ; %entry
2127; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2128; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2129; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2130; GFX6-NEXT:    s_mov_b32 s6, -1
2131; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2132; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2133; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2134; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2135; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2136; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2137; GFX6-NEXT:    buffer_wbinvl1
2138; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2139; GFX6-NEXT:    s_endpgm
2140;
2141; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2142; GFX7:       ; %bb.0: ; %entry
2143; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2144; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2145; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2146; GFX7-NEXT:    s_add_u32 s4, s0, 16
2147; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2148; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2149; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2150; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2151; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2152; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2153; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2154; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2155; GFX7-NEXT:    buffer_wbinvl1_vol
2156; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2157; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2158; GFX7-NEXT:    flat_store_dword v[0:1], v2
2159; GFX7-NEXT:    s_endpgm
2160;
2161; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2162; GFX10-WGP:       ; %bb.0: ; %entry
2163; GFX10-WGP-NEXT:    s_clause 0x1
2164; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2165; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2166; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2167; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2168; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2169; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2170; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2171; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2172; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2173; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2174; GFX10-WGP-NEXT:    buffer_gl0_inv
2175; GFX10-WGP-NEXT:    buffer_gl1_inv
2176; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2177; GFX10-WGP-NEXT:    s_endpgm
2178;
2179; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2180; GFX10-CU:       ; %bb.0: ; %entry
2181; GFX10-CU-NEXT:    s_clause 0x1
2182; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2183; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2184; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2185; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2186; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2187; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2188; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2189; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2190; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2191; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2192; GFX10-CU-NEXT:    buffer_gl0_inv
2193; GFX10-CU-NEXT:    buffer_gl1_inv
2194; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2195; GFX10-CU-NEXT:    s_endpgm
2196;
2197; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
2198; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2199; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2200; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2201; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2202; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2203; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2204; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2205; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2206; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2207; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2208; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2209; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2210; SKIP-CACHE-INV-NEXT:    s_endpgm
2211    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2212entry:
2213  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2214  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic
2215  %val0 = extractvalue { i32, i1 } %val, 0
2216  store i32 %val0, i32 addrspace(1)* %out, align 4
2217  ret void
2218}
2219
2220define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
2221; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2222; GFX6:       ; %bb.0: ; %entry
2223; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2224; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2225; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2226; GFX6-NEXT:    s_mov_b32 s6, -1
2227; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2228; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2229; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2230; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2231; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2232; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2233; GFX6-NEXT:    buffer_wbinvl1
2234; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2235; GFX6-NEXT:    s_endpgm
2236;
2237; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2238; GFX7:       ; %bb.0: ; %entry
2239; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2240; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2241; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2242; GFX7-NEXT:    s_add_u32 s4, s0, 16
2243; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2244; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2245; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2246; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2247; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2248; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2249; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2250; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2251; GFX7-NEXT:    buffer_wbinvl1_vol
2252; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2253; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2254; GFX7-NEXT:    flat_store_dword v[0:1], v2
2255; GFX7-NEXT:    s_endpgm
2256;
2257; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2258; GFX10-WGP:       ; %bb.0: ; %entry
2259; GFX10-WGP-NEXT:    s_clause 0x1
2260; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2261; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2262; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2263; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2264; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2265; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2266; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2267; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2268; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2269; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2270; GFX10-WGP-NEXT:    buffer_gl0_inv
2271; GFX10-WGP-NEXT:    buffer_gl1_inv
2272; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2273; GFX10-WGP-NEXT:    s_endpgm
2274;
2275; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2276; GFX10-CU:       ; %bb.0: ; %entry
2277; GFX10-CU-NEXT:    s_clause 0x1
2278; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2279; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2280; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2281; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2282; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2283; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2284; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2285; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2286; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2287; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2288; GFX10-CU-NEXT:    buffer_gl0_inv
2289; GFX10-CU-NEXT:    buffer_gl1_inv
2290; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2291; GFX10-CU-NEXT:    s_endpgm
2292;
2293; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
2294; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2295; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2296; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2297; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2298; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2299; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2300; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2301; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2302; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2303; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2304; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2305; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2306; SKIP-CACHE-INV-NEXT:    s_endpgm
2307    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2308entry:
2309  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2310  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic
2311  %val0 = extractvalue { i32, i1 } %val, 0
2312  store i32 %val0, i32 addrspace(1)* %out, align 4
2313  ret void
2314}
2315
2316define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
2317; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg:
2318; GFX6:       ; %bb.0: ; %entry
2319; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2320; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2321; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2322; GFX6-NEXT:    s_mov_b32 s6, -1
2323; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2324; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2325; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2326; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2327; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2328; GFX6-NEXT:    buffer_wbinvl1
2329; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2330; GFX6-NEXT:    s_endpgm
2331;
2332; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
2333; GFX7:       ; %bb.0: ; %entry
2334; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2335; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2336; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2337; GFX7-NEXT:    s_add_u32 s4, s0, 16
2338; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2339; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2340; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2341; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2342; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2343; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2344; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2345; GFX7-NEXT:    buffer_wbinvl1_vol
2346; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2347; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2348; GFX7-NEXT:    flat_store_dword v[0:1], v2
2349; GFX7-NEXT:    s_endpgm
2350;
2351; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
2352; GFX10-WGP:       ; %bb.0: ; %entry
2353; GFX10-WGP-NEXT:    s_clause 0x1
2354; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2355; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2356; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2357; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2358; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2359; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2360; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2361; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2362; GFX10-WGP-NEXT:    buffer_gl0_inv
2363; GFX10-WGP-NEXT:    buffer_gl1_inv
2364; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2365; GFX10-WGP-NEXT:    s_endpgm
2366;
2367; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
2368; GFX10-CU:       ; %bb.0: ; %entry
2369; GFX10-CU-NEXT:    s_clause 0x1
2370; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2371; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2372; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2373; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2374; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2375; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2376; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2377; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2378; GFX10-CU-NEXT:    buffer_gl0_inv
2379; GFX10-CU-NEXT:    buffer_gl1_inv
2380; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2381; GFX10-CU-NEXT:    s_endpgm
2382;
2383; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg:
2384; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2385; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2386; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2387; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2388; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2389; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2390; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2391; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2392; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2393; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2394; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2395; SKIP-CACHE-INV-NEXT:    s_endpgm
2396    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2397entry:
2398  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2399  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire
2400  %val0 = extractvalue { i32, i1 } %val, 0
2401  store i32 %val0, i32 addrspace(1)* %out, align 4
2402  ret void
2403}
2404
2405define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
2406; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg:
2407; GFX6:       ; %bb.0: ; %entry
2408; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2409; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2410; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2411; GFX6-NEXT:    s_mov_b32 s6, -1
2412; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2413; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2414; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2415; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2416; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2417; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2418; GFX6-NEXT:    buffer_wbinvl1
2419; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2420; GFX6-NEXT:    s_endpgm
2421;
2422; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
2423; GFX7:       ; %bb.0: ; %entry
2424; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2425; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2426; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2427; GFX7-NEXT:    s_add_u32 s4, s0, 16
2428; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2429; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2430; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2431; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2432; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2433; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2434; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2435; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2436; GFX7-NEXT:    buffer_wbinvl1_vol
2437; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2438; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2439; GFX7-NEXT:    flat_store_dword v[0:1], v2
2440; GFX7-NEXT:    s_endpgm
2441;
2442; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
2443; GFX10-WGP:       ; %bb.0: ; %entry
2444; GFX10-WGP-NEXT:    s_clause 0x1
2445; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2446; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2447; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2448; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2449; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2450; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2451; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2452; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2453; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2454; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2455; GFX10-WGP-NEXT:    buffer_gl0_inv
2456; GFX10-WGP-NEXT:    buffer_gl1_inv
2457; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2458; GFX10-WGP-NEXT:    s_endpgm
2459;
2460; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
2461; GFX10-CU:       ; %bb.0: ; %entry
2462; GFX10-CU-NEXT:    s_clause 0x1
2463; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2464; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2465; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2466; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2467; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2468; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2469; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2470; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2471; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2472; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2473; GFX10-CU-NEXT:    buffer_gl0_inv
2474; GFX10-CU-NEXT:    buffer_gl1_inv
2475; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2476; GFX10-CU-NEXT:    s_endpgm
2477;
2478; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg:
2479; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2480; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2481; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2482; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2483; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2484; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2485; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2486; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2487; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2488; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2489; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2490; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2491; SKIP-CACHE-INV-NEXT:    s_endpgm
2492    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2493entry:
2494  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2495  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire
2496  %val0 = extractvalue { i32, i1 } %val, 0
2497  store i32 %val0, i32 addrspace(1)* %out, align 4
2498  ret void
2499}
2500
2501define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
2502; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
2503; GFX6:       ; %bb.0: ; %entry
2504; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2505; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2506; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2507; GFX6-NEXT:    s_mov_b32 s6, -1
2508; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2509; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2510; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2511; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2512; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2513; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2514; GFX6-NEXT:    buffer_wbinvl1
2515; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2516; GFX6-NEXT:    s_endpgm
2517;
2518; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
2519; GFX7:       ; %bb.0: ; %entry
2520; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2521; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2522; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2523; GFX7-NEXT:    s_add_u32 s4, s0, 16
2524; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2525; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2526; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2527; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2528; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2529; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2530; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2531; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2532; GFX7-NEXT:    buffer_wbinvl1_vol
2533; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2534; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2535; GFX7-NEXT:    flat_store_dword v[0:1], v2
2536; GFX7-NEXT:    s_endpgm
2537;
2538; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
2539; GFX10-WGP:       ; %bb.0: ; %entry
2540; GFX10-WGP-NEXT:    s_clause 0x1
2541; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2542; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2543; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2544; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2545; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2546; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2547; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2548; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2549; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2550; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2551; GFX10-WGP-NEXT:    buffer_gl0_inv
2552; GFX10-WGP-NEXT:    buffer_gl1_inv
2553; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2554; GFX10-WGP-NEXT:    s_endpgm
2555;
2556; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
2557; GFX10-CU:       ; %bb.0: ; %entry
2558; GFX10-CU-NEXT:    s_clause 0x1
2559; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2560; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2561; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2562; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2563; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2564; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2565; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2566; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2567; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2568; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2569; GFX10-CU-NEXT:    buffer_gl0_inv
2570; GFX10-CU-NEXT:    buffer_gl1_inv
2571; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2572; GFX10-CU-NEXT:    s_endpgm
2573;
2574; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
2575; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2576; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2577; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2578; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2579; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2580; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2581; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2582; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2583; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2584; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2585; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2586; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2587; SKIP-CACHE-INV-NEXT:    s_endpgm
2588    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2589entry:
2590  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2591  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire
2592  %val0 = extractvalue { i32, i1 } %val, 0
2593  store i32 %val0, i32 addrspace(1)* %out, align 4
2594  ret void
2595}
2596
2597define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
2598; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
2599; GFX6:       ; %bb.0: ; %entry
2600; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2601; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2602; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2603; GFX6-NEXT:    s_mov_b32 s6, -1
2604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2605; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2606; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2607; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2608; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2609; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2610; GFX6-NEXT:    buffer_wbinvl1
2611; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2612; GFX6-NEXT:    s_endpgm
2613;
2614; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
2615; GFX7:       ; %bb.0: ; %entry
2616; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2617; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2618; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2619; GFX7-NEXT:    s_add_u32 s4, s0, 16
2620; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2621; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2623; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2624; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2625; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2626; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2627; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2628; GFX7-NEXT:    buffer_wbinvl1_vol
2629; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2630; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2631; GFX7-NEXT:    flat_store_dword v[0:1], v2
2632; GFX7-NEXT:    s_endpgm
2633;
2634; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
2635; GFX10-WGP:       ; %bb.0: ; %entry
2636; GFX10-WGP-NEXT:    s_clause 0x1
2637; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2638; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2639; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2640; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2641; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2642; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2643; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2644; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2645; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2646; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2647; GFX10-WGP-NEXT:    buffer_gl0_inv
2648; GFX10-WGP-NEXT:    buffer_gl1_inv
2649; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2650; GFX10-WGP-NEXT:    s_endpgm
2651;
2652; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
2653; GFX10-CU:       ; %bb.0: ; %entry
2654; GFX10-CU-NEXT:    s_clause 0x1
2655; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2656; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2657; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2658; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2659; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2660; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2661; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2662; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2663; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2664; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2665; GFX10-CU-NEXT:    buffer_gl0_inv
2666; GFX10-CU-NEXT:    buffer_gl1_inv
2667; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2668; GFX10-CU-NEXT:    s_endpgm
2669;
2670; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
2671; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2672; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2673; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2674; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2675; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2676; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2677; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2678; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2679; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2680; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2681; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2682; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2683; SKIP-CACHE-INV-NEXT:    s_endpgm
2684    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2685entry:
2686  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2687  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire
2688  %val0 = extractvalue { i32, i1 } %val, 0
2689  store i32 %val0, i32 addrspace(1)* %out, align 4
2690  ret void
2691}
2692
2693define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
2694; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
2695; GFX6:       ; %bb.0: ; %entry
2696; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2697; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2698; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2699; GFX6-NEXT:    s_mov_b32 s6, -1
2700; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2701; GFX6-NEXT:    v_mov_b32_e32 v0, s0
2702; GFX6-NEXT:    v_mov_b32_e32 v1, s1
2703; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2704; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2705; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2706; GFX6-NEXT:    buffer_wbinvl1
2707; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2708; GFX6-NEXT:    s_endpgm
2709;
2710; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
2711; GFX7:       ; %bb.0: ; %entry
2712; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2713; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
2714; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2715; GFX7-NEXT:    s_add_u32 s4, s0, 16
2716; GFX7-NEXT:    s_addc_u32 s5, s1, 0
2717; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2718; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2719; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2720; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2721; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2722; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
2723; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2724; GFX7-NEXT:    buffer_wbinvl1_vol
2725; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2726; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2727; GFX7-NEXT:    flat_store_dword v[0:1], v2
2728; GFX7-NEXT:    s_endpgm
2729;
2730; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
2731; GFX10-WGP:       ; %bb.0: ; %entry
2732; GFX10-WGP-NEXT:    s_clause 0x1
2733; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2734; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2735; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
2736; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2737; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
2738; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
2739; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2740; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
2741; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2742; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2743; GFX10-WGP-NEXT:    buffer_gl0_inv
2744; GFX10-WGP-NEXT:    buffer_gl1_inv
2745; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
2746; GFX10-WGP-NEXT:    s_endpgm
2747;
2748; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
2749; GFX10-CU:       ; %bb.0: ; %entry
2750; GFX10-CU-NEXT:    s_clause 0x1
2751; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2752; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2753; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
2754; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2755; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
2756; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
2757; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2758; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
2759; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
2760; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2761; GFX10-CU-NEXT:    buffer_gl0_inv
2762; GFX10-CU-NEXT:    buffer_gl1_inv
2763; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
2764; GFX10-CU-NEXT:    s_endpgm
2765;
2766; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
2767; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2768; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2769; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
2770; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
2771; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
2772; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2773; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
2774; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
2775; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2776; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
2777; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2778; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2779; SKIP-CACHE-INV-NEXT:    s_endpgm
2780    i32 addrspace(1)* %out, i32 %in, i32 %old) {
2781entry:
2782  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
2783  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
2784  %val0 = extractvalue { i32, i1 } %val, 0
2785  store i32 %val0, i32 addrspace(1)* %out, align 4
2786  ret void
2787}
2788
2789define amdgpu_kernel void @global_system_one_as_unordered_load(
2790; GFX6-LABEL: global_system_one_as_unordered_load:
2791; GFX6:       ; %bb.0: ; %entry
2792; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2793; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2794; GFX6-NEXT:    s_mov_b32 s2, -1
2795; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2796; GFX6-NEXT:    s_mov_b32 s0, s4
2797; GFX6-NEXT:    s_mov_b32 s1, s5
2798; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2799; GFX6-NEXT:    s_mov_b32 s4, s6
2800; GFX6-NEXT:    s_mov_b32 s5, s7
2801; GFX6-NEXT:    s_mov_b32 s6, s2
2802; GFX6-NEXT:    s_mov_b32 s7, s3
2803; GFX6-NEXT:    s_waitcnt vmcnt(0)
2804; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2805; GFX6-NEXT:    s_endpgm
2806;
2807; GFX7-LABEL: global_system_one_as_unordered_load:
2808; GFX7:       ; %bb.0: ; %entry
2809; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2810; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2811; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2812; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2813; GFX7-NEXT:    flat_load_dword v0, v[0:1]
2814; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2815; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2816; GFX7-NEXT:    s_waitcnt vmcnt(0)
2817; GFX7-NEXT:    flat_store_dword v[2:3], v0
2818; GFX7-NEXT:    s_endpgm
2819;
2820; GFX10-WGP-LABEL: global_system_one_as_unordered_load:
2821; GFX10-WGP:       ; %bb.0: ; %entry
2822; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2823; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2824; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2825; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1]
2826; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2827; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2828; GFX10-WGP-NEXT:    s_endpgm
2829;
2830; GFX10-CU-LABEL: global_system_one_as_unordered_load:
2831; GFX10-CU:       ; %bb.0: ; %entry
2832; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2833; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2834; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2835; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1]
2836; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2837; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2838; GFX10-CU-NEXT:    s_endpgm
2839;
2840; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load:
2841; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2842; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2843; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2844; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2845; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2846; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2847; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2848; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0
2849; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2850; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2851; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2852; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2853; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2854; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2855; SKIP-CACHE-INV-NEXT:    s_endpgm
2856    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2857entry:
2858  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4
2859  store i32 %val, i32 addrspace(1)* %out
2860  ret void
2861}
2862
2863define amdgpu_kernel void @global_system_one_as_monotonic_load(
2864; GFX6-LABEL: global_system_one_as_monotonic_load:
2865; GFX6:       ; %bb.0: ; %entry
2866; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2867; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2868; GFX6-NEXT:    s_mov_b32 s2, -1
2869; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2870; GFX6-NEXT:    s_mov_b32 s0, s4
2871; GFX6-NEXT:    s_mov_b32 s1, s5
2872; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
2873; GFX6-NEXT:    s_mov_b32 s4, s6
2874; GFX6-NEXT:    s_mov_b32 s5, s7
2875; GFX6-NEXT:    s_mov_b32 s6, s2
2876; GFX6-NEXT:    s_mov_b32 s7, s3
2877; GFX6-NEXT:    s_waitcnt vmcnt(0)
2878; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2879; GFX6-NEXT:    s_endpgm
2880;
2881; GFX7-LABEL: global_system_one_as_monotonic_load:
2882; GFX7:       ; %bb.0: ; %entry
2883; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2884; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2885; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2886; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2887; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
2888; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2889; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2890; GFX7-NEXT:    s_waitcnt vmcnt(0)
2891; GFX7-NEXT:    flat_store_dword v[2:3], v0
2892; GFX7-NEXT:    s_endpgm
2893;
2894; GFX10-WGP-LABEL: global_system_one_as_monotonic_load:
2895; GFX10-WGP:       ; %bb.0: ; %entry
2896; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2897; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2898; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2899; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
2900; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2901; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2902; GFX10-WGP-NEXT:    s_endpgm
2903;
2904; GFX10-CU-LABEL: global_system_one_as_monotonic_load:
2905; GFX10-CU:       ; %bb.0: ; %entry
2906; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2907; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2908; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2909; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
2910; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2911; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2912; GFX10-CU-NEXT:    s_endpgm
2913;
2914; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load:
2915; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2916; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2917; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2918; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2919; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
2920; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
2921; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
2922; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
2923; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
2924; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
2925; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
2926; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
2927; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
2928; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2929; SKIP-CACHE-INV-NEXT:    s_endpgm
2930    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
2931entry:
2932  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4
2933  store i32 %val, i32 addrspace(1)* %out
2934  ret void
2935}
2936
2937define amdgpu_kernel void @global_system_one_as_acquire_load(
2938; GFX6-LABEL: global_system_one_as_acquire_load:
2939; GFX6:       ; %bb.0: ; %entry
2940; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2941; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2942; GFX6-NEXT:    s_mov_b32 s2, -1
2943; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2944; GFX6-NEXT:    s_mov_b32 s0, s4
2945; GFX6-NEXT:    s_mov_b32 s1, s5
2946; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
2947; GFX6-NEXT:    s_waitcnt vmcnt(0)
2948; GFX6-NEXT:    buffer_wbinvl1
2949; GFX6-NEXT:    s_mov_b32 s4, s6
2950; GFX6-NEXT:    s_mov_b32 s5, s7
2951; GFX6-NEXT:    s_mov_b32 s6, s2
2952; GFX6-NEXT:    s_mov_b32 s7, s3
2953; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2954; GFX6-NEXT:    s_endpgm
2955;
2956; GFX7-LABEL: global_system_one_as_acquire_load:
2957; GFX7:       ; %bb.0: ; %entry
2958; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2959; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2960; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2961; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2962; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
2963; GFX7-NEXT:    s_waitcnt vmcnt(0)
2964; GFX7-NEXT:    buffer_wbinvl1_vol
2965; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2966; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2967; GFX7-NEXT:    flat_store_dword v[2:3], v0
2968; GFX7-NEXT:    s_endpgm
2969;
2970; GFX10-WGP-LABEL: global_system_one_as_acquire_load:
2971; GFX10-WGP:       ; %bb.0: ; %entry
2972; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2973; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
2974; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
2975; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
2976; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
2977; GFX10-WGP-NEXT:    buffer_gl0_inv
2978; GFX10-WGP-NEXT:    buffer_gl1_inv
2979; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
2980; GFX10-WGP-NEXT:    s_endpgm
2981;
2982; GFX10-CU-LABEL: global_system_one_as_acquire_load:
2983; GFX10-CU:       ; %bb.0: ; %entry
2984; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2985; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
2986; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
2988; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
2989; GFX10-CU-NEXT:    buffer_gl0_inv
2990; GFX10-CU-NEXT:    buffer_gl1_inv
2991; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
2992; GFX10-CU-NEXT:    s_endpgm
2993;
2994; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load:
2995; SKIP-CACHE-INV:       ; %bb.0: ; %entry
2996; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2997; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
2998; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
2999; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3000; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3001; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3002; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3003; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3004; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3005; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3006; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3007; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3008; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3009; SKIP-CACHE-INV-NEXT:    s_endpgm
3010    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3011entry:
3012  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4
3013  store i32 %val, i32 addrspace(1)* %out
3014  ret void
3015}
3016
3017define amdgpu_kernel void @global_system_one_as_seq_cst_load(
3018; GFX6-LABEL: global_system_one_as_seq_cst_load:
3019; GFX6:       ; %bb.0: ; %entry
3020; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3021; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3022; GFX6-NEXT:    s_mov_b32 s2, -1
3023; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3024; GFX6-NEXT:    s_mov_b32 s0, s4
3025; GFX6-NEXT:    s_mov_b32 s1, s5
3026; GFX6-NEXT:    s_waitcnt vmcnt(0)
3027; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3028; GFX6-NEXT:    s_waitcnt vmcnt(0)
3029; GFX6-NEXT:    buffer_wbinvl1
3030; GFX6-NEXT:    s_mov_b32 s4, s6
3031; GFX6-NEXT:    s_mov_b32 s5, s7
3032; GFX6-NEXT:    s_mov_b32 s6, s2
3033; GFX6-NEXT:    s_mov_b32 s7, s3
3034; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3035; GFX6-NEXT:    s_endpgm
3036;
3037; GFX7-LABEL: global_system_one_as_seq_cst_load:
3038; GFX7:       ; %bb.0: ; %entry
3039; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3040; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3041; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3042; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3043; GFX7-NEXT:    s_waitcnt vmcnt(0)
3044; GFX7-NEXT:    flat_load_dword v0, v[0:1] glc
3045; GFX7-NEXT:    s_waitcnt vmcnt(0)
3046; GFX7-NEXT:    buffer_wbinvl1_vol
3047; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3048; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3049; GFX7-NEXT:    flat_store_dword v[2:3], v0
3050; GFX7-NEXT:    s_endpgm
3051;
3052; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load:
3053; GFX10-WGP:       ; %bb.0: ; %entry
3054; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3055; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3056; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3057; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3058; GFX10-WGP-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3059; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3060; GFX10-WGP-NEXT:    buffer_gl0_inv
3061; GFX10-WGP-NEXT:    buffer_gl1_inv
3062; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[2:3]
3063; GFX10-WGP-NEXT:    s_endpgm
3064;
3065; GFX10-CU-LABEL: global_system_one_as_seq_cst_load:
3066; GFX10-CU:       ; %bb.0: ; %entry
3067; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3068; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3069; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3070; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3071; GFX10-CU-NEXT:    global_load_dword v1, v0, s[0:1] glc dlc
3072; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3073; GFX10-CU-NEXT:    buffer_gl0_inv
3074; GFX10-CU-NEXT:    buffer_gl1_inv
3075; GFX10-CU-NEXT:    global_store_dword v0, v1, s[2:3]
3076; GFX10-CU-NEXT:    s_endpgm
3077;
3078; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load:
3079; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3080; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3081; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3082; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3083; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3084; SKIP-CACHE-INV-NEXT:    s_mov_b32 s0, s4
3085; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s5
3086; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3087; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
3088; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3089; SKIP-CACHE-INV-NEXT:    s_mov_b32 s4, s6
3090; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, s7
3091; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, s2
3092; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s3
3093; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3094; SKIP-CACHE-INV-NEXT:    s_endpgm
3095    i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
3096entry:
3097  %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4
3098  store i32 %val, i32 addrspace(1)* %out
3099  ret void
3100}
3101
3102define amdgpu_kernel void @global_system_one_as_unordered_store(
3103; GFX6-LABEL: global_system_one_as_unordered_store:
3104; GFX6:       ; %bb.0: ; %entry
3105; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3106; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3107; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3108; GFX6-NEXT:    s_mov_b32 s2, -1
3109; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3110; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3111; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3112; GFX6-NEXT:    s_endpgm
3113;
3114; GFX7-LABEL: global_system_one_as_unordered_store:
3115; GFX7:       ; %bb.0: ; %entry
3116; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3117; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3118; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3119; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3120; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3121; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3122; GFX7-NEXT:    flat_store_dword v[0:1], v2
3123; GFX7-NEXT:    s_endpgm
3124;
3125; GFX10-WGP-LABEL: global_system_one_as_unordered_store:
3126; GFX10-WGP:       ; %bb.0: ; %entry
3127; GFX10-WGP-NEXT:    s_clause 0x1
3128; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3129; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3130; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3131; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3132; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3133; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3134; GFX10-WGP-NEXT:    s_endpgm
3135;
3136; GFX10-CU-LABEL: global_system_one_as_unordered_store:
3137; GFX10-CU:       ; %bb.0: ; %entry
3138; GFX10-CU-NEXT:    s_clause 0x1
3139; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3140; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3141; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3142; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3143; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3144; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3145; GFX10-CU-NEXT:    s_endpgm
3146;
3147; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store:
3148; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3149; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3150; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3151; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3152; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3153; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3154; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3155; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3156; SKIP-CACHE-INV-NEXT:    s_endpgm
3157    i32 %in, i32 addrspace(1)* %out) {
3158entry:
3159  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4
3160  ret void
3161}
3162
3163define amdgpu_kernel void @global_system_one_as_monotonic_store(
3164; GFX6-LABEL: global_system_one_as_monotonic_store:
3165; GFX6:       ; %bb.0: ; %entry
3166; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3167; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3168; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3169; GFX6-NEXT:    s_mov_b32 s2, -1
3170; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3171; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3172; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3173; GFX6-NEXT:    s_endpgm
3174;
3175; GFX7-LABEL: global_system_one_as_monotonic_store:
3176; GFX7:       ; %bb.0: ; %entry
3177; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3178; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3179; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3180; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3181; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3182; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3183; GFX7-NEXT:    flat_store_dword v[0:1], v2
3184; GFX7-NEXT:    s_endpgm
3185;
3186; GFX10-WGP-LABEL: global_system_one_as_monotonic_store:
3187; GFX10-WGP:       ; %bb.0: ; %entry
3188; GFX10-WGP-NEXT:    s_clause 0x1
3189; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3190; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3191; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3192; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3193; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3194; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3195; GFX10-WGP-NEXT:    s_endpgm
3196;
3197; GFX10-CU-LABEL: global_system_one_as_monotonic_store:
3198; GFX10-CU:       ; %bb.0: ; %entry
3199; GFX10-CU-NEXT:    s_clause 0x1
3200; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3201; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3202; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3203; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3204; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3205; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3206; GFX10-CU-NEXT:    s_endpgm
3207;
3208; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store:
3209; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3210; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3211; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3212; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3213; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3214; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3215; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3216; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3217; SKIP-CACHE-INV-NEXT:    s_endpgm
3218    i32 %in, i32 addrspace(1)* %out) {
3219entry:
3220  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4
3221  ret void
3222}
3223
3224define amdgpu_kernel void @global_system_one_as_release_store(
3225; GFX6-LABEL: global_system_one_as_release_store:
3226; GFX6:       ; %bb.0: ; %entry
3227; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3228; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3229; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3230; GFX6-NEXT:    s_mov_b32 s2, -1
3231; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3232; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3233; GFX6-NEXT:    s_waitcnt vmcnt(0)
3234; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3235; GFX6-NEXT:    s_endpgm
3236;
3237; GFX7-LABEL: global_system_one_as_release_store:
3238; GFX7:       ; %bb.0: ; %entry
3239; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3240; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3241; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3242; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3243; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3244; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3245; GFX7-NEXT:    s_waitcnt vmcnt(0)
3246; GFX7-NEXT:    flat_store_dword v[0:1], v2
3247; GFX7-NEXT:    s_endpgm
3248;
3249; GFX10-WGP-LABEL: global_system_one_as_release_store:
3250; GFX10-WGP:       ; %bb.0: ; %entry
3251; GFX10-WGP-NEXT:    s_clause 0x1
3252; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3253; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3254; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3255; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3256; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3257; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3258; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3259; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3260; GFX10-WGP-NEXT:    s_endpgm
3261;
3262; GFX10-CU-LABEL: global_system_one_as_release_store:
3263; GFX10-CU:       ; %bb.0: ; %entry
3264; GFX10-CU-NEXT:    s_clause 0x1
3265; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3266; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3267; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3268; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3269; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3270; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3271; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3272; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3273; GFX10-CU-NEXT:    s_endpgm
3274;
3275; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store:
3276; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3277; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3278; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3279; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3280; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3281; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3282; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3283; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3284; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3285; SKIP-CACHE-INV-NEXT:    s_endpgm
3286    i32 %in, i32 addrspace(1)* %out) {
3287entry:
3288  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4
3289  ret void
3290}
3291
3292define amdgpu_kernel void @global_system_one_as_seq_cst_store(
3293; GFX6-LABEL: global_system_one_as_seq_cst_store:
3294; GFX6:       ; %bb.0: ; %entry
3295; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x9
3296; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3297; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3298; GFX6-NEXT:    s_mov_b32 s2, -1
3299; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3300; GFX6-NEXT:    v_mov_b32_e32 v0, s4
3301; GFX6-NEXT:    s_waitcnt vmcnt(0)
3302; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3303; GFX6-NEXT:    s_endpgm
3304;
3305; GFX7-LABEL: global_system_one_as_seq_cst_store:
3306; GFX7:       ; %bb.0: ; %entry
3307; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
3308; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
3309; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3310; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3311; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3312; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3313; GFX7-NEXT:    s_waitcnt vmcnt(0)
3314; GFX7-NEXT:    flat_store_dword v[0:1], v2
3315; GFX7-NEXT:    s_endpgm
3316;
3317; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store:
3318; GFX10-WGP:       ; %bb.0: ; %entry
3319; GFX10-WGP-NEXT:    s_clause 0x1
3320; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0
3321; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3322; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3323; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3324; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3325; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3326; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3327; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3328; GFX10-WGP-NEXT:    s_endpgm
3329;
3330; GFX10-CU-LABEL: global_system_one_as_seq_cst_store:
3331; GFX10-CU:       ; %bb.0: ; %entry
3332; GFX10-CU-NEXT:    s_clause 0x1
3333; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0
3334; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3335; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3336; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3337; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3338; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3339; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3340; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3341; GFX10-CU-NEXT:    s_endpgm
3342;
3343; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store:
3344; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3345; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x9
3346; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3347; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, 0xf000
3348; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, -1
3349; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3350; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
3351; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3352; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3353; SKIP-CACHE-INV-NEXT:    s_endpgm
3354    i32 %in, i32 addrspace(1)* %out) {
3355entry:
3356  store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4
3357  ret void
3358}
3359
3360define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
3361; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw:
3362; GFX6:       ; %bb.0: ; %entry
3363; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3364; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3365; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3366; GFX6-NEXT:    s_mov_b32 s6, -1
3367; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3368; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3369; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3370; GFX6-NEXT:    s_endpgm
3371;
3372; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
3373; GFX7:       ; %bb.0: ; %entry
3374; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3375; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3376; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3377; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3378; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3379; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3380; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3381; GFX7-NEXT:    s_endpgm
3382;
3383; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
3384; GFX10-WGP:       ; %bb.0: ; %entry
3385; GFX10-WGP-NEXT:    s_clause 0x1
3386; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3387; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3388; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3389; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3390; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3391; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3392; GFX10-WGP-NEXT:    s_endpgm
3393;
3394; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
3395; GFX10-CU:       ; %bb.0: ; %entry
3396; GFX10-CU-NEXT:    s_clause 0x1
3397; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3398; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3399; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3400; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3401; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3402; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3403; GFX10-CU-NEXT:    s_endpgm
3404;
3405; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw:
3406; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3407; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3408; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3409; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3410; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3411; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3412; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3413; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3414; SKIP-CACHE-INV-NEXT:    s_endpgm
3415    i32 addrspace(1)* %out, i32 %in) {
3416entry:
3417  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic
3418  ret void
3419}
3420
3421define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
3422; GFX6-LABEL: global_system_one_as_acquire_atomicrmw:
3423; GFX6:       ; %bb.0: ; %entry
3424; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3425; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3426; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3427; GFX6-NEXT:    s_mov_b32 s6, -1
3428; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3429; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3430; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3431; GFX6-NEXT:    s_waitcnt vmcnt(0)
3432; GFX6-NEXT:    buffer_wbinvl1
3433; GFX6-NEXT:    s_endpgm
3434;
3435; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
3436; GFX7:       ; %bb.0: ; %entry
3437; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3438; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3439; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3440; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3441; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3442; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3443; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3444; GFX7-NEXT:    s_waitcnt vmcnt(0)
3445; GFX7-NEXT:    buffer_wbinvl1_vol
3446; GFX7-NEXT:    s_endpgm
3447;
3448; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
3449; GFX10-WGP:       ; %bb.0: ; %entry
3450; GFX10-WGP-NEXT:    s_clause 0x1
3451; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3452; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3453; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3454; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3455; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3456; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3457; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3458; GFX10-WGP-NEXT:    buffer_gl0_inv
3459; GFX10-WGP-NEXT:    buffer_gl1_inv
3460; GFX10-WGP-NEXT:    s_endpgm
3461;
3462; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
3463; GFX10-CU:       ; %bb.0: ; %entry
3464; GFX10-CU-NEXT:    s_clause 0x1
3465; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3466; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3467; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3468; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3469; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3470; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3471; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3472; GFX10-CU-NEXT:    buffer_gl0_inv
3473; GFX10-CU-NEXT:    buffer_gl1_inv
3474; GFX10-CU-NEXT:    s_endpgm
3475;
3476; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
3477; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3478; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3479; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3480; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3481; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3482; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3483; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3484; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3485; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3486; SKIP-CACHE-INV-NEXT:    s_endpgm
3487    i32 addrspace(1)* %out, i32 %in) {
3488entry:
3489  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
3490  ret void
3491}
3492
3493define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
3494; GFX6-LABEL: global_system_one_as_release_atomicrmw:
3495; GFX6:       ; %bb.0: ; %entry
3496; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3497; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3498; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3499; GFX6-NEXT:    s_mov_b32 s6, -1
3500; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3501; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3502; GFX6-NEXT:    s_waitcnt vmcnt(0)
3503; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3504; GFX6-NEXT:    s_endpgm
3505;
3506; GFX7-LABEL: global_system_one_as_release_atomicrmw:
3507; GFX7:       ; %bb.0: ; %entry
3508; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3509; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3510; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3511; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3512; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3513; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3514; GFX7-NEXT:    s_waitcnt vmcnt(0)
3515; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3516; GFX7-NEXT:    s_endpgm
3517;
3518; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw:
3519; GFX10-WGP:       ; %bb.0: ; %entry
3520; GFX10-WGP-NEXT:    s_clause 0x1
3521; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3522; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3523; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3524; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3525; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3526; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3527; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3528; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3529; GFX10-WGP-NEXT:    s_endpgm
3530;
3531; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw:
3532; GFX10-CU:       ; %bb.0: ; %entry
3533; GFX10-CU-NEXT:    s_clause 0x1
3534; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3535; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3536; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3537; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3538; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3539; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3540; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3541; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3542; GFX10-CU-NEXT:    s_endpgm
3543;
3544; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw:
3545; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3546; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3547; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3548; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3549; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3550; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3551; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3552; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3553; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3554; SKIP-CACHE-INV-NEXT:    s_endpgm
3555    i32 addrspace(1)* %out, i32 %in) {
3556entry:
3557  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release
3558  ret void
3559}
3560
3561define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
3562; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw:
3563; GFX6:       ; %bb.0: ; %entry
3564; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3565; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3566; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3567; GFX6-NEXT:    s_mov_b32 s6, -1
3568; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3569; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3570; GFX6-NEXT:    s_waitcnt vmcnt(0)
3571; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3572; GFX6-NEXT:    s_waitcnt vmcnt(0)
3573; GFX6-NEXT:    buffer_wbinvl1
3574; GFX6-NEXT:    s_endpgm
3575;
3576; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
3577; GFX7:       ; %bb.0: ; %entry
3578; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3579; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3580; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3581; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3582; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3583; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3584; GFX7-NEXT:    s_waitcnt vmcnt(0)
3585; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3586; GFX7-NEXT:    s_waitcnt vmcnt(0)
3587; GFX7-NEXT:    buffer_wbinvl1_vol
3588; GFX7-NEXT:    s_endpgm
3589;
3590; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
3591; GFX10-WGP:       ; %bb.0: ; %entry
3592; GFX10-WGP-NEXT:    s_clause 0x1
3593; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3594; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3595; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3596; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3597; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3598; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3599; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3600; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3601; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3602; GFX10-WGP-NEXT:    buffer_gl0_inv
3603; GFX10-WGP-NEXT:    buffer_gl1_inv
3604; GFX10-WGP-NEXT:    s_endpgm
3605;
3606; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
3607; GFX10-CU:       ; %bb.0: ; %entry
3608; GFX10-CU-NEXT:    s_clause 0x1
3609; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3610; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3611; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3612; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3613; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3614; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3615; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3616; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3617; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3618; GFX10-CU-NEXT:    buffer_gl0_inv
3619; GFX10-CU-NEXT:    buffer_gl1_inv
3620; GFX10-CU-NEXT:    s_endpgm
3621;
3622; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
3623; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3624; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3625; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3626; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3627; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3628; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3629; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3630; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3631; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3632; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3633; SKIP-CACHE-INV-NEXT:    s_endpgm
3634    i32 addrspace(1)* %out, i32 %in) {
3635entry:
3636  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
3637  ret void
3638}
3639
3640define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
3641; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw:
3642; GFX6:       ; %bb.0: ; %entry
3643; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3644; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3645; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3646; GFX6-NEXT:    s_mov_b32 s6, -1
3647; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3648; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3649; GFX6-NEXT:    s_waitcnt vmcnt(0)
3650; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3651; GFX6-NEXT:    s_waitcnt vmcnt(0)
3652; GFX6-NEXT:    buffer_wbinvl1
3653; GFX6-NEXT:    s_endpgm
3654;
3655; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
3656; GFX7:       ; %bb.0: ; %entry
3657; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3658; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3659; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3660; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3661; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3662; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3663; GFX7-NEXT:    s_waitcnt vmcnt(0)
3664; GFX7-NEXT:    flat_atomic_swap v[0:1], v2
3665; GFX7-NEXT:    s_waitcnt vmcnt(0)
3666; GFX7-NEXT:    buffer_wbinvl1_vol
3667; GFX7-NEXT:    s_endpgm
3668;
3669; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
3670; GFX10-WGP:       ; %bb.0: ; %entry
3671; GFX10-WGP-NEXT:    s_clause 0x1
3672; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3673; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3674; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3675; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3676; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3677; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3678; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3679; GFX10-WGP-NEXT:    global_atomic_swap v0, v1, s[0:1]
3680; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3681; GFX10-WGP-NEXT:    buffer_gl0_inv
3682; GFX10-WGP-NEXT:    buffer_gl1_inv
3683; GFX10-WGP-NEXT:    s_endpgm
3684;
3685; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
3686; GFX10-CU:       ; %bb.0: ; %entry
3687; GFX10-CU-NEXT:    s_clause 0x1
3688; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3689; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3690; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3691; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3692; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3693; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3694; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3695; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[0:1]
3696; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3697; GFX10-CU-NEXT:    buffer_gl0_inv
3698; GFX10-CU-NEXT:    buffer_gl1_inv
3699; GFX10-CU-NEXT:    s_endpgm
3700;
3701; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
3702; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3703; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3704; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3705; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3706; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3707; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3708; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3709; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3710; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
3711; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3712; SKIP-CACHE-INV-NEXT:    s_endpgm
3713    i32 addrspace(1)* %out, i32 %in) {
3714entry:
3715  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
3716  ret void
3717}
3718
3719define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
3720; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw:
3721; GFX6:       ; %bb.0: ; %entry
3722; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3723; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3724; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3725; GFX6-NEXT:    s_mov_b32 s6, -1
3726; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3727; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3728; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3729; GFX6-NEXT:    s_waitcnt vmcnt(0)
3730; GFX6-NEXT:    buffer_wbinvl1
3731; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3732; GFX6-NEXT:    s_endpgm
3733;
3734; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
3735; GFX7:       ; %bb.0: ; %entry
3736; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3737; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3738; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3739; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3740; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3741; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3742; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3743; GFX7-NEXT:    s_waitcnt vmcnt(0)
3744; GFX7-NEXT:    buffer_wbinvl1_vol
3745; GFX7-NEXT:    flat_store_dword v[0:1], v2
3746; GFX7-NEXT:    s_endpgm
3747;
3748; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
3749; GFX10-WGP:       ; %bb.0: ; %entry
3750; GFX10-WGP-NEXT:    s_clause 0x1
3751; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3752; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3753; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3754; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3755; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3756; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3757; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3758; GFX10-WGP-NEXT:    buffer_gl0_inv
3759; GFX10-WGP-NEXT:    buffer_gl1_inv
3760; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3761; GFX10-WGP-NEXT:    s_endpgm
3762;
3763; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
3764; GFX10-CU:       ; %bb.0: ; %entry
3765; GFX10-CU-NEXT:    s_clause 0x1
3766; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3767; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3768; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3769; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3770; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3771; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3772; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3773; GFX10-CU-NEXT:    buffer_gl0_inv
3774; GFX10-CU-NEXT:    buffer_gl1_inv
3775; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3776; GFX10-CU-NEXT:    s_endpgm
3777;
3778; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw:
3779; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3780; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3781; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3782; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3783; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3784; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3785; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3786; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3787; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3788; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3789; SKIP-CACHE-INV-NEXT:    s_endpgm
3790    i32 addrspace(1)* %out, i32 %in) {
3791entry:
3792  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
3793  store i32 %val, i32 addrspace(1)* %out, align 4
3794  ret void
3795}
3796
3797define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
3798; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
3799; GFX6:       ; %bb.0: ; %entry
3800; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3801; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3802; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3803; GFX6-NEXT:    s_mov_b32 s6, -1
3804; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3805; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3806; GFX6-NEXT:    s_waitcnt vmcnt(0)
3807; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3808; GFX6-NEXT:    s_waitcnt vmcnt(0)
3809; GFX6-NEXT:    buffer_wbinvl1
3810; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3811; GFX6-NEXT:    s_endpgm
3812;
3813; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
3814; GFX7:       ; %bb.0: ; %entry
3815; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3816; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3817; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3818; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3819; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3820; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3821; GFX7-NEXT:    s_waitcnt vmcnt(0)
3822; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3823; GFX7-NEXT:    s_waitcnt vmcnt(0)
3824; GFX7-NEXT:    buffer_wbinvl1_vol
3825; GFX7-NEXT:    flat_store_dword v[0:1], v2
3826; GFX7-NEXT:    s_endpgm
3827;
3828; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
3829; GFX10-WGP:       ; %bb.0: ; %entry
3830; GFX10-WGP-NEXT:    s_clause 0x1
3831; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3832; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3833; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3834; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3835; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3836; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3837; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3838; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3839; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3840; GFX10-WGP-NEXT:    buffer_gl0_inv
3841; GFX10-WGP-NEXT:    buffer_gl1_inv
3842; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3843; GFX10-WGP-NEXT:    s_endpgm
3844;
3845; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
3846; GFX10-CU:       ; %bb.0: ; %entry
3847; GFX10-CU-NEXT:    s_clause 0x1
3848; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3849; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3850; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3851; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3852; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3853; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3854; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3855; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3856; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3857; GFX10-CU-NEXT:    buffer_gl0_inv
3858; GFX10-CU-NEXT:    buffer_gl1_inv
3859; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3860; GFX10-CU-NEXT:    s_endpgm
3861;
3862; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
3863; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3864; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3865; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3866; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3867; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3868; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3870; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3871; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3872; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3873; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3874; SKIP-CACHE-INV-NEXT:    s_endpgm
3875    i32 addrspace(1)* %out, i32 %in) {
3876entry:
3877  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
3878  store i32 %val, i32 addrspace(1)* %out, align 4
3879  ret void
3880}
3881
3882define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
3883; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
3884; GFX6:       ; %bb.0: ; %entry
3885; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3886; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3887; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3888; GFX6-NEXT:    s_mov_b32 s6, -1
3889; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3890; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3891; GFX6-NEXT:    s_waitcnt vmcnt(0)
3892; GFX6-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3893; GFX6-NEXT:    s_waitcnt vmcnt(0)
3894; GFX6-NEXT:    buffer_wbinvl1
3895; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3896; GFX6-NEXT:    s_endpgm
3897;
3898; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
3899; GFX7:       ; %bb.0: ; %entry
3900; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3901; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2
3902; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3903; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3904; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3905; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3906; GFX7-NEXT:    s_waitcnt vmcnt(0)
3907; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
3908; GFX7-NEXT:    s_waitcnt vmcnt(0)
3909; GFX7-NEXT:    buffer_wbinvl1_vol
3910; GFX7-NEXT:    flat_store_dword v[0:1], v2
3911; GFX7-NEXT:    s_endpgm
3912;
3913; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
3914; GFX10-WGP:       ; %bb.0: ; %entry
3915; GFX10-WGP-NEXT:    s_clause 0x1
3916; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8
3917; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3918; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
3919; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
3920; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s2
3921; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3922; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
3923; GFX10-WGP-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3924; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
3925; GFX10-WGP-NEXT:    buffer_gl0_inv
3926; GFX10-WGP-NEXT:    buffer_gl1_inv
3927; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[0:1]
3928; GFX10-WGP-NEXT:    s_endpgm
3929;
3930; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
3931; GFX10-CU:       ; %bb.0: ; %entry
3932; GFX10-CU-NEXT:    s_clause 0x1
3933; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8
3934; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3935; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
3936; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
3937; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s2
3938; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3939; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
3940; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[0:1] glc
3941; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
3942; GFX10-CU-NEXT:    buffer_gl0_inv
3943; GFX10-CU-NEXT:    buffer_gl1_inv
3944; GFX10-CU-NEXT:    global_store_dword v0, v1, s[0:1]
3945; GFX10-CU-NEXT:    s_endpgm
3946;
3947; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
3948; SKIP-CACHE-INV:       ; %bb.0: ; %entry
3949; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3950; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb
3951; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
3952; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
3953; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
3954; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
3955; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3956; SKIP-CACHE-INV-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 glc
3957; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
3958; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3959; SKIP-CACHE-INV-NEXT:    s_endpgm
3960    i32 addrspace(1)* %out, i32 %in) {
3961entry:
3962  %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
3963  store i32 %val, i32 addrspace(1)* %out, align 4
3964  ret void
3965}
3966
3967define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
3968; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
3969; GFX6:       ; %bb.0: ; %entry
3970; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3971; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3972; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3973; GFX6-NEXT:    s_mov_b32 s6, -1
3974; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3975; GFX6-NEXT:    v_mov_b32_e32 v0, s0
3976; GFX6-NEXT:    v_mov_b32_e32 v1, s1
3977; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
3978; GFX6-NEXT:    s_endpgm
3979;
3980; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
3981; GFX7:       ; %bb.0: ; %entry
3982; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3983; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
3984; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3985; GFX7-NEXT:    s_add_u32 s0, s0, 16
3986; GFX7-NEXT:    s_addc_u32 s1, s1, 0
3987; GFX7-NEXT:    v_mov_b32_e32 v0, s0
3988; GFX7-NEXT:    v_mov_b32_e32 v2, s2
3989; GFX7-NEXT:    v_mov_b32_e32 v1, s1
3990; GFX7-NEXT:    v_mov_b32_e32 v3, s3
3991; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
3992; GFX7-NEXT:    s_endpgm
3993;
3994; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
3995; GFX10-WGP:       ; %bb.0: ; %entry
3996; GFX10-WGP-NEXT:    s_clause 0x1
3997; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
3998; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
3999; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4000; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4001; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4002; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4003; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4004; GFX10-WGP-NEXT:    s_endpgm
4005;
4006; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
4007; GFX10-CU:       ; %bb.0: ; %entry
4008; GFX10-CU-NEXT:    s_clause 0x1
4009; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4010; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4011; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4012; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4013; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4014; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4015; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4016; GFX10-CU-NEXT:    s_endpgm
4017;
4018; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
4019; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4020; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4021; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4022; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4023; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4024; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4025; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4026; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4027; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4028; SKIP-CACHE-INV-NEXT:    s_endpgm
4029    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4030entry:
4031  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4032  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
4033  ret void
4034}
4035
4036define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
4037; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
4038; GFX6:       ; %bb.0: ; %entry
4039; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4040; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4041; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4042; GFX6-NEXT:    s_mov_b32 s6, -1
4043; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4044; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4045; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4046; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4047; GFX6-NEXT:    s_waitcnt vmcnt(0)
4048; GFX6-NEXT:    buffer_wbinvl1
4049; GFX6-NEXT:    s_endpgm
4050;
4051; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
4052; GFX7:       ; %bb.0: ; %entry
4053; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4054; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4055; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4056; GFX7-NEXT:    s_add_u32 s0, s0, 16
4057; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4058; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4059; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4060; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4061; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4062; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4063; GFX7-NEXT:    s_waitcnt vmcnt(0)
4064; GFX7-NEXT:    buffer_wbinvl1_vol
4065; GFX7-NEXT:    s_endpgm
4066;
4067; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
4068; GFX10-WGP:       ; %bb.0: ; %entry
4069; GFX10-WGP-NEXT:    s_clause 0x1
4070; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4071; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4072; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4073; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4074; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4075; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4076; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4077; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4078; GFX10-WGP-NEXT:    buffer_gl0_inv
4079; GFX10-WGP-NEXT:    buffer_gl1_inv
4080; GFX10-WGP-NEXT:    s_endpgm
4081;
4082; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
4083; GFX10-CU:       ; %bb.0: ; %entry
4084; GFX10-CU-NEXT:    s_clause 0x1
4085; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4086; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4087; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4088; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4089; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4090; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4091; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4092; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4093; GFX10-CU-NEXT:    buffer_gl0_inv
4094; GFX10-CU-NEXT:    buffer_gl1_inv
4095; GFX10-CU-NEXT:    s_endpgm
4096;
4097; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
4098; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4099; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4100; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4101; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4102; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4103; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4104; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4105; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4106; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4107; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4108; SKIP-CACHE-INV-NEXT:    s_endpgm
4109    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4110entry:
4111  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4112  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
4113  ret void
4114}
4115
4116define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
4117; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg:
4118; GFX6:       ; %bb.0: ; %entry
4119; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4120; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4121; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4122; GFX6-NEXT:    s_mov_b32 s6, -1
4123; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4124; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4125; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4126; GFX6-NEXT:    s_waitcnt vmcnt(0)
4127; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4128; GFX6-NEXT:    s_endpgm
4129;
4130; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
4131; GFX7:       ; %bb.0: ; %entry
4132; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4133; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4134; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4135; GFX7-NEXT:    s_add_u32 s0, s0, 16
4136; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4137; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4138; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4139; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4140; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4141; GFX7-NEXT:    s_waitcnt vmcnt(0)
4142; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4143; GFX7-NEXT:    s_endpgm
4144;
4145; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
4146; GFX10-WGP:       ; %bb.0: ; %entry
4147; GFX10-WGP-NEXT:    s_clause 0x1
4148; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4149; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4150; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4151; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4152; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4153; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4154; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4155; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4156; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4157; GFX10-WGP-NEXT:    s_endpgm
4158;
4159; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
4160; GFX10-CU:       ; %bb.0: ; %entry
4161; GFX10-CU-NEXT:    s_clause 0x1
4162; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4163; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4164; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4165; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4166; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4167; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4168; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4169; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4170; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4171; GFX10-CU-NEXT:    s_endpgm
4172;
4173; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg:
4174; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4175; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4176; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4177; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4178; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4179; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4180; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4181; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4182; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4183; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4184; SKIP-CACHE-INV-NEXT:    s_endpgm
4185    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4186entry:
4187  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4188  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
4189  ret void
4190}
4191
4192define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
4193; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
4194; GFX6:       ; %bb.0: ; %entry
4195; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4196; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4197; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4198; GFX6-NEXT:    s_mov_b32 s6, -1
4199; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4200; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4201; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4202; GFX6-NEXT:    s_waitcnt vmcnt(0)
4203; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4204; GFX6-NEXT:    s_waitcnt vmcnt(0)
4205; GFX6-NEXT:    buffer_wbinvl1
4206; GFX6-NEXT:    s_endpgm
4207;
4208; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
4209; GFX7:       ; %bb.0: ; %entry
4210; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4211; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4212; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4213; GFX7-NEXT:    s_add_u32 s0, s0, 16
4214; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4215; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4216; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4217; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4218; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4219; GFX7-NEXT:    s_waitcnt vmcnt(0)
4220; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4221; GFX7-NEXT:    s_waitcnt vmcnt(0)
4222; GFX7-NEXT:    buffer_wbinvl1_vol
4223; GFX7-NEXT:    s_endpgm
4224;
4225; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
4226; GFX10-WGP:       ; %bb.0: ; %entry
4227; GFX10-WGP-NEXT:    s_clause 0x1
4228; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4229; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4230; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4231; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4232; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4233; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4234; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4235; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4236; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4237; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4238; GFX10-WGP-NEXT:    buffer_gl0_inv
4239; GFX10-WGP-NEXT:    buffer_gl1_inv
4240; GFX10-WGP-NEXT:    s_endpgm
4241;
4242; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
4243; GFX10-CU:       ; %bb.0: ; %entry
4244; GFX10-CU-NEXT:    s_clause 0x1
4245; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4246; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4247; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4248; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4249; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4250; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4251; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4252; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4253; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4254; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4255; GFX10-CU-NEXT:    buffer_gl0_inv
4256; GFX10-CU-NEXT:    buffer_gl1_inv
4257; GFX10-CU-NEXT:    s_endpgm
4258;
4259; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
4260; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4261; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4262; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4263; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4264; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4265; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4266; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4267; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4268; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4269; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4270; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4271; SKIP-CACHE-INV-NEXT:    s_endpgm
4272    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4273entry:
4274  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4275  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
4276  ret void
4277}
4278
4279define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
4280; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
4281; GFX6:       ; %bb.0: ; %entry
4282; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4283; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4284; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4285; GFX6-NEXT:    s_mov_b32 s6, -1
4286; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4287; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4288; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4289; GFX6-NEXT:    s_waitcnt vmcnt(0)
4290; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4291; GFX6-NEXT:    s_waitcnt vmcnt(0)
4292; GFX6-NEXT:    buffer_wbinvl1
4293; GFX6-NEXT:    s_endpgm
4294;
4295; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
4296; GFX7:       ; %bb.0: ; %entry
4297; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4298; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4299; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4300; GFX7-NEXT:    s_add_u32 s0, s0, 16
4301; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4302; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4303; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4304; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4305; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4306; GFX7-NEXT:    s_waitcnt vmcnt(0)
4307; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4308; GFX7-NEXT:    s_waitcnt vmcnt(0)
4309; GFX7-NEXT:    buffer_wbinvl1_vol
4310; GFX7-NEXT:    s_endpgm
4311;
4312; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
4313; GFX10-WGP:       ; %bb.0: ; %entry
4314; GFX10-WGP-NEXT:    s_clause 0x1
4315; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4316; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4317; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4318; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4319; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4320; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4321; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4322; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4323; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4324; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4325; GFX10-WGP-NEXT:    buffer_gl0_inv
4326; GFX10-WGP-NEXT:    buffer_gl1_inv
4327; GFX10-WGP-NEXT:    s_endpgm
4328;
4329; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
4330; GFX10-CU:       ; %bb.0: ; %entry
4331; GFX10-CU-NEXT:    s_clause 0x1
4332; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4333; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4334; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4335; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4336; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4337; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4338; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4339; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4340; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4341; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4342; GFX10-CU-NEXT:    buffer_gl0_inv
4343; GFX10-CU-NEXT:    buffer_gl1_inv
4344; GFX10-CU-NEXT:    s_endpgm
4345;
4346; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
4347; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4348; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4349; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4350; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4351; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4352; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4353; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4354; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4355; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4356; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4357; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4358; SKIP-CACHE-INV-NEXT:    s_endpgm
4359    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4360entry:
4361  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4362  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
4363  ret void
4364}
4365
4366define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
4367; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
4368; GFX6:       ; %bb.0: ; %entry
4369; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4370; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4371; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4372; GFX6-NEXT:    s_mov_b32 s6, -1
4373; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4374; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4375; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4376; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4377; GFX6-NEXT:    s_waitcnt vmcnt(0)
4378; GFX6-NEXT:    buffer_wbinvl1
4379; GFX6-NEXT:    s_endpgm
4380;
4381; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
4382; GFX7:       ; %bb.0: ; %entry
4383; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4384; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4385; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4386; GFX7-NEXT:    s_add_u32 s0, s0, 16
4387; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4388; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4389; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4390; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4391; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4392; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4393; GFX7-NEXT:    s_waitcnt vmcnt(0)
4394; GFX7-NEXT:    buffer_wbinvl1_vol
4395; GFX7-NEXT:    s_endpgm
4396;
4397; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
4398; GFX10-WGP:       ; %bb.0: ; %entry
4399; GFX10-WGP-NEXT:    s_clause 0x1
4400; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4401; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4402; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4403; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4404; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4405; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4406; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4407; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4408; GFX10-WGP-NEXT:    buffer_gl0_inv
4409; GFX10-WGP-NEXT:    buffer_gl1_inv
4410; GFX10-WGP-NEXT:    s_endpgm
4411;
4412; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
4413; GFX10-CU:       ; %bb.0: ; %entry
4414; GFX10-CU-NEXT:    s_clause 0x1
4415; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4416; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4417; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4418; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4419; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4420; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4421; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4422; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4423; GFX10-CU-NEXT:    buffer_gl0_inv
4424; GFX10-CU-NEXT:    buffer_gl1_inv
4425; GFX10-CU-NEXT:    s_endpgm
4426;
4427; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
4428; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4429; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4430; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4431; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4432; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4433; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4434; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4435; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4436; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4437; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4438; SKIP-CACHE-INV-NEXT:    s_endpgm
4439    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4440entry:
4441  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4442  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
4443  ret void
4444}
4445
4446define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
4447; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg:
4448; GFX6:       ; %bb.0: ; %entry
4449; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4450; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4451; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4452; GFX6-NEXT:    s_mov_b32 s6, -1
4453; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4454; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4455; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4456; GFX6-NEXT:    s_waitcnt vmcnt(0)
4457; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4458; GFX6-NEXT:    s_waitcnt vmcnt(0)
4459; GFX6-NEXT:    buffer_wbinvl1
4460; GFX6-NEXT:    s_endpgm
4461;
4462; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
4463; GFX7:       ; %bb.0: ; %entry
4464; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4465; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4466; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4467; GFX7-NEXT:    s_add_u32 s0, s0, 16
4468; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4469; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4470; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4471; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4472; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4473; GFX7-NEXT:    s_waitcnt vmcnt(0)
4474; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4475; GFX7-NEXT:    s_waitcnt vmcnt(0)
4476; GFX7-NEXT:    buffer_wbinvl1_vol
4477; GFX7-NEXT:    s_endpgm
4478;
4479; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
4480; GFX10-WGP:       ; %bb.0: ; %entry
4481; GFX10-WGP-NEXT:    s_clause 0x1
4482; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4483; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4484; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4485; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4486; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4487; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4488; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4489; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4490; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4491; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4492; GFX10-WGP-NEXT:    buffer_gl0_inv
4493; GFX10-WGP-NEXT:    buffer_gl1_inv
4494; GFX10-WGP-NEXT:    s_endpgm
4495;
4496; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
4497; GFX10-CU:       ; %bb.0: ; %entry
4498; GFX10-CU-NEXT:    s_clause 0x1
4499; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4500; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4501; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4502; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4503; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4504; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4505; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4506; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4507; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4508; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4509; GFX10-CU-NEXT:    buffer_gl0_inv
4510; GFX10-CU-NEXT:    buffer_gl1_inv
4511; GFX10-CU-NEXT:    s_endpgm
4512;
4513; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
4514; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4515; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4516; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4517; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4519; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4520; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4521; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4522; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4523; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4524; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4525; SKIP-CACHE-INV-NEXT:    s_endpgm
4526    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4527entry:
4528  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4529  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
4530  ret void
4531}
4532
4533define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
4534; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
4535; GFX6:       ; %bb.0: ; %entry
4536; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4537; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4538; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4539; GFX6-NEXT:    s_mov_b32 s6, -1
4540; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4541; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4542; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4543; GFX6-NEXT:    s_waitcnt vmcnt(0)
4544; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4545; GFX6-NEXT:    s_waitcnt vmcnt(0)
4546; GFX6-NEXT:    buffer_wbinvl1
4547; GFX6-NEXT:    s_endpgm
4548;
4549; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
4550; GFX7:       ; %bb.0: ; %entry
4551; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4552; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4553; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4554; GFX7-NEXT:    s_add_u32 s0, s0, 16
4555; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4556; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4557; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4558; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4559; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4560; GFX7-NEXT:    s_waitcnt vmcnt(0)
4561; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4562; GFX7-NEXT:    s_waitcnt vmcnt(0)
4563; GFX7-NEXT:    buffer_wbinvl1_vol
4564; GFX7-NEXT:    s_endpgm
4565;
4566; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
4567; GFX10-WGP:       ; %bb.0: ; %entry
4568; GFX10-WGP-NEXT:    s_clause 0x1
4569; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4570; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4571; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4572; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4573; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4574; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4575; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4576; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4577; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4578; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4579; GFX10-WGP-NEXT:    buffer_gl0_inv
4580; GFX10-WGP-NEXT:    buffer_gl1_inv
4581; GFX10-WGP-NEXT:    s_endpgm
4582;
4583; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
4584; GFX10-CU:       ; %bb.0: ; %entry
4585; GFX10-CU-NEXT:    s_clause 0x1
4586; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4587; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4588; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4589; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4590; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4591; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4592; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4593; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4594; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4595; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4596; GFX10-CU-NEXT:    buffer_gl0_inv
4597; GFX10-CU-NEXT:    buffer_gl1_inv
4598; GFX10-CU-NEXT:    s_endpgm
4599;
4600; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
4601; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4602; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4603; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4604; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4605; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4606; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4607; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4608; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4609; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4610; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4611; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4612; SKIP-CACHE-INV-NEXT:    s_endpgm
4613    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4614entry:
4615  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4616  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
4617  ret void
4618}
4619
4620define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
4621; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
4622; GFX6:       ; %bb.0: ; %entry
4623; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4624; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4625; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4626; GFX6-NEXT:    s_mov_b32 s6, -1
4627; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4628; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4629; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4630; GFX6-NEXT:    s_waitcnt vmcnt(0)
4631; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4632; GFX6-NEXT:    s_waitcnt vmcnt(0)
4633; GFX6-NEXT:    buffer_wbinvl1
4634; GFX6-NEXT:    s_endpgm
4635;
4636; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
4637; GFX7:       ; %bb.0: ; %entry
4638; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4639; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4640; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4641; GFX7-NEXT:    s_add_u32 s0, s0, 16
4642; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4643; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4644; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4645; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4646; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4647; GFX7-NEXT:    s_waitcnt vmcnt(0)
4648; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4649; GFX7-NEXT:    s_waitcnt vmcnt(0)
4650; GFX7-NEXT:    buffer_wbinvl1_vol
4651; GFX7-NEXT:    s_endpgm
4652;
4653; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
4654; GFX10-WGP:       ; %bb.0: ; %entry
4655; GFX10-WGP-NEXT:    s_clause 0x1
4656; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4657; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4658; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4659; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4660; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4661; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4662; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4663; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4664; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4665; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4666; GFX10-WGP-NEXT:    buffer_gl0_inv
4667; GFX10-WGP-NEXT:    buffer_gl1_inv
4668; GFX10-WGP-NEXT:    s_endpgm
4669;
4670; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
4671; GFX10-CU:       ; %bb.0: ; %entry
4672; GFX10-CU-NEXT:    s_clause 0x1
4673; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4674; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4675; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4676; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4677; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4678; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4679; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4680; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4681; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4682; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4683; GFX10-CU-NEXT:    buffer_gl0_inv
4684; GFX10-CU-NEXT:    buffer_gl1_inv
4685; GFX10-CU-NEXT:    s_endpgm
4686;
4687; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
4688; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4689; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4690; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4691; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4692; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4693; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4694; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4695; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4696; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4697; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4698; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4699; SKIP-CACHE-INV-NEXT:    s_endpgm
4700    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4701entry:
4702  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4703  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
4704  ret void
4705}
4706
4707define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
4708; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
4709; GFX6:       ; %bb.0: ; %entry
4710; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4711; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4712; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4713; GFX6-NEXT:    s_mov_b32 s6, -1
4714; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4715; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4716; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4717; GFX6-NEXT:    s_waitcnt vmcnt(0)
4718; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4719; GFX6-NEXT:    s_waitcnt vmcnt(0)
4720; GFX6-NEXT:    buffer_wbinvl1
4721; GFX6-NEXT:    s_endpgm
4722;
4723; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
4724; GFX7:       ; %bb.0: ; %entry
4725; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4726; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4727; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4728; GFX7-NEXT:    s_add_u32 s0, s0, 16
4729; GFX7-NEXT:    s_addc_u32 s1, s1, 0
4730; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4731; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4732; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4733; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4734; GFX7-NEXT:    s_waitcnt vmcnt(0)
4735; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
4736; GFX7-NEXT:    s_waitcnt vmcnt(0)
4737; GFX7-NEXT:    buffer_wbinvl1_vol
4738; GFX7-NEXT:    s_endpgm
4739;
4740; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
4741; GFX10-WGP:       ; %bb.0: ; %entry
4742; GFX10-WGP-NEXT:    s_clause 0x1
4743; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4744; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4745; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4746; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4747; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4748; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4749; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4750; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4751; GFX10-WGP-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4752; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4753; GFX10-WGP-NEXT:    buffer_gl0_inv
4754; GFX10-WGP-NEXT:    buffer_gl1_inv
4755; GFX10-WGP-NEXT:    s_endpgm
4756;
4757; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
4758; GFX10-CU:       ; %bb.0: ; %entry
4759; GFX10-CU-NEXT:    s_clause 0x1
4760; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4761; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4762; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4763; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4764; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4765; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4766; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4767; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4768; GFX10-CU-NEXT:    global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
4769; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4770; GFX10-CU-NEXT:    buffer_gl0_inv
4771; GFX10-CU-NEXT:    buffer_gl1_inv
4772; GFX10-CU-NEXT:    s_endpgm
4773;
4774; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
4775; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4776; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4777; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4778; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4779; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4780; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4781; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4782; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4783; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4784; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4785; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4786; SKIP-CACHE-INV-NEXT:    s_endpgm
4787    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4788entry:
4789  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4790  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
4791  ret void
4792}
4793
4794define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
4795; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
4796; GFX6:       ; %bb.0: ; %entry
4797; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4798; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4799; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4800; GFX6-NEXT:    s_mov_b32 s6, -1
4801; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4802; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4803; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4804; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4805; GFX6-NEXT:    s_waitcnt vmcnt(0)
4806; GFX6-NEXT:    buffer_wbinvl1
4807; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4808; GFX6-NEXT:    s_endpgm
4809;
4810; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
4811; GFX7:       ; %bb.0: ; %entry
4812; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4813; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4814; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4815; GFX7-NEXT:    s_add_u32 s4, s0, 16
4816; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4817; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4818; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4819; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4820; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4821; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4822; GFX7-NEXT:    s_waitcnt vmcnt(0)
4823; GFX7-NEXT:    buffer_wbinvl1_vol
4824; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4825; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4826; GFX7-NEXT:    flat_store_dword v[0:1], v2
4827; GFX7-NEXT:    s_endpgm
4828;
4829; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
4830; GFX10-WGP:       ; %bb.0: ; %entry
4831; GFX10-WGP-NEXT:    s_clause 0x1
4832; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4833; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4834; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4835; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4836; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4837; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4838; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4839; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4840; GFX10-WGP-NEXT:    buffer_gl0_inv
4841; GFX10-WGP-NEXT:    buffer_gl1_inv
4842; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4843; GFX10-WGP-NEXT:    s_endpgm
4844;
4845; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
4846; GFX10-CU:       ; %bb.0: ; %entry
4847; GFX10-CU-NEXT:    s_clause 0x1
4848; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4849; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4850; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4851; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4852; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4853; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4854; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4855; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4856; GFX10-CU-NEXT:    buffer_gl0_inv
4857; GFX10-CU-NEXT:    buffer_gl1_inv
4858; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4859; GFX10-CU-NEXT:    s_endpgm
4860;
4861; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
4862; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4863; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4864; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4865; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4866; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4867; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4868; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4869; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4870; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4871; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4872; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4873; SKIP-CACHE-INV-NEXT:    s_endpgm
4874    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4875entry:
4876  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4877  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
4878  %val0 = extractvalue { i32, i1 } %val, 0
4879  store i32 %val0, i32 addrspace(1)* %out, align 4
4880  ret void
4881}
4882
4883define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
4884; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4885; GFX6:       ; %bb.0: ; %entry
4886; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4887; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4888; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4889; GFX6-NEXT:    s_mov_b32 s6, -1
4890; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4891; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4892; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4893; GFX6-NEXT:    s_waitcnt vmcnt(0)
4894; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4895; GFX6-NEXT:    s_waitcnt vmcnt(0)
4896; GFX6-NEXT:    buffer_wbinvl1
4897; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4898; GFX6-NEXT:    s_endpgm
4899;
4900; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4901; GFX7:       ; %bb.0: ; %entry
4902; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4903; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
4904; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
4905; GFX7-NEXT:    s_add_u32 s4, s0, 16
4906; GFX7-NEXT:    s_addc_u32 s5, s1, 0
4907; GFX7-NEXT:    v_mov_b32_e32 v0, s4
4908; GFX7-NEXT:    v_mov_b32_e32 v2, s2
4909; GFX7-NEXT:    v_mov_b32_e32 v1, s5
4910; GFX7-NEXT:    v_mov_b32_e32 v3, s3
4911; GFX7-NEXT:    s_waitcnt vmcnt(0)
4912; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4913; GFX7-NEXT:    s_waitcnt vmcnt(0)
4914; GFX7-NEXT:    buffer_wbinvl1_vol
4915; GFX7-NEXT:    v_mov_b32_e32 v0, s0
4916; GFX7-NEXT:    v_mov_b32_e32 v1, s1
4917; GFX7-NEXT:    flat_store_dword v[0:1], v2
4918; GFX7-NEXT:    s_endpgm
4919;
4920; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4921; GFX10-WGP:       ; %bb.0: ; %entry
4922; GFX10-WGP-NEXT:    s_clause 0x1
4923; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4924; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4925; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
4926; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
4927; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
4928; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
4929; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4930; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
4931; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4932; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
4933; GFX10-WGP-NEXT:    buffer_gl0_inv
4934; GFX10-WGP-NEXT:    buffer_gl1_inv
4935; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
4936; GFX10-WGP-NEXT:    s_endpgm
4937;
4938; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4939; GFX10-CU:       ; %bb.0: ; %entry
4940; GFX10-CU-NEXT:    s_clause 0x1
4941; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4942; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4943; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
4944; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
4945; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
4946; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
4947; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4948; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
4949; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
4950; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
4951; GFX10-CU-NEXT:    buffer_gl0_inv
4952; GFX10-CU-NEXT:    buffer_gl1_inv
4953; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
4954; GFX10-CU-NEXT:    s_endpgm
4955;
4956; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
4957; SKIP-CACHE-INV:       ; %bb.0: ; %entry
4958; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4959; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4960; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
4961; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
4962; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
4963; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
4964; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
4965; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4966; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4967; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
4968; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4969; SKIP-CACHE-INV-NEXT:    s_endpgm
4970    i32 addrspace(1)* %out, i32 %in, i32 %old) {
4971entry:
4972  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
4973  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
4974  %val0 = extractvalue { i32, i1 } %val, 0
4975  store i32 %val0, i32 addrspace(1)* %out, align 4
4976  ret void
4977}
4978
4979define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
4980; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4981; GFX6:       ; %bb.0: ; %entry
4982; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4983; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4984; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4985; GFX6-NEXT:    s_mov_b32 s6, -1
4986; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4987; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4988; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4989; GFX6-NEXT:    s_waitcnt vmcnt(0)
4990; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4991; GFX6-NEXT:    s_waitcnt vmcnt(0)
4992; GFX6-NEXT:    buffer_wbinvl1
4993; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4994; GFX6-NEXT:    s_endpgm
4995;
4996; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
4997; GFX7:       ; %bb.0: ; %entry
4998; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4999; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5000; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5001; GFX7-NEXT:    s_add_u32 s4, s0, 16
5002; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5003; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5004; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5005; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5006; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5007; GFX7-NEXT:    s_waitcnt vmcnt(0)
5008; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5009; GFX7-NEXT:    s_waitcnt vmcnt(0)
5010; GFX7-NEXT:    buffer_wbinvl1_vol
5011; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5012; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5013; GFX7-NEXT:    flat_store_dword v[0:1], v2
5014; GFX7-NEXT:    s_endpgm
5015;
5016; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
5017; GFX10-WGP:       ; %bb.0: ; %entry
5018; GFX10-WGP-NEXT:    s_clause 0x1
5019; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5020; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5021; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5022; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5023; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5024; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5025; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5026; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5027; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5028; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5029; GFX10-WGP-NEXT:    buffer_gl0_inv
5030; GFX10-WGP-NEXT:    buffer_gl1_inv
5031; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5032; GFX10-WGP-NEXT:    s_endpgm
5033;
5034; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
5035; GFX10-CU:       ; %bb.0: ; %entry
5036; GFX10-CU-NEXT:    s_clause 0x1
5037; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5038; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5039; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5040; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5041; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5042; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5043; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5044; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5045; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5046; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5047; GFX10-CU-NEXT:    buffer_gl0_inv
5048; GFX10-CU-NEXT:    buffer_gl1_inv
5049; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5050; GFX10-CU-NEXT:    s_endpgm
5051;
5052; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
5053; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5054; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5055; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5056; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5057; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5058; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5059; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5060; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5061; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5062; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5063; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5064; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5065; SKIP-CACHE-INV-NEXT:    s_endpgm
5066    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5067entry:
5068  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5069  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
5070  %val0 = extractvalue { i32, i1 } %val, 0
5071  store i32 %val0, i32 addrspace(1)* %out, align 4
5072  ret void
5073}
5074
5075define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
5076; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
5077; GFX6:       ; %bb.0: ; %entry
5078; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5079; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5080; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5081; GFX6-NEXT:    s_mov_b32 s6, -1
5082; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5083; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5084; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5085; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5086; GFX6-NEXT:    s_waitcnt vmcnt(0)
5087; GFX6-NEXT:    buffer_wbinvl1
5088; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5089; GFX6-NEXT:    s_endpgm
5090;
5091; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
5092; GFX7:       ; %bb.0: ; %entry
5093; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5094; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5095; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5096; GFX7-NEXT:    s_add_u32 s4, s0, 16
5097; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5098; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5099; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5100; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5101; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5102; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5103; GFX7-NEXT:    s_waitcnt vmcnt(0)
5104; GFX7-NEXT:    buffer_wbinvl1_vol
5105; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5106; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5107; GFX7-NEXT:    flat_store_dword v[0:1], v2
5108; GFX7-NEXT:    s_endpgm
5109;
5110; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
5111; GFX10-WGP:       ; %bb.0: ; %entry
5112; GFX10-WGP-NEXT:    s_clause 0x1
5113; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5114; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5115; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5116; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5117; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5118; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5119; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5120; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5121; GFX10-WGP-NEXT:    buffer_gl0_inv
5122; GFX10-WGP-NEXT:    buffer_gl1_inv
5123; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5124; GFX10-WGP-NEXT:    s_endpgm
5125;
5126; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
5127; GFX10-CU:       ; %bb.0: ; %entry
5128; GFX10-CU-NEXT:    s_clause 0x1
5129; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5130; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5131; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5132; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5133; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5134; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5135; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5136; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5137; GFX10-CU-NEXT:    buffer_gl0_inv
5138; GFX10-CU-NEXT:    buffer_gl1_inv
5139; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5140; GFX10-CU-NEXT:    s_endpgm
5141;
5142; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
5143; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5144; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5145; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5146; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5147; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5148; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5149; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5150; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5151; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5152; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5153; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5154; SKIP-CACHE-INV-NEXT:    s_endpgm
5155    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5156entry:
5157  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5158  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
5159  %val0 = extractvalue { i32, i1 } %val, 0
5160  store i32 %val0, i32 addrspace(1)* %out, align 4
5161  ret void
5162}
5163
5164define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
5165; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
5166; GFX6:       ; %bb.0: ; %entry
5167; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5168; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5169; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5170; GFX6-NEXT:    s_mov_b32 s6, -1
5171; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5172; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5173; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5174; GFX6-NEXT:    s_waitcnt vmcnt(0)
5175; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5176; GFX6-NEXT:    s_waitcnt vmcnt(0)
5177; GFX6-NEXT:    buffer_wbinvl1
5178; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5179; GFX6-NEXT:    s_endpgm
5180;
5181; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
5182; GFX7:       ; %bb.0: ; %entry
5183; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5184; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5185; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5186; GFX7-NEXT:    s_add_u32 s4, s0, 16
5187; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5188; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5189; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5190; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5191; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5192; GFX7-NEXT:    s_waitcnt vmcnt(0)
5193; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5194; GFX7-NEXT:    s_waitcnt vmcnt(0)
5195; GFX7-NEXT:    buffer_wbinvl1_vol
5196; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5197; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5198; GFX7-NEXT:    flat_store_dword v[0:1], v2
5199; GFX7-NEXT:    s_endpgm
5200;
5201; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
5202; GFX10-WGP:       ; %bb.0: ; %entry
5203; GFX10-WGP-NEXT:    s_clause 0x1
5204; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5205; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5206; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5207; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5208; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5209; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5210; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5211; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5212; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5213; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5214; GFX10-WGP-NEXT:    buffer_gl0_inv
5215; GFX10-WGP-NEXT:    buffer_gl1_inv
5216; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5217; GFX10-WGP-NEXT:    s_endpgm
5218;
5219; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
5220; GFX10-CU:       ; %bb.0: ; %entry
5221; GFX10-CU-NEXT:    s_clause 0x1
5222; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5223; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5224; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5225; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5226; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5227; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5228; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5229; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5230; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5231; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5232; GFX10-CU-NEXT:    buffer_gl0_inv
5233; GFX10-CU-NEXT:    buffer_gl1_inv
5234; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5235; GFX10-CU-NEXT:    s_endpgm
5236;
5237; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
5238; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5239; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5240; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5241; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5242; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5243; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5244; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5245; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5246; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5247; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5248; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5249; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5250; SKIP-CACHE-INV-NEXT:    s_endpgm
5251    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5252entry:
5253  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5254  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire
5255  %val0 = extractvalue { i32, i1 } %val, 0
5256  store i32 %val0, i32 addrspace(1)* %out, align 4
5257  ret void
5258}
5259
5260define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
5261; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
5262; GFX6:       ; %bb.0: ; %entry
5263; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5264; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5265; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5266; GFX6-NEXT:    s_mov_b32 s6, -1
5267; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5268; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5269; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5270; GFX6-NEXT:    s_waitcnt vmcnt(0)
5271; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5272; GFX6-NEXT:    s_waitcnt vmcnt(0)
5273; GFX6-NEXT:    buffer_wbinvl1
5274; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5275; GFX6-NEXT:    s_endpgm
5276;
5277; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
5278; GFX7:       ; %bb.0: ; %entry
5279; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5280; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5281; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5282; GFX7-NEXT:    s_add_u32 s4, s0, 16
5283; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5284; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5285; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5286; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5287; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5288; GFX7-NEXT:    s_waitcnt vmcnt(0)
5289; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5290; GFX7-NEXT:    s_waitcnt vmcnt(0)
5291; GFX7-NEXT:    buffer_wbinvl1_vol
5292; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5293; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5294; GFX7-NEXT:    flat_store_dword v[0:1], v2
5295; GFX7-NEXT:    s_endpgm
5296;
5297; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
5298; GFX10-WGP:       ; %bb.0: ; %entry
5299; GFX10-WGP-NEXT:    s_clause 0x1
5300; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5301; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5302; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5303; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5304; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5305; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5306; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5307; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5308; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5309; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5310; GFX10-WGP-NEXT:    buffer_gl0_inv
5311; GFX10-WGP-NEXT:    buffer_gl1_inv
5312; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5313; GFX10-WGP-NEXT:    s_endpgm
5314;
5315; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
5316; GFX10-CU:       ; %bb.0: ; %entry
5317; GFX10-CU-NEXT:    s_clause 0x1
5318; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5319; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5320; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5321; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5322; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5323; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5324; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5325; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5326; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5327; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5328; GFX10-CU-NEXT:    buffer_gl0_inv
5329; GFX10-CU-NEXT:    buffer_gl1_inv
5330; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5331; GFX10-CU-NEXT:    s_endpgm
5332;
5333; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
5334; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5335; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5336; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5337; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5338; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5339; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5340; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5341; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5342; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5343; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5344; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5345; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5346; SKIP-CACHE-INV-NEXT:    s_endpgm
5347    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5348entry:
5349  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5350  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
5351  %val0 = extractvalue { i32, i1 } %val, 0
5352  store i32 %val0, i32 addrspace(1)* %out, align 4
5353  ret void
5354}
5355
5356define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
5357; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
5358; GFX6:       ; %bb.0: ; %entry
5359; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5360; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5361; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5362; GFX6-NEXT:    s_mov_b32 s6, -1
5363; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5364; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5365; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5366; GFX6-NEXT:    s_waitcnt vmcnt(0)
5367; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5368; GFX6-NEXT:    s_waitcnt vmcnt(0)
5369; GFX6-NEXT:    buffer_wbinvl1
5370; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5371; GFX6-NEXT:    s_endpgm
5372;
5373; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
5374; GFX7:       ; %bb.0: ; %entry
5375; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5376; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5377; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5378; GFX7-NEXT:    s_add_u32 s4, s0, 16
5379; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5380; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5381; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5382; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5383; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5384; GFX7-NEXT:    s_waitcnt vmcnt(0)
5385; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5386; GFX7-NEXT:    s_waitcnt vmcnt(0)
5387; GFX7-NEXT:    buffer_wbinvl1_vol
5388; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5389; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5390; GFX7-NEXT:    flat_store_dword v[0:1], v2
5391; GFX7-NEXT:    s_endpgm
5392;
5393; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
5394; GFX10-WGP:       ; %bb.0: ; %entry
5395; GFX10-WGP-NEXT:    s_clause 0x1
5396; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5397; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5398; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5399; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5400; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5401; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5402; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5403; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5404; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5405; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5406; GFX10-WGP-NEXT:    buffer_gl0_inv
5407; GFX10-WGP-NEXT:    buffer_gl1_inv
5408; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5409; GFX10-WGP-NEXT:    s_endpgm
5410;
5411; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
5412; GFX10-CU:       ; %bb.0: ; %entry
5413; GFX10-CU-NEXT:    s_clause 0x1
5414; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5415; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5416; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5417; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5418; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5419; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5420; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5421; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5422; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5423; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5424; GFX10-CU-NEXT:    buffer_gl0_inv
5425; GFX10-CU-NEXT:    buffer_gl1_inv
5426; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5427; GFX10-CU-NEXT:    s_endpgm
5428;
5429; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
5430; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5431; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5432; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5433; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5434; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5435; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5436; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5437; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5438; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5439; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5440; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5441; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5442; SKIP-CACHE-INV-NEXT:    s_endpgm
5443    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5444entry:
5445  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5446  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
5447  %val0 = extractvalue { i32, i1 } %val, 0
5448  store i32 %val0, i32 addrspace(1)* %out, align 4
5449  ret void
5450}
5451
5452define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
5453; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
5454; GFX6:       ; %bb.0: ; %entry
5455; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5456; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5457; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5458; GFX6-NEXT:    s_mov_b32 s6, -1
5459; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5460; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5461; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5462; GFX6-NEXT:    s_waitcnt vmcnt(0)
5463; GFX6-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5464; GFX6-NEXT:    s_waitcnt vmcnt(0)
5465; GFX6-NEXT:    buffer_wbinvl1
5466; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5467; GFX6-NEXT:    s_endpgm
5468;
5469; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
5470; GFX7:       ; %bb.0: ; %entry
5471; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5472; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2
5473; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
5474; GFX7-NEXT:    s_add_u32 s4, s0, 16
5475; GFX7-NEXT:    s_addc_u32 s5, s1, 0
5476; GFX7-NEXT:    v_mov_b32_e32 v0, s4
5477; GFX7-NEXT:    v_mov_b32_e32 v2, s2
5478; GFX7-NEXT:    v_mov_b32_e32 v1, s5
5479; GFX7-NEXT:    v_mov_b32_e32 v3, s3
5480; GFX7-NEXT:    s_waitcnt vmcnt(0)
5481; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5482; GFX7-NEXT:    s_waitcnt vmcnt(0)
5483; GFX7-NEXT:    buffer_wbinvl1_vol
5484; GFX7-NEXT:    v_mov_b32_e32 v0, s0
5485; GFX7-NEXT:    v_mov_b32_e32 v1, s1
5486; GFX7-NEXT:    flat_store_dword v[0:1], v2
5487; GFX7-NEXT:    s_endpgm
5488;
5489; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
5490; GFX10-WGP:       ; %bb.0: ; %entry
5491; GFX10-WGP-NEXT:    s_clause 0x1
5492; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5493; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5494; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, 0
5495; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
5496; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0
5497; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1
5498; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5499; GFX10-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
5500; GFX10-WGP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5501; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
5502; GFX10-WGP-NEXT:    buffer_gl0_inv
5503; GFX10-WGP-NEXT:    buffer_gl1_inv
5504; GFX10-WGP-NEXT:    global_store_dword v2, v0, s[2:3]
5505; GFX10-WGP-NEXT:    s_endpgm
5506;
5507; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
5508; GFX10-CU:       ; %bb.0: ; %entry
5509; GFX10-CU-NEXT:    s_clause 0x1
5510; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5511; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5512; GFX10-CU-NEXT:    v_mov_b32_e32 v2, 0
5513; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
5514; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0
5515; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1
5516; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5517; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
5518; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc
5519; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
5520; GFX10-CU-NEXT:    buffer_gl0_inv
5521; GFX10-CU-NEXT:    buffer_gl1_inv
5522; GFX10-CU-NEXT:    global_store_dword v2, v0, s[2:3]
5523; GFX10-CU-NEXT:    s_endpgm
5524;
5525; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
5526; SKIP-CACHE-INV:       ; %bb.0: ; %entry
5527; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5528; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5529; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, 0xf000
5530; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
5531; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
5532; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
5533; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1
5534; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5535; SKIP-CACHE-INV-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
5536; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
5537; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5538; SKIP-CACHE-INV-NEXT:    s_endpgm
5539    i32 addrspace(1)* %out, i32 %in, i32 %old) {
5540entry:
5541  %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
5542  %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
5543  %val0 = extractvalue { i32, i1 } %val, 0
5544  store i32 %val0, i32 addrspace(1)* %out, align 4
5545  ret void
5546}
5547
5548