1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7 8define amdgpu_kernel void @global_workgroup_unordered_load( 9; GFX6-LABEL: global_workgroup_unordered_load: 10; GFX6: ; %bb.0: ; %entry 11; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 12; GFX6-NEXT: s_mov_b32 s3, 0xf000 13; GFX6-NEXT: s_mov_b32 s2, -1 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_mov_b32 s0, s4 16; GFX6-NEXT: s_mov_b32 s1, s5 17; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 18; GFX6-NEXT: s_mov_b32 s4, s6 19; GFX6-NEXT: s_mov_b32 s5, s7 20; GFX6-NEXT: s_mov_b32 s6, s2 21; GFX6-NEXT: s_mov_b32 s7, s3 22; GFX6-NEXT: s_waitcnt vmcnt(0) 23; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 24; GFX6-NEXT: s_endpgm 25; 26; GFX7-LABEL: global_workgroup_unordered_load: 27; GFX7: ; %bb.0: ; %entry 28; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 29; GFX7-NEXT: s_waitcnt lgkmcnt(0) 30; GFX7-NEXT: v_mov_b32_e32 v0, s0 31; GFX7-NEXT: v_mov_b32_e32 v1, s1 32; GFX7-NEXT: flat_load_dword v0, v[0:1] 33; GFX7-NEXT: v_mov_b32_e32 v2, s2 34; GFX7-NEXT: v_mov_b32_e32 v3, s3 35; GFX7-NEXT: s_waitcnt vmcnt(0) 36; GFX7-NEXT: flat_store_dword v[2:3], v0 37; GFX7-NEXT: s_endpgm 38; 39; GFX10-WGP-LABEL: global_workgroup_unordered_load: 40; GFX10-WGP: ; %bb.0: ; %entry 41; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 42; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 43; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 44; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] 45; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 46; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 47; GFX10-WGP-NEXT: s_endpgm 48; 49; GFX10-CU-LABEL: global_workgroup_unordered_load: 50; GFX10-CU: ; %bb.0: ; %entry 51; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 52; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 53; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 54; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 55; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 56; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 57; GFX10-CU-NEXT: s_endpgm 58; 59; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_load: 60; SKIP-CACHE-INV: ; %bb.0: ; %entry 61; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 62; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 63; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 64; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 65; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 66; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 67; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 68; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 69; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 70; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 71; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 72; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 73; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 74; SKIP-CACHE-INV-NEXT: s_endpgm 75 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 76entry: 77 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4 78 store i32 %val, i32 addrspace(1)* %out 79 ret void 80} 81 82define amdgpu_kernel void @global_workgroup_monotonic_load( 83; GFX6-LABEL: global_workgroup_monotonic_load: 84; GFX6: ; %bb.0: ; %entry 85; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 86; GFX6-NEXT: s_mov_b32 s3, 0xf000 87; GFX6-NEXT: s_mov_b32 s2, -1 88; GFX6-NEXT: s_waitcnt lgkmcnt(0) 89; GFX6-NEXT: s_mov_b32 s0, s4 90; GFX6-NEXT: s_mov_b32 s1, s5 91; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 92; GFX6-NEXT: s_mov_b32 s4, s6 93; GFX6-NEXT: s_mov_b32 s5, s7 94; GFX6-NEXT: s_mov_b32 s6, s2 95; GFX6-NEXT: s_mov_b32 s7, s3 96; GFX6-NEXT: s_waitcnt vmcnt(0) 97; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 98; GFX6-NEXT: s_endpgm 99; 100; GFX7-LABEL: global_workgroup_monotonic_load: 101; GFX7: ; %bb.0: ; %entry 102; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 103; GFX7-NEXT: s_waitcnt lgkmcnt(0) 104; GFX7-NEXT: v_mov_b32_e32 v0, s0 105; GFX7-NEXT: v_mov_b32_e32 v1, s1 106; GFX7-NEXT: flat_load_dword v0, v[0:1] 107; GFX7-NEXT: v_mov_b32_e32 v2, s2 108; GFX7-NEXT: v_mov_b32_e32 v3, s3 109; GFX7-NEXT: s_waitcnt vmcnt(0) 110; GFX7-NEXT: flat_store_dword v[2:3], v0 111; GFX7-NEXT: s_endpgm 112; 113; GFX10-WGP-LABEL: global_workgroup_monotonic_load: 114; GFX10-WGP: ; %bb.0: ; %entry 115; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 116; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 117; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 118; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 119; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 120; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 121; GFX10-WGP-NEXT: s_endpgm 122; 123; GFX10-CU-LABEL: global_workgroup_monotonic_load: 124; GFX10-CU: ; %bb.0: ; %entry 125; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 126; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 127; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 128; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 129; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 130; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 131; GFX10-CU-NEXT: s_endpgm 132; 133; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_load: 134; SKIP-CACHE-INV: ; %bb.0: ; %entry 135; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 136; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 137; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 138; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 139; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 140; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 141; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 142; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 143; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 144; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 145; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 146; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 147; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 148; SKIP-CACHE-INV-NEXT: s_endpgm 149 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 150entry: 151 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4 152 store i32 %val, i32 addrspace(1)* %out 153 ret void 154} 155 156define amdgpu_kernel void @global_workgroup_acquire_load( 157; GFX6-LABEL: global_workgroup_acquire_load: 158; GFX6: ; %bb.0: ; %entry 159; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX6-NEXT: s_mov_b32 s3, 0xf000 161; GFX6-NEXT: s_mov_b32 s2, -1 162; GFX6-NEXT: s_waitcnt lgkmcnt(0) 163; GFX6-NEXT: s_mov_b32 s0, s4 164; GFX6-NEXT: s_mov_b32 s1, s5 165; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 166; GFX6-NEXT: s_mov_b32 s4, s6 167; GFX6-NEXT: s_mov_b32 s5, s7 168; GFX6-NEXT: s_mov_b32 s6, s2 169; GFX6-NEXT: s_mov_b32 s7, s3 170; GFX6-NEXT: s_waitcnt vmcnt(0) 171; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 172; GFX6-NEXT: s_endpgm 173; 174; GFX7-LABEL: global_workgroup_acquire_load: 175; GFX7: ; %bb.0: ; %entry 176; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 177; GFX7-NEXT: s_waitcnt lgkmcnt(0) 178; GFX7-NEXT: v_mov_b32_e32 v0, s0 179; GFX7-NEXT: v_mov_b32_e32 v1, s1 180; GFX7-NEXT: flat_load_dword v0, v[0:1] 181; GFX7-NEXT: v_mov_b32_e32 v2, s2 182; GFX7-NEXT: v_mov_b32_e32 v3, s3 183; GFX7-NEXT: s_waitcnt vmcnt(0) 184; GFX7-NEXT: flat_store_dword v[2:3], v0 185; GFX7-NEXT: s_endpgm 186; 187; GFX10-WGP-LABEL: global_workgroup_acquire_load: 188; GFX10-WGP: ; %bb.0: ; %entry 189; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 190; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 191; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 192; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 193; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 194; GFX10-WGP-NEXT: buffer_gl0_inv 195; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 196; GFX10-WGP-NEXT: s_endpgm 197; 198; GFX10-CU-LABEL: global_workgroup_acquire_load: 199; GFX10-CU: ; %bb.0: ; %entry 200; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 201; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 202; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 203; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 204; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 205; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 206; GFX10-CU-NEXT: s_endpgm 207; 208; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_load: 209; SKIP-CACHE-INV: ; %bb.0: ; %entry 210; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 211; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 212; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 213; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 214; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 215; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 216; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 217; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 218; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 219; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 220; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 221; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 222; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 223; SKIP-CACHE-INV-NEXT: s_endpgm 224 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 225entry: 226 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 227 store i32 %val, i32 addrspace(1)* %out 228 ret void 229} 230 231define amdgpu_kernel void @global_workgroup_seq_cst_load( 232; GFX6-LABEL: global_workgroup_seq_cst_load: 233; GFX6: ; %bb.0: ; %entry 234; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 235; GFX6-NEXT: s_mov_b32 s3, 0xf000 236; GFX6-NEXT: s_mov_b32 s2, -1 237; GFX6-NEXT: s_waitcnt lgkmcnt(0) 238; GFX6-NEXT: s_mov_b32 s0, s4 239; GFX6-NEXT: s_mov_b32 s1, s5 240; GFX6-NEXT: s_waitcnt lgkmcnt(0) 241; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 242; GFX6-NEXT: s_mov_b32 s4, s6 243; GFX6-NEXT: s_mov_b32 s5, s7 244; GFX6-NEXT: s_mov_b32 s6, s2 245; GFX6-NEXT: s_mov_b32 s7, s3 246; GFX6-NEXT: s_waitcnt vmcnt(0) 247; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 248; GFX6-NEXT: s_endpgm 249; 250; GFX7-LABEL: global_workgroup_seq_cst_load: 251; GFX7: ; %bb.0: ; %entry 252; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 253; GFX7-NEXT: s_waitcnt lgkmcnt(0) 254; GFX7-NEXT: v_mov_b32_e32 v0, s0 255; GFX7-NEXT: v_mov_b32_e32 v1, s1 256; GFX7-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7-NEXT: flat_load_dword v0, v[0:1] 258; GFX7-NEXT: v_mov_b32_e32 v2, s2 259; GFX7-NEXT: v_mov_b32_e32 v3, s3 260; GFX7-NEXT: s_waitcnt vmcnt(0) 261; GFX7-NEXT: flat_store_dword v[2:3], v0 262; GFX7-NEXT: s_endpgm 263; 264; GFX10-WGP-LABEL: global_workgroup_seq_cst_load: 265; GFX10-WGP: ; %bb.0: ; %entry 266; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 267; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 268; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 269; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 270; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 271; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 272; GFX10-WGP-NEXT: buffer_gl0_inv 273; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 274; GFX10-WGP-NEXT: s_endpgm 275; 276; GFX10-CU-LABEL: global_workgroup_seq_cst_load: 277; GFX10-CU: ; %bb.0: ; %entry 278; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 279; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 280; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 281; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 282; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 283; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 284; GFX10-CU-NEXT: s_endpgm 285; 286; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_load: 287; SKIP-CACHE-INV: ; %bb.0: ; %entry 288; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 289; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 290; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 291; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 292; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 293; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 294; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 295; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 296; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 297; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 298; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 299; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 300; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 301; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 302; SKIP-CACHE-INV-NEXT: s_endpgm 303 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 304entry: 305 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4 306 store i32 %val, i32 addrspace(1)* %out 307 ret void 308} 309 310define amdgpu_kernel void @global_workgroup_unordered_store( 311; GFX6-LABEL: global_workgroup_unordered_store: 312; GFX6: ; %bb.0: ; %entry 313; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 314; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 315; GFX6-NEXT: s_mov_b32 s3, 0xf000 316; GFX6-NEXT: s_mov_b32 s2, -1 317; GFX6-NEXT: s_waitcnt lgkmcnt(0) 318; GFX6-NEXT: v_mov_b32_e32 v0, s4 319; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 320; GFX6-NEXT: s_endpgm 321; 322; GFX7-LABEL: global_workgroup_unordered_store: 323; GFX7: ; %bb.0: ; %entry 324; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 325; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 326; GFX7-NEXT: s_waitcnt lgkmcnt(0) 327; GFX7-NEXT: v_mov_b32_e32 v2, s2 328; GFX7-NEXT: v_mov_b32_e32 v0, s0 329; GFX7-NEXT: v_mov_b32_e32 v1, s1 330; GFX7-NEXT: flat_store_dword v[0:1], v2 331; GFX7-NEXT: s_endpgm 332; 333; GFX10-WGP-LABEL: global_workgroup_unordered_store: 334; GFX10-WGP: ; %bb.0: ; %entry 335; GFX10-WGP-NEXT: s_clause 0x1 336; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 337; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 338; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 339; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 340; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 341; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 342; GFX10-WGP-NEXT: s_endpgm 343; 344; GFX10-CU-LABEL: global_workgroup_unordered_store: 345; GFX10-CU: ; %bb.0: ; %entry 346; GFX10-CU-NEXT: s_clause 0x1 347; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 348; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 349; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 350; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 351; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 352; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 353; GFX10-CU-NEXT: s_endpgm 354; 355; SKIP-CACHE-INV-LABEL: global_workgroup_unordered_store: 356; SKIP-CACHE-INV: ; %bb.0: ; %entry 357; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 358; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 359; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 360; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 361; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 362; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 363; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 364; SKIP-CACHE-INV-NEXT: s_endpgm 365 i32 %in, i32 addrspace(1)* %out) { 366entry: 367 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4 368 ret void 369} 370 371define amdgpu_kernel void @global_workgroup_monotonic_store( 372; GFX6-LABEL: global_workgroup_monotonic_store: 373; GFX6: ; %bb.0: ; %entry 374; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 375; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 376; GFX6-NEXT: s_mov_b32 s3, 0xf000 377; GFX6-NEXT: s_mov_b32 s2, -1 378; GFX6-NEXT: s_waitcnt lgkmcnt(0) 379; GFX6-NEXT: v_mov_b32_e32 v0, s4 380; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 381; GFX6-NEXT: s_endpgm 382; 383; GFX7-LABEL: global_workgroup_monotonic_store: 384; GFX7: ; %bb.0: ; %entry 385; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 386; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 387; GFX7-NEXT: s_waitcnt lgkmcnt(0) 388; GFX7-NEXT: v_mov_b32_e32 v2, s2 389; GFX7-NEXT: v_mov_b32_e32 v0, s0 390; GFX7-NEXT: v_mov_b32_e32 v1, s1 391; GFX7-NEXT: flat_store_dword v[0:1], v2 392; GFX7-NEXT: s_endpgm 393; 394; GFX10-WGP-LABEL: global_workgroup_monotonic_store: 395; GFX10-WGP: ; %bb.0: ; %entry 396; GFX10-WGP-NEXT: s_clause 0x1 397; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 398; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 399; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 400; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 401; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 402; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 403; GFX10-WGP-NEXT: s_endpgm 404; 405; GFX10-CU-LABEL: global_workgroup_monotonic_store: 406; GFX10-CU: ; %bb.0: ; %entry 407; GFX10-CU-NEXT: s_clause 0x1 408; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 409; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 410; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 411; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 412; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 413; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 414; GFX10-CU-NEXT: s_endpgm 415; 416; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_store: 417; SKIP-CACHE-INV: ; %bb.0: ; %entry 418; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 419; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 420; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 421; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 422; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 423; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 424; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 425; SKIP-CACHE-INV-NEXT: s_endpgm 426 i32 %in, i32 addrspace(1)* %out) { 427entry: 428 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4 429 ret void 430} 431 432define amdgpu_kernel void @global_workgroup_release_store( 433; GFX6-LABEL: global_workgroup_release_store: 434; GFX6: ; %bb.0: ; %entry 435; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 436; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 437; GFX6-NEXT: s_mov_b32 s3, 0xf000 438; GFX6-NEXT: s_mov_b32 s2, -1 439; GFX6-NEXT: s_waitcnt lgkmcnt(0) 440; GFX6-NEXT: v_mov_b32_e32 v0, s4 441; GFX6-NEXT: s_waitcnt lgkmcnt(0) 442; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 443; GFX6-NEXT: s_endpgm 444; 445; GFX7-LABEL: global_workgroup_release_store: 446; GFX7: ; %bb.0: ; %entry 447; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 448; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 449; GFX7-NEXT: s_waitcnt lgkmcnt(0) 450; GFX7-NEXT: v_mov_b32_e32 v2, s2 451; GFX7-NEXT: v_mov_b32_e32 v0, s0 452; GFX7-NEXT: v_mov_b32_e32 v1, s1 453; GFX7-NEXT: s_waitcnt lgkmcnt(0) 454; GFX7-NEXT: flat_store_dword v[0:1], v2 455; GFX7-NEXT: s_endpgm 456; 457; GFX10-WGP-LABEL: global_workgroup_release_store: 458; GFX10-WGP: ; %bb.0: ; %entry 459; GFX10-WGP-NEXT: s_clause 0x1 460; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 461; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 462; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 463; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 464; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 465; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 466; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 467; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 468; GFX10-WGP-NEXT: s_endpgm 469; 470; GFX10-CU-LABEL: global_workgroup_release_store: 471; GFX10-CU: ; %bb.0: ; %entry 472; GFX10-CU-NEXT: s_clause 0x1 473; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 474; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 475; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 476; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 477; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 478; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 479; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 480; GFX10-CU-NEXT: s_endpgm 481; 482; SKIP-CACHE-INV-LABEL: global_workgroup_release_store: 483; SKIP-CACHE-INV: ; %bb.0: ; %entry 484; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 485; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 486; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 487; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 488; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 489; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 490; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 491; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 492; SKIP-CACHE-INV-NEXT: s_endpgm 493 i32 %in, i32 addrspace(1)* %out) { 494entry: 495 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 496 ret void 497} 498 499define amdgpu_kernel void @global_workgroup_seq_cst_store( 500; GFX6-LABEL: global_workgroup_seq_cst_store: 501; GFX6: ; %bb.0: ; %entry 502; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 503; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 504; GFX6-NEXT: s_mov_b32 s3, 0xf000 505; GFX6-NEXT: s_mov_b32 s2, -1 506; GFX6-NEXT: s_waitcnt lgkmcnt(0) 507; GFX6-NEXT: v_mov_b32_e32 v0, s4 508; GFX6-NEXT: s_waitcnt lgkmcnt(0) 509; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 510; GFX6-NEXT: s_endpgm 511; 512; GFX7-LABEL: global_workgroup_seq_cst_store: 513; GFX7: ; %bb.0: ; %entry 514; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 515; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 516; GFX7-NEXT: s_waitcnt lgkmcnt(0) 517; GFX7-NEXT: v_mov_b32_e32 v2, s2 518; GFX7-NEXT: v_mov_b32_e32 v0, s0 519; GFX7-NEXT: v_mov_b32_e32 v1, s1 520; GFX7-NEXT: s_waitcnt lgkmcnt(0) 521; GFX7-NEXT: flat_store_dword v[0:1], v2 522; GFX7-NEXT: s_endpgm 523; 524; GFX10-WGP-LABEL: global_workgroup_seq_cst_store: 525; GFX10-WGP: ; %bb.0: ; %entry 526; GFX10-WGP-NEXT: s_clause 0x1 527; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 528; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 529; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 530; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 531; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 532; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 533; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 534; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 535; GFX10-WGP-NEXT: s_endpgm 536; 537; GFX10-CU-LABEL: global_workgroup_seq_cst_store: 538; GFX10-CU: ; %bb.0: ; %entry 539; GFX10-CU-NEXT: s_clause 0x1 540; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 541; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 542; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 543; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 544; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 545; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 546; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 547; GFX10-CU-NEXT: s_endpgm 548; 549; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_store: 550; SKIP-CACHE-INV: ; %bb.0: ; %entry 551; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 552; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 553; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 554; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 555; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 556; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 557; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 558; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 559; SKIP-CACHE-INV-NEXT: s_endpgm 560 i32 %in, i32 addrspace(1)* %out) { 561entry: 562 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4 563 ret void 564} 565 566define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( 567; GFX6-LABEL: global_workgroup_monotonic_atomicrmw: 568; GFX6: ; %bb.0: ; %entry 569; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 570; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 571; GFX6-NEXT: s_mov_b32 s7, 0xf000 572; GFX6-NEXT: s_mov_b32 s6, -1 573; GFX6-NEXT: s_waitcnt lgkmcnt(0) 574; GFX6-NEXT: v_mov_b32_e32 v0, s0 575; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 576; GFX6-NEXT: s_endpgm 577; 578; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: 579; GFX7: ; %bb.0: ; %entry 580; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 581; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 582; GFX7-NEXT: s_waitcnt lgkmcnt(0) 583; GFX7-NEXT: v_mov_b32_e32 v0, s0 584; GFX7-NEXT: v_mov_b32_e32 v1, s1 585; GFX7-NEXT: v_mov_b32_e32 v2, s2 586; GFX7-NEXT: flat_atomic_swap v[0:1], v2 587; GFX7-NEXT: s_endpgm 588; 589; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw: 590; GFX10-WGP: ; %bb.0: ; %entry 591; GFX10-WGP-NEXT: s_clause 0x1 592; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 593; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 594; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 595; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 596; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 597; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 598; GFX10-WGP-NEXT: s_endpgm 599; 600; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw: 601; GFX10-CU: ; %bb.0: ; %entry 602; GFX10-CU-NEXT: s_clause 0x1 603; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 604; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 605; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 606; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 607; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 608; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 609; GFX10-CU-NEXT: s_endpgm 610; 611; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_atomicrmw: 612; SKIP-CACHE-INV: ; %bb.0: ; %entry 613; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 614; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 615; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 616; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 617; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 618; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 619; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 620; SKIP-CACHE-INV-NEXT: s_endpgm 621 i32 addrspace(1)* %out, i32 %in) { 622entry: 623 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic 624 ret void 625} 626 627define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( 628; GFX6-LABEL: global_workgroup_acquire_atomicrmw: 629; GFX6: ; %bb.0: ; %entry 630; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 631; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 632; GFX6-NEXT: s_mov_b32 s7, 0xf000 633; GFX6-NEXT: s_mov_b32 s6, -1 634; GFX6-NEXT: s_waitcnt lgkmcnt(0) 635; GFX6-NEXT: v_mov_b32_e32 v0, s0 636; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 637; GFX6-NEXT: s_waitcnt lgkmcnt(0) 638; GFX6-NEXT: s_endpgm 639; 640; GFX7-LABEL: global_workgroup_acquire_atomicrmw: 641; GFX7: ; %bb.0: ; %entry 642; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 643; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 644; GFX7-NEXT: s_waitcnt lgkmcnt(0) 645; GFX7-NEXT: v_mov_b32_e32 v0, s0 646; GFX7-NEXT: v_mov_b32_e32 v1, s1 647; GFX7-NEXT: v_mov_b32_e32 v2, s2 648; GFX7-NEXT: flat_atomic_swap v[0:1], v2 649; GFX7-NEXT: s_waitcnt lgkmcnt(0) 650; GFX7-NEXT: s_endpgm 651; 652; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw: 653; GFX10-WGP: ; %bb.0: ; %entry 654; GFX10-WGP-NEXT: s_clause 0x1 655; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 656; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 657; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 658; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 659; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 660; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 661; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 662; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 663; GFX10-WGP-NEXT: buffer_gl0_inv 664; GFX10-WGP-NEXT: s_endpgm 665; 666; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw: 667; GFX10-CU: ; %bb.0: ; %entry 668; GFX10-CU-NEXT: s_clause 0x1 669; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 670; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 671; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 672; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 673; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 674; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 675; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 676; GFX10-CU-NEXT: s_endpgm 677; 678; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_atomicrmw: 679; SKIP-CACHE-INV: ; %bb.0: ; %entry 680; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 681; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 682; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 683; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 684; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 685; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 686; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 687; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 688; SKIP-CACHE-INV-NEXT: s_endpgm 689 i32 addrspace(1)* %out, i32 %in) { 690entry: 691 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire 692 ret void 693} 694 695define amdgpu_kernel void @global_workgroup_release_atomicrmw( 696; GFX6-LABEL: global_workgroup_release_atomicrmw: 697; GFX6: ; %bb.0: ; %entry 698; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 699; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 700; GFX6-NEXT: s_mov_b32 s7, 0xf000 701; GFX6-NEXT: s_mov_b32 s6, -1 702; GFX6-NEXT: s_waitcnt lgkmcnt(0) 703; GFX6-NEXT: v_mov_b32_e32 v0, s0 704; GFX6-NEXT: s_waitcnt lgkmcnt(0) 705; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 706; GFX6-NEXT: s_endpgm 707; 708; GFX7-LABEL: global_workgroup_release_atomicrmw: 709; GFX7: ; %bb.0: ; %entry 710; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 711; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 712; GFX7-NEXT: s_waitcnt lgkmcnt(0) 713; GFX7-NEXT: v_mov_b32_e32 v0, s0 714; GFX7-NEXT: v_mov_b32_e32 v1, s1 715; GFX7-NEXT: v_mov_b32_e32 v2, s2 716; GFX7-NEXT: s_waitcnt lgkmcnt(0) 717; GFX7-NEXT: flat_atomic_swap v[0:1], v2 718; GFX7-NEXT: s_endpgm 719; 720; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw: 721; GFX10-WGP: ; %bb.0: ; %entry 722; GFX10-WGP-NEXT: s_clause 0x1 723; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 724; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 725; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 726; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 727; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 728; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 729; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 730; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 731; GFX10-WGP-NEXT: s_endpgm 732; 733; GFX10-CU-LABEL: global_workgroup_release_atomicrmw: 734; GFX10-CU: ; %bb.0: ; %entry 735; GFX10-CU-NEXT: s_clause 0x1 736; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 737; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 738; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 739; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 740; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 741; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 742; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 743; GFX10-CU-NEXT: s_endpgm 744; 745; SKIP-CACHE-INV-LABEL: global_workgroup_release_atomicrmw: 746; SKIP-CACHE-INV: ; %bb.0: ; %entry 747; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 748; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 749; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 750; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 751; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 753; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 754; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 755; SKIP-CACHE-INV-NEXT: s_endpgm 756 i32 addrspace(1)* %out, i32 %in) { 757entry: 758 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release 759 ret void 760} 761 762define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( 763; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw: 764; GFX6: ; %bb.0: ; %entry 765; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 766; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 767; GFX6-NEXT: s_mov_b32 s7, 0xf000 768; GFX6-NEXT: s_mov_b32 s6, -1 769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 770; GFX6-NEXT: v_mov_b32_e32 v0, s0 771; GFX6-NEXT: s_waitcnt lgkmcnt(0) 772; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 773; GFX6-NEXT: s_waitcnt lgkmcnt(0) 774; GFX6-NEXT: s_endpgm 775; 776; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: 777; GFX7: ; %bb.0: ; %entry 778; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 779; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 780; GFX7-NEXT: s_waitcnt lgkmcnt(0) 781; GFX7-NEXT: v_mov_b32_e32 v0, s0 782; GFX7-NEXT: v_mov_b32_e32 v1, s1 783; GFX7-NEXT: v_mov_b32_e32 v2, s2 784; GFX7-NEXT: s_waitcnt lgkmcnt(0) 785; GFX7-NEXT: flat_atomic_swap v[0:1], v2 786; GFX7-NEXT: s_waitcnt lgkmcnt(0) 787; GFX7-NEXT: s_endpgm 788; 789; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: 790; GFX10-WGP: ; %bb.0: ; %entry 791; GFX10-WGP-NEXT: s_clause 0x1 792; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 793; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 794; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 795; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 796; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 797; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 798; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 799; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 800; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 801; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 802; GFX10-WGP-NEXT: buffer_gl0_inv 803; GFX10-WGP-NEXT: s_endpgm 804; 805; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw: 806; GFX10-CU: ; %bb.0: ; %entry 807; GFX10-CU-NEXT: s_clause 0x1 808; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 809; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 810; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 811; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 812; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 813; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 814; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 815; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 816; GFX10-CU-NEXT: s_endpgm 817; 818; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw: 819; SKIP-CACHE-INV: ; %bb.0: ; %entry 820; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 821; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 822; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 823; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 824; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 825; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 826; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 827; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 828; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 829; SKIP-CACHE-INV-NEXT: s_endpgm 830 i32 addrspace(1)* %out, i32 %in) { 831entry: 832 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel 833 ret void 834} 835 836define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( 837; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw: 838; GFX6: ; %bb.0: ; %entry 839; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 840; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 841; GFX6-NEXT: s_mov_b32 s7, 0xf000 842; GFX6-NEXT: s_mov_b32 s6, -1 843; GFX6-NEXT: s_waitcnt lgkmcnt(0) 844; GFX6-NEXT: v_mov_b32_e32 v0, s0 845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 846; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 847; GFX6-NEXT: s_waitcnt lgkmcnt(0) 848; GFX6-NEXT: s_endpgm 849; 850; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: 851; GFX7: ; %bb.0: ; %entry 852; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 853; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 854; GFX7-NEXT: s_waitcnt lgkmcnt(0) 855; GFX7-NEXT: v_mov_b32_e32 v0, s0 856; GFX7-NEXT: v_mov_b32_e32 v1, s1 857; GFX7-NEXT: v_mov_b32_e32 v2, s2 858; GFX7-NEXT: s_waitcnt lgkmcnt(0) 859; GFX7-NEXT: flat_atomic_swap v[0:1], v2 860; GFX7-NEXT: s_waitcnt lgkmcnt(0) 861; GFX7-NEXT: s_endpgm 862; 863; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: 864; GFX10-WGP: ; %bb.0: ; %entry 865; GFX10-WGP-NEXT: s_clause 0x1 866; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 867; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 868; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 869; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 870; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 871; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 872; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 873; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 874; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 875; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 876; GFX10-WGP-NEXT: buffer_gl0_inv 877; GFX10-WGP-NEXT: s_endpgm 878; 879; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw: 880; GFX10-CU: ; %bb.0: ; %entry 881; GFX10-CU-NEXT: s_clause 0x1 882; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 883; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 884; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 885; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 886; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 887; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 888; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 889; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 890; GFX10-CU-NEXT: s_endpgm 891; 892; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw: 893; SKIP-CACHE-INV: ; %bb.0: ; %entry 894; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 895; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 896; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 897; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 898; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 899; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 900; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 901; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 902; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 903; SKIP-CACHE-INV-NEXT: s_endpgm 904 i32 addrspace(1)* %out, i32 %in) { 905entry: 906 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst 907 ret void 908} 909 910define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( 911; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw: 912; GFX6: ; %bb.0: ; %entry 913; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 914; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 915; GFX6-NEXT: s_mov_b32 s7, 0xf000 916; GFX6-NEXT: s_mov_b32 s6, -1 917; GFX6-NEXT: s_waitcnt lgkmcnt(0) 918; GFX6-NEXT: v_mov_b32_e32 v0, s0 919; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 920; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 922; GFX6-NEXT: s_endpgm 923; 924; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: 925; GFX7: ; %bb.0: ; %entry 926; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 927; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 928; GFX7-NEXT: s_waitcnt lgkmcnt(0) 929; GFX7-NEXT: v_mov_b32_e32 v0, s0 930; GFX7-NEXT: v_mov_b32_e32 v1, s1 931; GFX7-NEXT: v_mov_b32_e32 v2, s2 932; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 933; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 934; GFX7-NEXT: flat_store_dword v[0:1], v2 935; GFX7-NEXT: s_endpgm 936; 937; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: 938; GFX10-WGP: ; %bb.0: ; %entry 939; GFX10-WGP-NEXT: s_clause 0x1 940; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 941; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 942; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 943; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 944; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 945; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 946; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 947; GFX10-WGP-NEXT: buffer_gl0_inv 948; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 949; GFX10-WGP-NEXT: s_endpgm 950; 951; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: 952; GFX10-CU: ; %bb.0: ; %entry 953; GFX10-CU-NEXT: s_clause 0x1 954; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 955; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 956; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 957; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 958; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 959; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 960; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 961; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 962; GFX10-CU-NEXT: s_endpgm 963; 964; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_ret_atomicrmw: 965; SKIP-CACHE-INV: ; %bb.0: ; %entry 966; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 967; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 968; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 969; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 970; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 971; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 972; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 973; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 974; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 975; SKIP-CACHE-INV-NEXT: s_endpgm 976 i32 addrspace(1)* %out, i32 %in) { 977entry: 978 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire 979 store i32 %val, i32 addrspace(1)* %out, align 4 980 ret void 981} 982 983define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( 984; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw: 985; GFX6: ; %bb.0: ; %entry 986; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 987; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 988; GFX6-NEXT: s_mov_b32 s7, 0xf000 989; GFX6-NEXT: s_mov_b32 s6, -1 990; GFX6-NEXT: s_waitcnt lgkmcnt(0) 991; GFX6-NEXT: v_mov_b32_e32 v0, s0 992; GFX6-NEXT: s_waitcnt lgkmcnt(0) 993; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 994; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 995; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 996; GFX6-NEXT: s_endpgm 997; 998; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: 999; GFX7: ; %bb.0: ; %entry 1000; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1001; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1002; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1003; GFX7-NEXT: v_mov_b32_e32 v0, s0 1004; GFX7-NEXT: v_mov_b32_e32 v1, s1 1005; GFX7-NEXT: v_mov_b32_e32 v2, s2 1006; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1008; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1009; GFX7-NEXT: flat_store_dword v[0:1], v2 1010; GFX7-NEXT: s_endpgm 1011; 1012; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: 1013; GFX10-WGP: ; %bb.0: ; %entry 1014; GFX10-WGP-NEXT: s_clause 0x1 1015; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1016; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1017; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 1018; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1020; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1021; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1022; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 1023; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1024; GFX10-WGP-NEXT: buffer_gl0_inv 1025; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 1026; GFX10-WGP-NEXT: s_endpgm 1027; 1028; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: 1029; GFX10-CU: ; %bb.0: ; %entry 1030; GFX10-CU-NEXT: s_clause 0x1 1031; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1032; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1033; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 1034; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1036; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1037; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 1038; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1039; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 1040; GFX10-CU-NEXT: s_endpgm 1041; 1042; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_ret_atomicrmw: 1043; SKIP-CACHE-INV: ; %bb.0: ; %entry 1044; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1045; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1046; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1047; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1048; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1049; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1050; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1051; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 1052; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1053; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 1054; SKIP-CACHE-INV-NEXT: s_endpgm 1055 i32 addrspace(1)* %out, i32 %in) { 1056entry: 1057 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel 1058 store i32 %val, i32 addrspace(1)* %out, align 4 1059 ret void 1060} 1061 1062define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( 1063; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw: 1064; GFX6: ; %bb.0: ; %entry 1065; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1066; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 1067; GFX6-NEXT: s_mov_b32 s7, 0xf000 1068; GFX6-NEXT: s_mov_b32 s6, -1 1069; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX6-NEXT: v_mov_b32_e32 v0, s0 1071; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1072; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 1073; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1074; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1075; GFX6-NEXT: s_endpgm 1076; 1077; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: 1078; GFX7: ; %bb.0: ; %entry 1079; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1080; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 1081; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX7-NEXT: v_mov_b32_e32 v0, s0 1083; GFX7-NEXT: v_mov_b32_e32 v1, s1 1084; GFX7-NEXT: v_mov_b32_e32 v2, s2 1085; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 1087; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1088; GFX7-NEXT: flat_store_dword v[0:1], v2 1089; GFX7-NEXT: s_endpgm 1090; 1091; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: 1092; GFX10-WGP: ; %bb.0: ; %entry 1093; GFX10-WGP-NEXT: s_clause 0x1 1094; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 1095; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1096; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 1097; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1098; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 1099; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1100; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1101; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 1102; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1103; GFX10-WGP-NEXT: buffer_gl0_inv 1104; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 1105; GFX10-WGP-NEXT: s_endpgm 1106; 1107; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: 1108; GFX10-CU: ; %bb.0: ; %entry 1109; GFX10-CU-NEXT: s_clause 0x1 1110; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 1111; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1112; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 1113; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 1115; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 1117; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1118; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 1119; GFX10-CU-NEXT: s_endpgm 1120; 1121; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_ret_atomicrmw: 1122; SKIP-CACHE-INV: ; %bb.0: ; %entry 1123; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1124; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 1125; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1126; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1127; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1128; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1129; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1130; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 1131; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1132; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 1133; SKIP-CACHE-INV-NEXT: s_endpgm 1134 i32 addrspace(1)* %out, i32 %in) { 1135entry: 1136 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst 1137 store i32 %val, i32 addrspace(1)* %out, align 4 1138 ret void 1139} 1140 1141define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( 1142; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: 1143; GFX6: ; %bb.0: ; %entry 1144; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1145; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1146; GFX6-NEXT: s_mov_b32 s7, 0xf000 1147; GFX6-NEXT: s_mov_b32 s6, -1 1148; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX6-NEXT: v_mov_b32_e32 v0, s0 1150; GFX6-NEXT: v_mov_b32_e32 v1, s1 1151; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1152; GFX6-NEXT: s_endpgm 1153; 1154; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: 1155; GFX7: ; %bb.0: ; %entry 1156; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1157; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1158; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX7-NEXT: s_add_u32 s0, s0, 16 1160; GFX7-NEXT: s_addc_u32 s1, s1, 0 1161; GFX7-NEXT: v_mov_b32_e32 v0, s0 1162; GFX7-NEXT: v_mov_b32_e32 v2, s2 1163; GFX7-NEXT: v_mov_b32_e32 v1, s1 1164; GFX7-NEXT: v_mov_b32_e32 v3, s3 1165; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1166; GFX7-NEXT: s_endpgm 1167; 1168; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: 1169; GFX10-WGP: ; %bb.0: ; %entry 1170; GFX10-WGP-NEXT: s_clause 0x1 1171; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1172; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1173; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1174; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1175; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1176; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1177; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1178; GFX10-WGP-NEXT: s_endpgm 1179; 1180; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: 1181; GFX10-CU: ; %bb.0: ; %entry 1182; GFX10-CU-NEXT: s_clause 0x1 1183; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1184; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1185; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1186; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1188; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1189; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1190; GFX10-CU-NEXT: s_endpgm 1191; 1192; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: 1193; SKIP-CACHE-INV: ; %bb.0: ; %entry 1194; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1195; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1196; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1197; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1198; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1199; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1200; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1201; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1202; SKIP-CACHE-INV-NEXT: s_endpgm 1203 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1204entry: 1205 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1206 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic 1207 ret void 1208} 1209 1210define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( 1211; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg: 1212; GFX6: ; %bb.0: ; %entry 1213; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1214; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1215; GFX6-NEXT: s_mov_b32 s7, 0xf000 1216; GFX6-NEXT: s_mov_b32 s6, -1 1217; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1218; GFX6-NEXT: v_mov_b32_e32 v0, s0 1219; GFX6-NEXT: v_mov_b32_e32 v1, s1 1220; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1221; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX6-NEXT: s_endpgm 1223; 1224; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: 1225; GFX7: ; %bb.0: ; %entry 1226; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1227; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1228; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX7-NEXT: s_add_u32 s0, s0, 16 1230; GFX7-NEXT: s_addc_u32 s1, s1, 0 1231; GFX7-NEXT: v_mov_b32_e32 v0, s0 1232; GFX7-NEXT: v_mov_b32_e32 v2, s2 1233; GFX7-NEXT: v_mov_b32_e32 v1, s1 1234; GFX7-NEXT: v_mov_b32_e32 v3, s3 1235; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1236; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1237; GFX7-NEXT: s_endpgm 1238; 1239; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: 1240; GFX10-WGP: ; %bb.0: ; %entry 1241; GFX10-WGP-NEXT: s_clause 0x1 1242; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1243; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1244; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1245; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1247; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1248; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1249; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1251; GFX10-WGP-NEXT: buffer_gl0_inv 1252; GFX10-WGP-NEXT: s_endpgm 1253; 1254; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: 1255; GFX10-CU: ; %bb.0: ; %entry 1256; GFX10-CU-NEXT: s_clause 0x1 1257; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1258; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1259; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1260; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1262; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1263; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1264; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1265; GFX10-CU-NEXT: s_endpgm 1266; 1267; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg: 1268; SKIP-CACHE-INV: ; %bb.0: ; %entry 1269; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1270; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1271; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1272; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1273; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1274; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1275; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1276; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1277; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1278; SKIP-CACHE-INV-NEXT: s_endpgm 1279 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1280entry: 1281 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1282 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 1283 ret void 1284} 1285 1286define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( 1287; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg: 1288; GFX6: ; %bb.0: ; %entry 1289; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1290; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1291; GFX6-NEXT: s_mov_b32 s7, 0xf000 1292; GFX6-NEXT: s_mov_b32 s6, -1 1293; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX6-NEXT: v_mov_b32_e32 v0, s0 1295; GFX6-NEXT: v_mov_b32_e32 v1, s1 1296; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1297; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1298; GFX6-NEXT: s_endpgm 1299; 1300; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: 1301; GFX7: ; %bb.0: ; %entry 1302; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1303; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1304; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX7-NEXT: s_add_u32 s0, s0, 16 1306; GFX7-NEXT: s_addc_u32 s1, s1, 0 1307; GFX7-NEXT: v_mov_b32_e32 v0, s0 1308; GFX7-NEXT: v_mov_b32_e32 v2, s2 1309; GFX7-NEXT: v_mov_b32_e32 v1, s1 1310; GFX7-NEXT: v_mov_b32_e32 v3, s3 1311; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1312; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1313; GFX7-NEXT: s_endpgm 1314; 1315; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: 1316; GFX10-WGP: ; %bb.0: ; %entry 1317; GFX10-WGP-NEXT: s_clause 0x1 1318; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1319; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1320; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1321; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1323; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1324; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1325; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1326; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1327; GFX10-WGP-NEXT: s_endpgm 1328; 1329; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: 1330; GFX10-CU: ; %bb.0: ; %entry 1331; GFX10-CU-NEXT: s_clause 0x1 1332; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1333; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1334; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1335; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1337; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1338; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1339; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1340; GFX10-CU-NEXT: s_endpgm 1341; 1342; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: 1343; SKIP-CACHE-INV: ; %bb.0: ; %entry 1344; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1345; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1346; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1347; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1348; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1349; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1350; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1351; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1352; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1353; SKIP-CACHE-INV-NEXT: s_endpgm 1354 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1355entry: 1356 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1357 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic 1358 ret void 1359} 1360 1361define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( 1362; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: 1363; GFX6: ; %bb.0: ; %entry 1364; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1365; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1366; GFX6-NEXT: s_mov_b32 s7, 0xf000 1367; GFX6-NEXT: s_mov_b32 s6, -1 1368; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1369; GFX6-NEXT: v_mov_b32_e32 v0, s0 1370; GFX6-NEXT: v_mov_b32_e32 v1, s1 1371; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1373; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1374; GFX6-NEXT: s_endpgm 1375; 1376; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: 1377; GFX7: ; %bb.0: ; %entry 1378; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1379; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1380; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX7-NEXT: s_add_u32 s0, s0, 16 1382; GFX7-NEXT: s_addc_u32 s1, s1, 0 1383; GFX7-NEXT: v_mov_b32_e32 v0, s0 1384; GFX7-NEXT: v_mov_b32_e32 v2, s2 1385; GFX7-NEXT: v_mov_b32_e32 v1, s1 1386; GFX7-NEXT: v_mov_b32_e32 v3, s3 1387; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1389; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1390; GFX7-NEXT: s_endpgm 1391; 1392; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: 1393; GFX10-WGP: ; %bb.0: ; %entry 1394; GFX10-WGP-NEXT: s_clause 0x1 1395; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1396; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1397; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1398; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1399; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1400; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1401; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1402; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1403; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1404; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1406; GFX10-WGP-NEXT: buffer_gl0_inv 1407; GFX10-WGP-NEXT: s_endpgm 1408; 1409; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: 1410; GFX10-CU: ; %bb.0: ; %entry 1411; GFX10-CU-NEXT: s_clause 0x1 1412; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1413; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1414; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1415; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1417; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1418; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1420; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1421; GFX10-CU-NEXT: s_endpgm 1422; 1423; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: 1424; SKIP-CACHE-INV: ; %bb.0: ; %entry 1425; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1426; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1427; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1428; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1429; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1430; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1431; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1432; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1433; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1434; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1435; SKIP-CACHE-INV-NEXT: s_endpgm 1436 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1437entry: 1438 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1439 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 1440 ret void 1441} 1442 1443define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( 1444; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: 1445; GFX6: ; %bb.0: ; %entry 1446; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1447; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1448; GFX6-NEXT: s_mov_b32 s7, 0xf000 1449; GFX6-NEXT: s_mov_b32 s6, -1 1450; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX6-NEXT: v_mov_b32_e32 v0, s0 1452; GFX6-NEXT: v_mov_b32_e32 v1, s1 1453; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1454; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1455; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1456; GFX6-NEXT: s_endpgm 1457; 1458; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: 1459; GFX7: ; %bb.0: ; %entry 1460; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1461; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1462; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1463; GFX7-NEXT: s_add_u32 s0, s0, 16 1464; GFX7-NEXT: s_addc_u32 s1, s1, 0 1465; GFX7-NEXT: v_mov_b32_e32 v0, s0 1466; GFX7-NEXT: v_mov_b32_e32 v2, s2 1467; GFX7-NEXT: v_mov_b32_e32 v1, s1 1468; GFX7-NEXT: v_mov_b32_e32 v3, s3 1469; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1470; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1471; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX7-NEXT: s_endpgm 1473; 1474; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: 1475; GFX10-WGP: ; %bb.0: ; %entry 1476; GFX10-WGP-NEXT: s_clause 0x1 1477; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1478; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1479; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1480; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1481; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1482; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1483; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1484; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1485; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1486; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1488; GFX10-WGP-NEXT: buffer_gl0_inv 1489; GFX10-WGP-NEXT: s_endpgm 1490; 1491; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: 1492; GFX10-CU: ; %bb.0: ; %entry 1493; GFX10-CU-NEXT: s_clause 0x1 1494; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1495; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1496; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1497; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1499; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1500; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1501; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1502; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX10-CU-NEXT: s_endpgm 1504; 1505; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: 1506; SKIP-CACHE-INV: ; %bb.0: ; %entry 1507; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1508; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1509; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1510; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1511; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1512; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1513; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1514; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1515; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1516; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1517; SKIP-CACHE-INV-NEXT: s_endpgm 1518 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1519entry: 1520 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1521 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 1522 ret void 1523} 1524 1525define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( 1526; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: 1527; GFX6: ; %bb.0: ; %entry 1528; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1529; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1530; GFX6-NEXT: s_mov_b32 s7, 0xf000 1531; GFX6-NEXT: s_mov_b32 s6, -1 1532; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX6-NEXT: v_mov_b32_e32 v0, s0 1534; GFX6-NEXT: v_mov_b32_e32 v1, s1 1535; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1536; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX6-NEXT: s_endpgm 1538; 1539; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: 1540; GFX7: ; %bb.0: ; %entry 1541; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1542; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1543; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX7-NEXT: s_add_u32 s0, s0, 16 1545; GFX7-NEXT: s_addc_u32 s1, s1, 0 1546; GFX7-NEXT: v_mov_b32_e32 v0, s0 1547; GFX7-NEXT: v_mov_b32_e32 v2, s2 1548; GFX7-NEXT: v_mov_b32_e32 v1, s1 1549; GFX7-NEXT: v_mov_b32_e32 v3, s3 1550; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1551; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX7-NEXT: s_endpgm 1553; 1554; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: 1555; GFX10-WGP: ; %bb.0: ; %entry 1556; GFX10-WGP-NEXT: s_clause 0x1 1557; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1558; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1559; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1560; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1562; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1563; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1564; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1565; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1566; GFX10-WGP-NEXT: buffer_gl0_inv 1567; GFX10-WGP-NEXT: s_endpgm 1568; 1569; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: 1570; GFX10-CU: ; %bb.0: ; %entry 1571; GFX10-CU-NEXT: s_clause 0x1 1572; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1573; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1574; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1575; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1577; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1578; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1579; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1580; GFX10-CU-NEXT: s_endpgm 1581; 1582; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg: 1583; SKIP-CACHE-INV: ; %bb.0: ; %entry 1584; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1585; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1586; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1587; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1588; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1589; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1590; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1591; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1592; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1593; SKIP-CACHE-INV-NEXT: s_endpgm 1594 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1595entry: 1596 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1597 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 1598 ret void 1599} 1600 1601define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( 1602; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg: 1603; GFX6: ; %bb.0: ; %entry 1604; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1605; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1606; GFX6-NEXT: s_mov_b32 s7, 0xf000 1607; GFX6-NEXT: s_mov_b32 s6, -1 1608; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX6-NEXT: v_mov_b32_e32 v0, s0 1610; GFX6-NEXT: v_mov_b32_e32 v1, s1 1611; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1612; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1613; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX6-NEXT: s_endpgm 1615; 1616; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: 1617; GFX7: ; %bb.0: ; %entry 1618; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1619; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1620; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1621; GFX7-NEXT: s_add_u32 s0, s0, 16 1622; GFX7-NEXT: s_addc_u32 s1, s1, 0 1623; GFX7-NEXT: v_mov_b32_e32 v0, s0 1624; GFX7-NEXT: v_mov_b32_e32 v2, s2 1625; GFX7-NEXT: v_mov_b32_e32 v1, s1 1626; GFX7-NEXT: v_mov_b32_e32 v3, s3 1627; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1628; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1629; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1630; GFX7-NEXT: s_endpgm 1631; 1632; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: 1633; GFX10-WGP: ; %bb.0: ; %entry 1634; GFX10-WGP-NEXT: s_clause 0x1 1635; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1636; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1637; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1638; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1639; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1640; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1641; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1642; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1643; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1644; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1646; GFX10-WGP-NEXT: buffer_gl0_inv 1647; GFX10-WGP-NEXT: s_endpgm 1648; 1649; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: 1650; GFX10-CU: ; %bb.0: ; %entry 1651; GFX10-CU-NEXT: s_clause 0x1 1652; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1653; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1654; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1655; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1657; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1658; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1659; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1660; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1661; GFX10-CU-NEXT: s_endpgm 1662; 1663; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: 1664; SKIP-CACHE-INV: ; %bb.0: ; %entry 1665; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1666; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1667; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1668; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1669; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1670; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1671; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1672; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1673; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1674; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1675; SKIP-CACHE-INV-NEXT: s_endpgm 1676 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1677entry: 1678 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1679 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 1680 ret void 1681} 1682 1683define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( 1684; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: 1685; GFX6: ; %bb.0: ; %entry 1686; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1687; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1688; GFX6-NEXT: s_mov_b32 s7, 0xf000 1689; GFX6-NEXT: s_mov_b32 s6, -1 1690; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1691; GFX6-NEXT: v_mov_b32_e32 v0, s0 1692; GFX6-NEXT: v_mov_b32_e32 v1, s1 1693; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1695; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX6-NEXT: s_endpgm 1697; 1698; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: 1699; GFX7: ; %bb.0: ; %entry 1700; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1701; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1702; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX7-NEXT: s_add_u32 s0, s0, 16 1704; GFX7-NEXT: s_addc_u32 s1, s1, 0 1705; GFX7-NEXT: v_mov_b32_e32 v0, s0 1706; GFX7-NEXT: v_mov_b32_e32 v2, s2 1707; GFX7-NEXT: v_mov_b32_e32 v1, s1 1708; GFX7-NEXT: v_mov_b32_e32 v3, s3 1709; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1711; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1712; GFX7-NEXT: s_endpgm 1713; 1714; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: 1715; GFX10-WGP: ; %bb.0: ; %entry 1716; GFX10-WGP-NEXT: s_clause 0x1 1717; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1718; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1719; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1720; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1721; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1722; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1723; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1724; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1725; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1726; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1728; GFX10-WGP-NEXT: buffer_gl0_inv 1729; GFX10-WGP-NEXT: s_endpgm 1730; 1731; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: 1732; GFX10-CU: ; %bb.0: ; %entry 1733; GFX10-CU-NEXT: s_clause 0x1 1734; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1735; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1736; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1737; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1739; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1740; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1741; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1742; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX10-CU-NEXT: s_endpgm 1744; 1745; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: 1746; SKIP-CACHE-INV: ; %bb.0: ; %entry 1747; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1748; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1749; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1750; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1751; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1752; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1753; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1754; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1755; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1756; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1757; SKIP-CACHE-INV-NEXT: s_endpgm 1758 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1759entry: 1760 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1761 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 1762 ret void 1763} 1764 1765define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( 1766; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: 1767; GFX6: ; %bb.0: ; %entry 1768; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1769; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1770; GFX6-NEXT: s_mov_b32 s7, 0xf000 1771; GFX6-NEXT: s_mov_b32 s6, -1 1772; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX6-NEXT: v_mov_b32_e32 v0, s0 1774; GFX6-NEXT: v_mov_b32_e32 v1, s1 1775; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1777; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1778; GFX6-NEXT: s_endpgm 1779; 1780; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: 1781; GFX7: ; %bb.0: ; %entry 1782; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1783; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1784; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1785; GFX7-NEXT: s_add_u32 s0, s0, 16 1786; GFX7-NEXT: s_addc_u32 s1, s1, 0 1787; GFX7-NEXT: v_mov_b32_e32 v0, s0 1788; GFX7-NEXT: v_mov_b32_e32 v2, s2 1789; GFX7-NEXT: v_mov_b32_e32 v1, s1 1790; GFX7-NEXT: v_mov_b32_e32 v3, s3 1791; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1792; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1793; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1794; GFX7-NEXT: s_endpgm 1795; 1796; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: 1797; GFX10-WGP: ; %bb.0: ; %entry 1798; GFX10-WGP-NEXT: s_clause 0x1 1799; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1800; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1801; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1802; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1804; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1805; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1806; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1807; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1808; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1809; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1810; GFX10-WGP-NEXT: buffer_gl0_inv 1811; GFX10-WGP-NEXT: s_endpgm 1812; 1813; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: 1814; GFX10-CU: ; %bb.0: ; %entry 1815; GFX10-CU-NEXT: s_clause 0x1 1816; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1817; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1818; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1819; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1821; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1822; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1823; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1824; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX10-CU-NEXT: s_endpgm 1826; 1827; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: 1828; SKIP-CACHE-INV: ; %bb.0: ; %entry 1829; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1830; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1831; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1832; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1833; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1834; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1835; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1836; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1837; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1838; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1839; SKIP-CACHE-INV-NEXT: s_endpgm 1840 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1841entry: 1842 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1843 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 1844 ret void 1845} 1846 1847define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( 1848; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: 1849; GFX6: ; %bb.0: ; %entry 1850; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1851; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1852; GFX6-NEXT: s_mov_b32 s7, 0xf000 1853; GFX6-NEXT: s_mov_b32 s6, -1 1854; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX6-NEXT: v_mov_b32_e32 v0, s0 1856; GFX6-NEXT: v_mov_b32_e32 v1, s1 1857; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1858; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1859; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1860; GFX6-NEXT: s_endpgm 1861; 1862; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: 1863; GFX7: ; %bb.0: ; %entry 1864; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1865; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1866; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX7-NEXT: s_add_u32 s0, s0, 16 1868; GFX7-NEXT: s_addc_u32 s1, s1, 0 1869; GFX7-NEXT: v_mov_b32_e32 v0, s0 1870; GFX7-NEXT: v_mov_b32_e32 v2, s2 1871; GFX7-NEXT: v_mov_b32_e32 v1, s1 1872; GFX7-NEXT: v_mov_b32_e32 v3, s3 1873; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 1875; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1876; GFX7-NEXT: s_endpgm 1877; 1878; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: 1879; GFX10-WGP: ; %bb.0: ; %entry 1880; GFX10-WGP-NEXT: s_clause 0x1 1881; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1882; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1883; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1884; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1885; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1886; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1887; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1888; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1889; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1890; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1891; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 1892; GFX10-WGP-NEXT: buffer_gl0_inv 1893; GFX10-WGP-NEXT: s_endpgm 1894; 1895; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: 1896; GFX10-CU: ; %bb.0: ; %entry 1897; GFX10-CU-NEXT: s_clause 0x1 1898; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1899; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1900; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1901; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1902; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1903; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1904; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 1906; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1907; GFX10-CU-NEXT: s_endpgm 1908; 1909; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: 1910; SKIP-CACHE-INV: ; %bb.0: ; %entry 1911; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1912; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1913; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1914; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1915; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1916; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 1917; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 1918; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1919; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 1920; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1921; SKIP-CACHE-INV-NEXT: s_endpgm 1922 i32 addrspace(1)* %out, i32 %in, i32 %old) { 1923entry: 1924 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 1925 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 1926 ret void 1927} 1928 1929define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( 1930; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: 1931; GFX6: ; %bb.0: ; %entry 1932; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1933; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1934; GFX6-NEXT: s_mov_b32 s7, 0xf000 1935; GFX6-NEXT: s_mov_b32 s6, -1 1936; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1937; GFX6-NEXT: v_mov_b32_e32 v0, s0 1938; GFX6-NEXT: v_mov_b32_e32 v1, s1 1939; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 1940; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1941; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1942; GFX6-NEXT: s_endpgm 1943; 1944; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: 1945; GFX7: ; %bb.0: ; %entry 1946; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1947; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 1948; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX7-NEXT: s_add_u32 s4, s0, 16 1950; GFX7-NEXT: s_addc_u32 s5, s1, 0 1951; GFX7-NEXT: v_mov_b32_e32 v0, s4 1952; GFX7-NEXT: v_mov_b32_e32 v2, s2 1953; GFX7-NEXT: v_mov_b32_e32 v1, s5 1954; GFX7-NEXT: v_mov_b32_e32 v3, s3 1955; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1956; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX7-NEXT: v_mov_b32_e32 v0, s0 1958; GFX7-NEXT: v_mov_b32_e32 v1, s1 1959; GFX7-NEXT: s_waitcnt vmcnt(0) 1960; GFX7-NEXT: flat_store_dword v[0:1], v2 1961; GFX7-NEXT: s_endpgm 1962; 1963; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: 1964; GFX10-WGP: ; %bb.0: ; %entry 1965; GFX10-WGP-NEXT: s_clause 0x1 1966; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1967; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1968; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 1969; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 1971; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 1972; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 1973; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1974; GFX10-WGP-NEXT: buffer_gl0_inv 1975; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 1976; GFX10-WGP-NEXT: s_endpgm 1977; 1978; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: 1979; GFX10-CU: ; %bb.0: ; %entry 1980; GFX10-CU-NEXT: s_clause 0x1 1981; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1982; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1983; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 1984; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 1986; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 1987; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 1988; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1989; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 1990; GFX10-CU-NEXT: s_endpgm 1991; 1992; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: 1993; SKIP-CACHE-INV: ; %bb.0: ; %entry 1994; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1995; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 1996; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 1997; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 1998; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 1999; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2000; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2001; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2002; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2003; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2004; SKIP-CACHE-INV-NEXT: s_endpgm 2005 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2006entry: 2007 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2008 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic 2009 %val0 = extractvalue { i32, i1 } %val, 0 2010 store i32 %val0, i32 addrspace(1)* %out, align 4 2011 ret void 2012} 2013 2014define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( 2015; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: 2016; GFX6: ; %bb.0: ; %entry 2017; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2018; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2019; GFX6-NEXT: s_mov_b32 s7, 0xf000 2020; GFX6-NEXT: s_mov_b32 s6, -1 2021; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2022; GFX6-NEXT: v_mov_b32_e32 v0, s0 2023; GFX6-NEXT: v_mov_b32_e32 v1, s1 2024; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2026; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2027; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2028; GFX6-NEXT: s_endpgm 2029; 2030; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: 2031; GFX7: ; %bb.0: ; %entry 2032; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2033; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2034; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX7-NEXT: s_add_u32 s4, s0, 16 2036; GFX7-NEXT: s_addc_u32 s5, s1, 0 2037; GFX7-NEXT: v_mov_b32_e32 v0, s4 2038; GFX7-NEXT: v_mov_b32_e32 v2, s2 2039; GFX7-NEXT: v_mov_b32_e32 v1, s5 2040; GFX7-NEXT: v_mov_b32_e32 v3, s3 2041; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2042; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2043; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2044; GFX7-NEXT: v_mov_b32_e32 v0, s0 2045; GFX7-NEXT: v_mov_b32_e32 v1, s1 2046; GFX7-NEXT: s_waitcnt vmcnt(0) 2047; GFX7-NEXT: flat_store_dword v[0:1], v2 2048; GFX7-NEXT: s_endpgm 2049; 2050; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: 2051; GFX10-WGP: ; %bb.0: ; %entry 2052; GFX10-WGP-NEXT: s_clause 0x1 2053; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2054; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2055; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2056; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2057; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2058; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2059; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2060; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2061; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2062; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2063; GFX10-WGP-NEXT: buffer_gl0_inv 2064; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2065; GFX10-WGP-NEXT: s_endpgm 2066; 2067; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: 2068; GFX10-CU: ; %bb.0: ; %entry 2069; GFX10-CU-NEXT: s_clause 0x1 2070; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2071; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2072; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2073; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2074; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2075; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2076; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2077; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2078; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2079; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2080; GFX10-CU-NEXT: s_endpgm 2081; 2082; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: 2083; SKIP-CACHE-INV: ; %bb.0: ; %entry 2084; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2085; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2086; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2087; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2088; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2089; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2090; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2091; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2092; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2093; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2094; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2095; SKIP-CACHE-INV-NEXT: s_endpgm 2096 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2097entry: 2098 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2099 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic 2100 %val0 = extractvalue { i32, i1 } %val, 0 2101 store i32 %val0, i32 addrspace(1)* %out, align 4 2102 ret void 2103} 2104 2105define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( 2106; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: 2107; GFX6: ; %bb.0: ; %entry 2108; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2109; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2110; GFX6-NEXT: s_mov_b32 s7, 0xf000 2111; GFX6-NEXT: s_mov_b32 s6, -1 2112; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2113; GFX6-NEXT: v_mov_b32_e32 v0, s0 2114; GFX6-NEXT: v_mov_b32_e32 v1, s1 2115; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2116; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2117; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2118; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2119; GFX6-NEXT: s_endpgm 2120; 2121; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: 2122; GFX7: ; %bb.0: ; %entry 2123; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2124; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2125; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2126; GFX7-NEXT: s_add_u32 s4, s0, 16 2127; GFX7-NEXT: s_addc_u32 s5, s1, 0 2128; GFX7-NEXT: v_mov_b32_e32 v0, s4 2129; GFX7-NEXT: v_mov_b32_e32 v2, s2 2130; GFX7-NEXT: v_mov_b32_e32 v1, s5 2131; GFX7-NEXT: v_mov_b32_e32 v3, s3 2132; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2134; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2135; GFX7-NEXT: v_mov_b32_e32 v0, s0 2136; GFX7-NEXT: v_mov_b32_e32 v1, s1 2137; GFX7-NEXT: s_waitcnt vmcnt(0) 2138; GFX7-NEXT: flat_store_dword v[0:1], v2 2139; GFX7-NEXT: s_endpgm 2140; 2141; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: 2142; GFX10-WGP: ; %bb.0: ; %entry 2143; GFX10-WGP-NEXT: s_clause 0x1 2144; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2145; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2146; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2147; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2149; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2150; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2151; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2152; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2153; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2154; GFX10-WGP-NEXT: buffer_gl0_inv 2155; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2156; GFX10-WGP-NEXT: s_endpgm 2157; 2158; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: 2159; GFX10-CU: ; %bb.0: ; %entry 2160; GFX10-CU-NEXT: s_clause 0x1 2161; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2162; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2163; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2164; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2166; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2167; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2168; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2169; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2170; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2171; GFX10-CU-NEXT: s_endpgm 2172; 2173; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: 2174; SKIP-CACHE-INV: ; %bb.0: ; %entry 2175; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2176; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2177; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2178; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2179; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2180; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2181; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2182; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2183; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2184; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2185; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2186; SKIP-CACHE-INV-NEXT: s_endpgm 2187 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2188entry: 2189 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2190 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic 2191 %val0 = extractvalue { i32, i1 } %val, 0 2192 store i32 %val0, i32 addrspace(1)* %out, align 4 2193 ret void 2194} 2195 2196define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( 2197; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: 2198; GFX6: ; %bb.0: ; %entry 2199; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2200; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2201; GFX6-NEXT: s_mov_b32 s7, 0xf000 2202; GFX6-NEXT: s_mov_b32 s6, -1 2203; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2204; GFX6-NEXT: v_mov_b32_e32 v0, s0 2205; GFX6-NEXT: v_mov_b32_e32 v1, s1 2206; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2207; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2208; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2209; GFX6-NEXT: s_endpgm 2210; 2211; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: 2212; GFX7: ; %bb.0: ; %entry 2213; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2214; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2215; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX7-NEXT: s_add_u32 s4, s0, 16 2217; GFX7-NEXT: s_addc_u32 s5, s1, 0 2218; GFX7-NEXT: v_mov_b32_e32 v0, s4 2219; GFX7-NEXT: v_mov_b32_e32 v2, s2 2220; GFX7-NEXT: v_mov_b32_e32 v1, s5 2221; GFX7-NEXT: v_mov_b32_e32 v3, s3 2222; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2223; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX7-NEXT: v_mov_b32_e32 v0, s0 2225; GFX7-NEXT: v_mov_b32_e32 v1, s1 2226; GFX7-NEXT: s_waitcnt vmcnt(0) 2227; GFX7-NEXT: flat_store_dword v[0:1], v2 2228; GFX7-NEXT: s_endpgm 2229; 2230; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: 2231; GFX10-WGP: ; %bb.0: ; %entry 2232; GFX10-WGP-NEXT: s_clause 0x1 2233; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2234; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2235; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2236; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2237; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2238; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2239; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2240; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2241; GFX10-WGP-NEXT: buffer_gl0_inv 2242; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2243; GFX10-WGP-NEXT: s_endpgm 2244; 2245; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: 2246; GFX10-CU: ; %bb.0: ; %entry 2247; GFX10-CU-NEXT: s_clause 0x1 2248; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2249; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2250; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2251; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2252; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2253; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2254; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2255; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2256; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2257; GFX10-CU-NEXT: s_endpgm 2258; 2259; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: 2260; SKIP-CACHE-INV: ; %bb.0: ; %entry 2261; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2262; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2263; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2264; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2265; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2266; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2267; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2268; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2269; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2270; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2271; SKIP-CACHE-INV-NEXT: s_endpgm 2272 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2273entry: 2274 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2275 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire 2276 %val0 = extractvalue { i32, i1 } %val, 0 2277 store i32 %val0, i32 addrspace(1)* %out, align 4 2278 ret void 2279} 2280 2281define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( 2282; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: 2283; GFX6: ; %bb.0: ; %entry 2284; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2285; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2286; GFX6-NEXT: s_mov_b32 s7, 0xf000 2287; GFX6-NEXT: s_mov_b32 s6, -1 2288; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2289; GFX6-NEXT: v_mov_b32_e32 v0, s0 2290; GFX6-NEXT: v_mov_b32_e32 v1, s1 2291; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2293; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2294; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2295; GFX6-NEXT: s_endpgm 2296; 2297; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: 2298; GFX7: ; %bb.0: ; %entry 2299; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2300; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2301; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX7-NEXT: s_add_u32 s4, s0, 16 2303; GFX7-NEXT: s_addc_u32 s5, s1, 0 2304; GFX7-NEXT: v_mov_b32_e32 v0, s4 2305; GFX7-NEXT: v_mov_b32_e32 v2, s2 2306; GFX7-NEXT: v_mov_b32_e32 v1, s5 2307; GFX7-NEXT: v_mov_b32_e32 v3, s3 2308; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2309; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2310; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2311; GFX7-NEXT: v_mov_b32_e32 v0, s0 2312; GFX7-NEXT: v_mov_b32_e32 v1, s1 2313; GFX7-NEXT: s_waitcnt vmcnt(0) 2314; GFX7-NEXT: flat_store_dword v[0:1], v2 2315; GFX7-NEXT: s_endpgm 2316; 2317; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: 2318; GFX10-WGP: ; %bb.0: ; %entry 2319; GFX10-WGP-NEXT: s_clause 0x1 2320; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2321; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2322; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2323; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2324; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2325; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2326; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2327; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2328; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2329; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2330; GFX10-WGP-NEXT: buffer_gl0_inv 2331; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2332; GFX10-WGP-NEXT: s_endpgm 2333; 2334; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: 2335; GFX10-CU: ; %bb.0: ; %entry 2336; GFX10-CU-NEXT: s_clause 0x1 2337; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2338; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2339; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2340; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2341; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2342; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2343; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2344; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2345; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2346; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2347; GFX10-CU-NEXT: s_endpgm 2348; 2349; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: 2350; SKIP-CACHE-INV: ; %bb.0: ; %entry 2351; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2352; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2353; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2354; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2355; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2356; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2357; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2358; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2359; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2360; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2361; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2362; SKIP-CACHE-INV-NEXT: s_endpgm 2363 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2364entry: 2365 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2366 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire 2367 %val0 = extractvalue { i32, i1 } %val, 0 2368 store i32 %val0, i32 addrspace(1)* %out, align 4 2369 ret void 2370} 2371 2372define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( 2373; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: 2374; GFX6: ; %bb.0: ; %entry 2375; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2376; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2377; GFX6-NEXT: s_mov_b32 s7, 0xf000 2378; GFX6-NEXT: s_mov_b32 s6, -1 2379; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2380; GFX6-NEXT: v_mov_b32_e32 v0, s0 2381; GFX6-NEXT: v_mov_b32_e32 v1, s1 2382; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2383; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2384; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2385; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2386; GFX6-NEXT: s_endpgm 2387; 2388; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: 2389; GFX7: ; %bb.0: ; %entry 2390; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2391; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2392; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2393; GFX7-NEXT: s_add_u32 s4, s0, 16 2394; GFX7-NEXT: s_addc_u32 s5, s1, 0 2395; GFX7-NEXT: v_mov_b32_e32 v0, s4 2396; GFX7-NEXT: v_mov_b32_e32 v2, s2 2397; GFX7-NEXT: v_mov_b32_e32 v1, s5 2398; GFX7-NEXT: v_mov_b32_e32 v3, s3 2399; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2401; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX7-NEXT: v_mov_b32_e32 v0, s0 2403; GFX7-NEXT: v_mov_b32_e32 v1, s1 2404; GFX7-NEXT: s_waitcnt vmcnt(0) 2405; GFX7-NEXT: flat_store_dword v[0:1], v2 2406; GFX7-NEXT: s_endpgm 2407; 2408; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: 2409; GFX10-WGP: ; %bb.0: ; %entry 2410; GFX10-WGP-NEXT: s_clause 0x1 2411; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2412; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2413; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2414; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2415; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2416; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2417; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2418; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2419; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2420; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2421; GFX10-WGP-NEXT: buffer_gl0_inv 2422; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2423; GFX10-WGP-NEXT: s_endpgm 2424; 2425; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: 2426; GFX10-CU: ; %bb.0: ; %entry 2427; GFX10-CU-NEXT: s_clause 0x1 2428; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2429; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2430; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2431; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2433; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2434; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2435; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2436; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2437; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2438; GFX10-CU-NEXT: s_endpgm 2439; 2440; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: 2441; SKIP-CACHE-INV: ; %bb.0: ; %entry 2442; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2443; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2444; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2445; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2446; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2447; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2448; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2449; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2450; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2451; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2452; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2453; SKIP-CACHE-INV-NEXT: s_endpgm 2454 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2455entry: 2456 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2457 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire 2458 %val0 = extractvalue { i32, i1 } %val, 0 2459 store i32 %val0, i32 addrspace(1)* %out, align 4 2460 ret void 2461} 2462 2463define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( 2464; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: 2465; GFX6: ; %bb.0: ; %entry 2466; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2467; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2468; GFX6-NEXT: s_mov_b32 s7, 0xf000 2469; GFX6-NEXT: s_mov_b32 s6, -1 2470; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2471; GFX6-NEXT: v_mov_b32_e32 v0, s0 2472; GFX6-NEXT: v_mov_b32_e32 v1, s1 2473; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2474; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2475; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2476; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2477; GFX6-NEXT: s_endpgm 2478; 2479; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: 2480; GFX7: ; %bb.0: ; %entry 2481; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2482; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2483; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2484; GFX7-NEXT: s_add_u32 s4, s0, 16 2485; GFX7-NEXT: s_addc_u32 s5, s1, 0 2486; GFX7-NEXT: v_mov_b32_e32 v0, s4 2487; GFX7-NEXT: v_mov_b32_e32 v2, s2 2488; GFX7-NEXT: v_mov_b32_e32 v1, s5 2489; GFX7-NEXT: v_mov_b32_e32 v3, s3 2490; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2491; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2492; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2493; GFX7-NEXT: v_mov_b32_e32 v0, s0 2494; GFX7-NEXT: v_mov_b32_e32 v1, s1 2495; GFX7-NEXT: s_waitcnt vmcnt(0) 2496; GFX7-NEXT: flat_store_dword v[0:1], v2 2497; GFX7-NEXT: s_endpgm 2498; 2499; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: 2500; GFX10-WGP: ; %bb.0: ; %entry 2501; GFX10-WGP-NEXT: s_clause 0x1 2502; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2503; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2504; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2505; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2507; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2508; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2509; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2510; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2511; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2512; GFX10-WGP-NEXT: buffer_gl0_inv 2513; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2514; GFX10-WGP-NEXT: s_endpgm 2515; 2516; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: 2517; GFX10-CU: ; %bb.0: ; %entry 2518; GFX10-CU-NEXT: s_clause 0x1 2519; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2520; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2521; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2522; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2523; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2524; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2525; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2526; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2527; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2528; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2529; GFX10-CU-NEXT: s_endpgm 2530; 2531; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: 2532; SKIP-CACHE-INV: ; %bb.0: ; %entry 2533; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2534; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2535; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2536; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2537; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2538; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2539; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2540; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2541; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2542; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2543; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2544; SKIP-CACHE-INV-NEXT: s_endpgm 2545 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2546entry: 2547 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2548 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire 2549 %val0 = extractvalue { i32, i1 } %val, 0 2550 store i32 %val0, i32 addrspace(1)* %out, align 4 2551 ret void 2552} 2553 2554define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( 2555; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: 2556; GFX6: ; %bb.0: ; %entry 2557; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2558; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2559; GFX6-NEXT: s_mov_b32 s7, 0xf000 2560; GFX6-NEXT: s_mov_b32 s6, -1 2561; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX6-NEXT: v_mov_b32_e32 v0, s0 2563; GFX6-NEXT: v_mov_b32_e32 v1, s1 2564; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2565; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2566; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2567; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2568; GFX6-NEXT: s_endpgm 2569; 2570; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: 2571; GFX7: ; %bb.0: ; %entry 2572; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2573; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 2574; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2575; GFX7-NEXT: s_add_u32 s4, s0, 16 2576; GFX7-NEXT: s_addc_u32 s5, s1, 0 2577; GFX7-NEXT: v_mov_b32_e32 v0, s4 2578; GFX7-NEXT: v_mov_b32_e32 v2, s2 2579; GFX7-NEXT: v_mov_b32_e32 v1, s5 2580; GFX7-NEXT: v_mov_b32_e32 v3, s3 2581; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2583; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2584; GFX7-NEXT: v_mov_b32_e32 v0, s0 2585; GFX7-NEXT: v_mov_b32_e32 v1, s1 2586; GFX7-NEXT: s_waitcnt vmcnt(0) 2587; GFX7-NEXT: flat_store_dword v[0:1], v2 2588; GFX7-NEXT: s_endpgm 2589; 2590; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: 2591; GFX10-WGP: ; %bb.0: ; %entry 2592; GFX10-WGP-NEXT: s_clause 0x1 2593; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2594; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2595; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 2596; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2597; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 2598; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 2599; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2600; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2601; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2602; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2603; GFX10-WGP-NEXT: buffer_gl0_inv 2604; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 2605; GFX10-WGP-NEXT: s_endpgm 2606; 2607; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: 2608; GFX10-CU: ; %bb.0: ; %entry 2609; GFX10-CU-NEXT: s_clause 0x1 2610; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2611; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2612; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 2613; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2614; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 2615; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 2616; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2617; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 2618; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2619; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 2620; GFX10-CU-NEXT: s_endpgm 2621; 2622; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: 2623; SKIP-CACHE-INV: ; %bb.0: ; %entry 2624; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2625; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2626; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 2627; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 2628; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2629; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 2630; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 2631; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2632; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 2633; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2634; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2635; SKIP-CACHE-INV-NEXT: s_endpgm 2636 i32 addrspace(1)* %out, i32 %in, i32 %old) { 2637entry: 2638 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 2639 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst 2640 %val0 = extractvalue { i32, i1 } %val, 0 2641 store i32 %val0, i32 addrspace(1)* %out, align 4 2642 ret void 2643} 2644 2645define amdgpu_kernel void @global_workgroup_one_as_unordered_load( 2646; GFX6-LABEL: global_workgroup_one_as_unordered_load: 2647; GFX6: ; %bb.0: ; %entry 2648; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2649; GFX6-NEXT: s_mov_b32 s3, 0xf000 2650; GFX6-NEXT: s_mov_b32 s2, -1 2651; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2652; GFX6-NEXT: s_mov_b32 s0, s4 2653; GFX6-NEXT: s_mov_b32 s1, s5 2654; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 2655; GFX6-NEXT: s_mov_b32 s4, s6 2656; GFX6-NEXT: s_mov_b32 s5, s7 2657; GFX6-NEXT: s_mov_b32 s6, s2 2658; GFX6-NEXT: s_mov_b32 s7, s3 2659; GFX6-NEXT: s_waitcnt vmcnt(0) 2660; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2661; GFX6-NEXT: s_endpgm 2662; 2663; GFX7-LABEL: global_workgroup_one_as_unordered_load: 2664; GFX7: ; %bb.0: ; %entry 2665; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2666; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2667; GFX7-NEXT: v_mov_b32_e32 v0, s0 2668; GFX7-NEXT: v_mov_b32_e32 v1, s1 2669; GFX7-NEXT: flat_load_dword v0, v[0:1] 2670; GFX7-NEXT: v_mov_b32_e32 v2, s2 2671; GFX7-NEXT: v_mov_b32_e32 v3, s3 2672; GFX7-NEXT: s_waitcnt vmcnt(0) 2673; GFX7-NEXT: flat_store_dword v[2:3], v0 2674; GFX7-NEXT: s_endpgm 2675; 2676; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: 2677; GFX10-WGP: ; %bb.0: ; %entry 2678; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2679; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 2680; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2681; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] 2682; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2683; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 2684; GFX10-WGP-NEXT: s_endpgm 2685; 2686; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: 2687; GFX10-CU: ; %bb.0: ; %entry 2688; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2689; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 2690; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2691; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 2692; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2693; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 2694; GFX10-CU-NEXT: s_endpgm 2695; 2696; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_load: 2697; SKIP-CACHE-INV: ; %bb.0: ; %entry 2698; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2699; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 2700; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 2701; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2702; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2703; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2704; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 2705; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 2706; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 2707; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 2708; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 2709; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2710; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2711; SKIP-CACHE-INV-NEXT: s_endpgm 2712 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 2713entry: 2714 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 2715 store i32 %val, i32 addrspace(1)* %out 2716 ret void 2717} 2718 2719define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( 2720; GFX6-LABEL: global_workgroup_one_as_monotonic_load: 2721; GFX6: ; %bb.0: ; %entry 2722; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2723; GFX6-NEXT: s_mov_b32 s3, 0xf000 2724; GFX6-NEXT: s_mov_b32 s2, -1 2725; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2726; GFX6-NEXT: s_mov_b32 s0, s4 2727; GFX6-NEXT: s_mov_b32 s1, s5 2728; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 2729; GFX6-NEXT: s_mov_b32 s4, s6 2730; GFX6-NEXT: s_mov_b32 s5, s7 2731; GFX6-NEXT: s_mov_b32 s6, s2 2732; GFX6-NEXT: s_mov_b32 s7, s3 2733; GFX6-NEXT: s_waitcnt vmcnt(0) 2734; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2735; GFX6-NEXT: s_endpgm 2736; 2737; GFX7-LABEL: global_workgroup_one_as_monotonic_load: 2738; GFX7: ; %bb.0: ; %entry 2739; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2740; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2741; GFX7-NEXT: v_mov_b32_e32 v0, s0 2742; GFX7-NEXT: v_mov_b32_e32 v1, s1 2743; GFX7-NEXT: flat_load_dword v0, v[0:1] 2744; GFX7-NEXT: v_mov_b32_e32 v2, s2 2745; GFX7-NEXT: v_mov_b32_e32 v3, s3 2746; GFX7-NEXT: s_waitcnt vmcnt(0) 2747; GFX7-NEXT: flat_store_dword v[2:3], v0 2748; GFX7-NEXT: s_endpgm 2749; 2750; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: 2751; GFX10-WGP: ; %bb.0: ; %entry 2752; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2753; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 2754; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2755; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 2756; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2757; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 2758; GFX10-WGP-NEXT: s_endpgm 2759; 2760; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: 2761; GFX10-CU: ; %bb.0: ; %entry 2762; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2763; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 2764; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2765; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 2766; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2767; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 2768; GFX10-CU-NEXT: s_endpgm 2769; 2770; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_load: 2771; SKIP-CACHE-INV: ; %bb.0: ; %entry 2772; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2773; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 2774; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 2775; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2776; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2777; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2778; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 2779; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 2780; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 2781; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 2782; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 2783; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2784; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2785; SKIP-CACHE-INV-NEXT: s_endpgm 2786 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 2787entry: 2788 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 2789 store i32 %val, i32 addrspace(1)* %out 2790 ret void 2791} 2792 2793define amdgpu_kernel void @global_workgroup_one_as_acquire_load( 2794; GFX6-LABEL: global_workgroup_one_as_acquire_load: 2795; GFX6: ; %bb.0: ; %entry 2796; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2797; GFX6-NEXT: s_mov_b32 s3, 0xf000 2798; GFX6-NEXT: s_mov_b32 s2, -1 2799; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2800; GFX6-NEXT: s_mov_b32 s0, s4 2801; GFX6-NEXT: s_mov_b32 s1, s5 2802; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 2803; GFX6-NEXT: s_mov_b32 s4, s6 2804; GFX6-NEXT: s_mov_b32 s5, s7 2805; GFX6-NEXT: s_mov_b32 s6, s2 2806; GFX6-NEXT: s_mov_b32 s7, s3 2807; GFX6-NEXT: s_waitcnt vmcnt(0) 2808; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2809; GFX6-NEXT: s_endpgm 2810; 2811; GFX7-LABEL: global_workgroup_one_as_acquire_load: 2812; GFX7: ; %bb.0: ; %entry 2813; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2814; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2815; GFX7-NEXT: v_mov_b32_e32 v0, s0 2816; GFX7-NEXT: v_mov_b32_e32 v1, s1 2817; GFX7-NEXT: flat_load_dword v0, v[0:1] 2818; GFX7-NEXT: v_mov_b32_e32 v2, s2 2819; GFX7-NEXT: v_mov_b32_e32 v3, s3 2820; GFX7-NEXT: s_waitcnt vmcnt(0) 2821; GFX7-NEXT: flat_store_dword v[2:3], v0 2822; GFX7-NEXT: s_endpgm 2823; 2824; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: 2825; GFX10-WGP: ; %bb.0: ; %entry 2826; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2827; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 2828; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2829; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 2830; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2831; GFX10-WGP-NEXT: buffer_gl0_inv 2832; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 2833; GFX10-WGP-NEXT: s_endpgm 2834; 2835; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: 2836; GFX10-CU: ; %bb.0: ; %entry 2837; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2838; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 2839; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2840; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 2841; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2842; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 2843; GFX10-CU-NEXT: s_endpgm 2844; 2845; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_load: 2846; SKIP-CACHE-INV: ; %bb.0: ; %entry 2847; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2848; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 2849; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 2850; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2851; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2852; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2853; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 2854; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 2855; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 2856; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 2857; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 2858; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2859; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2860; SKIP-CACHE-INV-NEXT: s_endpgm 2861 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 2862entry: 2863 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 2864 store i32 %val, i32 addrspace(1)* %out 2865 ret void 2866} 2867 2868define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( 2869; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: 2870; GFX6: ; %bb.0: ; %entry 2871; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2872; GFX6-NEXT: s_mov_b32 s3, 0xf000 2873; GFX6-NEXT: s_mov_b32 s2, -1 2874; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2875; GFX6-NEXT: s_mov_b32 s0, s4 2876; GFX6-NEXT: s_mov_b32 s1, s5 2877; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 2878; GFX6-NEXT: s_mov_b32 s4, s6 2879; GFX6-NEXT: s_mov_b32 s5, s7 2880; GFX6-NEXT: s_mov_b32 s6, s2 2881; GFX6-NEXT: s_mov_b32 s7, s3 2882; GFX6-NEXT: s_waitcnt vmcnt(0) 2883; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 2884; GFX6-NEXT: s_endpgm 2885; 2886; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: 2887; GFX7: ; %bb.0: ; %entry 2888; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2889; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2890; GFX7-NEXT: v_mov_b32_e32 v0, s0 2891; GFX7-NEXT: v_mov_b32_e32 v1, s1 2892; GFX7-NEXT: flat_load_dword v0, v[0:1] 2893; GFX7-NEXT: v_mov_b32_e32 v2, s2 2894; GFX7-NEXT: v_mov_b32_e32 v3, s3 2895; GFX7-NEXT: s_waitcnt vmcnt(0) 2896; GFX7-NEXT: flat_store_dword v[2:3], v0 2897; GFX7-NEXT: s_endpgm 2898; 2899; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: 2900; GFX10-WGP: ; %bb.0: ; %entry 2901; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2902; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 2903; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2904; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 2905; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc 2906; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 2907; GFX10-WGP-NEXT: buffer_gl0_inv 2908; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 2909; GFX10-WGP-NEXT: s_endpgm 2910; 2911; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: 2912; GFX10-CU: ; %bb.0: ; %entry 2913; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2914; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 2915; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2916; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] 2917; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 2918; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 2919; GFX10-CU-NEXT: s_endpgm 2920; 2921; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_load: 2922; SKIP-CACHE-INV: ; %bb.0: ; %entry 2923; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 2924; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 2925; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 2926; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2927; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4 2928; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5 2929; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 2930; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6 2931; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7 2932; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2 2933; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 2934; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 2935; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 2936; SKIP-CACHE-INV-NEXT: s_endpgm 2937 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 2938entry: 2939 %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 2940 store i32 %val, i32 addrspace(1)* %out 2941 ret void 2942} 2943 2944define amdgpu_kernel void @global_workgroup_one_as_unordered_store( 2945; GFX6-LABEL: global_workgroup_one_as_unordered_store: 2946; GFX6: ; %bb.0: ; %entry 2947; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 2948; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2949; GFX6-NEXT: s_mov_b32 s3, 0xf000 2950; GFX6-NEXT: s_mov_b32 s2, -1 2951; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2952; GFX6-NEXT: v_mov_b32_e32 v0, s4 2953; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 2954; GFX6-NEXT: s_endpgm 2955; 2956; GFX7-LABEL: global_workgroup_one_as_unordered_store: 2957; GFX7: ; %bb.0: ; %entry 2958; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 2959; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 2960; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2961; GFX7-NEXT: v_mov_b32_e32 v2, s2 2962; GFX7-NEXT: v_mov_b32_e32 v0, s0 2963; GFX7-NEXT: v_mov_b32_e32 v1, s1 2964; GFX7-NEXT: flat_store_dword v[0:1], v2 2965; GFX7-NEXT: s_endpgm 2966; 2967; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: 2968; GFX10-WGP: ; %bb.0: ; %entry 2969; GFX10-WGP-NEXT: s_clause 0x1 2970; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 2971; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2972; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 2973; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 2974; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 2975; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 2976; GFX10-WGP-NEXT: s_endpgm 2977; 2978; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: 2979; GFX10-CU: ; %bb.0: ; %entry 2980; GFX10-CU-NEXT: s_clause 0x1 2981; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 2982; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2983; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 2984; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 2985; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 2986; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 2987; GFX10-CU-NEXT: s_endpgm 2988; 2989; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_unordered_store: 2990; SKIP-CACHE-INV: ; %bb.0: ; %entry 2991; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 2992; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 2993; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 2994; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 2995; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 2996; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 2997; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 2998; SKIP-CACHE-INV-NEXT: s_endpgm 2999 i32 %in, i32 addrspace(1)* %out) { 3000entry: 3001 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 3002 ret void 3003} 3004 3005define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( 3006; GFX6-LABEL: global_workgroup_one_as_monotonic_store: 3007; GFX6: ; %bb.0: ; %entry 3008; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 3009; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3010; GFX6-NEXT: s_mov_b32 s3, 0xf000 3011; GFX6-NEXT: s_mov_b32 s2, -1 3012; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3013; GFX6-NEXT: v_mov_b32_e32 v0, s4 3014; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3015; GFX6-NEXT: s_endpgm 3016; 3017; GFX7-LABEL: global_workgroup_one_as_monotonic_store: 3018; GFX7: ; %bb.0: ; %entry 3019; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3020; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3021; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3022; GFX7-NEXT: v_mov_b32_e32 v2, s2 3023; GFX7-NEXT: v_mov_b32_e32 v0, s0 3024; GFX7-NEXT: v_mov_b32_e32 v1, s1 3025; GFX7-NEXT: flat_store_dword v[0:1], v2 3026; GFX7-NEXT: s_endpgm 3027; 3028; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: 3029; GFX10-WGP: ; %bb.0: ; %entry 3030; GFX10-WGP-NEXT: s_clause 0x1 3031; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3032; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3033; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3034; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3035; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3036; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3037; GFX10-WGP-NEXT: s_endpgm 3038; 3039; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: 3040; GFX10-CU: ; %bb.0: ; %entry 3041; GFX10-CU-NEXT: s_clause 0x1 3042; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3043; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3044; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3045; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3046; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3047; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3048; GFX10-CU-NEXT: s_endpgm 3049; 3050; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_store: 3051; SKIP-CACHE-INV: ; %bb.0: ; %entry 3052; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 3053; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3054; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 3055; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 3056; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3057; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3058; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 3059; SKIP-CACHE-INV-NEXT: s_endpgm 3060 i32 %in, i32 addrspace(1)* %out) { 3061entry: 3062 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 3063 ret void 3064} 3065 3066define amdgpu_kernel void @global_workgroup_one_as_release_store( 3067; GFX6-LABEL: global_workgroup_one_as_release_store: 3068; GFX6: ; %bb.0: ; %entry 3069; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 3070; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3071; GFX6-NEXT: s_mov_b32 s3, 0xf000 3072; GFX6-NEXT: s_mov_b32 s2, -1 3073; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3074; GFX6-NEXT: v_mov_b32_e32 v0, s4 3075; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3076; GFX6-NEXT: s_endpgm 3077; 3078; GFX7-LABEL: global_workgroup_one_as_release_store: 3079; GFX7: ; %bb.0: ; %entry 3080; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3081; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3082; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3083; GFX7-NEXT: v_mov_b32_e32 v2, s2 3084; GFX7-NEXT: v_mov_b32_e32 v0, s0 3085; GFX7-NEXT: v_mov_b32_e32 v1, s1 3086; GFX7-NEXT: flat_store_dword v[0:1], v2 3087; GFX7-NEXT: s_endpgm 3088; 3089; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: 3090; GFX10-WGP: ; %bb.0: ; %entry 3091; GFX10-WGP-NEXT: s_clause 0x1 3092; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3093; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3094; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3095; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3096; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3097; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3098; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3099; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3100; GFX10-WGP-NEXT: s_endpgm 3101; 3102; GFX10-CU-LABEL: global_workgroup_one_as_release_store: 3103; GFX10-CU: ; %bb.0: ; %entry 3104; GFX10-CU-NEXT: s_clause 0x1 3105; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3106; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3107; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3108; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3109; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3110; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3111; GFX10-CU-NEXT: s_endpgm 3112; 3113; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_store: 3114; SKIP-CACHE-INV: ; %bb.0: ; %entry 3115; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 3116; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3117; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 3118; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 3119; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3120; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3121; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 3122; SKIP-CACHE-INV-NEXT: s_endpgm 3123 i32 %in, i32 addrspace(1)* %out) { 3124entry: 3125 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 3126 ret void 3127} 3128 3129define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( 3130; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: 3131; GFX6: ; %bb.0: ; %entry 3132; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 3133; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3134; GFX6-NEXT: s_mov_b32 s3, 0xf000 3135; GFX6-NEXT: s_mov_b32 s2, -1 3136; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3137; GFX6-NEXT: v_mov_b32_e32 v0, s4 3138; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3139; GFX6-NEXT: s_endpgm 3140; 3141; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: 3142; GFX7: ; %bb.0: ; %entry 3143; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 3144; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 3145; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3146; GFX7-NEXT: v_mov_b32_e32 v2, s2 3147; GFX7-NEXT: v_mov_b32_e32 v0, s0 3148; GFX7-NEXT: v_mov_b32_e32 v1, s1 3149; GFX7-NEXT: flat_store_dword v[0:1], v2 3150; GFX7-NEXT: s_endpgm 3151; 3152; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: 3153; GFX10-WGP: ; %bb.0: ; %entry 3154; GFX10-WGP-NEXT: s_clause 0x1 3155; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 3156; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3157; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3158; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3159; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3160; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3161; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3162; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3163; GFX10-WGP-NEXT: s_endpgm 3164; 3165; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: 3166; GFX10-CU: ; %bb.0: ; %entry 3167; GFX10-CU-NEXT: s_clause 0x1 3168; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 3169; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3170; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3171; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3172; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3173; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3174; GFX10-CU-NEXT: s_endpgm 3175; 3176; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_store: 3177; SKIP-CACHE-INV: ; %bb.0: ; %entry 3178; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 3179; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3180; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 3181; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 3182; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3183; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 3184; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 3185; SKIP-CACHE-INV-NEXT: s_endpgm 3186 i32 %in, i32 addrspace(1)* %out) { 3187entry: 3188 store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 3189 ret void 3190} 3191 3192define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( 3193; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: 3194; GFX6: ; %bb.0: ; %entry 3195; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3196; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3197; GFX6-NEXT: s_mov_b32 s7, 0xf000 3198; GFX6-NEXT: s_mov_b32 s6, -1 3199; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3200; GFX6-NEXT: v_mov_b32_e32 v0, s0 3201; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3202; GFX6-NEXT: s_endpgm 3203; 3204; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: 3205; GFX7: ; %bb.0: ; %entry 3206; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3207; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3208; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3209; GFX7-NEXT: v_mov_b32_e32 v0, s0 3210; GFX7-NEXT: v_mov_b32_e32 v1, s1 3211; GFX7-NEXT: v_mov_b32_e32 v2, s2 3212; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3213; GFX7-NEXT: s_endpgm 3214; 3215; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: 3216; GFX10-WGP: ; %bb.0: ; %entry 3217; GFX10-WGP-NEXT: s_clause 0x1 3218; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3219; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3220; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3221; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3222; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3223; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 3224; GFX10-WGP-NEXT: s_endpgm 3225; 3226; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: 3227; GFX10-CU: ; %bb.0: ; %entry 3228; GFX10-CU-NEXT: s_clause 0x1 3229; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3230; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3231; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3232; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3233; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3234; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 3235; GFX10-CU-NEXT: s_endpgm 3236; 3237; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_atomicrmw: 3238; SKIP-CACHE-INV: ; %bb.0: ; %entry 3239; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3240; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3241; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3242; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3243; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3244; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3245; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3246; SKIP-CACHE-INV-NEXT: s_endpgm 3247 i32 addrspace(1)* %out, i32 %in) { 3248entry: 3249 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic 3250 ret void 3251} 3252 3253define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( 3254; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: 3255; GFX6: ; %bb.0: ; %entry 3256; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3257; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3258; GFX6-NEXT: s_mov_b32 s7, 0xf000 3259; GFX6-NEXT: s_mov_b32 s6, -1 3260; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3261; GFX6-NEXT: v_mov_b32_e32 v0, s0 3262; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3263; GFX6-NEXT: s_endpgm 3264; 3265; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: 3266; GFX7: ; %bb.0: ; %entry 3267; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3268; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3269; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3270; GFX7-NEXT: v_mov_b32_e32 v0, s0 3271; GFX7-NEXT: v_mov_b32_e32 v1, s1 3272; GFX7-NEXT: v_mov_b32_e32 v2, s2 3273; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3274; GFX7-NEXT: s_endpgm 3275; 3276; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: 3277; GFX10-WGP: ; %bb.0: ; %entry 3278; GFX10-WGP-NEXT: s_clause 0x1 3279; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3280; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3281; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3282; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3283; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3284; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 3285; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3286; GFX10-WGP-NEXT: buffer_gl0_inv 3287; GFX10-WGP-NEXT: s_endpgm 3288; 3289; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: 3290; GFX10-CU: ; %bb.0: ; %entry 3291; GFX10-CU-NEXT: s_clause 0x1 3292; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3293; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3294; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3295; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3296; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3297; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 3298; GFX10-CU-NEXT: s_endpgm 3299; 3300; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_atomicrmw: 3301; SKIP-CACHE-INV: ; %bb.0: ; %entry 3302; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3303; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3304; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3305; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3306; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3307; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3308; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3309; SKIP-CACHE-INV-NEXT: s_endpgm 3310 i32 addrspace(1)* %out, i32 %in) { 3311entry: 3312 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire 3313 ret void 3314} 3315 3316define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( 3317; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: 3318; GFX6: ; %bb.0: ; %entry 3319; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3320; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3321; GFX6-NEXT: s_mov_b32 s7, 0xf000 3322; GFX6-NEXT: s_mov_b32 s6, -1 3323; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3324; GFX6-NEXT: v_mov_b32_e32 v0, s0 3325; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3326; GFX6-NEXT: s_endpgm 3327; 3328; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: 3329; GFX7: ; %bb.0: ; %entry 3330; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3331; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3332; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3333; GFX7-NEXT: v_mov_b32_e32 v0, s0 3334; GFX7-NEXT: v_mov_b32_e32 v1, s1 3335; GFX7-NEXT: v_mov_b32_e32 v2, s2 3336; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3337; GFX7-NEXT: s_endpgm 3338; 3339; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: 3340; GFX10-WGP: ; %bb.0: ; %entry 3341; GFX10-WGP-NEXT: s_clause 0x1 3342; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3343; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3344; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3345; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3346; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3347; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3348; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3349; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 3350; GFX10-WGP-NEXT: s_endpgm 3351; 3352; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: 3353; GFX10-CU: ; %bb.0: ; %entry 3354; GFX10-CU-NEXT: s_clause 0x1 3355; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3356; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3357; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3358; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3359; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3360; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 3361; GFX10-CU-NEXT: s_endpgm 3362; 3363; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_atomicrmw: 3364; SKIP-CACHE-INV: ; %bb.0: ; %entry 3365; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3366; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3367; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3368; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3369; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3370; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3371; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3372; SKIP-CACHE-INV-NEXT: s_endpgm 3373 i32 addrspace(1)* %out, i32 %in) { 3374entry: 3375 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release 3376 ret void 3377} 3378 3379define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( 3380; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: 3381; GFX6: ; %bb.0: ; %entry 3382; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3383; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3384; GFX6-NEXT: s_mov_b32 s7, 0xf000 3385; GFX6-NEXT: s_mov_b32 s6, -1 3386; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3387; GFX6-NEXT: v_mov_b32_e32 v0, s0 3388; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3389; GFX6-NEXT: s_endpgm 3390; 3391; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: 3392; GFX7: ; %bb.0: ; %entry 3393; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3394; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3395; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3396; GFX7-NEXT: v_mov_b32_e32 v0, s0 3397; GFX7-NEXT: v_mov_b32_e32 v1, s1 3398; GFX7-NEXT: v_mov_b32_e32 v2, s2 3399; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3400; GFX7-NEXT: s_endpgm 3401; 3402; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: 3403; GFX10-WGP: ; %bb.0: ; %entry 3404; GFX10-WGP-NEXT: s_clause 0x1 3405; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3406; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3407; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3408; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3409; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3410; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3411; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3412; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 3413; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3414; GFX10-WGP-NEXT: buffer_gl0_inv 3415; GFX10-WGP-NEXT: s_endpgm 3416; 3417; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: 3418; GFX10-CU: ; %bb.0: ; %entry 3419; GFX10-CU-NEXT: s_clause 0x1 3420; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3421; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3422; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3423; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3424; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3425; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 3426; GFX10-CU-NEXT: s_endpgm 3427; 3428; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: 3429; SKIP-CACHE-INV: ; %bb.0: ; %entry 3430; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3431; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3432; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3433; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3434; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3435; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3436; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3437; SKIP-CACHE-INV-NEXT: s_endpgm 3438 i32 addrspace(1)* %out, i32 %in) { 3439entry: 3440 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel 3441 ret void 3442} 3443 3444define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( 3445; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: 3446; GFX6: ; %bb.0: ; %entry 3447; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3448; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3449; GFX6-NEXT: s_mov_b32 s7, 0xf000 3450; GFX6-NEXT: s_mov_b32 s6, -1 3451; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3452; GFX6-NEXT: v_mov_b32_e32 v0, s0 3453; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3454; GFX6-NEXT: s_endpgm 3455; 3456; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: 3457; GFX7: ; %bb.0: ; %entry 3458; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3459; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3460; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3461; GFX7-NEXT: v_mov_b32_e32 v0, s0 3462; GFX7-NEXT: v_mov_b32_e32 v1, s1 3463; GFX7-NEXT: v_mov_b32_e32 v2, s2 3464; GFX7-NEXT: flat_atomic_swap v[0:1], v2 3465; GFX7-NEXT: s_endpgm 3466; 3467; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: 3468; GFX10-WGP: ; %bb.0: ; %entry 3469; GFX10-WGP-NEXT: s_clause 0x1 3470; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3471; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3472; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3473; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3474; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3475; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3476; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3477; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[0:1] 3478; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3479; GFX10-WGP-NEXT: buffer_gl0_inv 3480; GFX10-WGP-NEXT: s_endpgm 3481; 3482; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: 3483; GFX10-CU: ; %bb.0: ; %entry 3484; GFX10-CU-NEXT: s_clause 0x1 3485; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3486; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3487; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3488; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3489; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3490; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[0:1] 3491; GFX10-CU-NEXT: s_endpgm 3492; 3493; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: 3494; SKIP-CACHE-INV: ; %bb.0: ; %entry 3495; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3496; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3497; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3498; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3499; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3500; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3501; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 3502; SKIP-CACHE-INV-NEXT: s_endpgm 3503 i32 addrspace(1)* %out, i32 %in) { 3504entry: 3505 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst 3506 ret void 3507} 3508 3509define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( 3510; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: 3511; GFX6: ; %bb.0: ; %entry 3512; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3513; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3514; GFX6-NEXT: s_mov_b32 s7, 0xf000 3515; GFX6-NEXT: s_mov_b32 s6, -1 3516; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3517; GFX6-NEXT: v_mov_b32_e32 v0, s0 3518; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3519; GFX6-NEXT: s_waitcnt vmcnt(0) 3520; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 3521; GFX6-NEXT: s_endpgm 3522; 3523; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: 3524; GFX7: ; %bb.0: ; %entry 3525; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3526; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3527; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX7-NEXT: v_mov_b32_e32 v0, s0 3529; GFX7-NEXT: v_mov_b32_e32 v1, s1 3530; GFX7-NEXT: v_mov_b32_e32 v2, s2 3531; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3532; GFX7-NEXT: s_waitcnt vmcnt(0) 3533; GFX7-NEXT: flat_store_dword v[0:1], v2 3534; GFX7-NEXT: s_endpgm 3535; 3536; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: 3537; GFX10-WGP: ; %bb.0: ; %entry 3538; GFX10-WGP-NEXT: s_clause 0x1 3539; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3540; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3541; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3542; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3543; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3544; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3545; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3546; GFX10-WGP-NEXT: buffer_gl0_inv 3547; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3548; GFX10-WGP-NEXT: s_endpgm 3549; 3550; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: 3551; GFX10-CU: ; %bb.0: ; %entry 3552; GFX10-CU-NEXT: s_clause 0x1 3553; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3554; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3555; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3556; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3557; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3558; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3559; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3560; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3561; GFX10-CU-NEXT: s_endpgm 3562; 3563; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: 3564; SKIP-CACHE-INV: ; %bb.0: ; %entry 3565; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3566; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3567; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3568; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3569; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3570; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3571; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3572; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3573; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 3574; SKIP-CACHE-INV-NEXT: s_endpgm 3575 i32 addrspace(1)* %out, i32 %in) { 3576entry: 3577 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire 3578 store i32 %val, i32 addrspace(1)* %out, align 4 3579 ret void 3580} 3581 3582define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( 3583; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: 3584; GFX6: ; %bb.0: ; %entry 3585; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3586; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3587; GFX6-NEXT: s_mov_b32 s7, 0xf000 3588; GFX6-NEXT: s_mov_b32 s6, -1 3589; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3590; GFX6-NEXT: v_mov_b32_e32 v0, s0 3591; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3592; GFX6-NEXT: s_waitcnt vmcnt(0) 3593; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 3594; GFX6-NEXT: s_endpgm 3595; 3596; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: 3597; GFX7: ; %bb.0: ; %entry 3598; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3599; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3600; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3601; GFX7-NEXT: v_mov_b32_e32 v0, s0 3602; GFX7-NEXT: v_mov_b32_e32 v1, s1 3603; GFX7-NEXT: v_mov_b32_e32 v2, s2 3604; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3605; GFX7-NEXT: s_waitcnt vmcnt(0) 3606; GFX7-NEXT: flat_store_dword v[0:1], v2 3607; GFX7-NEXT: s_endpgm 3608; 3609; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: 3610; GFX10-WGP: ; %bb.0: ; %entry 3611; GFX10-WGP-NEXT: s_clause 0x1 3612; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3613; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3614; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3615; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3616; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3617; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3618; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3619; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3620; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3621; GFX10-WGP-NEXT: buffer_gl0_inv 3622; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3623; GFX10-WGP-NEXT: s_endpgm 3624; 3625; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: 3626; GFX10-CU: ; %bb.0: ; %entry 3627; GFX10-CU-NEXT: s_clause 0x1 3628; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3629; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3630; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3631; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3632; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3633; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3634; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3635; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3636; GFX10-CU-NEXT: s_endpgm 3637; 3638; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: 3639; SKIP-CACHE-INV: ; %bb.0: ; %entry 3640; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3641; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3642; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3643; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3644; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3645; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3646; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3647; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3648; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 3649; SKIP-CACHE-INV-NEXT: s_endpgm 3650 i32 addrspace(1)* %out, i32 %in) { 3651entry: 3652 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel 3653 store i32 %val, i32 addrspace(1)* %out, align 4 3654 ret void 3655} 3656 3657define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( 3658; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: 3659; GFX6: ; %bb.0: ; %entry 3660; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3661; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3662; GFX6-NEXT: s_mov_b32 s7, 0xf000 3663; GFX6-NEXT: s_mov_b32 s6, -1 3664; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3665; GFX6-NEXT: v_mov_b32_e32 v0, s0 3666; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3667; GFX6-NEXT: s_waitcnt vmcnt(0) 3668; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 3669; GFX6-NEXT: s_endpgm 3670; 3671; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: 3672; GFX7: ; %bb.0: ; %entry 3673; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3674; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 3675; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3676; GFX7-NEXT: v_mov_b32_e32 v0, s0 3677; GFX7-NEXT: v_mov_b32_e32 v1, s1 3678; GFX7-NEXT: v_mov_b32_e32 v2, s2 3679; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc 3680; GFX7-NEXT: s_waitcnt vmcnt(0) 3681; GFX7-NEXT: flat_store_dword v[0:1], v2 3682; GFX7-NEXT: s_endpgm 3683; 3684; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: 3685; GFX10-WGP: ; %bb.0: ; %entry 3686; GFX10-WGP-NEXT: s_clause 0x1 3687; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 3688; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3689; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 3690; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3691; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 3692; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3693; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3694; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3695; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3696; GFX10-WGP-NEXT: buffer_gl0_inv 3697; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1] 3698; GFX10-WGP-NEXT: s_endpgm 3699; 3700; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: 3701; GFX10-CU: ; %bb.0: ; %entry 3702; GFX10-CU-NEXT: s_clause 0x1 3703; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 3704; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3705; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 3706; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3707; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 3708; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc 3709; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 3710; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1] 3711; GFX10-CU-NEXT: s_endpgm 3712; 3713; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: 3714; SKIP-CACHE-INV: ; %bb.0: ; %entry 3715; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3716; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 3717; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3718; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3719; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3720; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3721; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc 3722; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 3723; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 3724; SKIP-CACHE-INV-NEXT: s_endpgm 3725 i32 addrspace(1)* %out, i32 %in) { 3726entry: 3727 %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst 3728 store i32 %val, i32 addrspace(1)* %out, align 4 3729 ret void 3730} 3731 3732define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( 3733; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: 3734; GFX6: ; %bb.0: ; %entry 3735; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3736; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3737; GFX6-NEXT: s_mov_b32 s7, 0xf000 3738; GFX6-NEXT: s_mov_b32 s6, -1 3739; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3740; GFX6-NEXT: v_mov_b32_e32 v0, s0 3741; GFX6-NEXT: v_mov_b32_e32 v1, s1 3742; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3743; GFX6-NEXT: s_endpgm 3744; 3745; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: 3746; GFX7: ; %bb.0: ; %entry 3747; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3748; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3749; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX7-NEXT: s_add_u32 s0, s0, 16 3751; GFX7-NEXT: s_addc_u32 s1, s1, 0 3752; GFX7-NEXT: v_mov_b32_e32 v0, s0 3753; GFX7-NEXT: v_mov_b32_e32 v2, s2 3754; GFX7-NEXT: v_mov_b32_e32 v1, s1 3755; GFX7-NEXT: v_mov_b32_e32 v3, s3 3756; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3757; GFX7-NEXT: s_endpgm 3758; 3759; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: 3760; GFX10-WGP: ; %bb.0: ; %entry 3761; GFX10-WGP-NEXT: s_clause 0x1 3762; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3763; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3764; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 3765; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3766; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3767; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3768; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3769; GFX10-WGP-NEXT: s_endpgm 3770; 3771; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: 3772; GFX10-CU: ; %bb.0: ; %entry 3773; GFX10-CU-NEXT: s_clause 0x1 3774; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3775; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3776; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 3777; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3778; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3779; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3780; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3781; GFX10-CU-NEXT: s_endpgm 3782; 3783; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: 3784; SKIP-CACHE-INV: ; %bb.0: ; %entry 3785; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3786; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3787; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3788; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3789; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3790; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3791; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3792; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3793; SKIP-CACHE-INV-NEXT: s_endpgm 3794 i32 addrspace(1)* %out, i32 %in, i32 %old) { 3795entry: 3796 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 3797 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic 3798 ret void 3799} 3800 3801define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( 3802; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: 3803; GFX6: ; %bb.0: ; %entry 3804; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3805; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3806; GFX6-NEXT: s_mov_b32 s7, 0xf000 3807; GFX6-NEXT: s_mov_b32 s6, -1 3808; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3809; GFX6-NEXT: v_mov_b32_e32 v0, s0 3810; GFX6-NEXT: v_mov_b32_e32 v1, s1 3811; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3812; GFX6-NEXT: s_endpgm 3813; 3814; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: 3815; GFX7: ; %bb.0: ; %entry 3816; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3817; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3818; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3819; GFX7-NEXT: s_add_u32 s0, s0, 16 3820; GFX7-NEXT: s_addc_u32 s1, s1, 0 3821; GFX7-NEXT: v_mov_b32_e32 v0, s0 3822; GFX7-NEXT: v_mov_b32_e32 v2, s2 3823; GFX7-NEXT: v_mov_b32_e32 v1, s1 3824; GFX7-NEXT: v_mov_b32_e32 v3, s3 3825; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3826; GFX7-NEXT: s_endpgm 3827; 3828; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: 3829; GFX10-WGP: ; %bb.0: ; %entry 3830; GFX10-WGP-NEXT: s_clause 0x1 3831; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3832; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3833; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 3834; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3835; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3836; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3837; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3838; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3839; GFX10-WGP-NEXT: buffer_gl0_inv 3840; GFX10-WGP-NEXT: s_endpgm 3841; 3842; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: 3843; GFX10-CU: ; %bb.0: ; %entry 3844; GFX10-CU-NEXT: s_clause 0x1 3845; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3846; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3847; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 3848; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3849; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3850; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3851; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3852; GFX10-CU-NEXT: s_endpgm 3853; 3854; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: 3855; SKIP-CACHE-INV: ; %bb.0: ; %entry 3856; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3857; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3858; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3859; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3860; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3861; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3862; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3863; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3864; SKIP-CACHE-INV-NEXT: s_endpgm 3865 i32 addrspace(1)* %out, i32 %in, i32 %old) { 3866entry: 3867 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 3868 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 3869 ret void 3870} 3871 3872define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( 3873; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: 3874; GFX6: ; %bb.0: ; %entry 3875; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3876; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3877; GFX6-NEXT: s_mov_b32 s7, 0xf000 3878; GFX6-NEXT: s_mov_b32 s6, -1 3879; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX6-NEXT: v_mov_b32_e32 v0, s0 3881; GFX6-NEXT: v_mov_b32_e32 v1, s1 3882; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3883; GFX6-NEXT: s_endpgm 3884; 3885; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: 3886; GFX7: ; %bb.0: ; %entry 3887; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3888; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3889; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3890; GFX7-NEXT: s_add_u32 s0, s0, 16 3891; GFX7-NEXT: s_addc_u32 s1, s1, 0 3892; GFX7-NEXT: v_mov_b32_e32 v0, s0 3893; GFX7-NEXT: v_mov_b32_e32 v2, s2 3894; GFX7-NEXT: v_mov_b32_e32 v1, s1 3895; GFX7-NEXT: v_mov_b32_e32 v3, s3 3896; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3897; GFX7-NEXT: s_endpgm 3898; 3899; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: 3900; GFX10-WGP: ; %bb.0: ; %entry 3901; GFX10-WGP-NEXT: s_clause 0x1 3902; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3903; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3904; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 3905; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3907; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3908; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3909; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3910; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3911; GFX10-WGP-NEXT: s_endpgm 3912; 3913; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: 3914; GFX10-CU: ; %bb.0: ; %entry 3915; GFX10-CU-NEXT: s_clause 0x1 3916; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3917; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3918; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 3919; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3920; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3921; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3922; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3923; GFX10-CU-NEXT: s_endpgm 3924; 3925; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: 3926; SKIP-CACHE-INV: ; %bb.0: ; %entry 3927; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3928; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3929; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 3930; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 3931; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 3932; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 3933; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 3934; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3935; SKIP-CACHE-INV-NEXT: s_endpgm 3936 i32 addrspace(1)* %out, i32 %in, i32 %old) { 3937entry: 3938 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 3939 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic 3940 ret void 3941} 3942 3943define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( 3944; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: 3945; GFX6: ; %bb.0: ; %entry 3946; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3947; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3948; GFX6-NEXT: s_mov_b32 s7, 0xf000 3949; GFX6-NEXT: s_mov_b32 s6, -1 3950; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3951; GFX6-NEXT: v_mov_b32_e32 v0, s0 3952; GFX6-NEXT: v_mov_b32_e32 v1, s1 3953; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 3954; GFX6-NEXT: s_endpgm 3955; 3956; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: 3957; GFX7: ; %bb.0: ; %entry 3958; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3959; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 3960; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3961; GFX7-NEXT: s_add_u32 s0, s0, 16 3962; GFX7-NEXT: s_addc_u32 s1, s1, 0 3963; GFX7-NEXT: v_mov_b32_e32 v0, s0 3964; GFX7-NEXT: v_mov_b32_e32 v2, s2 3965; GFX7-NEXT: v_mov_b32_e32 v1, s1 3966; GFX7-NEXT: v_mov_b32_e32 v3, s3 3967; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 3968; GFX7-NEXT: s_endpgm 3969; 3970; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: 3971; GFX10-WGP: ; %bb.0: ; %entry 3972; GFX10-WGP-NEXT: s_clause 0x1 3973; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3974; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3975; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 3976; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 3977; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 3978; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 3979; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 3980; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3981; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3982; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 3983; GFX10-WGP-NEXT: buffer_gl0_inv 3984; GFX10-WGP-NEXT: s_endpgm 3985; 3986; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: 3987; GFX10-CU: ; %bb.0: ; %entry 3988; GFX10-CU-NEXT: s_clause 0x1 3989; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 3990; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 3991; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 3992; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 3993; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 3994; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 3995; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 3996; GFX10-CU-NEXT: s_endpgm 3997; 3998; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: 3999; SKIP-CACHE-INV: ; %bb.0: ; %entry 4000; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4001; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4002; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4003; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4004; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4005; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4006; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4007; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4008; SKIP-CACHE-INV-NEXT: s_endpgm 4009 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4010entry: 4011 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4012 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 4013 ret void 4014} 4015 4016define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( 4017; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4018; GFX6: ; %bb.0: ; %entry 4019; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4020; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4021; GFX6-NEXT: s_mov_b32 s7, 0xf000 4022; GFX6-NEXT: s_mov_b32 s6, -1 4023; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4024; GFX6-NEXT: v_mov_b32_e32 v0, s0 4025; GFX6-NEXT: v_mov_b32_e32 v1, s1 4026; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4027; GFX6-NEXT: s_endpgm 4028; 4029; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4030; GFX7: ; %bb.0: ; %entry 4031; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4032; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4033; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4034; GFX7-NEXT: s_add_u32 s0, s0, 16 4035; GFX7-NEXT: s_addc_u32 s1, s1, 0 4036; GFX7-NEXT: v_mov_b32_e32 v0, s0 4037; GFX7-NEXT: v_mov_b32_e32 v2, s2 4038; GFX7-NEXT: v_mov_b32_e32 v1, s1 4039; GFX7-NEXT: v_mov_b32_e32 v3, s3 4040; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4041; GFX7-NEXT: s_endpgm 4042; 4043; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4044; GFX10-WGP: ; %bb.0: ; %entry 4045; GFX10-WGP-NEXT: s_clause 0x1 4046; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4047; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4048; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4049; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4050; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4051; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4052; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4053; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4054; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4055; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4056; GFX10-WGP-NEXT: buffer_gl0_inv 4057; GFX10-WGP-NEXT: s_endpgm 4058; 4059; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4060; GFX10-CU: ; %bb.0: ; %entry 4061; GFX10-CU-NEXT: s_clause 0x1 4062; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4063; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4064; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4065; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4066; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4067; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4068; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4069; GFX10-CU-NEXT: s_endpgm 4070; 4071; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: 4072; SKIP-CACHE-INV: ; %bb.0: ; %entry 4073; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4074; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4075; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4076; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4077; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4078; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4079; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4080; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4081; SKIP-CACHE-INV-NEXT: s_endpgm 4082 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4083entry: 4084 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4085 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 4086 ret void 4087} 4088 4089define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( 4090; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: 4091; GFX6: ; %bb.0: ; %entry 4092; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4093; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4094; GFX6-NEXT: s_mov_b32 s7, 0xf000 4095; GFX6-NEXT: s_mov_b32 s6, -1 4096; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4097; GFX6-NEXT: v_mov_b32_e32 v0, s0 4098; GFX6-NEXT: v_mov_b32_e32 v1, s1 4099; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4100; GFX6-NEXT: s_endpgm 4101; 4102; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: 4103; GFX7: ; %bb.0: ; %entry 4104; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4105; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4106; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4107; GFX7-NEXT: s_add_u32 s0, s0, 16 4108; GFX7-NEXT: s_addc_u32 s1, s1, 0 4109; GFX7-NEXT: v_mov_b32_e32 v0, s0 4110; GFX7-NEXT: v_mov_b32_e32 v2, s2 4111; GFX7-NEXT: v_mov_b32_e32 v1, s1 4112; GFX7-NEXT: v_mov_b32_e32 v3, s3 4113; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4114; GFX7-NEXT: s_endpgm 4115; 4116; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: 4117; GFX10-WGP: ; %bb.0: ; %entry 4118; GFX10-WGP-NEXT: s_clause 0x1 4119; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4120; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4121; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4122; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4123; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4124; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4125; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4126; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4127; GFX10-WGP-NEXT: buffer_gl0_inv 4128; GFX10-WGP-NEXT: s_endpgm 4129; 4130; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: 4131; GFX10-CU: ; %bb.0: ; %entry 4132; GFX10-CU-NEXT: s_clause 0x1 4133; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4134; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4135; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4136; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4137; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4138; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4139; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4140; GFX10-CU-NEXT: s_endpgm 4141; 4142; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: 4143; SKIP-CACHE-INV: ; %bb.0: ; %entry 4144; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4145; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4146; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4147; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4148; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4149; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4150; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4151; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4152; SKIP-CACHE-INV-NEXT: s_endpgm 4153 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4154entry: 4155 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4156 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 4157 ret void 4158} 4159 4160define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( 4161; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: 4162; GFX6: ; %bb.0: ; %entry 4163; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4164; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4165; GFX6-NEXT: s_mov_b32 s7, 0xf000 4166; GFX6-NEXT: s_mov_b32 s6, -1 4167; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4168; GFX6-NEXT: v_mov_b32_e32 v0, s0 4169; GFX6-NEXT: v_mov_b32_e32 v1, s1 4170; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4171; GFX6-NEXT: s_endpgm 4172; 4173; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: 4174; GFX7: ; %bb.0: ; %entry 4175; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4176; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4177; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4178; GFX7-NEXT: s_add_u32 s0, s0, 16 4179; GFX7-NEXT: s_addc_u32 s1, s1, 0 4180; GFX7-NEXT: v_mov_b32_e32 v0, s0 4181; GFX7-NEXT: v_mov_b32_e32 v2, s2 4182; GFX7-NEXT: v_mov_b32_e32 v1, s1 4183; GFX7-NEXT: v_mov_b32_e32 v3, s3 4184; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4185; GFX7-NEXT: s_endpgm 4186; 4187; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: 4188; GFX10-WGP: ; %bb.0: ; %entry 4189; GFX10-WGP-NEXT: s_clause 0x1 4190; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4191; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4192; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4193; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4194; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4195; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4196; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4197; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4198; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4199; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4200; GFX10-WGP-NEXT: buffer_gl0_inv 4201; GFX10-WGP-NEXT: s_endpgm 4202; 4203; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: 4204; GFX10-CU: ; %bb.0: ; %entry 4205; GFX10-CU-NEXT: s_clause 0x1 4206; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4207; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4208; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4209; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4210; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4211; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4212; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4213; GFX10-CU-NEXT: s_endpgm 4214; 4215; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: 4216; SKIP-CACHE-INV: ; %bb.0: ; %entry 4217; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4218; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4219; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4220; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4221; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4222; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4224; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4225; SKIP-CACHE-INV-NEXT: s_endpgm 4226 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4227entry: 4228 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4229 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 4230 ret void 4231} 4232 4233define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( 4234; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: 4235; GFX6: ; %bb.0: ; %entry 4236; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4237; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4238; GFX6-NEXT: s_mov_b32 s7, 0xf000 4239; GFX6-NEXT: s_mov_b32 s6, -1 4240; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4241; GFX6-NEXT: v_mov_b32_e32 v0, s0 4242; GFX6-NEXT: v_mov_b32_e32 v1, s1 4243; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4244; GFX6-NEXT: s_endpgm 4245; 4246; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: 4247; GFX7: ; %bb.0: ; %entry 4248; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4249; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4250; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4251; GFX7-NEXT: s_add_u32 s0, s0, 16 4252; GFX7-NEXT: s_addc_u32 s1, s1, 0 4253; GFX7-NEXT: v_mov_b32_e32 v0, s0 4254; GFX7-NEXT: v_mov_b32_e32 v2, s2 4255; GFX7-NEXT: v_mov_b32_e32 v1, s1 4256; GFX7-NEXT: v_mov_b32_e32 v3, s3 4257; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4258; GFX7-NEXT: s_endpgm 4259; 4260; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: 4261; GFX10-WGP: ; %bb.0: ; %entry 4262; GFX10-WGP-NEXT: s_clause 0x1 4263; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4264; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4265; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4266; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4267; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4268; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4269; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4270; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4271; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4272; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4273; GFX10-WGP-NEXT: buffer_gl0_inv 4274; GFX10-WGP-NEXT: s_endpgm 4275; 4276; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: 4277; GFX10-CU: ; %bb.0: ; %entry 4278; GFX10-CU-NEXT: s_clause 0x1 4279; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4280; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4281; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4282; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4283; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4284; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4285; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4286; GFX10-CU-NEXT: s_endpgm 4287; 4288; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: 4289; SKIP-CACHE-INV: ; %bb.0: ; %entry 4290; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4291; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4292; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4293; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4294; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4295; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4296; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4297; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4298; SKIP-CACHE-INV-NEXT: s_endpgm 4299 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4300entry: 4301 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4302 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 4303 ret void 4304} 4305 4306define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( 4307; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: 4308; GFX6: ; %bb.0: ; %entry 4309; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4310; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4311; GFX6-NEXT: s_mov_b32 s7, 0xf000 4312; GFX6-NEXT: s_mov_b32 s6, -1 4313; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4314; GFX6-NEXT: v_mov_b32_e32 v0, s0 4315; GFX6-NEXT: v_mov_b32_e32 v1, s1 4316; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4317; GFX6-NEXT: s_endpgm 4318; 4319; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: 4320; GFX7: ; %bb.0: ; %entry 4321; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4322; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4323; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4324; GFX7-NEXT: s_add_u32 s0, s0, 16 4325; GFX7-NEXT: s_addc_u32 s1, s1, 0 4326; GFX7-NEXT: v_mov_b32_e32 v0, s0 4327; GFX7-NEXT: v_mov_b32_e32 v2, s2 4328; GFX7-NEXT: v_mov_b32_e32 v1, s1 4329; GFX7-NEXT: v_mov_b32_e32 v3, s3 4330; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4331; GFX7-NEXT: s_endpgm 4332; 4333; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: 4334; GFX10-WGP: ; %bb.0: ; %entry 4335; GFX10-WGP-NEXT: s_clause 0x1 4336; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4337; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4338; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4339; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4340; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4341; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4342; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4343; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4344; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4345; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4346; GFX10-WGP-NEXT: buffer_gl0_inv 4347; GFX10-WGP-NEXT: s_endpgm 4348; 4349; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: 4350; GFX10-CU: ; %bb.0: ; %entry 4351; GFX10-CU-NEXT: s_clause 0x1 4352; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4353; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4354; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4355; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4356; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4357; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4358; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4359; GFX10-CU-NEXT: s_endpgm 4360; 4361; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: 4362; SKIP-CACHE-INV: ; %bb.0: ; %entry 4363; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4364; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4365; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4366; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4367; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4368; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4369; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4370; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4371; SKIP-CACHE-INV-NEXT: s_endpgm 4372 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4373entry: 4374 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4375 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 4376 ret void 4377} 4378 4379define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( 4380; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 4381; GFX6: ; %bb.0: ; %entry 4382; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4383; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4384; GFX6-NEXT: s_mov_b32 s7, 0xf000 4385; GFX6-NEXT: s_mov_b32 s6, -1 4386; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4387; GFX6-NEXT: v_mov_b32_e32 v0, s0 4388; GFX6-NEXT: v_mov_b32_e32 v1, s1 4389; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4390; GFX6-NEXT: s_endpgm 4391; 4392; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 4393; GFX7: ; %bb.0: ; %entry 4394; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4395; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4396; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4397; GFX7-NEXT: s_add_u32 s0, s0, 16 4398; GFX7-NEXT: s_addc_u32 s1, s1, 0 4399; GFX7-NEXT: v_mov_b32_e32 v0, s0 4400; GFX7-NEXT: v_mov_b32_e32 v2, s2 4401; GFX7-NEXT: v_mov_b32_e32 v1, s1 4402; GFX7-NEXT: v_mov_b32_e32 v3, s3 4403; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] 4404; GFX7-NEXT: s_endpgm 4405; 4406; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 4407; GFX10-WGP: ; %bb.0: ; %entry 4408; GFX10-WGP-NEXT: s_clause 0x1 4409; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4410; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4411; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4412; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4413; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4414; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4415; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4416; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4417; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4418; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4419; GFX10-WGP-NEXT: buffer_gl0_inv 4420; GFX10-WGP-NEXT: s_endpgm 4421; 4422; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 4423; GFX10-CU: ; %bb.0: ; %entry 4424; GFX10-CU-NEXT: s_clause 0x1 4425; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4426; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4427; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4428; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4429; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4430; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4431; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 4432; GFX10-CU-NEXT: s_endpgm 4433; 4434; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: 4435; SKIP-CACHE-INV: ; %bb.0: ; %entry 4436; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4437; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4438; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4439; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4440; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4441; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4442; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4443; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 4444; SKIP-CACHE-INV-NEXT: s_endpgm 4445 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4446entry: 4447 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4448 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 4449 ret void 4450} 4451 4452define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( 4453; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 4454; GFX6: ; %bb.0: ; %entry 4455; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4456; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4457; GFX6-NEXT: s_mov_b32 s7, 0xf000 4458; GFX6-NEXT: s_mov_b32 s6, -1 4459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4460; GFX6-NEXT: v_mov_b32_e32 v0, s0 4461; GFX6-NEXT: v_mov_b32_e32 v1, s1 4462; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4463; GFX6-NEXT: s_waitcnt vmcnt(0) 4464; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4465; GFX6-NEXT: s_endpgm 4466; 4467; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 4468; GFX7: ; %bb.0: ; %entry 4469; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4470; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4471; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4472; GFX7-NEXT: s_add_u32 s4, s0, 16 4473; GFX7-NEXT: s_addc_u32 s5, s1, 0 4474; GFX7-NEXT: v_mov_b32_e32 v0, s4 4475; GFX7-NEXT: v_mov_b32_e32 v2, s2 4476; GFX7-NEXT: v_mov_b32_e32 v1, s5 4477; GFX7-NEXT: v_mov_b32_e32 v3, s3 4478; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4479; GFX7-NEXT: v_mov_b32_e32 v0, s0 4480; GFX7-NEXT: v_mov_b32_e32 v1, s1 4481; GFX7-NEXT: s_waitcnt vmcnt(0) 4482; GFX7-NEXT: flat_store_dword v[0:1], v2 4483; GFX7-NEXT: s_endpgm 4484; 4485; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 4486; GFX10-WGP: ; %bb.0: ; %entry 4487; GFX10-WGP-NEXT: s_clause 0x1 4488; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4489; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4490; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4491; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4492; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4493; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4494; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4495; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4496; GFX10-WGP-NEXT: buffer_gl0_inv 4497; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4498; GFX10-WGP-NEXT: s_endpgm 4499; 4500; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 4501; GFX10-CU: ; %bb.0: ; %entry 4502; GFX10-CU-NEXT: s_clause 0x1 4503; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4504; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4505; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4506; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4507; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4508; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4509; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4510; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4511; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4512; GFX10-CU-NEXT: s_endpgm 4513; 4514; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: 4515; SKIP-CACHE-INV: ; %bb.0: ; %entry 4516; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4517; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4518; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4519; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4520; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4521; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4522; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4523; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4524; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4525; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4526; SKIP-CACHE-INV-NEXT: s_endpgm 4527 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4528entry: 4529 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4530 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic 4531 %val0 = extractvalue { i32, i1 } %val, 0 4532 store i32 %val0, i32 addrspace(1)* %out, align 4 4533 ret void 4534} 4535 4536define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( 4537; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 4538; GFX6: ; %bb.0: ; %entry 4539; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4540; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4541; GFX6-NEXT: s_mov_b32 s7, 0xf000 4542; GFX6-NEXT: s_mov_b32 s6, -1 4543; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4544; GFX6-NEXT: v_mov_b32_e32 v0, s0 4545; GFX6-NEXT: v_mov_b32_e32 v1, s1 4546; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4547; GFX6-NEXT: s_waitcnt vmcnt(0) 4548; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4549; GFX6-NEXT: s_endpgm 4550; 4551; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 4552; GFX7: ; %bb.0: ; %entry 4553; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4554; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4555; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4556; GFX7-NEXT: s_add_u32 s4, s0, 16 4557; GFX7-NEXT: s_addc_u32 s5, s1, 0 4558; GFX7-NEXT: v_mov_b32_e32 v0, s4 4559; GFX7-NEXT: v_mov_b32_e32 v2, s2 4560; GFX7-NEXT: v_mov_b32_e32 v1, s5 4561; GFX7-NEXT: v_mov_b32_e32 v3, s3 4562; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4563; GFX7-NEXT: v_mov_b32_e32 v0, s0 4564; GFX7-NEXT: v_mov_b32_e32 v1, s1 4565; GFX7-NEXT: s_waitcnt vmcnt(0) 4566; GFX7-NEXT: flat_store_dword v[0:1], v2 4567; GFX7-NEXT: s_endpgm 4568; 4569; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 4570; GFX10-WGP: ; %bb.0: ; %entry 4571; GFX10-WGP-NEXT: s_clause 0x1 4572; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4573; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4574; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4575; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4576; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4577; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4578; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4579; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4580; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4581; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4582; GFX10-WGP-NEXT: buffer_gl0_inv 4583; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4584; GFX10-WGP-NEXT: s_endpgm 4585; 4586; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 4587; GFX10-CU: ; %bb.0: ; %entry 4588; GFX10-CU-NEXT: s_clause 0x1 4589; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4590; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4591; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4592; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4593; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4594; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4595; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4596; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4597; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4598; GFX10-CU-NEXT: s_endpgm 4599; 4600; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: 4601; SKIP-CACHE-INV: ; %bb.0: ; %entry 4602; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4603; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4604; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4605; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4606; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4607; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4608; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4609; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4610; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4611; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4612; SKIP-CACHE-INV-NEXT: s_endpgm 4613 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4614entry: 4615 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4616 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic 4617 %val0 = extractvalue { i32, i1 } %val, 0 4618 store i32 %val0, i32 addrspace(1)* %out, align 4 4619 ret void 4620} 4621 4622define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( 4623; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 4624; GFX6: ; %bb.0: ; %entry 4625; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4626; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4627; GFX6-NEXT: s_mov_b32 s7, 0xf000 4628; GFX6-NEXT: s_mov_b32 s6, -1 4629; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4630; GFX6-NEXT: v_mov_b32_e32 v0, s0 4631; GFX6-NEXT: v_mov_b32_e32 v1, s1 4632; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4633; GFX6-NEXT: s_waitcnt vmcnt(0) 4634; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4635; GFX6-NEXT: s_endpgm 4636; 4637; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 4638; GFX7: ; %bb.0: ; %entry 4639; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4640; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4641; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4642; GFX7-NEXT: s_add_u32 s4, s0, 16 4643; GFX7-NEXT: s_addc_u32 s5, s1, 0 4644; GFX7-NEXT: v_mov_b32_e32 v0, s4 4645; GFX7-NEXT: v_mov_b32_e32 v2, s2 4646; GFX7-NEXT: v_mov_b32_e32 v1, s5 4647; GFX7-NEXT: v_mov_b32_e32 v3, s3 4648; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4649; GFX7-NEXT: v_mov_b32_e32 v0, s0 4650; GFX7-NEXT: v_mov_b32_e32 v1, s1 4651; GFX7-NEXT: s_waitcnt vmcnt(0) 4652; GFX7-NEXT: flat_store_dword v[0:1], v2 4653; GFX7-NEXT: s_endpgm 4654; 4655; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 4656; GFX10-WGP: ; %bb.0: ; %entry 4657; GFX10-WGP-NEXT: s_clause 0x1 4658; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4659; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4660; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4661; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4662; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4663; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4664; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4665; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4666; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4667; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4668; GFX10-WGP-NEXT: buffer_gl0_inv 4669; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4670; GFX10-WGP-NEXT: s_endpgm 4671; 4672; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 4673; GFX10-CU: ; %bb.0: ; %entry 4674; GFX10-CU-NEXT: s_clause 0x1 4675; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4676; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4677; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4678; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4679; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4680; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4681; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4682; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4683; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4684; GFX10-CU-NEXT: s_endpgm 4685; 4686; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: 4687; SKIP-CACHE-INV: ; %bb.0: ; %entry 4688; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4689; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4690; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4691; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4692; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4693; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4694; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4695; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4696; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4697; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4698; SKIP-CACHE-INV-NEXT: s_endpgm 4699 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4700entry: 4701 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4702 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic 4703 %val0 = extractvalue { i32, i1 } %val, 0 4704 store i32 %val0, i32 addrspace(1)* %out, align 4 4705 ret void 4706} 4707 4708define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( 4709; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: 4710; GFX6: ; %bb.0: ; %entry 4711; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4712; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4713; GFX6-NEXT: s_mov_b32 s7, 0xf000 4714; GFX6-NEXT: s_mov_b32 s6, -1 4715; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4716; GFX6-NEXT: v_mov_b32_e32 v0, s0 4717; GFX6-NEXT: v_mov_b32_e32 v1, s1 4718; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4719; GFX6-NEXT: s_waitcnt vmcnt(0) 4720; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4721; GFX6-NEXT: s_endpgm 4722; 4723; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: 4724; GFX7: ; %bb.0: ; %entry 4725; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4726; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4727; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4728; GFX7-NEXT: s_add_u32 s4, s0, 16 4729; GFX7-NEXT: s_addc_u32 s5, s1, 0 4730; GFX7-NEXT: v_mov_b32_e32 v0, s4 4731; GFX7-NEXT: v_mov_b32_e32 v2, s2 4732; GFX7-NEXT: v_mov_b32_e32 v1, s5 4733; GFX7-NEXT: v_mov_b32_e32 v3, s3 4734; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4735; GFX7-NEXT: v_mov_b32_e32 v0, s0 4736; GFX7-NEXT: v_mov_b32_e32 v1, s1 4737; GFX7-NEXT: s_waitcnt vmcnt(0) 4738; GFX7-NEXT: flat_store_dword v[0:1], v2 4739; GFX7-NEXT: s_endpgm 4740; 4741; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: 4742; GFX10-WGP: ; %bb.0: ; %entry 4743; GFX10-WGP-NEXT: s_clause 0x1 4744; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4745; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4746; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4747; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4748; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4749; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4750; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4751; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4752; GFX10-WGP-NEXT: buffer_gl0_inv 4753; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4754; GFX10-WGP-NEXT: s_endpgm 4755; 4756; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: 4757; GFX10-CU: ; %bb.0: ; %entry 4758; GFX10-CU-NEXT: s_clause 0x1 4759; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4760; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4761; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4762; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4763; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4764; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4765; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4766; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4767; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4768; GFX10-CU-NEXT: s_endpgm 4769; 4770; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: 4771; SKIP-CACHE-INV: ; %bb.0: ; %entry 4772; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4773; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4774; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4775; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4776; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4777; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4778; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4779; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4780; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4781; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4782; SKIP-CACHE-INV-NEXT: s_endpgm 4783 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4784entry: 4785 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4786 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire 4787 %val0 = extractvalue { i32, i1 } %val, 0 4788 store i32 %val0, i32 addrspace(1)* %out, align 4 4789 ret void 4790} 4791 4792define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( 4793; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: 4794; GFX6: ; %bb.0: ; %entry 4795; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4796; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4797; GFX6-NEXT: s_mov_b32 s7, 0xf000 4798; GFX6-NEXT: s_mov_b32 s6, -1 4799; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4800; GFX6-NEXT: v_mov_b32_e32 v0, s0 4801; GFX6-NEXT: v_mov_b32_e32 v1, s1 4802; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4803; GFX6-NEXT: s_waitcnt vmcnt(0) 4804; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4805; GFX6-NEXT: s_endpgm 4806; 4807; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: 4808; GFX7: ; %bb.0: ; %entry 4809; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4810; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4811; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4812; GFX7-NEXT: s_add_u32 s4, s0, 16 4813; GFX7-NEXT: s_addc_u32 s5, s1, 0 4814; GFX7-NEXT: v_mov_b32_e32 v0, s4 4815; GFX7-NEXT: v_mov_b32_e32 v2, s2 4816; GFX7-NEXT: v_mov_b32_e32 v1, s5 4817; GFX7-NEXT: v_mov_b32_e32 v3, s3 4818; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4819; GFX7-NEXT: v_mov_b32_e32 v0, s0 4820; GFX7-NEXT: v_mov_b32_e32 v1, s1 4821; GFX7-NEXT: s_waitcnt vmcnt(0) 4822; GFX7-NEXT: flat_store_dword v[0:1], v2 4823; GFX7-NEXT: s_endpgm 4824; 4825; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: 4826; GFX10-WGP: ; %bb.0: ; %entry 4827; GFX10-WGP-NEXT: s_clause 0x1 4828; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4829; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4830; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4831; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4832; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4833; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4834; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4835; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4836; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4837; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4838; GFX10-WGP-NEXT: buffer_gl0_inv 4839; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4840; GFX10-WGP-NEXT: s_endpgm 4841; 4842; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: 4843; GFX10-CU: ; %bb.0: ; %entry 4844; GFX10-CU-NEXT: s_clause 0x1 4845; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4846; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4847; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4848; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4849; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4850; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4851; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4852; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4853; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4854; GFX10-CU-NEXT: s_endpgm 4855; 4856; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: 4857; SKIP-CACHE-INV: ; %bb.0: ; %entry 4858; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4859; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4860; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4861; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4862; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4863; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4864; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4865; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4866; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4867; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4868; SKIP-CACHE-INV-NEXT: s_endpgm 4869 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4870entry: 4871 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4872 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire 4873 %val0 = extractvalue { i32, i1 } %val, 0 4874 store i32 %val0, i32 addrspace(1)* %out, align 4 4875 ret void 4876} 4877 4878define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( 4879; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 4880; GFX6: ; %bb.0: ; %entry 4881; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4882; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4883; GFX6-NEXT: s_mov_b32 s7, 0xf000 4884; GFX6-NEXT: s_mov_b32 s6, -1 4885; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4886; GFX6-NEXT: v_mov_b32_e32 v0, s0 4887; GFX6-NEXT: v_mov_b32_e32 v1, s1 4888; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4889; GFX6-NEXT: s_waitcnt vmcnt(0) 4890; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4891; GFX6-NEXT: s_endpgm 4892; 4893; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 4894; GFX7: ; %bb.0: ; %entry 4895; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4896; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4897; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4898; GFX7-NEXT: s_add_u32 s4, s0, 16 4899; GFX7-NEXT: s_addc_u32 s5, s1, 0 4900; GFX7-NEXT: v_mov_b32_e32 v0, s4 4901; GFX7-NEXT: v_mov_b32_e32 v2, s2 4902; GFX7-NEXT: v_mov_b32_e32 v1, s5 4903; GFX7-NEXT: v_mov_b32_e32 v3, s3 4904; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4905; GFX7-NEXT: v_mov_b32_e32 v0, s0 4906; GFX7-NEXT: v_mov_b32_e32 v1, s1 4907; GFX7-NEXT: s_waitcnt vmcnt(0) 4908; GFX7-NEXT: flat_store_dword v[0:1], v2 4909; GFX7-NEXT: s_endpgm 4910; 4911; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 4912; GFX10-WGP: ; %bb.0: ; %entry 4913; GFX10-WGP-NEXT: s_clause 0x1 4914; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4915; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4916; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 4917; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 4918; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 4919; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 4920; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4921; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 4922; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4923; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 4924; GFX10-WGP-NEXT: buffer_gl0_inv 4925; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 4926; GFX10-WGP-NEXT: s_endpgm 4927; 4928; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 4929; GFX10-CU: ; %bb.0: ; %entry 4930; GFX10-CU-NEXT: s_clause 0x1 4931; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4932; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4933; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 4934; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 4935; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 4936; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 4937; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 4938; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 4939; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 4940; GFX10-CU-NEXT: s_endpgm 4941; 4942; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: 4943; SKIP-CACHE-INV: ; %bb.0: ; %entry 4944; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4945; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4946; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 4947; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 4948; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 4949; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 4950; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 4951; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4952; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 4953; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 4954; SKIP-CACHE-INV-NEXT: s_endpgm 4955 i32 addrspace(1)* %out, i32 %in, i32 %old) { 4956entry: 4957 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 4958 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire 4959 %val0 = extractvalue { i32, i1 } %val, 0 4960 store i32 %val0, i32 addrspace(1)* %out, align 4 4961 ret void 4962} 4963 4964define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( 4965; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 4966; GFX6: ; %bb.0: ; %entry 4967; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4968; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4969; GFX6-NEXT: s_mov_b32 s7, 0xf000 4970; GFX6-NEXT: s_mov_b32 s6, -1 4971; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4972; GFX6-NEXT: v_mov_b32_e32 v0, s0 4973; GFX6-NEXT: v_mov_b32_e32 v1, s1 4974; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 4975; GFX6-NEXT: s_waitcnt vmcnt(0) 4976; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4977; GFX6-NEXT: s_endpgm 4978; 4979; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 4980; GFX7: ; %bb.0: ; %entry 4981; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4982; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 4983; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4984; GFX7-NEXT: s_add_u32 s4, s0, 16 4985; GFX7-NEXT: s_addc_u32 s5, s1, 0 4986; GFX7-NEXT: v_mov_b32_e32 v0, s4 4987; GFX7-NEXT: v_mov_b32_e32 v2, s2 4988; GFX7-NEXT: v_mov_b32_e32 v1, s5 4989; GFX7-NEXT: v_mov_b32_e32 v3, s3 4990; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 4991; GFX7-NEXT: v_mov_b32_e32 v0, s0 4992; GFX7-NEXT: v_mov_b32_e32 v1, s1 4993; GFX7-NEXT: s_waitcnt vmcnt(0) 4994; GFX7-NEXT: flat_store_dword v[0:1], v2 4995; GFX7-NEXT: s_endpgm 4996; 4997; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 4998; GFX10-WGP: ; %bb.0: ; %entry 4999; GFX10-WGP-NEXT: s_clause 0x1 5000; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5001; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5002; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 5003; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5004; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5005; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5006; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5007; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5008; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 5009; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5010; GFX10-WGP-NEXT: buffer_gl0_inv 5011; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 5012; GFX10-WGP-NEXT: s_endpgm 5013; 5014; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5015; GFX10-CU: ; %bb.0: ; %entry 5016; GFX10-CU-NEXT: s_clause 0x1 5017; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5018; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5019; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 5020; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5021; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5022; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5023; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 5024; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5025; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 5026; GFX10-CU-NEXT: s_endpgm 5027; 5028; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: 5029; SKIP-CACHE-INV: ; %bb.0: ; %entry 5030; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5031; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5032; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 5033; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 5034; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5035; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5036; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5037; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 5038; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5039; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 5040; SKIP-CACHE-INV-NEXT: s_endpgm 5041 i32 addrspace(1)* %out, i32 %in, i32 %old) { 5042entry: 5043 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 5044 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire 5045 %val0 = extractvalue { i32, i1 } %val, 0 5046 store i32 %val0, i32 addrspace(1)* %out, align 4 5047 ret void 5048} 5049 5050define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( 5051; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5052; GFX6: ; %bb.0: ; %entry 5053; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5054; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5055; GFX6-NEXT: s_mov_b32 s7, 0xf000 5056; GFX6-NEXT: s_mov_b32 s6, -1 5057; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5058; GFX6-NEXT: v_mov_b32_e32 v0, s0 5059; GFX6-NEXT: v_mov_b32_e32 v1, s1 5060; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 5061; GFX6-NEXT: s_waitcnt vmcnt(0) 5062; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5063; GFX6-NEXT: s_endpgm 5064; 5065; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5066; GFX7: ; %bb.0: ; %entry 5067; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5068; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 5069; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5070; GFX7-NEXT: s_add_u32 s4, s0, 16 5071; GFX7-NEXT: s_addc_u32 s5, s1, 0 5072; GFX7-NEXT: v_mov_b32_e32 v0, s4 5073; GFX7-NEXT: v_mov_b32_e32 v2, s2 5074; GFX7-NEXT: v_mov_b32_e32 v1, s5 5075; GFX7-NEXT: v_mov_b32_e32 v3, s3 5076; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 5077; GFX7-NEXT: v_mov_b32_e32 v0, s0 5078; GFX7-NEXT: v_mov_b32_e32 v1, s1 5079; GFX7-NEXT: s_waitcnt vmcnt(0) 5080; GFX7-NEXT: flat_store_dword v[0:1], v2 5081; GFX7-NEXT: s_endpgm 5082; 5083; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5084; GFX10-WGP: ; %bb.0: ; %entry 5085; GFX10-WGP-NEXT: s_clause 0x1 5086; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5087; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5088; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 5089; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 5090; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 5091; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 5092; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5093; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 5094; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 5095; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 5096; GFX10-WGP-NEXT: buffer_gl0_inv 5097; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] 5098; GFX10-WGP-NEXT: s_endpgm 5099; 5100; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5101; GFX10-CU: ; %bb.0: ; %entry 5102; GFX10-CU-NEXT: s_clause 0x1 5103; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5104; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5105; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 5106; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 5107; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 5108; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 5109; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc 5110; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 5111; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] 5112; GFX10-CU-NEXT: s_endpgm 5113; 5114; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: 5115; SKIP-CACHE-INV: ; %bb.0: ; %entry 5116; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5117; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5118; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 5119; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 5120; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 5121; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 5122; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 5123; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc 5124; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 5125; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 5126; SKIP-CACHE-INV-NEXT: s_endpgm 5127 i32 addrspace(1)* %out, i32 %in, i32 %old) { 5128entry: 5129 %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 5130 %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst 5131 %val0 = extractvalue { i32, i1 } %val, 0 5132 store i32 %val0, i32 addrspace(1)* %out, align 4 5133 ret void 5134} 5135 5136