1; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s 3; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s 5 6declare i32 @llvm.amdgcn.workitem.id.x() 7 8; GCN-LABEL: {{^}}system_unordered: 9; GCN-NOT: s_waitcnt vmcnt(0){{$}} 10; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 11; GCN-NOT: s_waitcnt vmcnt(0){{$}} 12; GFX89-NOT: buffer_wbinvl1_vol 13; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 14define amdgpu_kernel void @system_unordered( 15 i32* %in, i32* %out) { 16entry: 17 %val = load atomic i32, i32* %in unordered, align 4 18 store i32 %val, i32* %out 19 ret void 20} 21 22; GCN-LABEL: {{^}}system_monotonic: 23; GCN-NOT: s_waitcnt vmcnt(0){{$}} 24; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 25; GCN-NOT: s_waitcnt vmcnt(0){{$}} 26; GFX89-NOT: buffer_wbinvl1_vol 27; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 28define amdgpu_kernel void @system_monotonic( 29 i32* %in, i32* %out) { 30entry: 31 %val = load atomic i32, i32* %in monotonic, align 4 32 store i32 %val, i32* %out 33 ret void 34} 35 36; GCN-LABEL: {{^}}system_acquire: 37; GCN-NOT: s_waitcnt vmcnt(0){{$}} 38; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 39; GCN-NEXT: s_waitcnt vmcnt(0){{$}} 40; GFX89-NEXT: buffer_wbinvl1_vol 41; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 42define amdgpu_kernel void @system_acquire( 43 i32* %in, i32* %out) { 44entry: 45 %val = load atomic i32, i32* %in acquire, align 4 46 store i32 %val, i32* %out 47 ret void 48} 49 50; GCN-LABEL: {{^}}system_seq_cst: 51; GCN: s_waitcnt vmcnt(0){{$}} 52; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 53; GCN-NEXT: s_waitcnt vmcnt(0){{$}} 54; GFX89-NEXT: buffer_wbinvl1_vol 55; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 56define amdgpu_kernel void @system_seq_cst( 57 i32* %in, i32* %out) { 58entry: 59 %val = load atomic i32, i32* %in seq_cst, align 4 60 store i32 %val, i32* %out 61 ret void 62} 63 64; GCN-LABEL: {{^}}singlethread_unordered: 65; GCN-NOT: s_waitcnt vmcnt(0){{$}} 66; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 67; GCN-NOT: s_waitcnt vmcnt(0){{$}} 68; GFX89-NOT: buffer_wbinvl1_vol 69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 70define amdgpu_kernel void @singlethread_unordered( 71 i32* %in, i32* %out) { 72entry: 73 %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 74 store i32 %val, i32* %out 75 ret void 76} 77 78; GCN-LABEL: {{^}}singlethread_monotonic: 79; GCN-NOT: s_waitcnt vmcnt(0){{$}} 80; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 81; GCN-NOT: s_waitcnt vmcnt(0){{$}} 82; GFX89-NOT: buffer_wbinvl1_vol 83; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 84define amdgpu_kernel void @singlethread_monotonic( 85 i32* %in, i32* %out) { 86entry: 87 %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 88 store i32 %val, i32* %out 89 ret void 90} 91 92; GCN-LABEL: {{^}}singlethread_acquire: 93; GCN-NOT: s_waitcnt vmcnt(0){{$}} 94; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 95; GCN-NOT: s_waitcnt vmcnt(0){{$}} 96; GFX89-NOT: buffer_wbinvl1_vol 97; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 98define amdgpu_kernel void @singlethread_acquire( 99 i32* %in, i32* %out) { 100entry: 101 %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 102 store i32 %val, i32* %out 103 ret void 104} 105 106; GCN-LABEL: {{^}}singlethread_seq_cst: 107; GCN-NOT: s_waitcnt vmcnt(0){{$}} 108; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 109; GCN-NOT: s_waitcnt vmcnt(0){{$}} 110; GFX89-NOT: buffer_wbinvl1_vol 111; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 112define amdgpu_kernel void @singlethread_seq_cst( 113 i32* %in, i32* %out) { 114entry: 115 %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 116 store i32 %val, i32* %out 117 ret void 118} 119 120; GCN-LABEL: {{^}}agent_unordered: 121; GCN-NOT: s_waitcnt vmcnt(0){{$}} 122; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 123; GCN-NOT: s_waitcnt vmcnt(0){{$}} 124; GFX89-NOT: buffer_wbinvl1_vol 125; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 126define amdgpu_kernel void @agent_unordered( 127 i32* %in, i32* %out) { 128entry: 129 %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 130 store i32 %val, i32* %out 131 ret void 132} 133 134; GCN-LABEL: {{^}}agent_monotonic: 135; GCN-NOT: s_waitcnt vmcnt(0){{$}} 136; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 137; GCN-NOT: s_waitcnt vmcnt(0){{$}} 138; GFX89-NOT: buffer_wbinvl1_vol 139; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 140define amdgpu_kernel void @agent_monotonic( 141 i32* %in, i32* %out) { 142entry: 143 %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 144 store i32 %val, i32* %out 145 ret void 146} 147 148; GCN-LABEL: {{^}}agent_acquire: 149; GCN-NOT: s_waitcnt vmcnt(0){{$}} 150; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 151; GCN-NEXT: s_waitcnt vmcnt(0){{$}} 152; GFX89-NEXT: buffer_wbinvl1_vol 153; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 154define amdgpu_kernel void @agent_acquire( 155 i32* %in, i32* %out) { 156entry: 157 %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 158 store i32 %val, i32* %out 159 ret void 160} 161 162; GCN-LABEL: {{^}}agent_seq_cst: 163; GCN: s_waitcnt vmcnt(0){{$}} 164; GCN-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} 165; GCN-NEXT: s_waitcnt vmcnt(0){{$}} 166; GFX89-NEXT: buffer_wbinvl1_vol 167; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 168define amdgpu_kernel void @agent_seq_cst( 169 i32* %in, i32* %out) { 170entry: 171 %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 172 store i32 %val, i32* %out 173 ret void 174} 175 176; GCN-LABEL: {{^}}workgroup_unordered: 177; GCN-NOT: s_waitcnt vmcnt(0){{$}} 178; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 179; GCN-NOT: s_waitcnt vmcnt(0){{$}} 180; GFX89-NOT: buffer_wbinvl1_vol 181; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 182define amdgpu_kernel void @workgroup_unordered( 183 i32* %in, i32* %out) { 184entry: 185 %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 186 store i32 %val, i32* %out 187 ret void 188} 189 190; GCN-LABEL: {{^}}workgroup_monotonic: 191; GCN-NOT: s_waitcnt vmcnt(0){{$}} 192; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 193; GCN-NOT: s_waitcnt vmcnt(0){{$}} 194; GFX89-NOT: buffer_wbinvl1_vol 195; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 196define amdgpu_kernel void @workgroup_monotonic( 197 i32* %in, i32* %out) { 198entry: 199 %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 200 store i32 %val, i32* %out 201 ret void 202} 203 204; GCN-LABEL: {{^}}workgroup_acquire: 205; GCN-NOT: s_waitcnt vmcnt(0){{$}} 206; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 207; GFX89-NOT: s_waitcnt vmcnt(0){{$}} 208; GFX89-NOT: buffer_wbinvl1_vol 209; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 210define amdgpu_kernel void @workgroup_acquire( 211 i32* %in, i32* %out) { 212entry: 213 %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 214 store i32 %val, i32* %out 215 ret void 216} 217 218; GCN-LABEL: {{^}}workgroup_seq_cst: 219; GFX89-NOT: s_waitcnt vmcnt(0){{$}} 220; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 221; GFX89-NOT: s_waitcnt vmcnt(0){{$}} 222; GFX89-NOT: buffer_wbinvl1_vol 223; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 224define amdgpu_kernel void @workgroup_seq_cst( 225 i32* %in, i32* %out) { 226entry: 227 %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 228 store i32 %val, i32* %out 229 ret void 230} 231 232; GCN-LABEL: {{^}}wavefront_unordered: 233; GCN-NOT: s_waitcnt vmcnt(0){{$}} 234; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 235; GCN-NOT: s_waitcnt vmcnt(0){{$}} 236; GFX89-NOT: buffer_wbinvl1_vol 237; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 238define amdgpu_kernel void @wavefront_unordered( 239 i32* %in, i32* %out) { 240entry: 241 %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 242 store i32 %val, i32* %out 243 ret void 244} 245 246; GCN-LABEL: {{^}}wavefront_monotonic: 247; GCN-NOT: s_waitcnt vmcnt(0){{$}} 248; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 249; GCN-NOT: s_waitcnt vmcnt(0){{$}} 250; GFX89-NOT: buffer_wbinvl1_vol 251; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 252define amdgpu_kernel void @wavefront_monotonic( 253 i32* %in, i32* %out) { 254entry: 255 %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 256 store i32 %val, i32* %out 257 ret void 258} 259 260; GCN-LABEL: {{^}}wavefront_acquire: 261; GCN-NOT: s_waitcnt vmcnt(0){{$}} 262; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 263; GCN-NOT: s_waitcnt vmcnt(0){{$}} 264; GFX89-NOT: buffer_wbinvl1_vol 265; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 266define amdgpu_kernel void @wavefront_acquire( 267 i32* %in, i32* %out) { 268entry: 269 %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 270 store i32 %val, i32* %out 271 ret void 272} 273 274; GCN-LABEL: {{^}}wavefront_seq_cst: 275; GCN-NOT: s_waitcnt vmcnt(0){{$}} 276; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} 277; GCN-NOT: s_waitcnt vmcnt(0){{$}} 278; GFX89-NOT: buffer_wbinvl1_vol 279; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] 280define amdgpu_kernel void @wavefront_seq_cst( 281 i32* %in, i32* %out) { 282entry: 283 %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 284 store i32 %val, i32* %out 285 ret void 286} 287 288; GCN-LABEL: {{^}}nontemporal_private_0: 289; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} 290define amdgpu_kernel void @nontemporal_private_0( 291 i32 addrspace(5)* %in, i32* %out) { 292entry: 293 %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 294 store i32 %val, i32* %out 295 ret void 296} 297 298; GCN-LABEL: {{^}}nontemporal_private_1: 299; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} 300define amdgpu_kernel void @nontemporal_private_1( 301 i32 addrspace(5)* %in, i32* %out) { 302entry: 303 %tid = call i32 @llvm.amdgcn.workitem.id.x() 304 %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid 305 %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 306 store i32 %val, i32* %out 307 ret void 308} 309 310; GCN-LABEL: {{^}}nontemporal_global_0: 311; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} 312define amdgpu_kernel void @nontemporal_global_0( 313 i32 addrspace(1)* %in, i32* %out) { 314entry: 315 %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 316 store i32 %val, i32* %out 317 ret void 318} 319 320; GCN-LABEL: {{^}}nontemporal_global_1: 321; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} 322; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} 323define amdgpu_kernel void @nontemporal_global_1( 324 i32 addrspace(1)* %in, i32* %out) { 325entry: 326 %tid = call i32 @llvm.amdgcn.workitem.id.x() 327 %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid 328 %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 329 store i32 %val, i32* %out 330 ret void 331} 332 333; GCN-LABEL: {{^}}nontemporal_local_0: 334; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} 335define amdgpu_kernel void @nontemporal_local_0( 336 i32 addrspace(3)* %in, i32* %out) { 337entry: 338 %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 339 store i32 %val, i32* %out 340 ret void 341} 342 343; GCN-LABEL: {{^}}nontemporal_local_1: 344; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} 345define amdgpu_kernel void @nontemporal_local_1( 346 i32 addrspace(3)* %in, i32* %out) { 347entry: 348 %tid = call i32 @llvm.amdgcn.workitem.id.x() 349 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid 350 %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 351 store i32 %val, i32* %out 352 ret void 353} 354 355; GCN-LABEL: {{^}}nontemporal_flat_0: 356; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} 357define amdgpu_kernel void @nontemporal_flat_0( 358 i32* %in, i32* %out) { 359entry: 360 %val = load i32, i32* %in, align 4, !nontemporal !0 361 store i32 %val, i32* %out 362 ret void 363} 364 365; GCN-LABEL: {{^}}nontemporal_flat_1: 366; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} 367define amdgpu_kernel void @nontemporal_flat_1( 368 i32* %in, i32* %out) { 369entry: 370 %tid = call i32 @llvm.amdgcn.workitem.id.x() 371 %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid 372 %val = load i32, i32* %val.gep, align 4, !nontemporal !0 373 store i32 %val, i32* %out 374 ret void 375} 376 377!0 = !{i32 1} 378