1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7 8define amdgpu_kernel void @global_nontemporal_load_0( 9; GFX6-LABEL: global_nontemporal_load_0: 10; GFX6: ; %bb.0: ; %entry 11; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; GFX6-NEXT: s_mov_b32 s7, 0xf000 13; GFX6-NEXT: s_mov_b32 s6, -1 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 16; GFX6-NEXT: s_mov_b32 s4, s2 17; GFX6-NEXT: s_mov_b32 s5, s3 18; GFX6-NEXT: s_waitcnt lgkmcnt(0) 19; GFX6-NEXT: v_mov_b32_e32 v0, s0 20; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 21; GFX6-NEXT: s_endpgm 22; 23; GFX7-LABEL: global_nontemporal_load_0: 24; GFX7: ; %bb.0: ; %entry 25; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 26; GFX7-NEXT: s_waitcnt lgkmcnt(0) 27; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 28; GFX7-NEXT: v_mov_b32_e32 v0, s2 29; GFX7-NEXT: v_mov_b32_e32 v1, s3 30; GFX7-NEXT: s_waitcnt lgkmcnt(0) 31; GFX7-NEXT: v_mov_b32_e32 v2, s0 32; GFX7-NEXT: flat_store_dword v[0:1], v2 33; GFX7-NEXT: s_endpgm 34; 35; GFX10-WGP-LABEL: global_nontemporal_load_0: 36; GFX10-WGP: ; %bb.0: ; %entry 37; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 38; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 39; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 41; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 42; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 43; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] 44; GFX10-WGP-NEXT: s_endpgm 45; 46; GFX10-CU-LABEL: global_nontemporal_load_0: 47; GFX10-CU: ; %bb.0: ; %entry 48; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 49; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 50; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 51; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 52; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 53; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 54; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] 55; GFX10-CU-NEXT: s_endpgm 56; 57; SKIP-CACHE-INV-LABEL: global_nontemporal_load_0: 58; SKIP-CACHE-INV: ; %bb.0: ; %entry 59; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 60; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 61; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 62; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 63; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 64; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 65; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 66; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 67; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 68; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 69; SKIP-CACHE-INV-NEXT: s_endpgm 70 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 71entry: 72 %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 73 store i32 %val, i32 addrspace(1)* %out 74 ret void 75} 76 77define amdgpu_kernel void @global_nontemporal_load_1( 78; GFX6-LABEL: global_nontemporal_load_1: 79; GFX6: ; %bb.0: ; %entry 80; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 81; GFX6-NEXT: s_mov_b32 s3, 0xf000 82; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 83; GFX6-NEXT: v_mov_b32_e32 v1, 0 84; GFX6-NEXT: s_mov_b32 s2, -1 85; GFX6-NEXT: s_waitcnt lgkmcnt(0) 86; GFX6-NEXT: s_mov_b32 s0, s6 87; GFX6-NEXT: s_mov_b32 s1, s7 88; GFX6-NEXT: s_mov_b32 s6, 0 89; GFX6-NEXT: s_mov_b32 s7, s3 90; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc 91; GFX6-NEXT: s_waitcnt vmcnt(0) 92; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 93; GFX6-NEXT: s_endpgm 94; 95; GFX7-LABEL: global_nontemporal_load_1: 96; GFX7: ; %bb.0: ; %entry 97; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 98; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 99; GFX7-NEXT: s_waitcnt lgkmcnt(0) 100; GFX7-NEXT: v_mov_b32_e32 v3, s1 101; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 102; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 103; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc 104; GFX7-NEXT: v_mov_b32_e32 v0, s2 105; GFX7-NEXT: v_mov_b32_e32 v1, s3 106; GFX7-NEXT: s_waitcnt vmcnt(0) 107; GFX7-NEXT: flat_store_dword v[0:1], v2 108; GFX7-NEXT: s_endpgm 109; 110; GFX10-WGP-LABEL: global_nontemporal_load_1: 111; GFX10-WGP: ; %bb.0: ; %entry 112; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 113; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 114; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 115; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 116; GFX10-WGP-NEXT: global_load_dword v0, v0, s[0:1] slc 117; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 118; GFX10-WGP-NEXT: global_store_dword v1, v0, s[2:3] 119; GFX10-WGP-NEXT: s_endpgm 120; 121; GFX10-CU-LABEL: global_nontemporal_load_1: 122; GFX10-CU: ; %bb.0: ; %entry 123; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 124; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 125; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 126; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 127; GFX10-CU-NEXT: global_load_dword v0, v0, s[0:1] slc 128; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 129; GFX10-CU-NEXT: global_store_dword v1, v0, s[2:3] 130; GFX10-CU-NEXT: s_endpgm 131; 132; SKIP-CACHE-INV-LABEL: global_nontemporal_load_1: 133; SKIP-CACHE-INV: ; %bb.0: ; %entry 134; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 135; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 136; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 137; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 138; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 139; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 140; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6 141; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 142; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 143; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 144; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc 145; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 146; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 147; SKIP-CACHE-INV-NEXT: s_endpgm 148 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 149entry: 150 %tid = call i32 @llvm.amdgcn.workitem.id.x() 151 %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid 152 %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 153 store i32 %val, i32 addrspace(1)* %out 154 ret void 155} 156 157define amdgpu_kernel void @global_nontemporal_store_0( 158; GFX6-LABEL: global_nontemporal_store_0: 159; GFX6: ; %bb.0: ; %entry 160; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 161; GFX6-NEXT: s_mov_b32 s7, 0xf000 162; GFX6-NEXT: s_mov_b32 s6, -1 163; GFX6-NEXT: s_waitcnt lgkmcnt(0) 164; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 165; GFX6-NEXT: s_mov_b32 s4, s2 166; GFX6-NEXT: s_mov_b32 s5, s3 167; GFX6-NEXT: s_waitcnt lgkmcnt(0) 168; GFX6-NEXT: v_mov_b32_e32 v0, s0 169; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc 170; GFX6-NEXT: s_endpgm 171; 172; GFX7-LABEL: global_nontemporal_store_0: 173; GFX7: ; %bb.0: ; %entry 174; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 175; GFX7-NEXT: s_waitcnt lgkmcnt(0) 176; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 177; GFX7-NEXT: v_mov_b32_e32 v0, s2 178; GFX7-NEXT: v_mov_b32_e32 v1, s3 179; GFX7-NEXT: s_waitcnt lgkmcnt(0) 180; GFX7-NEXT: v_mov_b32_e32 v2, s0 181; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc 182; GFX7-NEXT: s_endpgm 183; 184; GFX10-WGP-LABEL: global_nontemporal_store_0: 185; GFX10-WGP: ; %bb.0: ; %entry 186; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 187; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 188; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 189; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 190; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 191; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 192; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc 193; GFX10-WGP-NEXT: s_endpgm 194; 195; GFX10-CU-LABEL: global_nontemporal_store_0: 196; GFX10-CU: ; %bb.0: ; %entry 197; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 198; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 199; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 200; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 201; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 202; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 203; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc 204; GFX10-CU-NEXT: s_endpgm 205; 206; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0: 207; SKIP-CACHE-INV: ; %bb.0: ; %entry 208; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 209; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 210; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 211; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 212; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 213; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2 214; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3 215; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 216; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 217; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc 218; SKIP-CACHE-INV-NEXT: s_endpgm 219 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 220entry: 221 %val = load i32, i32 addrspace(1)* %in, align 4 222 store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 223 ret void 224} 225 226define amdgpu_kernel void @global_nontemporal_store_1( 227; GFX6-LABEL: global_nontemporal_store_1: 228; GFX6: ; %bb.0: ; %entry 229; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 230; GFX6-NEXT: s_mov_b32 s7, 0xf000 231; GFX6-NEXT: s_mov_b32 s6, 0 232; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 233; GFX6-NEXT: v_mov_b32_e32 v1, 0 234; GFX6-NEXT: s_waitcnt lgkmcnt(0) 235; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 236; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] 237; GFX6-NEXT: s_waitcnt lgkmcnt(0) 238; GFX6-NEXT: v_mov_b32_e32 v2, s0 239; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc 240; GFX6-NEXT: s_endpgm 241; 242; GFX7-LABEL: global_nontemporal_store_1: 243; GFX7: ; %bb.0: ; %entry 244; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 245; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 246; GFX7-NEXT: s_waitcnt lgkmcnt(0) 247; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 248; GFX7-NEXT: v_mov_b32_e32 v1, s3 249; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 250; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 251; GFX7-NEXT: s_waitcnt lgkmcnt(0) 252; GFX7-NEXT: v_mov_b32_e32 v2, s0 253; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc 254; GFX7-NEXT: s_endpgm 255; 256; GFX10-WGP-LABEL: global_nontemporal_store_1: 257; GFX10-WGP: ; %bb.0: ; %entry 258; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 259; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 260; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 261; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 262; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 263; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 264; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc 265; GFX10-WGP-NEXT: s_endpgm 266; 267; GFX10-CU-LABEL: global_nontemporal_store_1: 268; GFX10-CU: ; %bb.0: ; %entry 269; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 270; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 271; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 272; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 273; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 274; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 275; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc 276; GFX10-CU-NEXT: s_endpgm 277; 278; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1: 279; SKIP-CACHE-INV: ; %bb.0: ; %entry 280; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 281; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 282; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0 283; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0 285; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 286; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0 287; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3] 288; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 289; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 290; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc 291; SKIP-CACHE-INV-NEXT: s_endpgm 292 i32 addrspace(1)* %in, i32 addrspace(1)* %out) { 293entry: 294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 295 %val = load i32, i32 addrspace(1)* %in, align 4 296 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid 297 store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 298 ret void 299} 300 301!0 = !{i32 1} 302declare i32 @llvm.amdgcn.workitem.id.x() 303