1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7 8define amdgpu_kernel void @local_nontemporal_load_0( 9; GFX6-LABEL: local_nontemporal_load_0: 10; GFX6: ; %bb.0: ; %entry 11; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 12; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 13; GFX6-NEXT: s_mov_b32 m0, -1 14; GFX6-NEXT: s_mov_b32 s3, 0xf000 15; GFX6-NEXT: s_mov_b32 s2, -1 16; GFX6-NEXT: s_waitcnt lgkmcnt(0) 17; GFX6-NEXT: v_mov_b32_e32 v0, s4 18; GFX6-NEXT: ds_read_b32 v0, v0 19; GFX6-NEXT: s_waitcnt lgkmcnt(0) 20; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 21; GFX6-NEXT: s_endpgm 22; 23; GFX7-LABEL: local_nontemporal_load_0: 24; GFX7: ; %bb.0: ; %entry 25; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 26; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 27; GFX7-NEXT: s_mov_b32 m0, -1 28; GFX7-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7-NEXT: v_mov_b32_e32 v0, s2 30; GFX7-NEXT: ds_read_b32 v2, v0 31; GFX7-NEXT: v_mov_b32_e32 v0, s0 32; GFX7-NEXT: v_mov_b32_e32 v1, s1 33; GFX7-NEXT: s_waitcnt lgkmcnt(0) 34; GFX7-NEXT: flat_store_dword v[0:1], v2 35; GFX7-NEXT: s_endpgm 36; 37; GFX10-WGP-LABEL: local_nontemporal_load_0: 38; GFX10-WGP: ; %bb.0: ; %entry 39; GFX10-WGP-NEXT: s_clause 0x1 40; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 41; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 42; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 43; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 44; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 45; GFX10-WGP-NEXT: ds_read_b32 v0, v0 46; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 47; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] 48; GFX10-WGP-NEXT: s_endpgm 49; 50; GFX10-CU-LABEL: local_nontemporal_load_0: 51; GFX10-CU: ; %bb.0: ; %entry 52; GFX10-CU-NEXT: s_clause 0x1 53; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 54; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 55; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 56; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 57; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 58; GFX10-CU-NEXT: ds_read_b32 v0, v0 59; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 60; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] 61; GFX10-CU-NEXT: s_endpgm 62; 63; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0: 64; SKIP-CACHE-INV: ; %bb.0: ; %entry 65; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 66; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 67; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 68; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 69; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 70; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 71; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 72; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 73; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 74; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; SKIP-CACHE-INV-NEXT: s_endpgm 76 i32 addrspace(3)* %in, i32 addrspace(1)* %out) { 77entry: 78 %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 79 store i32 %val, i32 addrspace(1)* %out 80 ret void 81} 82 83define amdgpu_kernel void @local_nontemporal_load_1( 84; GFX6-LABEL: local_nontemporal_load_1: 85; GFX6: ; %bb.0: ; %entry 86; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 87; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 88; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 89; GFX6-NEXT: s_mov_b32 m0, -1 90; GFX6-NEXT: s_mov_b32 s3, 0xf000 91; GFX6-NEXT: s_waitcnt lgkmcnt(0) 92; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 93; GFX6-NEXT: ds_read_b32 v0, v0 94; GFX6-NEXT: s_mov_b32 s2, -1 95; GFX6-NEXT: s_waitcnt lgkmcnt(0) 96; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 97; GFX6-NEXT: s_endpgm 98; 99; GFX7-LABEL: local_nontemporal_load_1: 100; GFX7: ; %bb.0: ; %entry 101; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 102; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 103; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 104; GFX7-NEXT: s_mov_b32 m0, -1 105; GFX7-NEXT: s_waitcnt lgkmcnt(0) 106; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 107; GFX7-NEXT: ds_read_b32 v2, v0 108; GFX7-NEXT: v_mov_b32_e32 v0, s0 109; GFX7-NEXT: v_mov_b32_e32 v1, s1 110; GFX7-NEXT: s_waitcnt lgkmcnt(0) 111; GFX7-NEXT: flat_store_dword v[0:1], v2 112; GFX7-NEXT: s_endpgm 113; 114; GFX10-WGP-LABEL: local_nontemporal_load_1: 115; GFX10-WGP: ; %bb.0: ; %entry 116; GFX10-WGP-NEXT: s_clause 0x1 117; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 118; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 119; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 120; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 121; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 122; GFX10-WGP-NEXT: ds_read_b32 v0, v0 123; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 124; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] 125; GFX10-WGP-NEXT: s_endpgm 126; 127; GFX10-CU-LABEL: local_nontemporal_load_1: 128; GFX10-CU: ; %bb.0: ; %entry 129; GFX10-CU-NEXT: s_clause 0x1 130; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 131; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 132; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 133; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 134; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 135; GFX10-CU-NEXT: ds_read_b32 v0, v0 136; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 137; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] 138; GFX10-CU-NEXT: s_endpgm 139; 140; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1: 141; SKIP-CACHE-INV: ; %bb.0: ; %entry 142; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 143; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 144; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 145; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 146; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 147; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 148; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 149; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 150; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 151; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 152; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 153; SKIP-CACHE-INV-NEXT: s_endpgm 154 i32 addrspace(3)* %in, i32 addrspace(1)* %out) { 155entry: 156 %tid = call i32 @llvm.amdgcn.workitem.id.x() 157 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid 158 %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 159 store i32 %val, i32 addrspace(1)* %out 160 ret void 161} 162 163define amdgpu_kernel void @local_nontemporal_store_0( 164; GFX6-LABEL: local_nontemporal_store_0: 165; GFX6: ; %bb.0: ; %entry 166; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 167; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 168; GFX6-NEXT: s_mov_b32 m0, -1 169; GFX6-NEXT: s_waitcnt lgkmcnt(0) 170; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 171; GFX6-NEXT: v_mov_b32_e32 v0, s0 172; GFX6-NEXT: s_waitcnt lgkmcnt(0) 173; GFX6-NEXT: v_mov_b32_e32 v1, s1 174; GFX6-NEXT: ds_write_b32 v0, v1 175; GFX6-NEXT: s_endpgm 176; 177; GFX7-LABEL: local_nontemporal_store_0: 178; GFX7: ; %bb.0: ; %entry 179; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 180; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 181; GFX7-NEXT: s_mov_b32 m0, -1 182; GFX7-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 184; GFX7-NEXT: v_mov_b32_e32 v0, s2 185; GFX7-NEXT: s_waitcnt lgkmcnt(0) 186; GFX7-NEXT: v_mov_b32_e32 v1, s0 187; GFX7-NEXT: ds_write_b32 v0, v1 188; GFX7-NEXT: s_endpgm 189; 190; GFX10-WGP-LABEL: local_nontemporal_store_0: 191; GFX10-WGP: ; %bb.0: ; %entry 192; GFX10-WGP-NEXT: s_clause 0x1 193; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 194; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 195; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 196; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 197; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 198; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 199; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 200; GFX10-WGP-NEXT: ds_write_b32 v0, v1 201; GFX10-WGP-NEXT: s_endpgm 202; 203; GFX10-CU-LABEL: local_nontemporal_store_0: 204; GFX10-CU: ; %bb.0: ; %entry 205; GFX10-CU-NEXT: s_clause 0x1 206; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 207; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 208; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 209; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 210; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 211; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 212; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 213; GFX10-CU-NEXT: ds_write_b32 v0, v1 214; GFX10-CU-NEXT: s_endpgm 215; 216; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0: 217; SKIP-CACHE-INV: ; %bb.0: ; %entry 218; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 219; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 220; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 221; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 222; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 223; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 224; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 225; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 226; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 227; SKIP-CACHE-INV-NEXT: s_endpgm 228 i32 addrspace(1)* %in, i32 addrspace(3)* %out) { 229entry: 230 %val = load i32, i32 addrspace(1)* %in, align 4 231 store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 232 ret void 233} 234 235define amdgpu_kernel void @local_nontemporal_store_1( 236; GFX6-LABEL: local_nontemporal_store_1: 237; GFX6: ; %bb.0: ; %entry 238; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 239; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 240; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 241; GFX6-NEXT: s_mov_b32 m0, -1 242; GFX6-NEXT: s_waitcnt lgkmcnt(0) 243; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 244; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 245; GFX6-NEXT: s_waitcnt lgkmcnt(0) 246; GFX6-NEXT: v_mov_b32_e32 v1, s1 247; GFX6-NEXT: ds_write_b32 v0, v1 248; GFX6-NEXT: s_endpgm 249; 250; GFX7-LABEL: local_nontemporal_store_1: 251; GFX7: ; %bb.0: ; %entry 252; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 253; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 254; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 255; GFX7-NEXT: s_mov_b32 m0, -1 256; GFX7-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 258; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 259; GFX7-NEXT: s_waitcnt lgkmcnt(0) 260; GFX7-NEXT: v_mov_b32_e32 v1, s0 261; GFX7-NEXT: ds_write_b32 v0, v1 262; GFX7-NEXT: s_endpgm 263; 264; GFX10-WGP-LABEL: local_nontemporal_store_1: 265; GFX10-WGP: ; %bb.0: ; %entry 266; GFX10-WGP-NEXT: s_clause 0x1 267; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 268; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 269; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 270; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 271; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 272; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 273; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 274; GFX10-WGP-NEXT: ds_write_b32 v0, v1 275; GFX10-WGP-NEXT: s_endpgm 276; 277; GFX10-CU-LABEL: local_nontemporal_store_1: 278; GFX10-CU: ; %bb.0: ; %entry 279; GFX10-CU-NEXT: s_clause 0x1 280; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 281; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 282; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 283; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 284; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 285; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 286; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 287; GFX10-CU-NEXT: ds_write_b32 v0, v1 288; GFX10-CU-NEXT: s_endpgm 289; 290; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1: 291; SKIP-CACHE-INV: ; %bb.0: ; %entry 292; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 293; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 294; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 295; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 296; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 297; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 298; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 299; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 300; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 301; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 302; SKIP-CACHE-INV-NEXT: s_endpgm 303 i32 addrspace(1)* %in, i32 addrspace(3)* %out) { 304entry: 305 %tid = call i32 @llvm.amdgcn.workitem.id.x() 306 %val = load i32, i32 addrspace(1)* %in, align 4 307 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid 308 store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 309 ret void 310} 311 312!0 = !{i32 1} 313declare i32 @llvm.amdgcn.workitem.id.x() 314