1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7 8define amdgpu_kernel void @private_nontemporal_load_0( 9; GFX6-LABEL: private_nontemporal_load_0: 10; GFX6: ; %bb.0: ; %entry 11; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 12; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 13; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 14; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 15; GFX6-NEXT: s_mov_b32 s10, -1 16; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 17; GFX6-NEXT: s_add_u32 s8, s8, s3 18; GFX6-NEXT: s_addc_u32 s9, s9, 0 19; GFX6-NEXT: s_waitcnt lgkmcnt(0) 20; GFX6-NEXT: v_mov_b32_e32 v0, s4 21; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc 22; GFX6-NEXT: s_mov_b32 s3, 0xf000 23; GFX6-NEXT: s_mov_b32 s2, -1 24; GFX6-NEXT: s_waitcnt vmcnt(0) 25; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 26; GFX6-NEXT: s_endpgm 27; 28; GFX7-LABEL: private_nontemporal_load_0: 29; GFX7: ; %bb.0: ; %entry 30; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] 31; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 32; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 33; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 34; GFX7-NEXT: s_add_u32 s8, s8, s7 35; GFX7-NEXT: s_addc_u32 s9, s9, 0 36; GFX7-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7-NEXT: v_mov_b32_e32 v0, s2 38; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc 39; GFX7-NEXT: v_mov_b32_e32 v0, s0 40; GFX7-NEXT: v_mov_b32_e32 v1, s1 41; GFX7-NEXT: s_waitcnt vmcnt(0) 42; GFX7-NEXT: flat_store_dword v[0:1], v2 43; GFX7-NEXT: s_endpgm 44; 45; GFX10-WGP-LABEL: private_nontemporal_load_0: 46; GFX10-WGP: ; %bb.0: ; %entry 47; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] 48; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] 49; GFX10-WGP-NEXT: s_clause 0x1 50; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 51; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 52; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 53; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 54; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 55; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 56; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 57; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc 58; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 59; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] 60; GFX10-WGP-NEXT: s_endpgm 61; 62; GFX10-CU-LABEL: private_nontemporal_load_0: 63; GFX10-CU: ; %bb.0: ; %entry 64; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] 65; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] 66; GFX10-CU-NEXT: s_clause 0x1 67; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 68; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 69; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 70; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 71; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 72; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 73; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 74; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc 75; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 76; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] 77; GFX10-CU-NEXT: s_endpgm 78; 79; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0: 80; SKIP-CACHE-INV: ; %bb.0: ; %entry 81; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] 82; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 83; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 84; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 85; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 86; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 87; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 88; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 89; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 90; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 91; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc 92; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 93; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 94; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 95; SKIP-CACHE-INV-NEXT: s_endpgm 96 i32 addrspace(5)* %in, i32 addrspace(1)* %out) { 97entry: 98 %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 99 store i32 %val, i32 addrspace(1)* %out 100 ret void 101} 102 103define amdgpu_kernel void @private_nontemporal_load_1( 104; GFX6-LABEL: private_nontemporal_load_1: 105; GFX6: ; %bb.0: ; %entry 106; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 107; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 108; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 109; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 110; GFX6-NEXT: s_mov_b32 s10, -1 111; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 112; GFX6-NEXT: s_add_u32 s8, s8, s3 113; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 114; GFX6-NEXT: s_addc_u32 s9, s9, 0 115; GFX6-NEXT: s_waitcnt lgkmcnt(0) 116; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 117; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc 118; GFX6-NEXT: s_mov_b32 s3, 0xf000 119; GFX6-NEXT: s_mov_b32 s2, -1 120; GFX6-NEXT: s_waitcnt vmcnt(0) 121; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 122; GFX6-NEXT: s_endpgm 123; 124; GFX7-LABEL: private_nontemporal_load_1: 125; GFX7: ; %bb.0: ; %entry 126; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] 127; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 128; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 129; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 130; GFX7-NEXT: s_add_u32 s8, s8, s7 131; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 132; GFX7-NEXT: s_addc_u32 s9, s9, 0 133; GFX7-NEXT: s_waitcnt lgkmcnt(0) 134; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 135; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc 136; GFX7-NEXT: v_mov_b32_e32 v0, s0 137; GFX7-NEXT: v_mov_b32_e32 v1, s1 138; GFX7-NEXT: s_waitcnt vmcnt(0) 139; GFX7-NEXT: flat_store_dword v[0:1], v2 140; GFX7-NEXT: s_endpgm 141; 142; GFX10-WGP-LABEL: private_nontemporal_load_1: 143; GFX10-WGP: ; %bb.0: ; %entry 144; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] 145; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] 146; GFX10-WGP-NEXT: s_clause 0x1 147; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 148; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 149; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 150; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 151; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0 152; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 153; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 154; GFX10-WGP-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc 155; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 156; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1] 157; GFX10-WGP-NEXT: s_endpgm 158; 159; GFX10-CU-LABEL: private_nontemporal_load_1: 160; GFX10-CU: ; %bb.0: ; %entry 161; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] 162; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] 163; GFX10-CU-NEXT: s_clause 0x1 164; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 165; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 166; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 167; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 168; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0 169; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 171; GFX10-CU-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen slc 172; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 173; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1] 174; GFX10-CU-NEXT: s_endpgm 175; 176; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1: 177; SKIP-CACHE-INV: ; %bb.0: ; %entry 178; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[8:9] 179; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s0 180; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 181; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9 182; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 183; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 184; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 185; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 186; SKIP-CACHE-INV-NEXT: s_add_u32 s8, s8, s3 187; SKIP-CACHE-INV-NEXT: s_addc_u32 s9, s9, 0 188; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s4, v0 189; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc 190; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 191; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 192; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 193; SKIP-CACHE-INV-NEXT: s_endpgm 194 i32 addrspace(5)* %in, i32 addrspace(1)* %out) { 195entry: 196 %tid = call i32 @llvm.amdgcn.workitem.id.x() 197 %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid 198 %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 199 store i32 %val, i32 addrspace(1)* %out 200 ret void 201} 202 203define amdgpu_kernel void @private_nontemporal_store_0( 204; GFX6-LABEL: private_nontemporal_store_0: 205; GFX6: ; %bb.0: ; %entry 206; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 207; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 208; GFX6-NEXT: s_mov_b32 s6, -1 209; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 210; GFX6-NEXT: s_add_u32 s4, s4, s3 211; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 212; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 213; GFX6-NEXT: s_addc_u32 s5, s5, 0 214; GFX6-NEXT: s_waitcnt lgkmcnt(0) 215; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 216; GFX6-NEXT: v_mov_b32_e32 v1, s0 217; GFX6-NEXT: s_waitcnt lgkmcnt(0) 218; GFX6-NEXT: v_mov_b32_e32 v0, s1 219; GFX6-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc 220; GFX6-NEXT: s_endpgm 221; 222; GFX7-LABEL: private_nontemporal_store_0: 223; GFX7: ; %bb.0: ; %entry 224; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] 225; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 226; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 227; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 228; GFX7-NEXT: s_add_u32 s8, s8, s7 229; GFX7-NEXT: s_addc_u32 s9, s9, 0 230; GFX7-NEXT: s_waitcnt lgkmcnt(0) 231; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 232; GFX7-NEXT: v_mov_b32_e32 v1, s2 233; GFX7-NEXT: s_waitcnt lgkmcnt(0) 234; GFX7-NEXT: v_mov_b32_e32 v0, s0 235; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc 236; GFX7-NEXT: s_endpgm 237; 238; GFX10-WGP-LABEL: private_nontemporal_store_0: 239; GFX10-WGP: ; %bb.0: ; %entry 240; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] 241; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] 242; GFX10-WGP-NEXT: s_clause 0x1 243; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 244; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 245; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 246; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 247; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 248; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 249; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 250; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 251; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 252; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc 253; GFX10-WGP-NEXT: s_endpgm 254; 255; GFX10-CU-LABEL: private_nontemporal_store_0: 256; GFX10-CU: ; %bb.0: ; %entry 257; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] 258; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] 259; GFX10-CU-NEXT: s_clause 0x1 260; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 261; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 262; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 263; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 264; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 265; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 266; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 267; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 268; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 269; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc 270; GFX10-CU-NEXT: s_endpgm 271; 272; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0: 273; SKIP-CACHE-INV: ; %bb.0: ; %entry 274; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] 275; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 276; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 277; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 278; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 279; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 280; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 281; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 282; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 283; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 284; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 285; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 286; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 287; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc 288; SKIP-CACHE-INV-NEXT: s_endpgm 289 i32 addrspace(1)* %in, i32 addrspace(5)* %out) { 290entry: 291 %val = load i32, i32 addrspace(1)* %in, align 4 292 store i32 %val, i32 addrspace(5)* %out, !nontemporal !0 293 ret void 294} 295 296define amdgpu_kernel void @private_nontemporal_store_1( 297; GFX6-LABEL: private_nontemporal_store_1: 298; GFX6: ; %bb.0: ; %entry 299; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 300; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 301; GFX6-NEXT: s_mov_b32 s6, -1 302; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 303; GFX6-NEXT: s_add_u32 s4, s4, s3 304; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 305; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 306; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 307; GFX6-NEXT: s_addc_u32 s5, s5, 0 308; GFX6-NEXT: s_waitcnt lgkmcnt(0) 309; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 310; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 311; GFX6-NEXT: s_waitcnt lgkmcnt(0) 312; GFX6-NEXT: v_mov_b32_e32 v1, s1 313; GFX6-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc 314; GFX6-NEXT: s_endpgm 315; 316; GFX7-LABEL: private_nontemporal_store_1: 317; GFX7: ; %bb.0: ; %entry 318; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] 319; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 320; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 321; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 322; GFX7-NEXT: s_add_u32 s8, s8, s7 323; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 324; GFX7-NEXT: s_addc_u32 s9, s9, 0 325; GFX7-NEXT: s_waitcnt lgkmcnt(0) 326; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 327; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 328; GFX7-NEXT: s_waitcnt lgkmcnt(0) 329; GFX7-NEXT: v_mov_b32_e32 v1, s0 330; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc 331; GFX7-NEXT: s_endpgm 332; 333; GFX10-WGP-LABEL: private_nontemporal_store_1: 334; GFX10-WGP: ; %bb.0: ; %entry 335; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] 336; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] 337; GFX10-WGP-NEXT: s_clause 0x1 338; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 339; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 340; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 341; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 342; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 343; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 344; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 345; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 346; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 347; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc 348; GFX10-WGP-NEXT: s_endpgm 349; 350; GFX10-CU-LABEL: private_nontemporal_store_1: 351; GFX10-CU: ; %bb.0: ; %entry 352; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] 353; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] 354; GFX10-CU-NEXT: s_clause 0x1 355; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 356; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 357; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 358; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 359; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 360; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 361; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 362; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 363; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 364; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc 365; GFX10-CU-NEXT: s_endpgm 366; 367; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1: 368; SKIP-CACHE-INV: ; %bb.0: ; %entry 369; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5] 370; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 371; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 372; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0 373; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 374; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s4, s3 375; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 376; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb 377; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s5, 0 378; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 379; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 380; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0 381; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 382; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 383; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc 384; SKIP-CACHE-INV-NEXT: s_endpgm 385 i32 addrspace(1)* %in, i32 addrspace(5)* %out) { 386entry: 387 %tid = call i32 @llvm.amdgcn.workitem.id.x() 388 %val = load i32, i32 addrspace(1)* %in, align 4 389 %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid 390 store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0 391 ret void 392} 393 394!0 = !{i32 1} 395declare i32 @llvm.amdgcn.workitem.id.x() 396