1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s 6 7define amdgpu_kernel void @zero_init_kernel() { 8; GFX9-LABEL: zero_init_kernel: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 11; GFX9-NEXT: s_mov_b32 s0, 0 12; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 13; GFX9-NEXT: s_mov_b32 s1, s0 14; GFX9-NEXT: s_mov_b32 s2, s0 15; GFX9-NEXT: s_mov_b32 s3, s0 16; GFX9-NEXT: v_mov_b32_e32 v0, s0 17; GFX9-NEXT: v_mov_b32_e32 v1, s1 18; GFX9-NEXT: v_mov_b32_e32 v2, s2 19; GFX9-NEXT: v_mov_b32_e32 v3, s3 20; GFX9-NEXT: s_mov_b32 vcc_hi, 0 21; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 22; GFX9-NEXT: s_mov_b32 vcc_hi, 0 23; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 28; GFX9-NEXT: s_endpgm 29; 30; GFX10-LABEL: zero_init_kernel: 31; GFX10: ; %bb.0: 32; GFX10-NEXT: s_add_u32 s0, s0, s3 33; GFX10-NEXT: s_addc_u32 s1, s1, 0 34; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 35; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 36; GFX10-NEXT: s_mov_b32 s0, 0 37; GFX10-NEXT: s_mov_b32 s1, s0 38; GFX10-NEXT: s_mov_b32 s2, s0 39; GFX10-NEXT: s_mov_b32 s3, s0 40; GFX10-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-NEXT: v_mov_b32_e32 v2, s2 43; GFX10-NEXT: v_mov_b32_e32 v3, s3 44; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 45; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 48; GFX10-NEXT: s_endpgm 49; 50; GFX9-PAL-LABEL: zero_init_kernel: 51; GFX9-PAL: ; %bb.0: 52; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 53; GFX9-PAL-NEXT: s_mov_b32 s2, s0 54; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 55; GFX9-PAL-NEXT: s_mov_b32 s0, 0 56; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 57; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 59; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 60; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 61; GFX9-PAL-NEXT: s_mov_b32 s1, s0 62; GFX9-PAL-NEXT: s_mov_b32 s2, s0 63; GFX9-PAL-NEXT: s_mov_b32 s3, s0 64; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 65; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 66; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 67; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 68; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 69; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 70; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 71; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 72; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 73; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 74; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 75; GFX9-PAL-NEXT: s_endpgm 76; 77; GFX10-PAL-LABEL: zero_init_kernel: 78; GFX10-PAL: ; %bb.0: 79; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 80; GFX10-PAL-NEXT: s_mov_b32 s2, s0 81; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 82; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 83; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 84; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 85; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 86; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 87; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 88; GFX10-PAL-NEXT: s_mov_b32 s0, 0 89; GFX10-PAL-NEXT: s_mov_b32 s1, s0 90; GFX10-PAL-NEXT: s_mov_b32 s2, s0 91; GFX10-PAL-NEXT: s_mov_b32 s3, s0 92; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 93; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 94; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 95; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 96; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 97; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 98; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 99; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 100; GFX10-PAL-NEXT: s_endpgm 101 %alloca = alloca [32 x i16], align 2, addrspace(5) 102 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 103 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 104 ret void 105} 106 107define void @zero_init_foo() { 108; GFX9-LABEL: zero_init_foo: 109; GFX9: ; %bb.0: 110; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX9-NEXT: s_mov_b32 s0, 0 112; GFX9-NEXT: s_mov_b32 s1, s0 113; GFX9-NEXT: s_mov_b32 s2, s0 114; GFX9-NEXT: s_mov_b32 s3, s0 115; GFX9-NEXT: v_mov_b32_e32 v0, s0 116; GFX9-NEXT: v_mov_b32_e32 v1, s1 117; GFX9-NEXT: v_mov_b32_e32 v2, s2 118; GFX9-NEXT: v_mov_b32_e32 v3, s3 119; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 120; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 121; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 122; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 123; GFX9-NEXT: s_waitcnt vmcnt(0) 124; GFX9-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: zero_init_foo: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 130; GFX10-NEXT: s_mov_b32 s0, 0 131; GFX10-NEXT: s_mov_b32 s1, s0 132; GFX10-NEXT: s_mov_b32 s2, s0 133; GFX10-NEXT: s_mov_b32 s3, s0 134; GFX10-NEXT: v_mov_b32_e32 v0, s0 135; GFX10-NEXT: v_mov_b32_e32 v1, s1 136; GFX10-NEXT: v_mov_b32_e32 v2, s2 137; GFX10-NEXT: v_mov_b32_e32 v3, s3 138; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 139; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 140; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 141; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 142; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 143; GFX10-NEXT: s_setpc_b64 s[30:31] 144; 145; GFX9-PAL-LABEL: zero_init_foo: 146; GFX9-PAL: ; %bb.0: 147; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 148; GFX9-PAL-NEXT: s_mov_b32 s0, 0 149; GFX9-PAL-NEXT: s_mov_b32 s1, s0 150; GFX9-PAL-NEXT: s_mov_b32 s2, s0 151; GFX9-PAL-NEXT: s_mov_b32 s3, s0 152; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 153; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 154; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 155; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 156; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 157; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 158; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 159; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 160; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 161; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-PAL-LABEL: zero_init_foo: 164; GFX10-PAL: ; %bb.0: 165; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 167; GFX10-PAL-NEXT: s_mov_b32 s0, 0 168; GFX10-PAL-NEXT: s_mov_b32 s1, s0 169; GFX10-PAL-NEXT: s_mov_b32 s2, s0 170; GFX10-PAL-NEXT: s_mov_b32 s3, s0 171; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 172; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 173; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 174; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 175; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 176; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 177; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 178; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 179; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 180; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 181 %alloca = alloca [32 x i16], align 2, addrspace(5) 182 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 183 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 184 ret void 185} 186 187define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 188; GFX9-LABEL: store_load_sindex_kernel: 189; GFX9: ; %bb.0: ; %bb 190; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 191; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 192; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 193; GFX9-NEXT: v_mov_b32_e32 v0, 15 194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 195; GFX9-NEXT: s_lshl_b32 s1, s0, 2 196; GFX9-NEXT: s_and_b32 s0, s0, 15 197; GFX9-NEXT: s_lshl_b32 s0, s0, 2 198; GFX9-NEXT: s_add_u32 s1, 4, s1 199; GFX9-NEXT: scratch_store_dword off, v0, s1 200; GFX9-NEXT: s_add_u32 s0, 4, s0 201; GFX9-NEXT: scratch_load_dword v0, off, s0 202; GFX9-NEXT: s_endpgm 203; 204; GFX10-LABEL: store_load_sindex_kernel: 205; GFX10: ; %bb.0: ; %bb 206; GFX10-NEXT: s_add_u32 s2, s2, s5 207; GFX10-NEXT: s_addc_u32 s3, s3, 0 208; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 209; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 210; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 211; GFX10-NEXT: v_mov_b32_e32 v0, 15 212; GFX10-NEXT: s_waitcnt lgkmcnt(0) 213; GFX10-NEXT: s_and_b32 s1, s0, 15 214; GFX10-NEXT: s_lshl_b32 s0, s0, 2 215; GFX10-NEXT: s_lshl_b32 s1, s1, 2 216; GFX10-NEXT: s_add_u32 s0, 4, s0 217; GFX10-NEXT: s_add_u32 s1, 4, s1 218; GFX10-NEXT: scratch_store_dword off, v0, s0 219; GFX10-NEXT: scratch_load_dword v0, off, s1 220; GFX10-NEXT: s_endpgm 221; 222; GFX9-PAL-LABEL: store_load_sindex_kernel: 223; GFX9-PAL: ; %bb.0: ; %bb 224; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 225; GFX9-PAL-NEXT: s_mov_b32 s4, s0 226; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 227; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 228; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 229; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 230; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 231; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 232; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 233; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 234; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 235; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 236; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 237; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 238; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 239; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 240; GFX9-PAL-NEXT: s_endpgm 241; 242; GFX10-PAL-LABEL: store_load_sindex_kernel: 243; GFX10-PAL: ; %bb.0: ; %bb 244; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 245; GFX10-PAL-NEXT: s_mov_b32 s4, s0 246; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 247; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 248; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 249; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 250; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 251; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 252; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 253; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 254; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 255; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 257; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 258; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 259; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 260; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 261; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 262; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 263; GFX10-PAL-NEXT: s_endpgm 264bb: 265 %i = alloca [32 x float], align 4, addrspace(5) 266 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 267 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 268 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 269 store volatile i32 15, i32 addrspace(5)* %i8, align 4 270 %i9 = and i32 %idx, 15 271 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 272 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 273 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 274 ret void 275} 276 277define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 278; GFX9-LABEL: store_load_sindex_foo: 279; GFX9: ; %bb.0: ; %bb 280; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 281; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 282; GFX9-NEXT: s_lshl_b32 s0, s2, 2 283; GFX9-NEXT: s_add_u32 s0, 4, s0 284; GFX9-NEXT: v_mov_b32_e32 v0, 15 285; GFX9-NEXT: scratch_store_dword off, v0, s0 286; GFX9-NEXT: s_and_b32 s0, s2, 15 287; GFX9-NEXT: s_lshl_b32 s0, s0, 2 288; GFX9-NEXT: s_add_u32 s0, 4, s0 289; GFX9-NEXT: scratch_load_dword v0, off, s0 290; GFX9-NEXT: s_endpgm 291; 292; GFX10-LABEL: store_load_sindex_foo: 293; GFX10: ; %bb.0: ; %bb 294; GFX10-NEXT: s_add_u32 s0, s0, s3 295; GFX10-NEXT: s_addc_u32 s1, s1, 0 296; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 297; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 298; GFX10-NEXT: s_and_b32 s0, s2, 15 299; GFX10-NEXT: v_mov_b32_e32 v0, 15 300; GFX10-NEXT: s_lshl_b32 s1, s2, 2 301; GFX10-NEXT: s_lshl_b32 s0, s0, 2 302; GFX10-NEXT: s_add_u32 s1, 4, s1 303; GFX10-NEXT: s_add_u32 s0, 4, s0 304; GFX10-NEXT: scratch_store_dword off, v0, s1 305; GFX10-NEXT: scratch_load_dword v0, off, s0 306; GFX10-NEXT: s_endpgm 307; 308; GFX9-PAL-LABEL: store_load_sindex_foo: 309; GFX9-PAL: ; %bb.0: ; %bb 310; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 311; GFX9-PAL-NEXT: s_mov_b32 s2, s0 312; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 313; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 314; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 315; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 316; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 317; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 318; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 319; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 320; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 321; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 322; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 323; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 324; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 325; GFX9-PAL-NEXT: s_endpgm 326; 327; GFX10-PAL-LABEL: store_load_sindex_foo: 328; GFX10-PAL: ; %bb.0: ; %bb 329; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 330; GFX10-PAL-NEXT: s_mov_b32 s2, s0 331; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 332; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 333; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 334; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 335; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 336; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 337; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 338; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 339; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 340; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 341; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 342; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 343; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 344; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 345; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 346; GFX10-PAL-NEXT: s_endpgm 347bb: 348 %i = alloca [32 x float], align 4, addrspace(5) 349 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 350 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 351 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 352 store volatile i32 15, i32 addrspace(5)* %i8, align 4 353 %i9 = and i32 %idx, 15 354 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 355 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 356 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 357 ret void 358} 359 360define amdgpu_kernel void @store_load_vindex_kernel() { 361; GFX9-LABEL: store_load_vindex_kernel: 362; GFX9: ; %bb.0: ; %bb 363; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 364; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 365; GFX9-NEXT: v_mov_b32_e32 v1, 4 366; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 367; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 368; GFX9-NEXT: v_mov_b32_e32 v3, 15 369; GFX9-NEXT: scratch_store_dword v2, v3, off 370; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 371; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 372; GFX9-NEXT: s_endpgm 373; 374; GFX10-LABEL: store_load_vindex_kernel: 375; GFX10: ; %bb.0: ; %bb 376; GFX10-NEXT: s_add_u32 s0, s0, s3 377; GFX10-NEXT: s_addc_u32 s1, s1, 0 378; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 379; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 380; GFX10-NEXT: v_mov_b32_e32 v1, 4 381; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 382; GFX10-NEXT: v_mov_b32_e32 v3, 15 383; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 384; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 385; GFX10-NEXT: scratch_store_dword v2, v3, off 386; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 387; GFX10-NEXT: s_endpgm 388; 389; GFX9-PAL-LABEL: store_load_vindex_kernel: 390; GFX9-PAL: ; %bb.0: ; %bb 391; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 392; GFX9-PAL-NEXT: s_mov_b32 s2, s0 393; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 394; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 395; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 396; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 397; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 398; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 399; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 400; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 401; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 402; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 403; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 404; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 405; GFX9-PAL-NEXT: s_endpgm 406; 407; GFX10-PAL-LABEL: store_load_vindex_kernel: 408; GFX10-PAL: ; %bb.0: ; %bb 409; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 410; GFX10-PAL-NEXT: s_mov_b32 s2, s0 411; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 412; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 413; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 414; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 415; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 416; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 417; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 418; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 419; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 420; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 421; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 422; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 423; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 424; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 425; GFX10-PAL-NEXT: s_endpgm 426bb: 427 %i = alloca [32 x float], align 4, addrspace(5) 428 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 429 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 430 %i3 = zext i32 %i2 to i64 431 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 432 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 433 store volatile i32 15, i32 addrspace(5)* %i8, align 4 434 %i9 = sub nsw i32 31, %i2 435 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 436 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 437 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 438 ret void 439} 440 441define void @store_load_vindex_foo(i32 %idx) { 442; GFX9-LABEL: store_load_vindex_foo: 443; GFX9: ; %bb.0: ; %bb 444; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GFX9-NEXT: v_mov_b32_e32 v1, s32 446; GFX9-NEXT: v_mov_b32_e32 v3, 15 447; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 448; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 449; GFX9-NEXT: scratch_store_dword v2, v3, off 450; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 451; GFX9-NEXT: scratch_load_dword v0, v0, off 452; GFX9-NEXT: s_waitcnt vmcnt(0) 453; GFX9-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX10-LABEL: store_load_vindex_foo: 456; GFX10: ; %bb.0: ; %bb 457; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 459; GFX10-NEXT: v_mov_b32_e32 v1, 15 460; GFX10-NEXT: v_mov_b32_e32 v2, s32 461; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 462; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 463; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 464; GFX10-NEXT: scratch_store_dword v0, v1, off 465; GFX10-NEXT: scratch_load_dword v0, v2, off 466; GFX10-NEXT: s_waitcnt vmcnt(0) 467; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 468; GFX10-NEXT: s_setpc_b64 s[30:31] 469; 470; GFX9-PAL-LABEL: store_load_vindex_foo: 471; GFX9-PAL: ; %bb.0: ; %bb 472; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 474; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 475; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 476; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 477; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 478; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 479; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 480; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 481; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 482; 483; GFX10-PAL-LABEL: store_load_vindex_foo: 484; GFX10-PAL: ; %bb.0: ; %bb 485; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 486; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 487; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 488; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 489; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 490; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 491; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 492; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 493; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 494; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 495; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 496; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 497bb: 498 %i = alloca [32 x float], align 4, addrspace(5) 499 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 500 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 501 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 502 store volatile i32 15, i32 addrspace(5)* %i8, align 4 503 %i9 = and i32 %idx, 15 504 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 505 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 506 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 507 ret void 508} 509 510define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 511; GFX9-LABEL: private_ptr_foo: 512; GFX9: ; %bb.0: 513; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 514; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 515; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 516; GFX9-NEXT: s_waitcnt vmcnt(0) 517; GFX9-NEXT: s_setpc_b64 s[30:31] 518; 519; GFX10-LABEL: private_ptr_foo: 520; GFX10: ; %bb.0: 521; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 522; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 523; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 524; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 525; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 526; GFX10-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX9-PAL-LABEL: private_ptr_foo: 529; GFX9-PAL: ; %bb.0: 530; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 532; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 533; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 534; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 535; 536; GFX10-PAL-LABEL: private_ptr_foo: 537; GFX10-PAL: ; %bb.0: 538; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 539; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 540; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 541; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 542; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 543; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 544 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 545 store float 1.000000e+01, float addrspace(5)* %gep, align 4 546 ret void 547} 548 549define amdgpu_kernel void @zero_init_small_offset_kernel() { 550; GFX9-LABEL: zero_init_small_offset_kernel: 551; GFX9: ; %bb.0: 552; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 553; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 554; GFX9-NEXT: s_mov_b32 vcc_hi, 0 555; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 556; GFX9-NEXT: s_mov_b32 s0, 0 557; GFX9-NEXT: s_mov_b32 s1, s0 558; GFX9-NEXT: s_mov_b32 s2, s0 559; GFX9-NEXT: s_mov_b32 s3, s0 560; GFX9-NEXT: s_waitcnt vmcnt(0) 561; GFX9-NEXT: v_mov_b32_e32 v0, s0 562; GFX9-NEXT: v_mov_b32_e32 v1, s1 563; GFX9-NEXT: v_mov_b32_e32 v2, s2 564; GFX9-NEXT: v_mov_b32_e32 v3, s3 565; GFX9-NEXT: s_mov_b32 vcc_hi, 0 566; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 567; GFX9-NEXT: s_mov_b32 vcc_hi, 0 568; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 569; GFX9-NEXT: s_mov_b32 vcc_hi, 0 570; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 571; GFX9-NEXT: s_mov_b32 vcc_hi, 0 572; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 573; GFX9-NEXT: s_endpgm 574; 575; GFX10-LABEL: zero_init_small_offset_kernel: 576; GFX10: ; %bb.0: 577; GFX10-NEXT: s_add_u32 s0, s0, s3 578; GFX10-NEXT: s_addc_u32 s1, s1, 0 579; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 580; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 581; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 582; GFX10-NEXT: s_mov_b32 s0, 0 583; GFX10-NEXT: s_mov_b32 s1, s0 584; GFX10-NEXT: s_mov_b32 s2, s0 585; GFX10-NEXT: s_mov_b32 s3, s0 586; GFX10-NEXT: s_waitcnt vmcnt(0) 587; GFX10-NEXT: v_mov_b32_e32 v0, s0 588; GFX10-NEXT: v_mov_b32_e32 v1, s1 589; GFX10-NEXT: v_mov_b32_e32 v2, s2 590; GFX10-NEXT: v_mov_b32_e32 v3, s3 591; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 592; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 593; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 594; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 595; GFX10-NEXT: s_endpgm 596; 597; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 598; GFX9-PAL: ; %bb.0: 599; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 600; GFX9-PAL-NEXT: s_mov_b32 s2, s0 601; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 602; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 603; GFX9-PAL-NEXT: s_mov_b32 s0, 0 604; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 605; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 606; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 607; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 608; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 609; GFX9-PAL-NEXT: s_mov_b32 s1, s0 610; GFX9-PAL-NEXT: s_mov_b32 s2, s0 611; GFX9-PAL-NEXT: s_mov_b32 s3, s0 612; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 613; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 614; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 615; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 616; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 617; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 618; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 619; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 620; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 621; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 622; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 623; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 624; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 625; GFX9-PAL-NEXT: s_endpgm 626; 627; GFX10-PAL-LABEL: zero_init_small_offset_kernel: 628; GFX10-PAL: ; %bb.0: 629; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 630; GFX10-PAL-NEXT: s_mov_b32 s2, s0 631; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 632; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 633; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 634; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 635; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 636; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 637; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 638; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 639; GFX10-PAL-NEXT: s_mov_b32 s0, 0 640; GFX10-PAL-NEXT: s_mov_b32 s1, s0 641; GFX10-PAL-NEXT: s_mov_b32 s2, s0 642; GFX10-PAL-NEXT: s_mov_b32 s3, s0 643; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 644; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 645; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 646; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 647; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 648; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 649; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 650; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 651; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 652; GFX10-PAL-NEXT: s_endpgm 653 %padding = alloca [64 x i32], align 4, addrspace(5) 654 %alloca = alloca [32 x i16], align 2, addrspace(5) 655 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 656 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 657 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 658 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 659 ret void 660} 661 662define void @zero_init_small_offset_foo() { 663; GFX9-LABEL: zero_init_small_offset_foo: 664; GFX9: ; %bb.0: 665; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 666; GFX9-NEXT: scratch_load_dword v0, off, s32 667; GFX9-NEXT: s_mov_b32 s0, 0 668; GFX9-NEXT: s_mov_b32 s1, s0 669; GFX9-NEXT: s_mov_b32 s2, s0 670; GFX9-NEXT: s_mov_b32 s3, s0 671; GFX9-NEXT: s_waitcnt vmcnt(0) 672; GFX9-NEXT: v_mov_b32_e32 v0, s0 673; GFX9-NEXT: v_mov_b32_e32 v1, s1 674; GFX9-NEXT: v_mov_b32_e32 v2, s2 675; GFX9-NEXT: v_mov_b32_e32 v3, s3 676; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 677; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 678; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 679; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 680; GFX9-NEXT: s_waitcnt vmcnt(0) 681; GFX9-NEXT: s_setpc_b64 s[30:31] 682; 683; GFX10-LABEL: zero_init_small_offset_foo: 684; GFX10: ; %bb.0: 685; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 686; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 687; GFX10-NEXT: scratch_load_dword v0, off, s32 688; GFX10-NEXT: s_mov_b32 s0, 0 689; GFX10-NEXT: s_mov_b32 s1, s0 690; GFX10-NEXT: s_mov_b32 s2, s0 691; GFX10-NEXT: s_mov_b32 s3, s0 692; GFX10-NEXT: s_waitcnt vmcnt(0) 693; GFX10-NEXT: v_mov_b32_e32 v0, s0 694; GFX10-NEXT: v_mov_b32_e32 v1, s1 695; GFX10-NEXT: v_mov_b32_e32 v2, s2 696; GFX10-NEXT: v_mov_b32_e32 v3, s3 697; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 698; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 699; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 700; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 701; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 702; GFX10-NEXT: s_setpc_b64 s[30:31] 703; 704; GFX9-PAL-LABEL: zero_init_small_offset_foo: 705; GFX9-PAL: ; %bb.0: 706; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 707; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 708; GFX9-PAL-NEXT: s_mov_b32 s0, 0 709; GFX9-PAL-NEXT: s_mov_b32 s1, s0 710; GFX9-PAL-NEXT: s_mov_b32 s2, s0 711; GFX9-PAL-NEXT: s_mov_b32 s3, s0 712; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 713; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 714; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 715; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 716; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 717; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 718; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 719; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 720; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 721; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 722; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 723; 724; GFX10-PAL-LABEL: zero_init_small_offset_foo: 725; GFX10-PAL: ; %bb.0: 726; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 727; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 728; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 729; GFX10-PAL-NEXT: s_mov_b32 s0, 0 730; GFX10-PAL-NEXT: s_mov_b32 s1, s0 731; GFX10-PAL-NEXT: s_mov_b32 s2, s0 732; GFX10-PAL-NEXT: s_mov_b32 s3, s0 733; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 734; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 735; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 736; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 737; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 738; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 739; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 740; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 741; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 742; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 743; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 744 %padding = alloca [64 x i32], align 4, addrspace(5) 745 %alloca = alloca [32 x i16], align 2, addrspace(5) 746 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 747 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 748 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 749 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 750 ret void 751} 752 753define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 754; GFX9-LABEL: store_load_sindex_small_offset_kernel: 755; GFX9: ; %bb.0: ; %bb 756; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 757; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 758; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 759; GFX9-NEXT: s_mov_b32 vcc_hi, 0 760; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 762; GFX9-NEXT: s_lshl_b32 s1, s0, 2 763; GFX9-NEXT: s_and_b32 s0, s0, 15 764; GFX9-NEXT: s_lshl_b32 s0, s0, 2 765; GFX9-NEXT: s_waitcnt vmcnt(0) 766; GFX9-NEXT: v_mov_b32_e32 v0, 15 767; GFX9-NEXT: s_add_u32 s1, 0x104, s1 768; GFX9-NEXT: scratch_store_dword off, v0, s1 769; GFX9-NEXT: s_add_u32 s0, 0x104, s0 770; GFX9-NEXT: scratch_load_dword v0, off, s0 771; GFX9-NEXT: s_endpgm 772; 773; GFX10-LABEL: store_load_sindex_small_offset_kernel: 774; GFX10: ; %bb.0: ; %bb 775; GFX10-NEXT: s_add_u32 s2, s2, s5 776; GFX10-NEXT: s_addc_u32 s3, s3, 0 777; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 778; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 779; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 780; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 781; GFX10-NEXT: s_waitcnt vmcnt(0) 782; GFX10-NEXT: v_mov_b32_e32 v0, 15 783; GFX10-NEXT: s_waitcnt lgkmcnt(0) 784; GFX10-NEXT: s_and_b32 s1, s0, 15 785; GFX10-NEXT: s_lshl_b32 s0, s0, 2 786; GFX10-NEXT: s_lshl_b32 s1, s1, 2 787; GFX10-NEXT: s_add_u32 s0, 0x104, s0 788; GFX10-NEXT: s_add_u32 s1, 0x104, s1 789; GFX10-NEXT: scratch_store_dword off, v0, s0 790; GFX10-NEXT: scratch_load_dword v0, off, s1 791; GFX10-NEXT: s_endpgm 792; 793; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 794; GFX9-PAL: ; %bb.0: ; %bb 795; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 796; GFX9-PAL-NEXT: s_mov_b32 s4, s0 797; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 798; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 799; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 800; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 801; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 802; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 803; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 804; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 805; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 806; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 807; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 808; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 809; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 810; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 811; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 812; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 813; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 814; GFX9-PAL-NEXT: s_endpgm 815; 816; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: 817; GFX10-PAL: ; %bb.0: ; %bb 818; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 819; GFX10-PAL-NEXT: s_mov_b32 s4, s0 820; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 821; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 822; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 823; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 824; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 825; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 826; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 827; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 828; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 829; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 830; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 831; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 832; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 833; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 834; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 835; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 836; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 837; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 838; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 839; GFX10-PAL-NEXT: s_endpgm 840bb: 841 %padding = alloca [64 x i32], align 4, addrspace(5) 842 %i = alloca [32 x float], align 4, addrspace(5) 843 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 844 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 845 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 846 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 847 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 848 store volatile i32 15, i32 addrspace(5)* %i8, align 4 849 %i9 = and i32 %idx, 15 850 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 851 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 852 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 853 ret void 854} 855 856define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 857; GFX9-LABEL: store_load_sindex_small_offset_foo: 858; GFX9: ; %bb.0: ; %bb 859; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 860; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 861; GFX9-NEXT: s_mov_b32 vcc_hi, 0 862; GFX9-NEXT: s_lshl_b32 s0, s2, 2 863; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 864; GFX9-NEXT: s_add_u32 s0, 0x104, s0 865; GFX9-NEXT: s_waitcnt vmcnt(0) 866; GFX9-NEXT: v_mov_b32_e32 v0, 15 867; GFX9-NEXT: scratch_store_dword off, v0, s0 868; GFX9-NEXT: s_and_b32 s0, s2, 15 869; GFX9-NEXT: s_lshl_b32 s0, s0, 2 870; GFX9-NEXT: s_add_u32 s0, 0x104, s0 871; GFX9-NEXT: scratch_load_dword v0, off, s0 872; GFX9-NEXT: s_endpgm 873; 874; GFX10-LABEL: store_load_sindex_small_offset_foo: 875; GFX10: ; %bb.0: ; %bb 876; GFX10-NEXT: s_add_u32 s0, s0, s3 877; GFX10-NEXT: s_addc_u32 s1, s1, 0 878; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 879; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 880; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 881; GFX10-NEXT: s_and_b32 s0, s2, 15 882; GFX10-NEXT: s_waitcnt vmcnt(0) 883; GFX10-NEXT: v_mov_b32_e32 v0, 15 884; GFX10-NEXT: s_lshl_b32 s1, s2, 2 885; GFX10-NEXT: s_lshl_b32 s0, s0, 2 886; GFX10-NEXT: s_add_u32 s1, 0x104, s1 887; GFX10-NEXT: s_add_u32 s0, 0x104, s0 888; GFX10-NEXT: scratch_store_dword off, v0, s1 889; GFX10-NEXT: scratch_load_dword v0, off, s0 890; GFX10-NEXT: s_endpgm 891; 892; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 893; GFX9-PAL: ; %bb.0: ; %bb 894; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 895; GFX9-PAL-NEXT: s_mov_b32 s2, s0 896; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 897; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 898; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 899; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 900; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 901; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 902; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 903; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 904; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 905; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 906; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 907; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 908; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 909; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 910; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 911; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 912; GFX9-PAL-NEXT: s_endpgm 913; 914; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: 915; GFX10-PAL: ; %bb.0: ; %bb 916; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 917; GFX10-PAL-NEXT: s_mov_b32 s2, s0 918; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 919; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 920; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 921; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 922; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 923; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 924; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 925; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 926; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 927; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 928; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 929; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 930; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 931; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 932; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 933; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 934; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 935; GFX10-PAL-NEXT: s_endpgm 936bb: 937 %padding = alloca [64 x i32], align 4, addrspace(5) 938 %i = alloca [32 x float], align 4, addrspace(5) 939 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 940 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 941 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 942 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 943 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 944 store volatile i32 15, i32 addrspace(5)* %i8, align 4 945 %i9 = and i32 %idx, 15 946 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 947 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 948 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 949 ret void 950} 951 952define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 953; GFX9-LABEL: store_load_vindex_small_offset_kernel: 954; GFX9: ; %bb.0: ; %bb 955; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 956; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 957; GFX9-NEXT: s_mov_b32 vcc_hi, 0 958; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 959; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 960; GFX9-NEXT: s_waitcnt vmcnt(0) 961; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 962; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 963; GFX9-NEXT: v_mov_b32_e32 v3, 15 964; GFX9-NEXT: scratch_store_dword v2, v3, off 965; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 966; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 967; GFX9-NEXT: s_endpgm 968; 969; GFX10-LABEL: store_load_vindex_small_offset_kernel: 970; GFX10: ; %bb.0: ; %bb 971; GFX10-NEXT: s_add_u32 s0, s0, s3 972; GFX10-NEXT: s_addc_u32 s1, s1, 0 973; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 974; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 975; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 976; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 977; GFX10-NEXT: v_mov_b32_e32 v3, 15 978; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 979; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 980; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 981; GFX10-NEXT: scratch_store_dword v2, v3, off 982; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 983; GFX10-NEXT: s_endpgm 984; 985; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 986; GFX9-PAL: ; %bb.0: ; %bb 987; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 988; GFX9-PAL-NEXT: s_mov_b32 s2, s0 989; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 990; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 991; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 992; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 993; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 994; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 995; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 996; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 997; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 998; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 999; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1000; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1001; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1002; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1003; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1004; GFX9-PAL-NEXT: s_endpgm 1005; 1006; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: 1007; GFX10-PAL: ; %bb.0: ; %bb 1008; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1009; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1010; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1011; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1013; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1014; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1015; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1016; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1017; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1018; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1019; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1020; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1021; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1022; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 1023; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1024; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1025; GFX10-PAL-NEXT: s_endpgm 1026bb: 1027 %padding = alloca [64 x i32], align 4, addrspace(5) 1028 %i = alloca [32 x float], align 4, addrspace(5) 1029 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1030 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1031 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1032 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1033 %i3 = zext i32 %i2 to i64 1034 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1035 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1036 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1037 %i9 = sub nsw i32 31, %i2 1038 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1039 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1040 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1041 ret void 1042} 1043 1044define void @store_load_vindex_small_offset_foo(i32 %idx) { 1045; GFX9-LABEL: store_load_vindex_small_offset_foo: 1046; GFX9: ; %bb.0: ; %bb 1047; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1048; GFX9-NEXT: scratch_load_dword v1, off, s32 1049; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 1050; GFX9-NEXT: s_waitcnt vmcnt(0) 1051; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1052; GFX9-NEXT: v_mov_b32_e32 v3, 15 1053; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1054; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1055; GFX9-NEXT: scratch_store_dword v2, v3, off 1056; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1057; GFX9-NEXT: scratch_load_dword v0, v0, off 1058; GFX9-NEXT: s_waitcnt vmcnt(0) 1059; GFX9-NEXT: s_setpc_b64 s[30:31] 1060; 1061; GFX10-LABEL: store_load_vindex_small_offset_foo: 1062; GFX10: ; %bb.0: ; %bb 1063; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1064; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1065; GFX10-NEXT: v_mov_b32_e32 v1, 15 1066; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 1067; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1068; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1069; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1070; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1071; GFX10-NEXT: scratch_load_dword v3, off, s32 1072; GFX10-NEXT: scratch_store_dword v0, v1, off 1073; GFX10-NEXT: scratch_load_dword v0, v2, off 1074; GFX10-NEXT: s_waitcnt vmcnt(0) 1075; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1076; GFX10-NEXT: s_setpc_b64 s[30:31] 1077; 1078; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1079; GFX9-PAL: ; %bb.0: ; %bb 1080; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1081; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 1082; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 1083; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1084; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1085; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1086; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1087; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1088; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1089; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1090; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 1091; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1092; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1093; 1094; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1095; GFX10-PAL: ; %bb.0: ; %bb 1096; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1097; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1098; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1099; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x100 1100; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1101; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1102; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1103; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1104; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 1105; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1106; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 1107; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1108; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1109; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1110bb: 1111 %padding = alloca [64 x i32], align 4, addrspace(5) 1112 %i = alloca [32 x float], align 4, addrspace(5) 1113 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1114 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1115 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1116 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1117 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1118 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1119 %i9 = and i32 %idx, 15 1120 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1121 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1122 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1123 ret void 1124} 1125 1126define amdgpu_kernel void @zero_init_large_offset_kernel() { 1127; GFX9-LABEL: zero_init_large_offset_kernel: 1128; GFX9: ; %bb.0: 1129; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1130; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1131; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1132; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1133; GFX9-NEXT: s_mov_b32 s0, 0 1134; GFX9-NEXT: s_mov_b32 s1, s0 1135; GFX9-NEXT: s_mov_b32 s2, s0 1136; GFX9-NEXT: s_mov_b32 s3, s0 1137; GFX9-NEXT: s_waitcnt vmcnt(0) 1138; GFX9-NEXT: v_mov_b32_e32 v0, s0 1139; GFX9-NEXT: v_mov_b32_e32 v1, s1 1140; GFX9-NEXT: v_mov_b32_e32 v2, s2 1141; GFX9-NEXT: v_mov_b32_e32 v3, s3 1142; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1143; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1144; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1145; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1146; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1147; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1148; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1149; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1150; GFX9-NEXT: s_endpgm 1151; 1152; GFX10-LABEL: zero_init_large_offset_kernel: 1153; GFX10: ; %bb.0: 1154; GFX10-NEXT: s_add_u32 s0, s0, s3 1155; GFX10-NEXT: s_addc_u32 s1, s1, 0 1156; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1157; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1158; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1159; GFX10-NEXT: s_mov_b32 s0, 0 1160; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1161; GFX10-NEXT: s_mov_b32 s1, s0 1162; GFX10-NEXT: s_mov_b32 s2, s0 1163; GFX10-NEXT: s_mov_b32 s3, s0 1164; GFX10-NEXT: s_waitcnt vmcnt(0) 1165; GFX10-NEXT: v_mov_b32_e32 v0, s0 1166; GFX10-NEXT: v_mov_b32_e32 v1, s1 1167; GFX10-NEXT: v_mov_b32_e32 v2, s2 1168; GFX10-NEXT: v_mov_b32_e32 v3, s3 1169; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1170; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1171; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1172; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1173; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1174; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1175; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1176; GFX10-NEXT: s_endpgm 1177; 1178; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1179; GFX9-PAL: ; %bb.0: 1180; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1181; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1182; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1183; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1184; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1185; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1186; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1187; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1188; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1189; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1190; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1191; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1192; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1193; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1194; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1195; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1196; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1197; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1198; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1199; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1200; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1201; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1202; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1203; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1204; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1205; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1206; GFX9-PAL-NEXT: s_endpgm 1207; 1208; GFX10-PAL-LABEL: zero_init_large_offset_kernel: 1209; GFX10-PAL: ; %bb.0: 1210; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1211; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1212; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1213; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1215; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1216; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1217; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1218; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1219; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1220; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1221; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1222; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1223; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1224; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1225; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1226; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1227; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1228; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1229; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1230; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1231; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1232; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1233; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1234; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1235; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1236; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1237; GFX10-PAL-NEXT: s_endpgm 1238 %padding = alloca [4096 x i32], align 4, addrspace(5) 1239 %alloca = alloca [32 x i16], align 2, addrspace(5) 1240 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1241 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1242 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1243 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1244 ret void 1245} 1246 1247define void @zero_init_large_offset_foo() { 1248; GFX9-LABEL: zero_init_large_offset_foo: 1249; GFX9: ; %bb.0: 1250; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1251; GFX9-NEXT: scratch_load_dword v0, off, s32 1252; GFX9-NEXT: s_mov_b32 s0, 0 1253; GFX9-NEXT: s_mov_b32 s1, s0 1254; GFX9-NEXT: s_mov_b32 s2, s0 1255; GFX9-NEXT: s_mov_b32 s3, s0 1256; GFX9-NEXT: s_waitcnt vmcnt(0) 1257; GFX9-NEXT: v_mov_b32_e32 v0, s0 1258; GFX9-NEXT: v_mov_b32_e32 v1, s1 1259; GFX9-NEXT: v_mov_b32_e32 v2, s2 1260; GFX9-NEXT: v_mov_b32_e32 v3, s3 1261; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1262; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1263; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1264; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1265; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1266; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1267; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1268; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1269; GFX9-NEXT: s_waitcnt vmcnt(0) 1270; GFX9-NEXT: s_setpc_b64 s[30:31] 1271; 1272; GFX10-LABEL: zero_init_large_offset_foo: 1273; GFX10: ; %bb.0: 1274; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1275; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1276; GFX10-NEXT: scratch_load_dword v0, off, s32 1277; GFX10-NEXT: s_mov_b32 s0, 0 1278; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1279; GFX10-NEXT: s_mov_b32 s1, s0 1280; GFX10-NEXT: s_mov_b32 s2, s0 1281; GFX10-NEXT: s_mov_b32 s3, s0 1282; GFX10-NEXT: s_waitcnt vmcnt(0) 1283; GFX10-NEXT: v_mov_b32_e32 v0, s0 1284; GFX10-NEXT: v_mov_b32_e32 v1, s1 1285; GFX10-NEXT: v_mov_b32_e32 v2, s2 1286; GFX10-NEXT: v_mov_b32_e32 v3, s3 1287; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1288; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1289; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1290; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1291; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1292; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1293; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1294; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1295; GFX10-NEXT: s_setpc_b64 s[30:31] 1296; 1297; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1298; GFX9-PAL: ; %bb.0: 1299; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1300; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 1301; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1302; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1303; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1304; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1305; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1306; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1307; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1308; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1309; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1310; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1311; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1312; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1313; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1314; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1315; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1316; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1317; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1318; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1319; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1320; 1321; GFX10-PAL-LABEL: zero_init_large_offset_foo: 1322; GFX10-PAL: ; %bb.0: 1323; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1324; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1325; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 1326; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1327; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1328; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1329; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1330; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1331; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1332; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1333; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1334; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1335; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1336; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1337; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1338; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1339; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1340; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1341; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1342; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1343; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1344; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1345 %padding = alloca [4096 x i32], align 4, addrspace(5) 1346 %alloca = alloca [32 x i16], align 2, addrspace(5) 1347 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1348 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1349 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1350 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1351 ret void 1352} 1353 1354define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1355; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1356; GFX9: ; %bb.0: ; %bb 1357; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1358; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1359; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1360; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1361; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1362; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1363; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1364; GFX9-NEXT: s_and_b32 s0, s0, 15 1365; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1366; GFX9-NEXT: s_waitcnt vmcnt(0) 1367; GFX9-NEXT: v_mov_b32_e32 v0, 15 1368; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 1369; GFX9-NEXT: scratch_store_dword off, v0, s1 1370; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1371; GFX9-NEXT: scratch_load_dword v0, off, s0 1372; GFX9-NEXT: s_endpgm 1373; 1374; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1375; GFX10: ; %bb.0: ; %bb 1376; GFX10-NEXT: s_add_u32 s2, s2, s5 1377; GFX10-NEXT: s_addc_u32 s3, s3, 0 1378; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1379; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1380; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1381; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1382; GFX10-NEXT: s_waitcnt vmcnt(0) 1383; GFX10-NEXT: v_mov_b32_e32 v0, 15 1384; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX10-NEXT: s_and_b32 s1, s0, 15 1386; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1387; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1388; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1389; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1390; GFX10-NEXT: scratch_store_dword off, v0, s0 1391; GFX10-NEXT: scratch_load_dword v0, off, s1 1392; GFX10-NEXT: s_endpgm 1393; 1394; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1395; GFX9-PAL: ; %bb.0: ; %bb 1396; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1397; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1398; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1399; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1400; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1401; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1403; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1404; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1405; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1406; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1407; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1408; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1409; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1410; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1411; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1412; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1413; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1414; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 1415; GFX9-PAL-NEXT: s_endpgm 1416; 1417; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: 1418; GFX10-PAL: ; %bb.0: ; %bb 1419; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 1420; GFX10-PAL-NEXT: s_mov_b32 s4, s0 1421; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1422; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1423; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1424; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 1425; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 1426; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1427; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1428; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1429; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1430; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1431; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1432; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1434; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1435; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1436; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1437; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1438; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1439; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 1440; GFX10-PAL-NEXT: s_endpgm 1441bb: 1442 %padding = alloca [4096 x i32], align 4, addrspace(5) 1443 %i = alloca [32 x float], align 4, addrspace(5) 1444 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1445 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1446 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1447 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1448 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1449 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1450 %i9 = and i32 %idx, 15 1451 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1452 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1453 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1454 ret void 1455} 1456 1457define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 1458; GFX9-LABEL: store_load_sindex_large_offset_foo: 1459; GFX9: ; %bb.0: ; %bb 1460; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1461; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1462; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1463; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1464; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1465; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1466; GFX9-NEXT: s_waitcnt vmcnt(0) 1467; GFX9-NEXT: v_mov_b32_e32 v0, 15 1468; GFX9-NEXT: scratch_store_dword off, v0, s0 1469; GFX9-NEXT: s_and_b32 s0, s2, 15 1470; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1471; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1472; GFX9-NEXT: scratch_load_dword v0, off, s0 1473; GFX9-NEXT: s_endpgm 1474; 1475; GFX10-LABEL: store_load_sindex_large_offset_foo: 1476; GFX10: ; %bb.0: ; %bb 1477; GFX10-NEXT: s_add_u32 s0, s0, s3 1478; GFX10-NEXT: s_addc_u32 s1, s1, 0 1479; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1480; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1481; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1482; GFX10-NEXT: s_and_b32 s0, s2, 15 1483; GFX10-NEXT: s_waitcnt vmcnt(0) 1484; GFX10-NEXT: v_mov_b32_e32 v0, 15 1485; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1486; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1487; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1488; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1489; GFX10-NEXT: scratch_store_dword off, v0, s1 1490; GFX10-NEXT: scratch_load_dword v0, off, s0 1491; GFX10-NEXT: s_endpgm 1492; 1493; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 1494; GFX9-PAL: ; %bb.0: ; %bb 1495; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1496; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1497; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1498; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1499; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1500; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1501; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1502; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1503; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1504; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1505; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1506; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1507; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1508; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1509; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1510; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1511; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1512; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 1513; GFX9-PAL-NEXT: s_endpgm 1514; 1515; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: 1516; GFX10-PAL: ; %bb.0: ; %bb 1517; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1518; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1519; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1520; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1521; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1522; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1523; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1524; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1525; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1526; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1527; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1528; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1529; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1530; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1531; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1532; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1533; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1534; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1535; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 1536; GFX10-PAL-NEXT: s_endpgm 1537bb: 1538 %padding = alloca [4096 x i32], align 4, addrspace(5) 1539 %i = alloca [32 x float], align 4, addrspace(5) 1540 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1541 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1542 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1543 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1544 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1545 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1546 %i9 = and i32 %idx, 15 1547 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1548 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1549 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1550 ret void 1551} 1552 1553define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 1554; GFX9-LABEL: store_load_vindex_large_offset_kernel: 1555; GFX9: ; %bb.0: ; %bb 1556; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1557; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1558; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1559; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 1560; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1561; GFX9-NEXT: s_waitcnt vmcnt(0) 1562; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 1563; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1564; GFX9-NEXT: v_mov_b32_e32 v3, 15 1565; GFX9-NEXT: scratch_store_dword v2, v3, off 1566; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1567; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 1568; GFX9-NEXT: s_endpgm 1569; 1570; GFX10-LABEL: store_load_vindex_large_offset_kernel: 1571; GFX10: ; %bb.0: ; %bb 1572; GFX10-NEXT: s_add_u32 s0, s0, s3 1573; GFX10-NEXT: s_addc_u32 s1, s1, 0 1574; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1575; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1576; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 1577; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1578; GFX10-NEXT: v_mov_b32_e32 v3, 15 1579; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1580; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1581; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 1582; GFX10-NEXT: scratch_store_dword v2, v3, off 1583; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 1584; GFX10-NEXT: s_endpgm 1585; 1586; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 1587; GFX9-PAL: ; %bb.0: ; %bb 1588; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1589; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1590; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1591; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1592; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1593; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1594; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1595; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1596; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1597; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1598; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 1599; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1600; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1601; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1602; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1603; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1604; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1605; GFX9-PAL-NEXT: s_endpgm 1606; 1607; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: 1608; GFX10-PAL: ; %bb.0: ; %bb 1609; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1610; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1611; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1612; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1613; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1614; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1615; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1616; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1617; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1618; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1619; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1620; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1621; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1622; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1623; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 1624; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1625; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1626; GFX10-PAL-NEXT: s_endpgm 1627bb: 1628 %padding = alloca [4096 x i32], align 4, addrspace(5) 1629 %i = alloca [32 x float], align 4, addrspace(5) 1630 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1631 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1632 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1633 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1634 %i3 = zext i32 %i2 to i64 1635 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1636 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1637 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1638 %i9 = sub nsw i32 31, %i2 1639 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1640 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1641 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1642 ret void 1643} 1644 1645define void @store_load_vindex_large_offset_foo(i32 %idx) { 1646; GFX9-LABEL: store_load_vindex_large_offset_foo: 1647; GFX9: ; %bb.0: ; %bb 1648; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1649; GFX9-NEXT: scratch_load_dword v1, off, s32 1650; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1651; GFX9-NEXT: s_waitcnt vmcnt(0) 1652; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1653; GFX9-NEXT: v_mov_b32_e32 v3, 15 1654; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1655; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1656; GFX9-NEXT: scratch_store_dword v2, v3, off 1657; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1658; GFX9-NEXT: scratch_load_dword v0, v0, off 1659; GFX9-NEXT: s_waitcnt vmcnt(0) 1660; GFX9-NEXT: s_setpc_b64 s[30:31] 1661; 1662; GFX10-LABEL: store_load_vindex_large_offset_foo: 1663; GFX10: ; %bb.0: ; %bb 1664; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1665; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1666; GFX10-NEXT: v_mov_b32_e32 v1, 15 1667; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1668; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1669; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1670; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1671; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1672; GFX10-NEXT: scratch_load_dword v3, off, s32 1673; GFX10-NEXT: scratch_store_dword v0, v1, off 1674; GFX10-NEXT: scratch_load_dword v0, v2, off 1675; GFX10-NEXT: s_waitcnt vmcnt(0) 1676; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1677; GFX10-NEXT: s_setpc_b64 s[30:31] 1678; 1679; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 1680; GFX9-PAL: ; %bb.0: ; %bb 1681; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1682; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 1683; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1684; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1685; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1686; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1687; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1688; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1689; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1690; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1691; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 1692; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1693; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1694; 1695; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 1696; GFX10-PAL: ; %bb.0: ; %bb 1697; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1698; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1699; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1700; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1701; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1702; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1703; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1704; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1705; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 1706; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1707; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 1708; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1709; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1710; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1711bb: 1712 %padding = alloca [4096 x i32], align 4, addrspace(5) 1713 %i = alloca [32 x float], align 4, addrspace(5) 1714 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1715 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1716 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1717 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1718 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1719 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1720 %i9 = and i32 %idx, 15 1721 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1722 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1723 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1724 ret void 1725} 1726 1727define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 1728; GFX9-LABEL: store_load_large_imm_offset_kernel: 1729; GFX9: ; %bb.0: ; %bb 1730; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1731; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1732; GFX9-NEXT: s_movk_i32 s0, 0x3000 1733; GFX9-NEXT: v_mov_b32_e32 v0, 13 1734; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1735; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1736; GFX9-NEXT: s_add_u32 s0, 4, s0 1737; GFX9-NEXT: v_mov_b32_e32 v0, 15 1738; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1739; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 1740; GFX9-NEXT: s_endpgm 1741; 1742; GFX10-LABEL: store_load_large_imm_offset_kernel: 1743; GFX10: ; %bb.0: ; %bb 1744; GFX10-NEXT: s_add_u32 s0, s0, s3 1745; GFX10-NEXT: s_addc_u32 s1, s1, 0 1746; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1747; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1748; GFX10-NEXT: v_mov_b32_e32 v0, 13 1749; GFX10-NEXT: v_mov_b32_e32 v1, 15 1750; GFX10-NEXT: s_movk_i32 s0, 0x3800 1751; GFX10-NEXT: s_add_u32 s0, 4, s0 1752; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 1753; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1754; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 1755; GFX10-NEXT: s_endpgm 1756; 1757; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 1758; GFX9-PAL: ; %bb.0: ; %bb 1759; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1760; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1761; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1762; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1763; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1764; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1765; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1767; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1768; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1769; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1770; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 1771; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1772; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1773; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 1774; GFX9-PAL-NEXT: s_endpgm 1775; 1776; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: 1777; GFX10-PAL: ; %bb.0: ; %bb 1778; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1779; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1780; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1781; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1782; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1783; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1784; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1785; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1786; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1787; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1788; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1789; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1790; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 1791; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 1792; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1793; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 1794; GFX10-PAL-NEXT: s_endpgm 1795bb: 1796 %i = alloca [4096 x i32], align 4, addrspace(5) 1797 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1798 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1799 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1800 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1801 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1802 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1803 ret void 1804} 1805 1806define void @store_load_large_imm_offset_foo() { 1807; GFX9-LABEL: store_load_large_imm_offset_foo: 1808; GFX9: ; %bb.0: ; %bb 1809; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1810; GFX9-NEXT: s_movk_i32 s0, 0x3000 1811; GFX9-NEXT: v_mov_b32_e32 v0, 13 1812; GFX9-NEXT: scratch_store_dword off, v0, s32 1813; GFX9-NEXT: s_add_u32 s0, s32, s0 1814; GFX9-NEXT: v_mov_b32_e32 v0, 15 1815; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1816; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 1817; GFX9-NEXT: s_waitcnt vmcnt(0) 1818; GFX9-NEXT: s_setpc_b64 s[30:31] 1819; 1820; GFX10-LABEL: store_load_large_imm_offset_foo: 1821; GFX10: ; %bb.0: ; %bb 1822; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1823; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1824; GFX10-NEXT: v_mov_b32_e32 v0, 13 1825; GFX10-NEXT: v_mov_b32_e32 v1, 15 1826; GFX10-NEXT: s_movk_i32 s0, 0x3800 1827; GFX10-NEXT: s_add_u32 s0, s32, s0 1828; GFX10-NEXT: scratch_store_dword off, v0, s32 1829; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1830; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 1831; GFX10-NEXT: s_waitcnt vmcnt(0) 1832; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1833; GFX10-NEXT: s_setpc_b64 s[30:31] 1834; 1835; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 1836; GFX9-PAL: ; %bb.0: ; %bb 1837; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1838; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1839; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1840; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 1841; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0 1842; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1843; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1844; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 1845; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1846; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1847; 1848; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 1849; GFX10-PAL: ; %bb.0: ; %bb 1850; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1851; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1852; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1853; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1854; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1855; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0 1856; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 1857; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1858; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 1859; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1860; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1861; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1862bb: 1863 %i = alloca [4096 x i32], align 4, addrspace(5) 1864 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1865 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1866 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1867 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1868 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1869 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1870 ret void 1871} 1872 1873define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 1874; GFX9-LABEL: store_load_vidx_sidx_offset: 1875; GFX9: ; %bb.0: ; %bb 1876; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1877; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1878; GFX9-NEXT: v_mov_b32_e32 v1, 4 1879; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1881; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 1882; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1883; GFX9-NEXT: v_mov_b32_e32 v1, 15 1884; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 1885; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 1886; GFX9-NEXT: s_endpgm 1887; 1888; GFX10-LABEL: store_load_vidx_sidx_offset: 1889; GFX10: ; %bb.0: ; %bb 1890; GFX10-NEXT: s_add_u32 s2, s2, s5 1891; GFX10-NEXT: s_addc_u32 s3, s3, 0 1892; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1893; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1894; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1895; GFX10-NEXT: v_mov_b32_e32 v1, 15 1896; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1897; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 1898; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 1899; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 1900; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 1901; GFX10-NEXT: s_endpgm 1902; 1903; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 1904; GFX9-PAL: ; %bb.0: ; %bb 1905; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1906; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1907; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1908; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1909; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 1910; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1912; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1913; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 1914; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1915; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1916; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 1917; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 1918; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 1919; GFX9-PAL-NEXT: s_endpgm 1920; 1921; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 1922; GFX10-PAL: ; %bb.0: ; %bb 1923; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 1924; GFX10-PAL-NEXT: s_mov_b32 s4, s0 1925; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1926; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1928; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 1929; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 1930; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1931; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1932; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1933; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1934; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1935; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 1936; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 1937; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 1938; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 1939; GFX10-PAL-NEXT: s_endpgm 1940bb: 1941 %alloca = alloca [32 x i32], align 4, addrspace(5) 1942 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 1943 %add1 = add nsw i32 %sidx, %vidx 1944 %add2 = add nsw i32 %add1, 256 1945 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 1946 store volatile i32 15, i32 addrspace(5)* %gep, align 4 1947 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 1948 ret void 1949} 1950 1951define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 1952; GFX9-LABEL: store_load_i64_aligned: 1953; GFX9: ; %bb.0: ; %bb 1954; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1955; GFX9-NEXT: v_mov_b32_e32 v1, 15 1956; GFX9-NEXT: v_mov_b32_e32 v2, 0 1957; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1958; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1959; GFX9-NEXT: s_waitcnt vmcnt(0) 1960; GFX9-NEXT: s_setpc_b64 s[30:31] 1961; 1962; GFX10-LABEL: store_load_i64_aligned: 1963; GFX10: ; %bb.0: ; %bb 1964; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1965; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1966; GFX10-NEXT: v_mov_b32_e32 v1, 15 1967; GFX10-NEXT: v_mov_b32_e32 v2, 0 1968; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1969; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1970; GFX10-NEXT: s_waitcnt vmcnt(0) 1971; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1972; GFX10-NEXT: s_setpc_b64 s[30:31] 1973; 1974; GFX9-PAL-LABEL: store_load_i64_aligned: 1975; GFX9-PAL: ; %bb.0: ; %bb 1976; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 1978; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 1979; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1980; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1981; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1982; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1983; 1984; GFX10-PAL-LABEL: store_load_i64_aligned: 1985; GFX10-PAL: ; %bb.0: ; %bb 1986; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1987; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1988; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1989; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 1990; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1991; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1992; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1993; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1994; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1995bb: 1996 store volatile i64 15, i64 addrspace(5)* %arg, align 8 1997 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 1998 ret void 1999} 2000 2001define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2002; GFX9-LABEL: store_load_i64_unaligned: 2003; GFX9: ; %bb.0: ; %bb 2004; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2005; GFX9-NEXT: v_mov_b32_e32 v1, 15 2006; GFX9-NEXT: v_mov_b32_e32 v2, 0 2007; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2008; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2009; GFX9-NEXT: s_waitcnt vmcnt(0) 2010; GFX9-NEXT: s_setpc_b64 s[30:31] 2011; 2012; GFX10-LABEL: store_load_i64_unaligned: 2013; GFX10: ; %bb.0: ; %bb 2014; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2015; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2016; GFX10-NEXT: v_mov_b32_e32 v1, 15 2017; GFX10-NEXT: v_mov_b32_e32 v2, 0 2018; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2019; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2020; GFX10-NEXT: s_waitcnt vmcnt(0) 2021; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2022; GFX10-NEXT: s_setpc_b64 s[30:31] 2023; 2024; GFX9-PAL-LABEL: store_load_i64_unaligned: 2025; GFX9-PAL: ; %bb.0: ; %bb 2026; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2027; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2028; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2029; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2030; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2031; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2032; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2033; 2034; GFX10-PAL-LABEL: store_load_i64_unaligned: 2035; GFX10-PAL: ; %bb.0: ; %bb 2036; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2037; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2038; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2039; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2040; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2041; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2042; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2043; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2044; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2045bb: 2046 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2047 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2048 ret void 2049} 2050 2051define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 2052; GFX9-LABEL: store_load_v3i32_unaligned: 2053; GFX9: ; %bb.0: ; %bb 2054; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2055; GFX9-NEXT: v_mov_b32_e32 v1, 1 2056; GFX9-NEXT: v_mov_b32_e32 v2, 2 2057; GFX9-NEXT: v_mov_b32_e32 v3, 3 2058; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2059; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2060; GFX9-NEXT: s_waitcnt vmcnt(0) 2061; GFX9-NEXT: s_setpc_b64 s[30:31] 2062; 2063; GFX10-LABEL: store_load_v3i32_unaligned: 2064; GFX10: ; %bb.0: ; %bb 2065; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2066; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2067; GFX10-NEXT: v_mov_b32_e32 v1, 1 2068; GFX10-NEXT: v_mov_b32_e32 v2, 2 2069; GFX10-NEXT: v_mov_b32_e32 v3, 3 2070; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2071; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2072; GFX10-NEXT: s_waitcnt vmcnt(0) 2073; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2074; GFX10-NEXT: s_setpc_b64 s[30:31] 2075; 2076; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 2077; GFX9-PAL: ; %bb.0: ; %bb 2078; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2079; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2080; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2081; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2082; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2083; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2084; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2085; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2086; 2087; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 2088; GFX10-PAL: ; %bb.0: ; %bb 2089; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2090; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2091; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2092; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2093; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2094; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2095; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2096; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2097; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2098; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2099bb: 2100 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 2101 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 2102 ret void 2103} 2104 2105define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 2106; GFX9-LABEL: store_load_v4i32_unaligned: 2107; GFX9: ; %bb.0: ; %bb 2108; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2109; GFX9-NEXT: v_mov_b32_e32 v1, 1 2110; GFX9-NEXT: v_mov_b32_e32 v2, 2 2111; GFX9-NEXT: v_mov_b32_e32 v3, 3 2112; GFX9-NEXT: v_mov_b32_e32 v4, 4 2113; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2114; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2115; GFX9-NEXT: s_waitcnt vmcnt(0) 2116; GFX9-NEXT: s_setpc_b64 s[30:31] 2117; 2118; GFX10-LABEL: store_load_v4i32_unaligned: 2119; GFX10: ; %bb.0: ; %bb 2120; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2121; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2122; GFX10-NEXT: v_mov_b32_e32 v1, 1 2123; GFX10-NEXT: v_mov_b32_e32 v2, 2 2124; GFX10-NEXT: v_mov_b32_e32 v3, 3 2125; GFX10-NEXT: v_mov_b32_e32 v4, 4 2126; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2127; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2128; GFX10-NEXT: s_waitcnt vmcnt(0) 2129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2130; GFX10-NEXT: s_setpc_b64 s[30:31] 2131; 2132; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 2133; GFX9-PAL: ; %bb.0: ; %bb 2134; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2135; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2136; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2137; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2138; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 2139; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2140; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2141; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2142; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2143; 2144; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 2145; GFX10-PAL: ; %bb.0: ; %bb 2146; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2147; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2148; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2149; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2150; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2151; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 2152; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2153; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2154; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2155; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2156; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2157bb: 2158 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 2159 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 2160 ret void 2161} 2162 2163declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 2164declare i32 @llvm.amdgcn.workitem.id.x() 2165