1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s 4 5define <2 x half> @chain_hi_to_lo_private() { 6; GFX900-LABEL: chain_hi_to_lo_private: 7; GFX900: ; %bb.0: ; %bb 8; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 10; GFX900-NEXT: s_waitcnt vmcnt(0) 11; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 12; GFX900-NEXT: s_waitcnt vmcnt(0) 13; GFX900-NEXT: s_setpc_b64 s[30:31] 14; 15; FLATSCR-LABEL: chain_hi_to_lo_private: 16; FLATSCR: ; %bb.0: ; %bb 17; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; FLATSCR-NEXT: s_mov_b32 s0, 2 19; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 20; FLATSCR-NEXT: s_mov_b32 s0, 0 21; FLATSCR-NEXT: s_waitcnt vmcnt(0) 22; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 23; FLATSCR-NEXT: s_waitcnt vmcnt(0) 24; FLATSCR-NEXT: s_setpc_b64 s[30:31] 25bb: 26 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 27 %load_lo = load half, half addrspace(5)* %gep_lo 28 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0 29 %load_hi = load half, half addrspace(5)* %gep_hi 30 31 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 32 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 33 34 ret <2 x half> %result 35} 36 37define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) { 38; GFX900-LABEL: chain_hi_to_lo_private_different_bases: 39; GFX900: ; %bb.0: ; %bb 40; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 42; GFX900-NEXT: s_waitcnt vmcnt(0) 43; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen 44; GFX900-NEXT: s_waitcnt vmcnt(0) 45; GFX900-NEXT: s_setpc_b64 s[30:31] 46; 47; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases: 48; FLATSCR: ; %bb.0: ; %bb 49; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; FLATSCR-NEXT: scratch_load_ushort v0, v0, off 51; FLATSCR-NEXT: s_waitcnt vmcnt(0) 52; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off 53; FLATSCR-NEXT: s_waitcnt vmcnt(0) 54; FLATSCR-NEXT: s_setpc_b64 s[30:31] 55bb: 56 %load_lo = load half, half addrspace(5)* %base_lo 57 %load_hi = load half, half addrspace(5)* %base_hi 58 59 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 60 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 61 62 ret <2 x half> %result 63} 64 65define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) { 66; GFX900-LABEL: chain_hi_to_lo_arithmatic: 67; GFX900: ; %bb.0: ; %bb 68; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1 70; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 71; GFX900-NEXT: s_waitcnt vmcnt(0) 72; GFX900-NEXT: v_mov_b32_e32 v0, v1 73; GFX900-NEXT: s_setpc_b64 s[30:31] 74; 75; FLATSCR-LABEL: chain_hi_to_lo_arithmatic: 76; FLATSCR: ; %bb.0: ; %bb 77; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1 79; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off 80; FLATSCR-NEXT: s_waitcnt vmcnt(0) 81; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 82; FLATSCR-NEXT: s_setpc_b64 s[30:31] 83bb: 84 %arith_lo = fadd half %in, 1.0 85 %load_hi = load half, half addrspace(5)* %base 86 87 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0 88 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 89 90 ret <2 x half> %result 91} 92 93define <2 x half> @chain_hi_to_lo_group() { 94; GCN-LABEL: chain_hi_to_lo_group: 95; GCN: ; %bb.0: ; %bb 96; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 97; GCN-NEXT: v_mov_b32_e32 v1, 0 98; GCN-NEXT: ds_read_u16 v0, v1 offset:2 99; GCN-NEXT: s_waitcnt lgkmcnt(0) 100; GCN-NEXT: ds_read_u16_d16_hi v0, v1 101; GCN-NEXT: s_waitcnt lgkmcnt(0) 102; GCN-NEXT: s_setpc_b64 s[30:31] 103bb: 104 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 105 %load_lo = load half, half addrspace(3)* %gep_lo 106 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0 107 %load_hi = load half, half addrspace(3)* %gep_hi 108 109 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 110 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 111 112 ret <2 x half> %result 113} 114 115define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) { 116; GCN-LABEL: chain_hi_to_lo_group_different_bases: 117; GCN: ; %bb.0: ; %bb 118; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GCN-NEXT: ds_read_u16 v0, v0 120; GCN-NEXT: s_waitcnt lgkmcnt(0) 121; GCN-NEXT: ds_read_u16_d16_hi v0, v1 122; GCN-NEXT: s_waitcnt lgkmcnt(0) 123; GCN-NEXT: s_setpc_b64 s[30:31] 124bb: 125 %load_lo = load half, half addrspace(3)* %base_lo 126 %load_hi = load half, half addrspace(3)* %base_hi 127 128 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 129 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 130 131 ret <2 x half> %result 132} 133 134define <2 x half> @chain_hi_to_lo_global() { 135; GCN-LABEL: chain_hi_to_lo_global: 136; GCN: ; %bb.0: ; %bb 137; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GCN-NEXT: v_mov_b32_e32 v0, 2 139; GCN-NEXT: v_mov_b32_e32 v1, 0 140; GCN-NEXT: global_load_ushort v0, v[0:1], off 141; GCN-NEXT: v_mov_b32_e32 v1, 0 142; GCN-NEXT: v_mov_b32_e32 v2, 0 143; GCN-NEXT: s_waitcnt vmcnt(0) 144; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off 145; GCN-NEXT: s_waitcnt vmcnt(0) 146; GCN-NEXT: s_setpc_b64 s[30:31] 147bb: 148 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 149 %load_lo = load half, half addrspace(1)* %gep_lo 150 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0 151 %load_hi = load half, half addrspace(1)* %gep_hi 152 153 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 154 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 155 156 ret <2 x half> %result 157} 158 159define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) { 160; GCN-LABEL: chain_hi_to_lo_global_different_bases: 161; GCN: ; %bb.0: ; %bb 162; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 163; GCN-NEXT: global_load_ushort v0, v[0:1], off 164; GCN-NEXT: s_waitcnt vmcnt(0) 165; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off 166; GCN-NEXT: s_waitcnt vmcnt(0) 167; GCN-NEXT: s_setpc_b64 s[30:31] 168bb: 169 %load_lo = load half, half addrspace(1)* %base_lo 170 %load_hi = load half, half addrspace(1)* %base_hi 171 172 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 173 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 174 175 ret <2 x half> %result 176} 177 178define <2 x half> @chain_hi_to_lo_flat() { 179; GCN-LABEL: chain_hi_to_lo_flat: 180; GCN: ; %bb.0: ; %bb 181; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 182; GCN-NEXT: v_mov_b32_e32 v0, 2 183; GCN-NEXT: v_mov_b32_e32 v1, 0 184; GCN-NEXT: flat_load_ushort v0, v[0:1] 185; GCN-NEXT: v_mov_b32_e32 v1, 0 186; GCN-NEXT: v_mov_b32_e32 v2, 0 187; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 188; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] 189; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 190; GCN-NEXT: s_setpc_b64 s[30:31] 191bb: 192 %gep_lo = getelementptr inbounds half, half* null, i64 1 193 %load_lo = load half, half* %gep_lo 194 %gep_hi = getelementptr inbounds half, half* null, i64 0 195 %load_hi = load half, half* %gep_hi 196 197 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 198 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 199 200 ret <2 x half> %result 201} 202 203define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) { 204; GCN-LABEL: chain_hi_to_lo_flat_different_bases: 205; GCN: ; %bb.0: ; %bb 206; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 207; GCN-NEXT: flat_load_ushort v0, v[0:1] 208; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 209; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] 210; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 211; GCN-NEXT: s_setpc_b64 s[30:31] 212bb: 213 %load_lo = load half, half* %base_lo 214 %load_hi = load half, half* %base_hi 215 216 %temp = insertelement <2 x half> undef, half %load_lo, i32 0 217 %result = insertelement <2 x half> %temp, half %load_hi, i32 1 218 219 ret <2 x half> %result 220} 221 222; Make sure we don't lose any of the private stores. 223define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { 224; GFX900-LABEL: vload2_private: 225; GFX900: ; %bb.0: ; %entry 226; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9 227; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 228; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 229; GFX900-NEXT: v_mov_b32_e32 v2, 0 230; GFX900-NEXT: s_add_u32 s0, s0, s9 231; GFX900-NEXT: s_addc_u32 s1, s1, 0 232; GFX900-NEXT: s_waitcnt lgkmcnt(0) 233; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] 234; GFX900-NEXT: s_waitcnt vmcnt(0) 235; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 236; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2 237; GFX900-NEXT: s_waitcnt vmcnt(0) 238; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6 239; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4 240; GFX900-NEXT: s_waitcnt vmcnt(0) 241; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 242; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4 243; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6 244; GFX900-NEXT: s_waitcnt vmcnt(1) 245; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 246; GFX900-NEXT: s_waitcnt vmcnt(0) 247; GFX900-NEXT: v_mov_b32_e32 v1, v3 248; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8 249; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0 250; GFX900-NEXT: s_waitcnt vmcnt(0) 251; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 252; GFX900-NEXT: s_endpgm 253; 254; FLATSCR-LABEL: vload2_private: 255; FLATSCR: ; %bb.0: ; %entry 256; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 257; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 258; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 259; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 260; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 261; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 262; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] 263; FLATSCR-NEXT: s_waitcnt vmcnt(0) 264; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4 265; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2 266; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 267; FLATSCR-NEXT: s_waitcnt vmcnt(0) 268; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6 269; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4 270; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 271; FLATSCR-NEXT: s_waitcnt vmcnt(0) 272; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8 273; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 274; FLATSCR-NEXT: scratch_load_ushort v0, off, vcc_hi offset:4 275; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 276; FLATSCR-NEXT: scratch_load_ushort v3, off, vcc_hi offset:6 277; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 278; FLATSCR-NEXT: s_waitcnt vmcnt(1) 279; FLATSCR-NEXT: v_and_b32_e32 v0, 0xffff, v0 280; FLATSCR-NEXT: s_waitcnt vmcnt(0) 281; FLATSCR-NEXT: v_mov_b32_e32 v1, v3 282; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, vcc_hi offset:8 283; FLATSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v0 284; FLATSCR-NEXT: s_waitcnt vmcnt(0) 285; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 286; FLATSCR-NEXT: s_endpgm 287entry: 288 %loc = alloca [3 x i16], align 2, addrspace(5) 289 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* 290 %tmp = load i16, i16 addrspace(1)* %in, align 2 291 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 292 store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx 293 %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 294 %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2 295 %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 296 store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3 297 %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2 298 %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2 299 %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2 300 store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx 301 %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)* 302 %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2 303 store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4 304 %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 305 %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)* 306 %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2 307 %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1 308 store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4 309 %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* 310 ret void 311} 312 313; There is another instruction between the misordered instruction and 314; the value dependent load, so a simple operand check is insufficient. 315define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) { 316; GCN-LABEL: chain_hi_to_lo_group_other_dep: 317; GCN: ; %bb.0: ; %bb 318; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GCN-NEXT: ds_read_u16_d16_hi v1, v0 320; GCN-NEXT: s_waitcnt lgkmcnt(0) 321; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] 322; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2 323; GCN-NEXT: s_waitcnt lgkmcnt(0) 324; GCN-NEXT: v_mov_b32_e32 v0, v1 325; GCN-NEXT: s_setpc_b64 s[30:31] 326bb: 327 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 328 %load_lo = load i16, i16 addrspace(3)* %gep_lo 329 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 330 %load_hi = load i16, i16 addrspace(3)* %gep_hi 331 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 332 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 333 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 334 ret <2 x i16> %result 335} 336 337; The volatile operations aren't put on the same chain 338define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) { 339; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: 340; GCN: ; %bb.0: ; %bb 341; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GCN-NEXT: ds_read_u16 v1, v0 offset:2 343; GCN-NEXT: ds_read_u16_d16_hi v0, v0 344; GCN-NEXT: v_mov_b32_e32 v2, 0xffff 345; GCN-NEXT: s_waitcnt lgkmcnt(0) 346; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 347; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0 348; GCN-NEXT: s_setpc_b64 s[30:31] 349bb: 350 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 351 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo 352 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 353 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi 354 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 355 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 356 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 357 ret <2 x i16> %result 358} 359 360define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) { 361; GFX900-LABEL: chain_hi_to_lo_private_other_dep: 362; GFX900: ; %bb.0: ; %bb 363; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen 365; GFX900-NEXT: s_waitcnt vmcnt(0) 366; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] 367; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 368; GFX900-NEXT: s_waitcnt vmcnt(0) 369; GFX900-NEXT: v_mov_b32_e32 v0, v1 370; GFX900-NEXT: s_setpc_b64 s[30:31] 371; 372; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep: 373; FLATSCR: ; %bb.0: ; %bb 374; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 375; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off 376; FLATSCR-NEXT: s_waitcnt vmcnt(0) 377; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] 378; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2 379; FLATSCR-NEXT: s_waitcnt vmcnt(0) 380; FLATSCR-NEXT: v_mov_b32_e32 v0, v1 381; FLATSCR-NEXT: s_setpc_b64 s[30:31] 382bb: 383 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 384 %load_lo = load i16, i16 addrspace(5)* %gep_lo 385 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0 386 %load_hi = load i16, i16 addrspace(5)* %gep_hi 387 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 388 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 389 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 390 ret <2 x i16> %result 391} 392 393define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) { 394; GCN-LABEL: chain_hi_to_lo_global_other_dep: 395; GCN: ; %bb.0: ; %bb 396; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 397; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 398; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off 399; GCN-NEXT: v_mov_b32_e32 v1, 0xffff 400; GCN-NEXT: s_waitcnt vmcnt(0) 401; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 402; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 403; GCN-NEXT: s_setpc_b64 s[30:31] 404bb: 405 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 406 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo 407 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0 408 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi 409 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 410 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 411 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 412 ret <2 x i16> %result 413} 414 415define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) { 416; GCN-LABEL: chain_hi_to_lo_flat_other_dep: 417; GCN: ; %bb.0: ; %bb 418; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 420; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] 421; GCN-NEXT: v_mov_b32_e32 v1, 0xffff 422; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 423; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] 424; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0 425; GCN-NEXT: s_setpc_b64 s[30:31] 426bb: 427 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 428 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo 429 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0 430 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi 431 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 432 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12> 433 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0 434 ret <2 x i16> %result 435} 436 437define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) { 438; GCN-LABEL: chain_hi_to_lo_group_may_alias_store: 439; GCN: ; %bb.0: ; %bb 440; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; GCN-NEXT: v_mov_b32_e32 v3, 0x7b 442; GCN-NEXT: ds_read_u16 v2, v0 443; GCN-NEXT: ds_write_b16 v1, v3 444; GCN-NEXT: ds_read_u16 v0, v0 offset:2 445; GCN-NEXT: s_waitcnt lgkmcnt(0) 446; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 447; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0 448; GCN-NEXT: s_setpc_b64 s[30:31] 449bb: 450 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 451 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 452 %load_hi = load i16, i16 addrspace(3)* %gep_hi 453 store i16 123, i16 addrspace(3)* %may.alias 454 %load_lo = load i16, i16 addrspace(3)* %gep_lo 455 456 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1 457 %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0 458 ret <2 x i16> %result 459} 460