1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4 5; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 6; GCN: s_waitcnt 7; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 8; GFX900-NEXT: s_waitcnt 9; GFX900-NEXT: s_setpc_b64 10 11; NO-D16-HI: ds_read_u16 v 12define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 13entry: 14 %load = load i16, i16 addrspace(3)* %in 15 %build = insertelement <2 x i16> undef, i16 %load, i32 1 16 ret <2 x i16> %build 17} 18 19; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 20; GCN: s_waitcnt 21; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 22; GFX900-NEXT: s_waitcnt 23; GFX900-NEXT: v_mov_b32_e32 v0, v1 24; GFX900-NEXT: s_setpc_b64 25 26; NO-D16-HI: ds_read_u16 v 27define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 28entry: 29 %load = load i16, i16 addrspace(3)* %in 30 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 31 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 32 ret <2 x i16> %build1 33} 34 35; Show that we get reasonable regalloc without physreg constraints. 36; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 37; GCN: s_waitcnt 38; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 39; GFX900-NEXT: s_waitcnt 40; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 41; GFX900-NEXT: s_waitcnt 42; GFX900-NEXT: s_setpc_b64 43 44; NO-D16-HI: ds_read_u16 v 45define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 46entry: 47 %load = load i16, i16 addrspace(3)* %in 48 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 49 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 50 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 51 ret void 52} 53 54; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 55; GCN: s_waitcnt 56; GFX900-NEXT: v_mov_b32_e32 v1, 0 57; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 58; GFX900-NEXT: s_waitcnt 59; GFX900-NEXT: v_mov_b32_e32 v0, v1 60; GFX900-NEXT: s_setpc_b64 61 62; NO-D16-HI: ds_read_u16 v 63define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 64entry: 65 %load = load i16, i16 addrspace(3)* %in 66 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 67 ret <2 x i16> %build 68} 69 70; FIXME: Remove m0 initialization 71; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 72; GCN: s_waitcnt 73; GFX900-NEXT: ds_read_u16 v0, v0 74; GFX900-NEXT: s_waitcnt lgkmcnt(0) 75; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 76; GFX900-NEXT: s_setpc_b64 77 78; NO-D16-HI: ds_read_u16 v 79; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 80define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 81entry: 82 %load = load i16, i16 addrspace(3)* %in 83 %zext = zext i16 %load to i32 84 %shift = shl i32 %zext, 16 85 ret i32 %shift 86} 87 88; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 89; GCN: s_waitcnt 90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 91; GFX900-NEXT: s_waitcnt 92; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 93; GFX900-NEXT: s_waitcnt 94; GFX900-NEXT: s_setpc_b64 95 96; NO-D16-HI: ds_read_u16 v 97define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 98entry: 99 %load = load half, half addrspace(3)* %in 100 %build0 = insertelement <2 x half> undef, half %reg, i32 0 101 %build1 = insertelement <2 x half> %build0, half %load, i32 1 102 store <2 x half> %build1, <2 x half> addrspace(1)* undef 103 ret void 104} 105 106; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 107; GCN: s_waitcnt 108; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 109; GFX900-NEXT: s_waitcnt 110; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 111; GFX900-NEXT: s_waitcnt 112; GFX900-NEXT: s_setpc_b64 113 114; NO-D16-HI: ds_read_u8 v 115define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 116entry: 117 %load = load i8, i8 addrspace(3)* %in 118 %ext = zext i8 %load to i16 119 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 120 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 121 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 122 ret void 123} 124 125; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 126; GCN: s_waitcnt 127; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 128; GFX900-NEXT: s_waitcnt 129; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 130; GFX900-NEXT: s_waitcnt 131; GFX900-NEXT: s_setpc_b64 132 133; NO-D16-HI: ds_read_i8 v 134define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 135entry: 136 %load = load i8, i8 addrspace(3)* %in 137 %ext = sext i8 %load to i16 138 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 139 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 140 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 141 ret void 142} 143 144; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 145; GCN: s_waitcnt 146; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 147; GFX900-NEXT: s_waitcnt 148; GFX900-NEXT: global_store_dword 149; GFX900-NEXT: s_waitcnt 150; GFX900-NEXT: s_setpc_b64 151define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 152entry: 153 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 154 %load = load i16, i16 addrspace(1)* %gep 155 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 156 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 157 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 158 ret void 159} 160 161; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 162; GCN: s_waitcnt 163; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 164; GFX900-NEXT: s_waitcnt 165; GFX900-NEXT: global_store_dword 166; GFX900-NEXT: s_waitcnt 167; GFX900-NEXT: s_setpc_b64 168define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 169entry: 170 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 171 %load = load half, half addrspace(1)* %gep 172 %build0 = insertelement <2 x half> undef, half %reg, i32 0 173 %build1 = insertelement <2 x half> %build0, half %load, i32 1 174 store <2 x half> %build1, <2 x half> addrspace(1)* undef 175 ret void 176} 177 178; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 179; GCN: s_waitcnt 180; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 181; GFX900-NEXT: s_waitcnt 182; GFX900-NEXT: global_store_dword 183; GFX900-NEXT: s_waitcnt 184; GFX900-NEXT: s_setpc_b64 185define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 186entry: 187 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 188 %load = load i8, i8 addrspace(1)* %gep 189 %ext = zext i8 %load to i16 190 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 191 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 192 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 193 ret void 194} 195 196; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 197; GCN: s_waitcnt 198; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 199; GFX900-NEXT: s_waitcnt 200; GFX900-NEXT: global_store_dword 201; GFX900-NEXT: s_waitcnt 202; GFX900-NEXT: s_setpc_b64 203define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 204entry: 205 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 206 %load = load i8, i8 addrspace(1)* %gep 207 %ext = sext i8 %load to i16 208 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 209 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 210 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 211 ret void 212} 213 214; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 215; GCN: s_waitcnt 216; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 217; GFX900-NEXT: s_waitcnt 218; GFX900-NEXT: global_store_dword v[0:1], v2 219; GFX900-NEXT: s_waitcnt 220; GFX900-NEXT: s_setpc_b64 221 222; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 223; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 224; GFX803: v_or_b32_sdwa 225; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 226define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 227entry: 228 %load = load i16, i16* %in 229 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 230 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 231 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 232 ret void 233} 234 235; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 236; GCN: s_waitcnt 237; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 238; GFX900-NEXT: s_waitcnt 239; GFX900-NEXT: global_store_dword v[0:1], v2 240; GFX900-NEXT: s_waitcnt 241; GFX900-NEXT: s_setpc_b64 242 243; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 244; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 245; GFX803: v_or_b32_sdwa 246; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 247define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 248entry: 249 %load = load half, half* %in 250 %build0 = insertelement <2 x half> undef, half %reg, i32 0 251 %build1 = insertelement <2 x half> %build0, half %load, i32 1 252 store <2 x half> %build1, <2 x half> addrspace(1)* undef 253 ret void 254} 255 256; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 257; GCN: s_waitcnt 258; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 259; GFX900-NEXT: s_waitcnt 260; GFX900-NEXT: global_store_dword v[0:1], v2 261; GFX900-NEXT: s_waitcnt 262; GFX900-NEXT: s_setpc_b64 263 264; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 265; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 266; GFX803: v_or_b32_sdwa 267; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 268define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 269entry: 270 %load = load i8, i8* %in 271 %ext = zext i8 %load to i16 272 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 273 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 274 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 275 ret void 276} 277 278; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 279; GCN: s_waitcnt 280; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 281; GFX900-NEXT: s_waitcnt 282; GFX900-NEXT: global_store_dword v[0:1], v2 283; GFX900-NEXT: s_waitcnt 284; GFX900-NEXT: s_setpc_b64 285 286; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 287; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 288; GFX803: v_or_b32_sdwa 289; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 290define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 291entry: 292 %load = load i8, i8* %in 293 %ext = sext i8 %load to i16 294 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 295 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 296 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 297 ret void 298} 299 300; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 301; GCN: s_waitcnt 302; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} 303; GFX900-NEXT: s_waitcnt 304; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 305; GFX900-NEXT: s_waitcnt 306; GFX900-NEXT: s_setpc_b64 307 308; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} 309define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { 310entry: 311 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 312 %load = load i16, i16 addrspace(5)* %gep 313 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 314 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 315 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 316 ret void 317} 318 319; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 320; GCN: s_waitcnt 321; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} 322; GFX900-NEXT: s_waitcnt 323; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 324; GFX900-NEXT: s_waitcnt 325; GFX900-NEXT: s_setpc_b64 326 327; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} 328define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { 329entry: 330 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 331 %load = load half, half addrspace(5)* %gep 332 %build0 = insertelement <2 x half> undef, half %reg, i32 0 333 %build1 = insertelement <2 x half> %build0, half %load, i32 1 334 store <2 x half> %build1, <2 x half> addrspace(1)* undef 335 ret void 336} 337 338; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 339; GCN: s_waitcnt 340; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}} 341; GFX900: s_waitcnt 342; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 343; GFX900-NEXT: s_waitcnt 344; GFX900-NEXT: s_setpc_b64 345 346; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} 347define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { 348entry: 349 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 350 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 351 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 352 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 353 ret void 354} 355 356; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 357; GCN: s_waitcnt 358; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 359; GFX900-NEXT: s_waitcnt 360; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 361; GFX900-NEXT: s_waitcnt 362; GFX900-NEXT: s_setpc_b64 363 364; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}} 365define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 366entry: 367 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 368 %build0 = insertelement <2 x half> undef, half %reg, i32 0 369 %build1 = insertelement <2 x half> %build0, half %load, i32 1 370 store <2 x half> %build1, <2 x half> addrspace(1)* undef 371 ret void 372} 373 374; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 375; GCN: s_waitcnt 376; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} 377; GFX900-NEXT: s_waitcnt 378; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 379; GFX900-NEXT: s_waitcnt 380; GFX900-NEXT: s_setpc_b64 381 382; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} 383define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 384entry: 385 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 386 %load = load i8, i8 addrspace(5)* %gep 387 %ext = zext i8 %load to i16 388 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 389 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 390 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 391 ret void 392} 393 394; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 395; GCN: s_waitcnt 396; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} 397; GFX900-NEXT: s_waitcnt 398; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 399; GFX900-NEXT: s_waitcnt 400; GFX900-NEXT: s_setpc_b64 401 402; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} 403define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { 404entry: 405 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 406 %load = load i8, i8 addrspace(5)* %gep 407 %ext = sext i8 %load to i16 408 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 409 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 410 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 411 ret void 412} 413 414; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 415; GCN: s_waitcnt 416; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 417; GFX900-NEXT: s_waitcnt 418; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 419; GFX900-NEXT: s_waitcnt 420; GFX900-NEXT: s_setpc_b64 421 422; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} 423define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 424entry: 425 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 426 %ext = zext i8 %load to i16 427 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 428 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 429 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 430 ret void 431} 432 433; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 434; GCN: s_waitcnt 435; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 436; GFX900-NEXT: s_waitcnt 437; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 438; GFX900-NEXT: s_waitcnt 439; GFX900-NEXT: s_setpc_b64 440 441; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}} 442define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 443entry: 444 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 445 %ext = sext i8 %load to i16 446 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 447 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 448 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 449 ret void 450} 451 452; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 453; GCN: s_waitcnt 454; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}} 455; GFX900-NEXT: s_waitcnt 456; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 457; GFX900-NEXT: s_waitcnt 458; GFX900-NEXT: s_setpc_b64 459 460; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}} 461define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 462entry: 463 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 464 %ext = zext i8 %load to i16 465 %bc.ext = bitcast i16 %ext to half 466 %build0 = insertelement <2 x half> undef, half %reg, i32 0 467 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 468 store <2 x half> %build1, <2 x half> addrspace(1)* undef 469 ret void 470} 471 472; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 473; GCN: s_waitcnt 474; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 475; GFX900-NEXT: s_waitcnt 476; GFX900-NEXT: global_store_dword 477; GFX900-NEXT: s_waitcnt 478; GFX900-NEXT: s_setpc_b64 479 480; GFX803: flat_load_ushort 481; GFX906: global_load_ushort 482define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 483entry: 484 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 485 %load = load i16, i16 addrspace(4)* %gep 486 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 487 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 488 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 489 ret void 490} 491 492; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 493; GCN: s_waitcnt 494; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 495; GFX900-NEXT: s_waitcnt 496; GFX900-NEXT: global_store_dword 497; GFX900-NEXT: s_waitcnt 498; GFX900-NEXT: s_setpc_b64 499 500; GFX803: flat_load_ushort 501; GFX906: global_load_ushort 502define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 503entry: 504 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 505 %load = load half, half addrspace(4)* %gep 506 %build0 = insertelement <2 x half> undef, half %reg, i32 0 507 %build1 = insertelement <2 x half> %build0, half %load, i32 1 508 store <2 x half> %build1, <2 x half> addrspace(1)* undef 509 ret void 510} 511 512; Local object gives known offset, so requires converting from offen 513; to offset variant. 514 515; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 516; GFX900: buffer_store_dword 517; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 518define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { 519entry: 520 %obj0 = alloca [10 x i32], align 4, addrspace(5) 521 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 522 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 523 store volatile i32 123, i32 addrspace(5)* %bc 524 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025 525 %load = load i16, i16 addrspace(5)* %gep 526 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 527 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 528 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 529 ret void 530} 531 532; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 533; GFX900: buffer_store_dword 534; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 535define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { 536entry: 537 %obj0 = alloca [10 x i32], align 4, addrspace(5) 538 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 539 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 540 store volatile i32 123, i32 addrspace(5)* %bc 541 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 542 %load = load i8, i8 addrspace(5)* %gep 543 %ext = sext i8 %load to i16 544 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 545 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 546 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 547 ret void 548} 549 550; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 551; GFX900: buffer_store_dword 552; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 553define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { 554entry: 555 %obj0 = alloca [10 x i32], align 4, addrspace(5) 556 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 557 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 558 store volatile i32 123, i32 addrspace(5)* %bc 559 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051 560 %load = load i8, i8 addrspace(5)* %gep 561 %ext = zext i8 %load to i16 562 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 563 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 564 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 565 ret void 566} 567 568; FIXME: Remove m0 init and waitcnt between reads 569; FIXME: Is there a cost to using the extload over not? 570; GCN-LABEL: {{^}}load_local_v2i16_split: 571; GCN: s_waitcnt 572; GFX900-NEXT: ds_read_u16 v1, v0 573; GFX900-NEXT: s_waitcnt 574; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 575; GFX900-NEXT: s_waitcnt 576; GFX900-NEXT: v_mov_b32_e32 v0, v1 577; GFX900-NEXT: s_setpc_b64 578define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 { 579entry: 580 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 581 %load0 = load volatile i16, i16 addrspace(3)* %in 582 %load1 = load volatile i16, i16 addrspace(3)* %gep 583 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 584 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 585 ret <2 x i16> %build1 586} 587 588; FIXME: Remove waitcnt between reads 589; GCN-LABEL: {{^}}load_global_v2i16_split: 590; GCN: s_waitcnt 591; GFX900-NEXT: global_load_ushort v2 592; GFX900-NEXT: s_waitcnt 593; GFX900-NEXT: global_load_short_d16_hi v2 594; GFX900-NEXT: s_waitcnt 595; GFX900-NEXT: v_mov_b32_e32 v0, v2 596; GFX900-NEXT: s_setpc_b64 597define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 598entry: 599 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 600 %load0 = load volatile i16, i16 addrspace(1)* %in 601 %load1 = load volatile i16, i16 addrspace(1)* %gep 602 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 603 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 604 ret <2 x i16> %build1 605} 606 607; FIXME: Remove waitcnt between reads 608; GCN-LABEL: {{^}}load_flat_v2i16_split: 609; GCN: s_waitcnt 610; GFX900-NEXT: flat_load_ushort v2 611; GFX900-NEXT: s_waitcnt 612; GFX900-NEXT: flat_load_short_d16_hi v2 613; GFX900-NEXT: s_waitcnt 614; GFX900-NEXT: v_mov_b32_e32 v0, v2 615; GFX900-NEXT: s_setpc_b64 616define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 617entry: 618 %gep = getelementptr inbounds i16, i16* %in, i64 1 619 %load0 = load volatile i16, i16* %in 620 %load1 = load volatile i16, i16* %gep 621 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 622 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 623 ret <2 x i16> %build1 624} 625 626; FIXME: Remove waitcnt between reads 627; GCN-LABEL: {{^}}load_constant_v2i16_split: 628; GCN: s_waitcnt 629; GFX900-NEXT: global_load_ushort v2 630; GFX900-NEXT: s_waitcnt 631; GFX900-NEXT: global_load_short_d16_hi v2 632; GFX900-NEXT: s_waitcnt 633; GFX900-NEXT: v_mov_b32_e32 v0, v2 634; GFX900-NEXT: s_setpc_b64 635define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 636entry: 637 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 638 %load0 = load volatile i16, i16 addrspace(4)* %in 639 %load1 = load volatile i16, i16 addrspace(4)* %gep 640 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 641 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 642 ret <2 x i16> %build1 643} 644 645; FIXME: Remove m0 init and waitcnt between reads 646; FIXME: Is there a cost to using the extload over not? 647; GCN-LABEL: {{^}}load_private_v2i16_split: 648; GCN: s_waitcnt 649; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}} 650; GFX900-NEXT: s_waitcnt 651; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 652; GFX900-NEXT: s_waitcnt 653; GFX900-NEXT: s_setpc_b64 654define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { 655entry: 656 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 657 %load0 = load volatile i16, i16 addrspace(5)* %in 658 %load1 = load volatile i16, i16 addrspace(5)* %gep 659 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 660 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 661 ret <2 x i16> %build1 662} 663 664attributes #0 = { nounwind } 665