1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s 4; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 5 6; FUNC-LABEL: {{^}}constant_load_i16: 7; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}} 8; GCN-HSA: flat_load_ushort 9 10; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 11define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) { 12entry: 13 %ld = load i16, i16 addrspace(4)* %in 14 store i16 %ld, i16 addrspace(1)* %out 15 ret void 16} 17 18; FUNC-LABEL: {{^}}constant_load_v2i16: 19; GCN: s_load_dword s 20 21; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 22define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) { 23entry: 24 %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in 25 store <2 x i16> %ld, <2 x i16> addrspace(1)* %out 26 ret void 27} 28 29; FUNC-LABEL: {{^}}constant_load_v3i16: 30; GCN: s_load_dwordx2 s 31 32; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 33; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1 34define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 35entry: 36 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 37 store <3 x i16> %ld, <3 x i16> addrspace(1)* %out 38 ret void 39} 40 41; FUNC-LABEL: {{^}}constant_load_v4i16: 42; GCN: s_load_dwordx2 43 44; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 45define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) { 46entry: 47 %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in 48 store <4 x i16> %ld, <4 x i16> addrspace(1)* %out 49 ret void 50} 51 52; FUNC-LABEL: {{^}}constant_load_v8i16: 53; GCN: s_load_dwordx4 54 55; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 56define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) { 57entry: 58 %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in 59 store <8 x i16> %ld, <8 x i16> addrspace(1)* %out 60 ret void 61} 62 63; FUNC-LABEL: {{^}}constant_load_v16i16: 64; GCN: s_load_dwordx8 65 66; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 67; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 68define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) { 69entry: 70 %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in 71 store <16 x i16> %ld, <16 x i16> addrspace(1)* %out 72 ret void 73} 74 75; FUNC-LABEL: {{^}}constant_load_v16i16_align2: 76; GCN-HSA: flat_load_dwordx4 77; GCN-HSA: flat_load_dwordx4 78; GCN-HSA: flat_store_dwordx4 79; GCN-HSA: flat_store_dwordx4 80define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 { 81entry: 82 %ld = load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2 83 store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32 84 ret void 85} 86 87; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32: 88; GCN-NOHSA: buffer_load_ushort 89; GCN-NOHSA: buffer_store_dword 90 91; GCN-HSA: flat_load_ushort 92; GCN-HSA: flat_store_dword 93 94; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 95define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 96 %a = load i16, i16 addrspace(4)* %in 97 %ext = zext i16 %a to i32 98 store i32 %ext, i32 addrspace(1)* %out 99 ret void 100} 101 102; FUNC-LABEL: {{^}}constant_sextload_i16_to_i32: 103; GCN-NOHSA: buffer_load_sshort 104; GCN-NOHSA: buffer_store_dword 105 106; GCN-HSA: flat_load_sshort 107; GCN-HSA: flat_store_dword 108 109; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 110; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 111; EG: 16 112define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 113 %a = load i16, i16 addrspace(4)* %in 114 %ext = sext i16 %a to i32 115 store i32 %ext, i32 addrspace(1)* %out 116 ret void 117} 118 119; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i32: 120; GCN-NOHSA: buffer_load_ushort 121; GCN-HSA: flat_load_ushort 122 123; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1 124define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 125 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 126 %ext = zext <1 x i16> %load to <1 x i32> 127 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 128 ret void 129} 130 131; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i32: 132; GCN-NOHSA: buffer_load_sshort 133; GCN-HSA: flat_load_sshort 134 135; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 136; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal 137; EG: 16 138define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 139 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 140 %ext = sext <1 x i16> %load to <1 x i32> 141 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out 142 ret void 143} 144 145; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32: 146; GCN: s_load_dword s 147; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}} 148; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 149 150; v2i16 is naturally 4 byte aligned 151; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 152; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal 153; EG: 16 154; EG: 16 155define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 156 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 157 %ext = zext <2 x i16> %load to <2 x i32> 158 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 159 ret void 160} 161 162; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32: 163; GCN: s_load_dword s 164; GCN-DAG: s_ashr_i32 165; GCN-DAG: s_sext_i32_i16 166 167; v2i16 is naturally 4 byte aligned 168; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 169; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 170; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal 171; TODO: We should use ASHR instead of LSHR + BFE 172; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal 173; EG-DAG: 16 174; EG-DAG: 16 175define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 176 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 177 %ext = sext <2 x i16> %load to <2 x i32> 178 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 179 ret void 180} 181 182; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32: 183; GCN: s_load_dwordx2 184 185; v3i16 is naturally 8 byte aligned 186; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 187; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, 188; EG: CF_END 189; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1 190; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 191; TODO: This should use DST, but for some there are redundant MOVs 192; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 193; EG-DAG: 16 194; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal 195; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal 196; EG-DAG: 65535 197; EG-DAG: 65535 198define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 199entry: 200 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 201 %ext = zext <3 x i16> %ld to <3 x i32> 202 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 203 ret void 204} 205 206; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32: 207; GCN: s_load_dwordx2 208 209; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 210; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}}, 211; v3i16 is naturally 8 byte aligned 212; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1 213; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1 214; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal 215; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal 216; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal 217; EG-DAG: 16 218; EG-DAG: 16 219define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) { 220entry: 221 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in 222 %ext = sext <3 x i16> %ld to <3 x i32> 223 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out 224 ret void 225} 226 227; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32: 228; GCN: s_load_dwordx2 229; GCN-DAG: s_and_b32 230; GCN-DAG: s_lshr_b32 231 232; v4i16 is naturally 8 byte aligned 233; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}} 234; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 235; TODO: This should use LD, but for some there are redundant MOVs 236; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal 237; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal 238; EG-DAG: 16 239; EG-DAG: 16 240; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal 241; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal 242; EG-DAG: 65535 243; EG-DAG: 65535 244define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 245 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 246 %ext = zext <4 x i16> %load to <4 x i32> 247 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 248 ret void 249} 250 251; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32: 252; GCN: s_load_dwordx2 253; GCN-DAG: s_ashr_i32 254; GCN-DAG: s_sext_i32_i16 255 256; v4i16 is naturally 8 byte aligned 257; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}}, 258; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1 259; TODO: This should use LD, but for some there are redundant MOVs 260; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal 261; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal 262; TODO: We should use ASHR instead of LSHR + BFE 263; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal 264; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal 265; EG-DAG: 16 266; EG-DAG: 16 267; EG-DAG: 16 268; EG-DAG: 16 269define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 270 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 271 %ext = sext <4 x i16> %load to <4 x i32> 272 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out 273 ret void 274} 275 276; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32: 277; GCN: s_load_dwordx4 278; GCN-DAG: s_and_b32 279; GCN-DAG: s_lshr_b32 280 281; v8i16 is naturally 16 byte aligned 282; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 283; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 284; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 285; TODO: These should use LSHR instead of BFE_UINT 286; TODO: This should use DST, but for some there are redundant MOVs 287; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal 288; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal 289; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal 290; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal 291; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal 292; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal 293; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal 294; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal 295; EG-DAG: 16 296; EG-DAG: 16 297; EG-DAG: 16 298; EG-DAG: 16 299; EG-DAG: 65535 300; EG-DAG: 65535 301; EG-DAG: 65535 302; EG-DAG: 65535 303define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 304 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 305 %ext = zext <8 x i16> %load to <8 x i32> 306 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 307 ret void 308} 309 310; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32: 311; GCN: s_load_dwordx4 312; GCN-DAG: s_ashr_i32 313; GCN-DAG: s_sext_i32_i16 314 315; v8i16 is naturally 16 byte aligned 316; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 317; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}}, 318; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1 319; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT 320; TODO: This should use DST, but for some there are redundant MOVs 321; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal 322; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal 323; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal 324; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal 325; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal 326; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal 327; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal 328; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal 329; EG-DAG: 16 330; EG-DAG: 16 331; EG-DAG: 16 332; EG-DAG: 16 333; EG-DAG: 16 334; EG-DAG: 16 335; EG-DAG: 16 336; EG-DAG: 16 337define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 338 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 339 %ext = sext <8 x i16> %load to <8 x i32> 340 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out 341 ret void 342} 343 344; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32: 345; GCN: s_load_dwordx8 346; GCN-DAG: s_and_b32 347; GCN-DAG: s_lshr_b32 348 349; v16i16 is naturally 32 byte aligned 350; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1 351; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1 352define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 353 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 354 %ext = zext <16 x i16> %load to <16 x i32> 355 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 356 ret void 357} 358 359; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32: 360; GCN: s_load_dwordx8 361; GCN-DAG: s_ashr_i32 362; GCN-DAG: s_sext_i32_i16 363 364; v16i16 is naturally 32 byte aligned 365; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1 366; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1 367define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 368 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 369 %ext = sext <16 x i16> %load to <16 x i32> 370 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out 371 ret void 372} 373 374; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32: 375; GCN-DAG: s_load_dwordx16 376; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} 377; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 378; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] 379 380; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 381; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 382; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 383; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 384define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 385 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 386 %ext = zext <32 x i16> %load to <32 x i32> 387 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 388 ret void 389} 390 391; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32: 392; GCN: s_load_dwordx16 393; GCN-DAG: s_ashr_i32 394; GCN-DAG: s_sext_i32_i16 395 396; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 397; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 398; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 399; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 400define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 401 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 402 %ext = sext <32 x i16> %load to <32 x i32> 403 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out 404 ret void 405} 406 407; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: 408; GCN: s_load_dwordx16 409; GCN: s_load_dwordx16 410 411; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 412; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 413; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 414; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 415; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1 416; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 417; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 418; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 419define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 420 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 421 %ext = zext <64 x i16> %load to <64 x i32> 422 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 423 ret void 424} 425 426; FUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i32: 427 428; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1 429; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1 430; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1 431; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1 432; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 64, #1 433; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1 434; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1 435; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1 436define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 437 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 438 %ext = sext <64 x i16> %load to <64 x i32> 439 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out 440 ret void 441} 442 443; FUNC-LABEL: {{^}}constant_zextload_i16_to_i64: 444; GCN-NOHSA-DAG: buffer_load_ushort v[[LO:[0-9]+]], 445; GCN-HSA-DAG: flat_load_ushort v[[LO:[0-9]+]], 446; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 447 448; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 449; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 450 451; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 452; EG: MOV {{.*}}, 0.0 453define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 454 %a = load i16, i16 addrspace(4)* %in 455 %ext = zext i16 %a to i64 456 store i64 %ext, i64 addrspace(1)* %out 457 ret void 458} 459 460; FUNC-LABEL: {{^}}constant_sextload_i16_to_i64: 461; FIXME: Need to optimize this sequence to avoid extra bfe: 462; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64 463; t31: i64 = any_extend t28 464; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16 465 466; GCN-NOHSA-SI-DAG: buffer_load_sshort v[[LO:[0-9]+]], 467; GCN-HSA-DAG: flat_load_sshort v[[LO:[0-9]+]], 468; GCN-NOHSA-VI-DAG: buffer_load_ushort v[[ULO:[0-9]+]], 469; GCN-NOHSA-VI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16 470; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 471 472; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] 473; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} 474 475; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 476; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 477; TODO: These could be expanded earlier using ASHR 15 478; EG: 31 479define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 { 480 %a = load i16, i16 addrspace(4)* %in 481 %ext = sext i16 %a to i64 482 store i64 %ext, i64 addrspace(1)* %out 483 ret void 484} 485 486; FUNC-LABEL: {{^}}constant_zextload_v1i16_to_v1i64: 487 488; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 489; EG: MOV {{.*}}, 0.0 490define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 491 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 492 %ext = zext <1 x i16> %load to <1 x i64> 493 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 494 ret void 495} 496 497; FUNC-LABEL: {{^}}constant_sextload_v1i16_to_v1i64: 498 499; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 500; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal 501; TODO: These could be expanded earlier using ASHR 15 502; EG: 31 503define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 { 504 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in 505 %ext = sext <1 x i16> %load to <1 x i64> 506 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out 507 ret void 508} 509 510; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64: 511 512; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 513define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 514 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 515 %ext = zext <2 x i16> %load to <2 x i64> 516 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 517 ret void 518} 519 520; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64: 521 522; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1 523define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 { 524 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in 525 %ext = sext <2 x i16> %load to <2 x i64> 526 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 527 ret void 528} 529 530; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64: 531 532; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 533define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 534 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 535 %ext = zext <4 x i16> %load to <4 x i64> 536 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 537 ret void 538} 539 540; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64: 541 542; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1 543define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 { 544 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in 545 %ext = sext <4 x i16> %load to <4 x i64> 546 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out 547 ret void 548} 549 550; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64: 551 552; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 553define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 554 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 555 %ext = zext <8 x i16> %load to <8 x i64> 556 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 557 ret void 558} 559 560; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64: 561 562; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 563define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { 564 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in 565 %ext = sext <8 x i16> %load to <8 x i64> 566 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out 567 ret void 568} 569 570; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i64: 571 572; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 573; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 574define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 575 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 576 %ext = zext <16 x i16> %load to <16 x i64> 577 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 578 ret void 579} 580 581; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i64: 582 583; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 584; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 585define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { 586 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in 587 %ext = sext <16 x i16> %load to <16 x i64> 588 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out 589 ret void 590} 591 592; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i64: 593 594; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 595; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 596; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 597; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 598define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 599 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 600 %ext = zext <32 x i16> %load to <32 x i64> 601 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 602 ret void 603} 604 605; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i64: 606 607; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1 608; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1 609; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1 610; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1 611define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { 612 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in 613 %ext = sext <32 x i16> %load to <32 x i64> 614 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out 615 ret void 616} 617 618; These trigger undefined register machine verifier errors 619 620; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64: 621; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 622; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 623; %ext = zext <64 x i16> %load to <64 x i64> 624; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 625; ret void 626; } 627 628; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64: 629; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { 630; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in 631; %ext = sext <64 x i16> %load to <64 x i64> 632; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out 633; ret void 634; } 635 636attributes #0 = { nounwind } 637