1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-GFX9,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-GFX9: kernarg_segment_byte_size = 12 9; HSA-GFX9: kernarg_segment_alignment = 4 10 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17 18 19; EGCM: VTX_READ_8{{.*}} #3 20; EGCM: KC0[2].Y 21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 22 %ext = zext i8 %in to i32 23 store i32 %ext, i32 addrspace(1)* %out, align 4 24 ret void 25} 26 27; FUNC-LABEL: {{^}}i8_zext_arg: 28; HSA-GFX9: kernarg_segment_byte_size = 12 29; HSA-GFX9: kernarg_segment_alignment = 4 30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32 33; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 34; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 35 36 37; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 38; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 39; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 40 41; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 42; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 43; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 44; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 45define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 46 %ext = zext i8 %in to i32 47 store i32 %ext, i32 addrspace(1)* %out, align 4 48 ret void 49} 50 51; FUNC-LABEL: {{^}}i8_sext_arg: 52; HSA-GFX9: kernarg_segment_byte_size = 12 53; HSA-GFX9: kernarg_segment_alignment = 4 54; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 55 56; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 57 58; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 59; HSA-GFX9: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 60; HSA-GFX9: global_store_dword 61 62 63; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 65; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 66 67; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 68; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 69; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 70; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 71define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 72 %ext = sext i8 %in to i32 73 store i32 %ext, i32 addrspace(1)* %out, align 4 74 ret void 75} 76 77; FUNC-LABEL: {{^}}i16_arg: 78; HSA-GFX9: kernarg_segment_byte_size = 12 79; HSA-GFX9: kernarg_segment_alignment = 4 80 81; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 82 83; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 84; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 85 86; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 87; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 88; HSA-GFX9: global_store_dword 89 90; EGCM: VTX_READ_16 91; EGCM: KC0[2].Y 92define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 93 %ext = zext i16 %in to i32 94 store i32 %ext, i32 addrspace(1)* %out, align 4 95 ret void 96} 97 98; FUNC-LABEL: {{^}}i16_zext_arg: 99; HSA-GFX9: kernarg_segment_byte_size = 12 100; HSA-GFX9: kernarg_segment_alignment = 4 101 102; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 103; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 104 105; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 106; HSA-GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 107; HSA-GFX9: global_store_dword 108 109; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 110; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 111; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 112 113; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 114; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 115; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 116; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 117define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 118 %ext = zext i16 %in to i32 119 store i32 %ext, i32 addrspace(1)* %out, align 4 120 ret void 121} 122 123; FUNC-LABEL: {{^}}i16_sext_arg: 124; HSA-GFX9: kernarg_segment_byte_size = 12 125; HSA-GFX9: kernarg_segment_alignment = 4 126 127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 129 130 131; HSA-GFX9: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 132; HSA-GFX9: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 133; HSA-GFX9: global_store_dword 134 135; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 137; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 138 139; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 140; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 141; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 142; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 143define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 144 %ext = sext i16 %in to i32 145 store i32 %ext, i32 addrspace(1)* %out, align 4 146 ret void 147} 148 149; FUNC-LABEL: {{^}}i32_arg: 150; HSA-GFX9: kernarg_segment_byte_size = 12 151; HSA-GFX9: kernarg_segment_alignment = 4 152 153; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 154; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 155; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 156; HSA-GFX9: s_load_dword s{{[0-9]}}, s[4:5], 0x8 157define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 158entry: 159 store i32 %in, i32 addrspace(1)* %out, align 4 160 ret void 161} 162 163; FUNC-LABEL: {{^}}f32_arg: 164; HSA-GFX9: kernarg_segment_byte_size = 12 165; HSA-GFX9: kernarg_segment_alignment = 4 166; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 167; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 168; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 169; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 170define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 171entry: 172 store float %in, float addrspace(1)* %out, align 4 173 ret void 174} 175 176; FUNC-LABEL: {{^}}v2i8_arg: 177; HSA-GFX9: kernarg_segment_byte_size = 12 178; HSA-GFX9: kernarg_segment_alignment = 4 179 180; EGCM: VTX_READ_8 181; EGCM: VTX_READ_8 182 183; GCN: s_load_dword s 184; GCN-NOT: {{buffer|flat|global}}_load_ 185define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 186entry: 187 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 188 ret void 189} 190 191; FUNC-LABEL: {{^}}v2i16_arg: 192; HSA-GFX9: kernarg_segment_byte_size = 12 193; HSA-GFX9: kernarg_segment_alignment = 4 194 195; EGCM: VTX_READ_16 196; EGCM: VTX_READ_16 197 198; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 199; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 200; HSA-GFX9: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 201define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 202entry: 203 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 204 ret void 205} 206 207; FUNC-LABEL: {{^}}v2i32_arg: 208; HSA-GFX9: kernarg_segment_byte_size = 16 209; HSA-GFX9: kernarg_segment_alignment = 4 210 211; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 212; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 215; HSA-GFX9: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 216define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 217entry: 218 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 219 ret void 220} 221 222; FUNC-LABEL: {{^}}v2f32_arg: 223; HSA-GFX9: kernarg_segment_byte_size = 16 224; HSA-GFX9: kernarg_segment_alignment = 4 225 226; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 227; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 228; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 229; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 230; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 231define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 232entry: 233 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 234 ret void 235} 236 237; FUNC-LABEL: {{^}}v3i8_arg: 238; HSA-GFX9: kernarg_segment_byte_size = 12 239; HSA-GFX9: kernarg_segment_alignment = 4 240 241; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 242; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 243; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 244 245; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 246 247; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 248; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 249define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 250entry: 251 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 252 ret void 253} 254 255; FUNC-LABEL: {{^}}v3i16_arg: 256; HSA-GFX9: kernarg_segment_byte_size = 16 257; HSA-GFX9: kernarg_segment_alignment = 4 258 259; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 260; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 261; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 262 263; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 264 265; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 266; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 267define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 268entry: 269 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 270 ret void 271} 272 273; FUNC-LABEL: {{^}}v3i32_arg: 274; HSA-GFX9: kernarg_segment_byte_size = 32 275; HSA-GFX9: kernarg_segment_alignment = 4 276; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 277; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 278; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 279; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 280; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 281; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 282define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 283entry: 284 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 285 ret void 286} 287 288; FUNC-LABEL: {{^}}v3f32_arg: 289; HSA-GFX9: kernarg_segment_byte_size = 32 290; HSA-GFX9: kernarg_segment_alignment = 4 291; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 292; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 293; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 294; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 295; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 296; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 297define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 298entry: 299 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 300 ret void 301} 302 303; FUNC-LABEL: {{^}}v4i8_arg: 304; HSA-GFX9: kernarg_segment_byte_size = 12 305; HSA-GFX9: kernarg_segment_alignment = 4 306; EGCM: VTX_READ_8 307; EGCM: VTX_READ_8 308; EGCM: VTX_READ_8 309; EGCM: VTX_READ_8 310 311; GCN-DAG: s_load_dwordx2 s 312; GCN-DAG: s_load_dword s 313define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 314entry: 315 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 316 ret void 317} 318 319; FUNC-LABEL: {{^}}v4i16_arg: 320; HSA-GFX9: kernarg_segment_byte_size = 16 321; HSA-GFX9: kernarg_segment_alignment = 4 322; EGCM: VTX_READ_16 323; EGCM: VTX_READ_16 324; EGCM: VTX_READ_16 325; EGCM: VTX_READ_16 326 327; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 328; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 329 330; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 331; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 332 333 334; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 335; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 336 337; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 338; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 339define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 340entry: 341 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 342 ret void 343} 344 345; FUNC-LABEL: {{^}}v4i32_arg: 346; HSA-GFX9: kernarg_segment_byte_size = 32 347; HSA-GFX9: kernarg_segment_alignment = 4 348; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 349; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 350; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 351; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 352 353; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 354; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 355; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 356define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 357entry: 358 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 359 ret void 360} 361 362; FUNC-LABEL: {{^}}v4f32_arg: 363; HSA-GFX9: kernarg_segment_byte_size = 32 364; HSA-GFX9: kernarg_segment_alignment = 4 365; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 366; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 367; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 368; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 369; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 370; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 371; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 372define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 373entry: 374 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 375 ret void 376} 377 378; FUNC-LABEL: {{^}}v5i8_arg: 379; HSA-GFX9: kernarg_segment_byte_size = 16 380; HSA-GFX9: kernarg_segment_alignment = 4 381 382; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 383; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 384; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 385 386; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 387 388; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 389; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 390define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { 391entry: 392 store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 393 ret void 394} 395 396; FUNC-LABEL: {{^}}v5i16_arg: 397; HSA-GFX9: kernarg_segment_byte_size = 32 398; HSA-GFX9: kernarg_segment_alignment = 4 399 400; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 401; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 402; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58 403 404; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd 405 406; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 407; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 408define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { 409entry: 410 store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 411 ret void 412} 413 414; FUNC-LABEL: {{^}}v5i32_arg: 415; HSA-GFX9: kernarg_segment_byte_size = 64 416; HSA-GFX9: kernarg_segment_alignment = 5 417; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 418; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 419; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 420; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 421; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 422; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 423define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { 424entry: 425 store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 426 ret void 427} 428 429; FUNC-LABEL: {{^}}v5f32_arg: 430; HSA-GFX9: kernarg_segment_byte_size = 64 431; HSA-GFX9: kernarg_segment_alignment = 5 432; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 433; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 434; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 435; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 436; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 437; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 438define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { 439entry: 440 store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 441 ret void 442} 443 444; FUNC-LABEL: {{^}}v5i64_arg: 445; HSA-GFX9: kernarg_segment_byte_size = 128 446; HSA-GFX9: kernarg_segment_alignment = 6 447; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 448; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 449; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 450; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 451; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 452; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 453; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 454; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 455; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 456; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 457; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 458; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 459; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 460; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 461; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 462; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 463define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { 464entry: 465 store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 466 ret void 467} 468 469; FUNC-LABEL: {{^}}v5f64_arg: 470; HSA-GFX9: kernarg_segment_byte_size = 128 471; HSA-GFX9: kernarg_segment_alignment = 6 472; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 473; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 474; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 475; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 476; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 477; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 478; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 479; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 480; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 481; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 482; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 483; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21 484; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 485; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84 486; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 487; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60 488define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { 489entry: 490 store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 491 ret void 492} 493 494; FIXME: Lots of unpack and re-pack junk on VI 495; FUNC-LABEL: {{^}}v8i8_arg: 496; HSA-GFX9: kernarg_segment_byte_size = 16 497; HSA-GFX9: kernarg_segment_alignment = 4 498; EGCM: VTX_READ_8 499; EGCM: VTX_READ_8 500; EGCM: VTX_READ_8 501; EGCM: VTX_READ_8 502; EGCM: VTX_READ_8 503; EGCM: VTX_READ_8 504; EGCM: VTX_READ_8 505; EGCM: VTX_READ_8 506 507; SI-NOT: {{buffer|flat|global}}_load 508; SI: s_load_dwordx2 s 509; SI-NEXT: s_load_dwordx2 s 510; SI-NOT: {{buffer|flat|global}}_load 511 512; VI: s_load_dwordx2 s 513; VI-NEXT: s_load_dwordx2 s 514; VI-NOT: lshl 515; VI-NOT: _or 516; VI-NOT: _sdwa 517define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 518entry: 519 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 520 ret void 521} 522 523; FUNC-LABEL: {{^}}v8i16_arg: 524; HSA-GFX9: kernarg_segment_byte_size = 32 525; HSA-GFX9: kernarg_segment_alignment = 4 526; EGCM: VTX_READ_16 527; EGCM: VTX_READ_16 528; EGCM: VTX_READ_16 529; EGCM: VTX_READ_16 530; EGCM: VTX_READ_16 531; EGCM: VTX_READ_16 532; EGCM: VTX_READ_16 533; EGCM: VTX_READ_16 534 535; SI: s_load_dwordx4 536; SI-NEXT: s_load_dwordx2 537; SI-NOT: {{buffer|flat|global}}_load 538 539 540; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 541 542; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 543define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 544entry: 545 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 546 ret void 547} 548 549; FUNC-LABEL: {{^}}v8i32_arg: 550; HSA-GFX9: kernarg_segment_byte_size = 64 551; HSA-GFX9: kernarg_segment_alignment = 5 552; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 553; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 554; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 555; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 556; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 557; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 558; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 559; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 560 561; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 562; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 563; HSA-GFX9: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 564define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 565entry: 566 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 567 ret void 568} 569 570; FUNC-LABEL: {{^}}v8f32_arg: 571; HSA-GFX9: kernarg_segment_byte_size = 64 572; HSA-GFX9: kernarg_segment_alignment = 5 573; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 574; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 575; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 576; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 577; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 578; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 579; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 580; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 581; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 582define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 583entry: 584 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 585 ret void 586} 587 588; FIXME: Pack/repack on VI 589 590; FUNC-LABEL: {{^}}v16i8_arg: 591; HSA-GFX9: kernarg_segment_byte_size = 32 592; HSA-GFX9: kernarg_segment_alignment = 4 593; EGCM: VTX_READ_8 594; EGCM: VTX_READ_8 595; EGCM: VTX_READ_8 596; EGCM: VTX_READ_8 597; EGCM: VTX_READ_8 598; EGCM: VTX_READ_8 599; EGCM: VTX_READ_8 600; EGCM: VTX_READ_8 601; EGCM: VTX_READ_8 602; EGCM: VTX_READ_8 603; EGCM: VTX_READ_8 604; EGCM: VTX_READ_8 605; EGCM: VTX_READ_8 606; EGCM: VTX_READ_8 607; EGCM: VTX_READ_8 608; EGCM: VTX_READ_8 609 610; SI: s_load_dwordx4 s 611; SI-NEXT: s_load_dwordx2 s 612; SI-NOT: {{buffer|flat|global}}_load 613 614 615; VI: s_load_dwordx4 s 616; VI-NOT: shr 617; VI-NOT: shl 618; VI-NOT: _sdwa 619; VI-NOT: _or_ 620define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 621entry: 622 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 623 ret void 624} 625 626; FUNC-LABEL: {{^}}v16i16_arg: 627; HSA-GFX9: kernarg_segment_byte_size = 64 628; HSA-GFX9: kernarg_segment_alignment = 5 629; EGCM: VTX_READ_16 630; EGCM: VTX_READ_16 631; EGCM: VTX_READ_16 632; EGCM: VTX_READ_16 633; EGCM: VTX_READ_16 634 635; EGCM: VTX_READ_16 636; EGCM: VTX_READ_16 637; EGCM: VTX_READ_16 638; EGCM: VTX_READ_16 639; EGCM: VTX_READ_16 640; EGCM: VTX_READ_16 641; EGCM: VTX_READ_16 642; EGCM: VTX_READ_16 643; EGCM: VTX_READ_16 644; EGCM: VTX_READ_16 645; EGCM: VTX_READ_16 646 647; SI: s_load_dwordx8 s 648; SI-NEXT: s_load_dwordx2 s 649; SI-NOT: {{buffer|flat|global}}_load 650 651 652; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 653 654; HSA-GFX9: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 655define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 656entry: 657 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 658 ret void 659} 660 661; FUNC-LABEL: {{^}}v16i32_arg: 662; HSA-GFX9: kernarg_segment_byte_size = 128 663; HSA-GFX9: kernarg_segment_alignment = 6 664; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 665; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 666; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 667; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 668; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 669; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 670; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 671; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 672; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 673; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 674; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 675; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 676; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 677; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 678; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 679; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 680; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 681; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 682; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 683define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 684entry: 685 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 686 ret void 687} 688 689; FUNC-LABEL: {{^}}v16f32_arg: 690; HSA-GFX9: kernarg_segment_byte_size = 128 691; HSA-GFX9: kernarg_segment_alignment = 6 692; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 693; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 694; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 695; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 696; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 697; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 698; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 699; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 700; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 701; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 702; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 703; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 704; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 705; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 706; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 707; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 708; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 709; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 710; HSA-GFX9: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 711define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 712entry: 713 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 714 ret void 715} 716 717; FUNC-LABEL: {{^}}kernel_arg_i64: 718; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 719; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 720 721; MESA-GCN: buffer_store_dwordx2 722define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 723 store i64 %a, i64 addrspace(1)* %out, align 8 724 ret void 725} 726 727; FUNC-LABEL: {{^}}f64_kernel_arg: 728; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 729; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 730; MESA-GCN: buffer_store_dwordx2 731 732; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 733define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 734entry: 735 store double %in, double addrspace(1)* %out 736 ret void 737} 738 739; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 740; XGCN: s_load_dwordx2 741; XGCN: s_load_dwordx2 742; XGCN: buffer_store_dwordx2 743; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 744; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 745; ret void 746; } 747 748; FUNC-LABEL: {{^}}i65_arg: 749; HSA-GFX9: kernarg_segment_byte_size = 24 750; HSA-GFX9: kernarg_segment_alignment = 4 751; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 752; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 753define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 754entry: 755 store i65 %in, i65 addrspace(1)* %out, align 4 756 ret void 757} 758 759; FUNC-LABEL: {{^}}i1_arg: 760; HSA-GFX9: kernarg_segment_byte_size = 12 761; HSA-GFX9: kernarg_segment_alignment = 4 762 763; GCN: s_load_dword s 764; GCN: s_and_b32 765; GCN: {{buffer|flat|global}}_store_byte 766define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 767 store i1 %x, i1 addrspace(1)* %out, align 1 768 ret void 769} 770 771; FUNC-LABEL: {{^}}i1_arg_zext_i32: 772; HSA-GFX9: kernarg_segment_byte_size = 12 773; HSA-GFX9: kernarg_segment_alignment = 4 774 775; GCN: s_load_dword 776; GCN: {{buffer|flat|global}}_store_dword 777define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 778 %ext = zext i1 %x to i32 779 store i32 %ext, i32 addrspace(1)* %out, align 4 780 ret void 781} 782 783; FUNC-LABEL: {{^}}i1_arg_zext_i64: 784; HSA-GFX9: kernarg_segment_byte_size = 12 785; HSA-GFX9: kernarg_segment_alignment = 4 786 787; GCN: s_load_dword s 788; GCN: {{buffer|flat|global}}_store_dwordx2 789define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 790 %ext = zext i1 %x to i64 791 store i64 %ext, i64 addrspace(1)* %out, align 8 792 ret void 793} 794 795; FUNC-LABEL: {{^}}i1_arg_sext_i32: 796; HSA-GFX9: kernarg_segment_byte_size = 12 797; HSA-GFX9: kernarg_segment_alignment = 4 798 799; GCN: s_load_dword 800; GCN: {{buffer|flat|global}}_store_dword 801define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 802 %ext = sext i1 %x to i32 803 store i32 %ext, i32addrspace(1)* %out, align 4 804 ret void 805} 806 807; FUNC-LABEL: {{^}}i1_arg_sext_i64: 808; HSA-GFX9: kernarg_segment_byte_size = 12 809; HSA-GFX9: kernarg_segment_alignment = 4 810 811; GCN: s_load_dword 812; GCN: s_bfe_i64 813; GCN: {{buffer|flat|global}}_store_dwordx2 814define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 815 %ext = sext i1 %x to i64 816 store i64 %ext, i64 addrspace(1)* %out, align 8 817 ret void 818} 819 820; FUNC-LABEL: {{^}}empty_struct_arg: 821; HSA-GFX9: kernarg_segment_byte_size = 0 822define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 823 ret void 824} 825 826; The correct load offsets for these: 827; load 4 from 0, 828; load 8 from 8 829; load 4 from 24 830; load 8 from 32 831 832; With the SelectionDAG argument lowering, the alignments for the 833; struct members is not properly considered, making these wrong. 834 835; FIXME: Total argument size is computed wrong 836; FUNC-LABEL: {{^}}struct_argument_alignment: 837; HSA-GFX9: kernarg_segment_byte_size = 40 838; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 839; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 840; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 841; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 842define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 843 %val0 = extractvalue {i32, i64} %arg0, 0 844 %val1 = extractvalue {i32, i64} %arg0, 1 845 %val2 = extractvalue {i32, i64} %arg1, 0 846 %val3 = extractvalue {i32, i64} %arg1, 1 847 store volatile i32 %val0, i32 addrspace(1)* null 848 store volatile i64 %val1, i64 addrspace(1)* null 849 store volatile i32 %val2, i32 addrspace(1)* null 850 store volatile i64 %val3, i64 addrspace(1)* null 851 ret void 852} 853 854; No padding between i8 and next struct, but round up at end to 4 byte 855; multiple. 856; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 857; HSA-GFX9: kernarg_segment_byte_size = 28 858; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 859; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 860; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 861; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 862; HSA-GFX9: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 863define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 864 %val0 = extractvalue <{i32, i64}> %arg0, 0 865 %val1 = extractvalue <{i32, i64}> %arg0, 1 866 %val2 = extractvalue <{i32, i64}> %arg1, 0 867 %val3 = extractvalue <{i32, i64}> %arg1, 1 868 store volatile i32 %val0, i32 addrspace(1)* null 869 store volatile i64 %val1, i64 addrspace(1)* null 870 store volatile i32 %val2, i32 addrspace(1)* null 871 store volatile i64 %val3, i64 addrspace(1)* null 872 ret void 873} 874 875; GCN-LABEL: {{^}}struct_argument_alignment_after: 876; HSA-GFX9: kernarg_segment_byte_size = 64 877; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 878; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 879; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 880; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 881; HSA-GFX9: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 882define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 883 %val0 = extractvalue {i32, i64} %arg0, 0 884 %val1 = extractvalue {i32, i64} %arg0, 1 885 %val2 = extractvalue {i32, i64} %arg2, 0 886 %val3 = extractvalue {i32, i64} %arg2, 1 887 store volatile i32 %val0, i32 addrspace(1)* null 888 store volatile i64 %val1, i64 addrspace(1)* null 889 store volatile i32 %val2, i32 addrspace(1)* null 890 store volatile i64 %val3, i64 addrspace(1)* null 891 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 892 ret void 893} 894 895; GCN-LABEL: {{^}}array_3xi32: 896; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 897; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 898; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 899; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 900define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 901 store volatile i16 %arg0, i16 addrspace(1)* undef 902 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 903 ret void 904} 905 906; FIXME: Why not all scalar loads? 907; GCN-LABEL: {{^}}array_3xi16: 908; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 909; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2 910; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4 911; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6 912define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 913 store volatile i8 %arg0, i8 addrspace(1)* undef 914 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 915 ret void 916} 917 918; GCN-LABEL: {{^}}small_array_round_down_offset: 919; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 920; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1 921define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { 922 %val = extractvalue [1 x i8] %arg, 0 923 store volatile i8 %val, i8 addrspace(1)* undef 924 ret void 925} 926 927; GCN-LABEL: {{^}}byref_align_constant_i32_arg: 928; HSA-GFX9: kernarg_segment_byte_size = 264 929; HSA-GFX9-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x100{{$}} 930define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { 931 %in = load i32, i32 addrspace(4)* %in.byref 932 store volatile i32 %in, i32 addrspace(1)* %out, align 4 933 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 934 ret void 935} 936 937; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: 938; HSA-GFX9: kernarg_segment_byte_size = 132 939; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 940; HSA-GFX9-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} 941define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { 942 %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref 943 %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* 944 store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 945 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 946 ret void 947} 948