1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 2 3; Repeat of some problematic tests in kernel-args.ll, with the IR 4; argument lowering pass disabled. Struct padding needs to be 5; accounted for, as well as legalization of types changing offsets. 6 7; FUNC-LABEL: {{^}}i1_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10 11; GCN: s_load_dword s 12; GCN: s_and_b32 13define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 14 store i1 %x, i1 addrspace(1)* %out, align 1 15 ret void 16} 17 18; FUNC-LABEL: {{^}}v3i8_arg: 19; HSA-VI: kernarg_segment_byte_size = 12 20; HSA-VI: kernarg_segment_alignment = 4 21; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 22; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 23define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 24entry: 25 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 26 ret void 27} 28 29; FUNC-LABEL: {{^}}i65_arg: 30; HSA-VI: kernarg_segment_byte_size = 24 31; HSA-VI: kernarg_segment_alignment = 4 32; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 33; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 34define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 35entry: 36 store i65 %in, i65 addrspace(1)* %out, align 4 37 ret void 38} 39 40; FUNC-LABEL: {{^}}empty_struct_arg: 41; HSA-VI: kernarg_segment_byte_size = 0 42define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 43 ret void 44} 45 46; The correct load offsets for these: 47; load 4 from 0, 48; load 8 from 8 49; load 4 from 24 50; load 8 from 32 51 52; With the SelectionDAG argument lowering, the alignments for the 53; struct members is not properly considered, making these wrong. 54 55; FIXME: Total argument size is computed wrong 56; FUNC-LABEL: {{^}}struct_argument_alignment: 57; HSA-VI: kernarg_segment_byte_size = 40 58; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 59; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 60; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 61; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 62define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 63 %val0 = extractvalue {i32, i64} %arg0, 0 64 %val1 = extractvalue {i32, i64} %arg0, 1 65 %val2 = extractvalue {i32, i64} %arg1, 0 66 %val3 = extractvalue {i32, i64} %arg1, 1 67 store volatile i32 %val0, i32 addrspace(1)* null 68 store volatile i64 %val1, i64 addrspace(1)* null 69 store volatile i32 %val2, i32 addrspace(1)* null 70 store volatile i64 %val3, i64 addrspace(1)* null 71 ret void 72} 73 74; No padding between i8 and next struct, but round up at end to 4 byte 75; multiple. 76; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 77; HSA-VI: kernarg_segment_byte_size = 28 78; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 79; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17 80; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13 81; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 82; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 83define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 84 %val0 = extractvalue <{i32, i64}> %arg0, 0 85 %val1 = extractvalue <{i32, i64}> %arg0, 1 86 %val2 = extractvalue <{i32, i64}> %arg1, 0 87 %val3 = extractvalue <{i32, i64}> %arg1, 1 88 store volatile i32 %val0, i32 addrspace(1)* null 89 store volatile i64 %val1, i64 addrspace(1)* null 90 store volatile i32 %val2, i32 addrspace(1)* null 91 store volatile i64 %val3, i64 addrspace(1)* null 92 ret void 93} 94 95; GCN-LABEL: {{^}}struct_argument_alignment_after: 96; HSA-VI: kernarg_segment_byte_size = 64 97; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 98; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 99; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 100; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 101; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 102define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 103 %val0 = extractvalue {i32, i64} %arg0, 0 104 %val1 = extractvalue {i32, i64} %arg0, 1 105 %val2 = extractvalue {i32, i64} %arg2, 0 106 %val3 = extractvalue {i32, i64} %arg2, 1 107 store volatile i32 %val0, i32 addrspace(1)* null 108 store volatile i64 %val1, i64 addrspace(1)* null 109 store volatile i32 %val2, i32 addrspace(1)* null 110 store volatile i64 %val3, i64 addrspace(1)* null 111 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 112 ret void 113} 114 115; GCN-LABEL: {{^}}array_3xi32: 116; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 117; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 118; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 119; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 120define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 121 store volatile i16 %arg0, i16 addrspace(1)* undef 122 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 123 ret void 124} 125 126; GCN-LABEL: {{^}}array_3xi16: 127; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 128; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 129define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 130 store volatile i8 %arg0, i8 addrspace(1)* undef 131 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 132 ret void 133} 134 135; GCN-LABEL: {{^}}v2i15_arg: 136; GCN: s_load_dword [[DWORD:s[0-9]+]] 137; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}} 138; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}} 139define amdgpu_kernel void @v2i15_arg(<2 x i15> addrspace(1)* nocapture %out, <2 x i15> %in) { 140entry: 141 store <2 x i15> %in, <2 x i15> addrspace(1)* %out, align 4 142 ret void 143} 144 145; GCN-LABEL: {{^}}v3i15_arg: 146; GCN: s_load_dword [[DWORD:s[0-9]+]] 147; GCN: s_lshl_b64 148; GCN: s_and_b32 149; GCN: s_and_b32 150; GCN: s_or_b32 151define amdgpu_kernel void @v3i15_arg(<3 x i15> addrspace(1)* nocapture %out, <3 x i15> %in) { 152entry: 153 store <3 x i15> %in, <3 x i15> addrspace(1)* %out, align 4 154 ret void 155} 156 157; Byref pointers should only be treated as offsets from kernarg 158; GCN-LABEL: {{^}}byref_constant_i8_arg: 159; GCN: kernarg_segment_byte_size = 12 160; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 161; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 162define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) { 163 %in = load i8, i8 addrspace(4)* %in.byref 164 %ext = zext i8 %in to i32 165 store i32 %ext, i32 addrspace(1)* %out, align 4 166 ret void 167} 168 169; GCN-LABEL: {{^}}byref_constant_i16_arg: 170; GCN: kernarg_segment_byte_size = 12 171; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 172; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8 173define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) { 174 %in = load i16, i16 addrspace(4)* %in.byref 175 %ext = zext i16 %in to i32 176 store i32 %ext, i32 addrspace(1)* %out, align 4 177 ret void 178} 179 180; GCN-LABEL: {{^}}byref_constant_i32_arg: 181; GCN: kernarg_segment_byte_size = 16 182; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} 183; GCN: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0xc{{$}} 184define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { 185 %in = load i32, i32 addrspace(4)* %in.byref 186 store volatile i32 %in, i32 addrspace(1)* %out, align 4 187 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 188 ret void 189} 190 191; GCN-LABEL: {{^}}byref_constant_v4i32_arg: 192; GCN: kernarg_segment_byte_size = 36 193; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10{{$}} 194; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}} 195define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) { 196 %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref 197 store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 198 %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)* 199 store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4 200 ret void 201} 202 203; GCN-LABEL: {{^}}byref_align_constant_i32_arg: 204; GCN: kernarg_segment_byte_size = 264 205; GCN-DAG: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x100{{$}} 206; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} 207; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] 208; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] 209; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s 210; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s 211define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { 212 %in = load i32, i32 addrspace(4)* %in.byref 213 store volatile i32 %in, i32 addrspace(1)* %out, align 4 214 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 215 ret void 216} 217 218; GCN-LABEL: {{^}}byref_natural_align_constant_v16i32_arg: 219; GCN: kernarg_segment_byte_size = 132 220; GCN-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x80 221; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}} 222define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { 223 %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref 224 %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* 225 store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 226 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 227 ret void 228} 229 230; Also accept byref kernel arguments with other global address spaces. 231; GCN-LABEL: {{^}}byref_global_i32_arg: 232; GCN: kernarg_segment_byte_size = 12 233; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} 234define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) { 235 %in = load i32, i32 addrspace(1)* %in.byref 236 store i32 %in, i32 addrspace(1)* %out, align 4 237 ret void 238} 239 240; GCN-LABEL: {{^}}byref_flat_i32_arg: 241; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} 242define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) { 243 %in = load i32, i32* %in.byref 244 store i32 %in, i32 addrspace(1)* %out, align 4 245 ret void 246} 247 248; GCN-LABEL: {{^}}byref_constant_32bit_i32_arg: 249; GCN: s_add_i32 s[[PTR_LO:[0-9]+]], s4, 8 250; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}} 251; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}} 252define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) { 253 %in = load i32, i32 addrspace(6)* %in.byref 254 store i32 %in, i32 addrspace(1)* %out, align 4 255 ret void 256} 257 258; define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref %in.byref) { 259; %in = load i32, i32 addrspace(999)* %in.byref 260; store i32 %in, i32 addrspace(1)* %out, align 4 261; ret void 262; } 263 264; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: 265; GCN: kernarg_segment_byte_size = 20 266; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x8 267; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0xc 268; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x10 269define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { 270 %in0 = load i32, i32 addrspace(4)* %in0.byref 271 %in1 = load i32, i32 addrspace(4)* %in1.byref 272 store volatile i32 %in0, i32 addrspace(1)* %out, align 4 273 store volatile i32 %in1, i32 addrspace(1)* %out, align 4 274 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 275 ret void 276} 277 278; GCN-LABEL: {{^}}byref_constant_i32_arg_offset0: 279; GCN: kernarg_segment_byte_size = 4 280; GCN-NOT: s4 281; GCN-NOT: s5 282; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}} 283define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) { 284 %in = load i32, i32 addrspace(4)* %in.byref 285 store i32 %in, i32 addrspace(1)* undef, align 4 286 ret void 287} 288