1; RUN: llc -march=r600 -mcpu=redwood -disable-promote-alloca-to-vector < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC 2; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600-VECT -check-prefix=FUNC 3; RUN: opt -S -mtriple=r600-unknown-unknown -mcpu=redwood -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=OPT %s 4target datalayout = "A5" 5 6declare i32 @llvm.r600.read.tidig.x() nounwind readnone 7 8; FUNC-LABEL: {{^}}mova_same_clause: 9 10; R600: LDS_WRITE 11; R600: LDS_WRITE 12; R600: LDS_READ 13; R600: LDS_READ 14 15; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0 16; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0 17; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1 18; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1 19; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1 20 21define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { 22entry: 23 %stack = alloca [5 x i32], align 4, addrspace(5) 24 %0 = load i32, i32 addrspace(1)* %in, align 4 25 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 26 store i32 4, i32 addrspace(5)* %arrayidx1, align 4 27 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 28 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 29 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 30 store i32 5, i32 addrspace(5)* %arrayidx3, align 4 31 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 32 %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 33 store i32 %2, i32 addrspace(1)* %out, align 4 34 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 35 %3 = load i32, i32 addrspace(5)* %arrayidx12 36 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 37 store i32 %3, i32 addrspace(1)* %arrayidx13 38 ret void 39} 40 41; This test checks that the stack offset is calculated correctly for structs. 42; All register loads/stores should be optimized away, so there shouldn't be 43; any MOVA instructions. 44; 45; XXX: This generated code has unnecessary MOVs, we should be able to optimize 46; this. 47 48; FUNC-LABEL: {{^}}multiple_structs: 49; R600-NOT: MOVA_INT 50%struct.point = type { i32, i32 } 51 52define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 { 53entry: 54 %a = alloca %struct.point, addrspace(5) 55 %b = alloca %struct.point, addrspace(5) 56 %a.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 57 %a.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1 58 %b.x.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 59 %b.y.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1 60 store i32 0, i32 addrspace(5)* %a.x.ptr 61 store i32 1, i32 addrspace(5)* %a.y.ptr 62 store i32 2, i32 addrspace(5)* %b.x.ptr 63 store i32 3, i32 addrspace(5)* %b.y.ptr 64 %a.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0 65 %b.indirect.ptr = getelementptr inbounds %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0 66 %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr 67 %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr 68 %0 = add i32 %a.indirect, %b.indirect 69 store i32 %0, i32 addrspace(1)* %out 70 ret void 71} 72 73; Test direct access of a private array inside a loop. The private array 74; loads and stores should be lowered to copies, so there shouldn't be any 75; MOVA instructions. 76 77; FUNC-LABEL: {{^}}direct_loop: 78; R600-NOT: MOVA_INT 79 80define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 81entry: 82 %prv_array_const = alloca [2 x i32], addrspace(5) 83 %prv_array = alloca [2 x i32], addrspace(5) 84 %a = load i32, i32 addrspace(1)* %in 85 %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 86 %b = load i32, i32 addrspace(1)* %b_src_ptr 87 %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 88 store i32 %a, i32 addrspace(5)* %a_dst_ptr 89 %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1 90 store i32 %b, i32 addrspace(5)* %b_dst_ptr 91 br label %for.body 92 93for.body: 94 %inc = phi i32 [0, %entry], [%count, %for.body] 95 %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0 96 %x = load i32, i32 addrspace(5)* %x_ptr 97 %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 98 %y = load i32, i32 addrspace(5)* %y_ptr 99 %xy = add i32 %x, %y 100 store i32 %xy, i32 addrspace(5)* %y_ptr 101 %count = add i32 %inc, 1 102 %done = icmp eq i32 %count, 4095 103 br i1 %done, label %for.end, label %for.body 104 105for.end: 106 %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0 107 %value = load i32, i32 addrspace(5)* %value_ptr 108 store i32 %value, i32 addrspace(1)* %out 109 ret void 110} 111 112; FUNC-LABEL: {{^}}short_array: 113 114; R600-VECT: MOVA_INT 115define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { 116entry: 117 %0 = alloca [2 x i16], addrspace(5) 118 %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0 119 %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1 120 store i16 0, i16 addrspace(5)* %1 121 store i16 1, i16 addrspace(5)* %2 122 %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index 123 %4 = load i16, i16 addrspace(5)* %3 124 %5 = sext i16 %4 to i32 125 store i32 %5, i32 addrspace(1)* %out 126 ret void 127} 128 129; FUNC-LABEL: {{^}}char_array: 130 131; R600-VECT: MOVA_INT 132define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { 133entry: 134 %0 = alloca [2 x i8], addrspace(5) 135 %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0 136 %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1 137 store i8 0, i8 addrspace(5)* %1 138 store i8 1, i8 addrspace(5)* %2 139 %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index 140 %4 = load i8, i8 addrspace(5)* %3 141 %5 = sext i8 %4 to i32 142 store i32 %5, i32 addrspace(1)* %out 143 ret void 144 145} 146 147; Make sure we don't overwrite workitem information with private memory 148 149; FUNC-LABEL: {{^}}work_item_info: 150; R600-NOT: MOV T0.X 151; Additional check in case the move ends up in the last slot 152; R600-NOT: MOV * TO.X 153define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { 154entry: 155 %0 = alloca [2 x i32], addrspace(5) 156 %1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0 157 %2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1 158 store i32 0, i32 addrspace(5)* %1 159 store i32 1, i32 addrspace(5)* %2 160 %3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in 161 %4 = load i32, i32 addrspace(5)* %3 162 %5 = call i32 @llvm.r600.read.tidig.x() 163 %6 = add i32 %4, %5 164 store i32 %6, i32 addrspace(1)* %out 165 ret void 166} 167 168; Test that two stack objects are not stored in the same register 169; The second stack object should be in T3.X 170; FUNC-LABEL: {{^}}no_overlap: 171; R600_CHECK: MOV 172; R600_CHECK: [[CHAN:[XYZW]]]+ 173; R600-NOT: [[CHAN]]+ 174define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { 175entry: 176 %0 = alloca [3 x i8], align 1, addrspace(5) 177 %1 = alloca [2 x i8], align 1, addrspace(5) 178 %2 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0 179 %3 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1 180 %4 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2 181 %5 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0 182 %6 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1 183 store i8 0, i8 addrspace(5)* %2 184 store i8 1, i8 addrspace(5)* %3 185 store i8 2, i8 addrspace(5)* %4 186 store i8 1, i8 addrspace(5)* %5 187 store i8 0, i8 addrspace(5)* %6 188 %7 = getelementptr inbounds [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in 189 %8 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in 190 %9 = load i8, i8 addrspace(5)* %7 191 %10 = load i8, i8 addrspace(5)* %8 192 %11 = add i8 %9, %10 193 %12 = sext i8 %11 to i32 194 store i32 %12, i32 addrspace(1)* %out 195 ret void 196} 197 198define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 199entry: 200 %alloca = alloca [2 x [2 x i8]], addrspace(5) 201 %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 202 %gep1 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 203 store i8 0, i8 addrspace(5)* %gep0 204 store i8 1, i8 addrspace(5)* %gep1 205 %gep2 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 206 %load = load i8, i8 addrspace(5)* %gep2 207 %sext = sext i8 %load to i32 208 store i32 %sext, i32 addrspace(1)* %out 209 ret void 210} 211 212define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 213entry: 214 %alloca = alloca [2 x [2 x i32]], addrspace(5) 215 %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 216 %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 217 store i32 0, i32 addrspace(5)* %gep0 218 store i32 1, i32 addrspace(5)* %gep1 219 %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 220 %load = load i32, i32 addrspace(5)* %gep2 221 store i32 %load, i32 addrspace(1)* %out 222 ret void 223} 224 225define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { 226entry: 227 %alloca = alloca [2 x [2 x i64]], addrspace(5) 228 %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 229 %gep1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 230 store i64 0, i64 addrspace(5)* %gep0 231 store i64 1, i64 addrspace(5)* %gep1 232 %gep2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index 233 %load = load i64, i64 addrspace(5)* %gep2 234 store i64 %load, i64 addrspace(1)* %out 235 ret void 236} 237 238%struct.pair32 = type { i32, i32 } 239 240define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { 241entry: 242 %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5) 243 %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1 244 %gep1 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1 245 store i32 0, i32 addrspace(5)* %gep0 246 store i32 1, i32 addrspace(5)* %gep1 247 %gep2 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0 248 %load = load i32, i32 addrspace(5)* %gep2 249 store i32 %load, i32 addrspace(1)* %out 250 ret void 251} 252 253define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { 254entry: 255 %alloca = alloca [2 x %struct.pair32], addrspace(5) 256 %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1 257 %gep1 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0 258 store i32 0, i32 addrspace(5)* %gep0 259 store i32 1, i32 addrspace(5)* %gep1 260 %gep2 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0 261 %load = load i32, i32 addrspace(5)* %gep2 262 store i32 %load, i32 addrspace(1)* %out 263 ret void 264} 265 266define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { 267entry: 268 %tmp = alloca [2 x i32], addrspace(5) 269 %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 270 %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 271 store i32 0, i32 addrspace(5)* %tmp1 272 store i32 1, i32 addrspace(5)* %tmp2 273 %cmp = icmp eq i32 %in, 0 274 %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2 275 %load = load i32, i32 addrspace(5)* %sel 276 store i32 %load, i32 addrspace(1)* %out 277 ret void 278} 279 280; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it 281; finds one, it should stop trying to promote. 282 283; FUNC-LABEL: ptrtoint: 284; SI-NOT: ds_write 285; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen 286; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; 287define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { 288 %alloca = alloca [16 x i32], addrspace(5) 289 %tmp0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a 290 store i32 5, i32 addrspace(5)* %tmp0 291 %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32 292 %tmp2 = add i32 %tmp1, 5 293 %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)* 294 %tmp4 = getelementptr inbounds i32, i32 addrspace(5)* %tmp3, i32 %b 295 %tmp5 = load i32, i32 addrspace(5)* %tmp4 296 store i32 %tmp5, i32 addrspace(1)* %out 297 ret void 298} 299 300; OPT: !0 = !{i32 0, i32 257} 301; OPT: !1 = !{i32 0, i32 256} 302 303attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" } 304