1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
2
3@lds0 = addrspace(3) global [512 x float] undef
4@lds1 = addrspace(3) global [256 x float] undef
5@lds2 = addrspace(3) global [4096 x float] undef
6@lds3 = addrspace(3) global [67 x i8] undef
7
8@dynamic_shared0 = external addrspace(3) global [0 x float]
9@dynamic_shared1 = external addrspace(3) global [0 x double]
10@dynamic_shared2 = external addrspace(3) global [0 x double], align 4
11@dynamic_shared3 = external addrspace(3) global [0 x double], align 16
12
13; CHECK-LABEL: {{^}}dynamic_shared_array_0:
14; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
15define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
16  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
17  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x
18  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
19  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
20  store float %val0, float addrspace(3)* %arrayidx1, align 4
21  ret void
22}
23
24; CHECK-LABEL: {{^}}dynamic_shared_array_1:
25; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0xc00
26; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
27define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {
28entry:
29  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
30  %idx.0 = add nsw i32 %tid.x, 64
31  %tmp = icmp eq i32 %cond, 0
32  br i1 %tmp, label %if, label %else
33
34if:                                               ; preds = %entry
35  %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
36  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
37  br label %endif
38
39else:                                             ; preds = %entry
40  %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
41  %val1 = load float, float addrspace(3)* %arrayidx1, align 4
42  br label %endif
43
44endif:                                            ; preds = %else, %if
45  %val = phi float [ %val0, %if ], [ %val1, %else ]
46  %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
47  store float %val, float addrspace(3)* %arrayidx, align 4
48  ret void
49}
50
51; CHECK-LABEL: {{^}}dynamic_shared_array_2:
52; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x4000
53; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
54define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
55  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
56  %vidx = add i32 %tid.x, %idx
57  %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx
58  %val0 = load float, float addrspace(3)* %arrayidx0, align 4
59  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
60  store float %val0, float addrspace(3)* %arrayidx1, align 4
61  ret void
62}
63
64; The offset to the dynamic shared memory array should be aligned on the type
65; specified.
66; CHECK-LABEL: {{^}}dynamic_shared_array_3:
67; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
68; CHECK: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
69define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
70  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
71  %vidx = add i32 %tid.x, %idx
72  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
73  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
74  %val1 = uitofp i8 %val0 to float
75  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
76  store float %val1, float addrspace(3)* %arrayidx1, align 4
77  ret void
78}
79
80; The offset to the dynamic shared memory array should be aligned on the
81; maximal one.
82; CHECK-LABEL: {{^}}dynamic_shared_array_4:
83; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
84; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
85; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
86define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
87  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
88  %vidx = add i32 %tid.x, %idx
89  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
90  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
91  %val1 = uitofp i8 %val0 to float
92  %val2 = uitofp i8 %val0 to double
93  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
94  store float %val1, float addrspace(3)* %arrayidx1, align 4
95  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x
96  store double %val2, double addrspace(3)* %arrayidx2, align 4
97  ret void
98}
99
100; Honor the explicit alignment from the specified variable.
101; CHECK-LABEL: {{^}}dynamic_shared_array_5:
102; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
103; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
104; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
105define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
106  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
107  %vidx = add i32 %tid.x, %idx
108  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
109  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
110  %val1 = uitofp i8 %val0 to float
111  %val2 = uitofp i8 %val0 to double
112  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
113  store float %val1, float addrspace(3)* %arrayidx1, align 4
114  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x
115  store double %val2, double addrspace(3)* %arrayidx2, align 4
116  ret void
117}
118
119; Honor the explicit alignment from the specified variable.
120; CHECK-LABEL: {{^}}dynamic_shared_array_6:
121; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
122; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 2, [[DYNLDS]]
123; CHECK-DAG: v_lshl_add_u32 {{v[0-9]+}}, {{v[0-9]+}}, 3, [[DYNLDS]]
124define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
125  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
126  %vidx = add i32 %tid.x, %idx
127  %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
128  %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
129  %val1 = uitofp i8 %val0 to float
130  %val2 = uitofp i8 %val0 to double
131  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
132  store float %val1, float addrspace(3)* %arrayidx1, align 4
133  %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x
134  store double %val2, double addrspace(3)* %arrayidx2, align 4
135  ret void
136}
137
138declare i32 @llvm.amdgcn.workitem.id.x()
139