1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
3; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
4
5declare i32 @llvm.ctpop.i32(i32) nounwind readnone
6declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
7declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
8declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
9declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
10
11declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
12
13; FUNC-LABEL: {{^}}s_ctpop_i32:
14; GCN: s_load_dword [[SVAL:s[0-9]+]],
15; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
16; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
17; GCN: buffer_store_dword [[VRESULT]],
18; GCN: s_endpgm
19
20; EG: BCNT_INT
21define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
22  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
23  store i32 %ctpop, i32 addrspace(1)* %out, align 4
24  ret void
25}
26
27; XXX - Why 0 in register?
28; FUNC-LABEL: {{^}}v_ctpop_i32:
29; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
30; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
31; GCN: buffer_store_dword [[RESULT]],
32; GCN: s_endpgm
33
34; EG: BCNT_INT
35define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
36  %tid = call i32 @llvm.amdgcn.workitem.id.x()
37  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
38  %val = load i32, i32 addrspace(1)* %in.gep, align 4
39  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
40  store i32 %ctpop, i32 addrspace(1)* %out, align 4
41  ret void
42}
43
44; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
45; SI: buffer_load_dword [[VAL0:v[0-9]+]],
46; SI: buffer_load_dword [[VAL1:v[0-9]+]],
47; VI: flat_load_dword [[VAL0:v[0-9]+]],
48; VI: flat_load_dword [[VAL1:v[0-9]+]],
49; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
50; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
51; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
52; GCN: buffer_store_dword [[RESULT]],
53; GCN: s_endpgm
54
55; EG: BCNT_INT
56; EG: BCNT_INT
57define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
58  %tid = call i32 @llvm.amdgcn.workitem.id.x()
59  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
60  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
61  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
62  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
63  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
64  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
65  %add = add i32 %ctpop0, %ctpop1
66  store i32 %add, i32 addrspace(1)* %out, align 4
67  ret void
68}
69
70; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
71; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
72; GCN: s_waitcnt
73; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
74; GCN: buffer_store_dword [[RESULT]],
75; GCN: s_endpgm
76define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
77  %tid = call i32 @llvm.amdgcn.workitem.id.x()
78  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
79  %val = load i32, i32 addrspace(1)* %in.gep, align 4
80  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
81  %add = add i32 %ctpop, %sval
82  store i32 %add, i32 addrspace(1)* %out, align 4
83  ret void
84}
85
86; FUNC-LABEL: {{^}}v_ctpop_v2i32:
87; GCN: v_bcnt_u32_b32{{(_e64)*}}
88; GCN: v_bcnt_u32_b32{{(_e64)*}}
89; GCN: s_endpgm
90
91; EG: BCNT_INT
92; EG: BCNT_INT
93define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
94  %tid = call i32 @llvm.amdgcn.workitem.id.x()
95  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
96  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
97  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
98  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
99  ret void
100}
101
102; FUNC-LABEL: {{^}}v_ctpop_v4i32:
103; GCN: v_bcnt_u32_b32{{(_e64)*}}
104; GCN: v_bcnt_u32_b32{{(_e64)*}}
105; GCN: v_bcnt_u32_b32{{(_e64)*}}
106; GCN: v_bcnt_u32_b32{{(_e64)*}}
107; GCN: s_endpgm
108
109; EG: BCNT_INT
110; EG: BCNT_INT
111; EG: BCNT_INT
112; EG: BCNT_INT
113define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
115  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
116  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
117  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
118  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
119  ret void
120}
121
122; FUNC-LABEL: {{^}}v_ctpop_v8i32:
123; GCN: v_bcnt_u32_b32{{(_e64)*}}
124; GCN: v_bcnt_u32_b32{{(_e64)*}}
125; GCN: v_bcnt_u32_b32{{(_e64)*}}
126; GCN: v_bcnt_u32_b32{{(_e64)*}}
127; GCN: v_bcnt_u32_b32{{(_e64)*}}
128; GCN: v_bcnt_u32_b32{{(_e64)*}}
129; GCN: v_bcnt_u32_b32{{(_e64)*}}
130; GCN: v_bcnt_u32_b32{{(_e64)*}}
131; GCN: s_endpgm
132
133; EG: BCNT_INT
134; EG: BCNT_INT
135; EG: BCNT_INT
136; EG: BCNT_INT
137; EG: BCNT_INT
138; EG: BCNT_INT
139; EG: BCNT_INT
140; EG: BCNT_INT
141define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
142  %tid = call i32 @llvm.amdgcn.workitem.id.x()
143  %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
144  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
145  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
146  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
147  ret void
148}
149
150; FUNC-LABEL: {{^}}v_ctpop_v16i32:
151; GCN: v_bcnt_u32_b32{{(_e64)*}}
152; GCN: v_bcnt_u32_b32{{(_e64)*}}
153; GCN: v_bcnt_u32_b32{{(_e64)*}}
154; GCN: v_bcnt_u32_b32{{(_e64)*}}
155; GCN: v_bcnt_u32_b32{{(_e64)*}}
156; GCN: v_bcnt_u32_b32{{(_e64)*}}
157; GCN: v_bcnt_u32_b32{{(_e64)*}}
158; GCN: v_bcnt_u32_b32{{(_e64)*}}
159; GCN: v_bcnt_u32_b32{{(_e64)*}}
160; GCN: v_bcnt_u32_b32{{(_e64)*}}
161; GCN: v_bcnt_u32_b32{{(_e64)*}}
162; GCN: v_bcnt_u32_b32{{(_e64)*}}
163; GCN: v_bcnt_u32_b32{{(_e64)*}}
164; GCN: v_bcnt_u32_b32{{(_e64)*}}
165; GCN: v_bcnt_u32_b32{{(_e64)*}}
166; GCN: v_bcnt_u32_b32{{(_e64)*}}
167; GCN: s_endpgm
168
169; EG: BCNT_INT
170; EG: BCNT_INT
171; EG: BCNT_INT
172; EG: BCNT_INT
173; EG: BCNT_INT
174; EG: BCNT_INT
175; EG: BCNT_INT
176; EG: BCNT_INT
177; EG: BCNT_INT
178; EG: BCNT_INT
179; EG: BCNT_INT
180; EG: BCNT_INT
181; EG: BCNT_INT
182; EG: BCNT_INT
183; EG: BCNT_INT
184; EG: BCNT_INT
185define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
186  %tid = call i32 @llvm.amdgcn.workitem.id.x()
187  %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
188  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
189  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
190  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
191  ret void
192}
193
194; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
195; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
196; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
197; GCN: buffer_store_dword [[RESULT]],
198; GCN: s_endpgm
199
200; EG: BCNT_INT
201define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
204  %val = load i32, i32 addrspace(1)* %in.gep, align 4
205  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
206  %add = add i32 %ctpop, 4
207  store i32 %add, i32 addrspace(1)* %out, align 4
208  ret void
209}
210
211; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
212; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
213; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
214; GCN: buffer_store_dword [[RESULT]],
215; GCN: s_endpgm
216
217; EG: BCNT_INT
218define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
219  %tid = call i32 @llvm.amdgcn.workitem.id.x()
220  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
221  %val = load i32, i32 addrspace(1)* %in.gep, align 4
222  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
223  %add = add i32 4, %ctpop
224  store i32 %add, i32 addrspace(1)* %out, align 4
225  ret void
226}
227
228; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
229; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
230; GCN-DAG: s_mov_b32 [[LIT:s[0-9]+]], 0x1869f
231; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
232; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
233; GCN: buffer_store_dword [[RESULT]],
234; GCN: s_endpgm
235define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
236  %tid = call i32 @llvm.amdgcn.workitem.id.x()
237  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
238  %val = load i32, i32 addrspace(1)* %in.gep, align 4
239  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
240  %add = add i32 %ctpop, 99999
241  store i32 %add, i32 addrspace(1)* %out, align 4
242  ret void
243}
244
245; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
246; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
247; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
248; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
249; GCN: buffer_store_dword [[RESULT]],
250; GCN: s_endpgm
251
252; EG: BCNT_INT
253define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
254  %tid = call i32 @llvm.amdgcn.workitem.id.x()
255  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
256  %val = load i32, i32 addrspace(1)* %in.gep, align 4
257  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
258  %add = add i32 %ctpop, %const
259  store i32 %add, i32 addrspace(1)* %out, align 4
260  ret void
261}
262
263; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
264; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
265; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
266; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
267; GCN: buffer_store_dword [[RESULT]],
268; GCN: s_endpgm
269
270; EG: BCNT_INT
271define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
272  %tid = call i32 @llvm.amdgcn.workitem.id.x()
273  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
274  %val = load i32, i32 addrspace(1)* %in.gep, align 4
275  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
276  %add = add i32 %const, %ctpop
277  store i32 %add, i32 addrspace(1)* %out, align 4
278  ret void
279}
280
281; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
282; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
283; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
284; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
285; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
286; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
287; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
288; GCN: buffer_store_dword [[RESULT]],
289; GCN: s_endpgm
290
291; EG: BCNT_INT
292define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
293  %tid = call i32 @llvm.amdgcn.workitem.id.x()
294  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
295  %val = load i32, i32 addrspace(1)* %in.gep, align 4
296  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
297  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
298  %const = load i32, i32 addrspace(1)* %gep, align 4
299  %add = add i32 %const, %ctpop
300  store i32 %add, i32 addrspace(1)* %out, align 4
301  ret void
302}
303
304; FUNC-LABEL: {{^}}ctpop_i32_in_br:
305; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
306; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
307; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
308; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
309; GCN: buffer_store_dword [[RESULT]],
310; GCN: s_endpgm
311; EG: BCNT_INT
312define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
313entry:
314  %tmp0 = icmp eq i32 %cond, 0
315  br i1 %tmp0, label %if, label %else
316
317if:
318  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
319  br label %endif
320
321else:
322  %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1
323  %tmp4 = load i32, i32 addrspace(1)* %tmp3
324  br label %endif
325
326endif:
327  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
328  store i32 %tmp5, i32 addrspace(1)* %out
329  ret void
330}
331