1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA  -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
4
5declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
6declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
7declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
8declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
9declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
10declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
11declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13
14; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
15; SI: s_load_dword [[VAL:s[0-9]+]],
16; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
17; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
18; SI: buffer_store_dword [[VRESULT]],
19; SI: s_endpgm
20; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
21; EG: FFBL_INT {{\*? *}}[[RESULT]]
22define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
23  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
24  store i32 %cttz, i32 addrspace(1)* %out, align 4
25  ret void
26}
27
28; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
29; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
30; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
31; SI: buffer_store_dword [[RESULT]],
32; SI: s_endpgm
33; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
34; EG: FFBL_INT {{\*? *}}[[RESULT]]
35define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
36  %tid = call i32 @llvm.amdgcn.workitem.id.x()
37  %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
38  %val = load i32, i32 addrspace(1)* %in.gep, align 4
39  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
40  store i32 %cttz, i32 addrspace(1)* %out, align 4
41  ret void
42}
43
44; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
45; SI: {{buffer|flat}}_load_dwordx2
46; SI: v_ffbl_b32_e32
47; SI: v_ffbl_b32_e32
48; SI: buffer_store_dwordx2
49; SI: s_endpgm
50; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
51; EG: FFBL_INT {{\*? *}}[[RESULT]]
52; EG: FFBL_INT {{\*? *}}[[RESULT]]
53define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
54  %tid = call i32 @llvm.amdgcn.workitem.id.x()
55  %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
56  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
57  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
58  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
59  ret void
60}
61
62; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
63; SI: {{buffer|flat}}_load_dwordx4
64; SI: v_ffbl_b32_e32
65; SI: v_ffbl_b32_e32
66; SI: v_ffbl_b32_e32
67; SI: v_ffbl_b32_e32
68; SI: buffer_store_dwordx4
69; SI: s_endpgm
70; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
71; EG: FFBL_INT {{\*? *}}[[RESULT]]
72; EG: FFBL_INT {{\*? *}}[[RESULT]]
73; EG: FFBL_INT {{\*? *}}[[RESULT]]
74; EG: FFBL_INT {{\*? *}}[[RESULT]]
75define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
76  %tid = call i32 @llvm.amdgcn.workitem.id.x()
77  %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
78  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
79  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
80  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
81  ret void
82}
83
84; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select:
85; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
86; EG: MEM_RAT MSKOR
87; EG: FFBL_INT
88define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
89  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
90  %cttz_ret = icmp ne i8 %val, 0
91  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
92  store i8 %cttz, i8 addrspace(1)* %out, align 4
93  ret void
94}
95
96; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select:
97; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
98; EG: MEM_RAT MSKOR
99; EG: FFBL_INT
100define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
101  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
102  %cttz_ret = icmp ne i16 %val, 0
103  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
104  store i16 %cttz, i16 addrspace(1)* %out, align 4
105  ret void
106}
107
108; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select:
109; SI: s_ff1_i32_b32
110; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
111; EG: FFBL_INT {{\*? *}}[[RESULT]]
112define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
113  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
114  %cttz_ret = icmp ne i32 %val, 0
115  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
116  store i32 %cttz, i32 addrspace(1)* %out, align 4
117  ret void
118}
119
120; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select:
121; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
122; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
123; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
124define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
125  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
126  %cttz_ret = icmp ne i64 %val, 0
127  %ret = select i1 %cttz_ret, i64 %cttz, i64 32
128  store i64 %cttz, i64 addrspace(1)* %out, align 4
129  ret void
130}
131
132; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select:
133; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
134; SI-SDWA: v_ffbl_b32_e32
135; EG: MEM_RAT MSKOR
136define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
137  %val = load i8, i8 addrspace(1)* %arrayidx, align 1
138  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
139  %cttz_ret = icmp ne i8 %val, 0
140  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
141  store i8 %ret, i8 addrspace(1)* %out, align 4
142  ret void
143}
144
145; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select:
146; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
147; SI-SDWA: v_ffbl_b32_e32
148; EG: MEM_RAT MSKOR
149define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
150  %val = load i16, i16 addrspace(1)* %arrayidx, align 1
151  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
152  %cttz_ret = icmp ne i16 %val, 0
153  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
154  store i16 %ret, i16 addrspace(1)* %out, align 4
155  ret void
156}
157
158; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select:
159; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
160; SI-DAG: v_cmp_ne_u32_e32 vcc, 0
161; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
162define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
163  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
164  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
165  %cttz_ret = icmp ne i32 %val, 0
166  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
167  store i32 %ret, i32 addrspace(1)* %out, align 4
168  ret void
169}
170
171; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select:
172; SI-NOSDWA: v_or_b32_e32
173; SI-NOSDWA: v_or_b32_e32
174; SI-NOSDWA: v_or_b32_e32
175; SI-NOSDWA: v_or_b32_e32
176; SI-NOSDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
177; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
178; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
179; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
180; SI-SDWA: v_or_b32_e32
181; SI-SDWA: v_or_b32_sdwa
182; SI-SDWA: v_or_b32_e32
183; SI-SDWA: v_or_b32_sdwa
184; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
185; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
186; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
187; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
188; SI: v_cmp_eq_u32_e32 vcc, 0
189; SI: v_cmp_ne_u64_e32 vcc, 0
190; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
191define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
192  %val = load i64, i64 addrspace(1)* %arrayidx, align 1
193  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
194  %cttz_ret = icmp ne i64 %val, 0
195  %ret = select i1 %cttz_ret, i64 %cttz, i64 32
196  store i64 %ret, i64 addrspace(1)* %out, align 4
197  ret void
198}
199
200; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1:
201; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
202; SI: buffer_store_dword [[VAL]],
203; SI: s_endpgm
204; EG: MEM_RAT_CACHELESS STORE_RAW
205; EG: FFBL_INT
206define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
207  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
208  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
209  %cmp = icmp eq i32 %val, 0
210  %sel = select i1 %cmp, i32 -1, i32 %ctlz
211  store i32 %sel, i32 addrspace(1)* %out
212  ret void
213}
214
215; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1:
216; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
217; SI: buffer_store_dword [[VAL]],
218; SI: s_endpgm
219; EG: MEM_RAT_CACHELESS STORE_RAW
220; EG: FFBL_INT
221define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
222  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
223  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
224  %cmp = icmp ne i32 %val, 0
225  %sel = select i1 %cmp, i32 %ctlz, i32 -1
226  store i32 %sel, i32 addrspace(1)* %out
227  ret void
228}
229
230; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth:
231; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
232; SI: v_cmp
233; SI: v_cndmask
234; SI: s_endpgm
235; EG: MEM_RAT_CACHELESS STORE_RAW
236; EG: FFBL_INT
237define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
238  %val = load i32, i32 addrspace(1)* %arrayidx, align 1
239  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
240  %cmp = icmp ne i32 %ctlz, 32
241  %sel = select i1 %cmp, i32 %ctlz, i32 -1
242  store i32 %sel, i32 addrspace(1)* %out
243  ret void
244}
245
246; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
247; SI: {{buffer|flat}}_load_ubyte
248; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
249; SI-SDWA: v_ffbl_b32_e32
250; EG: MEM_RAT MSKOR
251; EG: FFBL_INT
252 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
253  %val = load i8, i8 addrspace(1)* %arrayidx, align 1
254  %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
255  %cmp = icmp eq i8 %val, 0
256  %sel = select i1 %cmp, i8 -1, i8 %ctlz
257  store i8 %sel, i8 addrspace(1)* %out
258  ret void
259}
260
261; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1:
262; SI: {{buffer|flat}}_load_ubyte
263; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
264; SI: buffer_store_short
265; EG: MEM_RAT MSKOR
266; EG: FFBL_INT
267 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
268  %val = load i16, i16 addrspace(1)* %arrayidx, align 1
269  %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
270  %cmp = icmp eq i16 %val, 0
271  %sel = select i1 %cmp, i16 -1, i16 %ctlz
272  store i16 %sel, i16 addrspace(1)* %out
273  ret void
274}
275
276
277