1; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
2
3; This test checks the @llvm.cttz.* intrinsics for vectors.
4
5declare <1 x i8> @llvm.cttz.v1i8(<1 x i8>, i1)
6declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
7declare <4 x i8> @llvm.cttz.v4i8(<4 x i8>, i1)
8declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>, i1)
9declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
10
11declare <1 x i16> @llvm.cttz.v1i16(<1 x i16>, i1)
12declare <2 x i16> @llvm.cttz.v2i16(<2 x i16>, i1)
13declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>, i1)
14declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
15
16declare <1 x i32> @llvm.cttz.v1i32(<1 x i32>, i1)
17declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1)
18declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
19
20declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>, i1)
21declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
22
23;------------------------------------------------------------------------------
24
25define void @test_v1i8(<1 x i8>* %p) {
26; CHECK-LABEL: test_v1i8
27  %a = load <1 x i8>, <1 x i8>* %p
28  %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
29  store <1 x i8> %tmp, <1 x i8>* %p
30  ret void
31}
32
33define void @test_v2i8(<2 x i8>* %p) {
34; CHECK-LABEL: test_v2i8:
35  %a = load <2 x i8>, <2 x i8>* %p
36  %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
37  store <2 x i8> %tmp, <2 x i8>* %p
38  ret void
39}
40
41define void @test_v4i8(<4 x i8>* %p) {
42; CHECK-LABEL: test_v4i8:
43  %a = load <4 x i8>, <4 x i8>* %p
44  %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
45  store <4 x i8> %tmp, <4 x i8>* %p
46  ret void
47}
48
49define void @test_v8i8(<8 x i8>* %p) {
50; CHECK-LABEL: test_v8i8:
51; CHECK: vldr		[[D1:d[0-9]+]], [r0]
52; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
53; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
54; CHECK: vand		[[D1]], [[D1]], [[D3]]
55; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
56; CHECK: vcnt.8		[[D1]], [[D1]]
57; CHECK: vstr		[[D1]], [r0]
58  %a = load <8 x i8>, <8 x i8>* %p
59  %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
60  store <8 x i8> %tmp, <8 x i8>* %p
61  ret void
62}
63
64define void @test_v16i8(<16 x i8>* %p) {
65; CHECK-LABEL: test_v16i8:
66; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
67; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
68; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
69; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
70; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
71; CHECK: vcnt.8		[[Q1]], [[Q1]]
72; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
73  %a = load <16 x i8>, <16 x i8>* %p
74  %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
75  store <16 x i8> %tmp, <16 x i8>* %p
76  ret void
77}
78
79define void @test_v1i16(<1 x i16>* %p) {
80; CHECK-LABEL: test_v1i16:
81  %a = load <1 x i16>, <1 x i16>* %p
82  %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
83  store <1 x i16> %tmp, <1 x i16>* %p
84  ret void
85}
86
87define void @test_v2i16(<2 x i16>* %p) {
88; CHECK-LABEL: test_v2i16:
89  %a = load <2 x i16>, <2 x i16>* %p
90  %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
91  store <2 x i16> %tmp, <2 x i16>* %p
92  ret void
93}
94
95define void @test_v4i16(<4 x i16>* %p) {
96; CHECK-LABEL: test_v4i16:
97; CHECK: vldr		[[D1:d[0-9]+]], [r0]
98; CHECK: vmov.i16	[[D2:d[0-9]+]], #0x1
99; CHECK: vneg.s16	[[D3:d[0-9]+]], [[D1]]
100; CHECK: vand		[[D1]], [[D1]], [[D3]]
101; CHECK: vsub.i16	[[D1]], [[D1]], [[D2]]
102; CHECK: vcnt.8		[[D1]], [[D1]]
103; CHECK: vpaddl.u8	[[D1]], [[D1]]
104; CHECK: vstr		[[D1]], [r0]
105  %a = load <4 x i16>, <4 x i16>* %p
106  %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
107  store <4 x i16> %tmp, <4 x i16>* %p
108  ret void
109}
110
111define void @test_v8i16(<8 x i16>* %p) {
112; CHECK-LABEL: test_v8i16:
113; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
114; CHECK: vmov.i16	[[Q2:q[0-9]+]], #0x1
115; CHECK: vneg.s16	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
116; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
117; CHECK: vsub.i16	[[Q1]], [[Q1]], [[Q2]]
118; CHECK: vcnt.8		[[Q1]], [[Q1]]
119; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
120; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
121  %a = load <8 x i16>, <8 x i16>* %p
122  %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
123  store <8 x i16> %tmp, <8 x i16>* %p
124  ret void
125}
126
127define void @test_v1i32(<1 x i32>* %p) {
128; CHECK-LABEL: test_v1i32:
129  %a = load <1 x i32>, <1 x i32>* %p
130  %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
131  store <1 x i32> %tmp, <1 x i32>* %p
132  ret void
133}
134
135define void @test_v2i32(<2 x i32>* %p) {
136; CHECK-LABEL: test_v2i32:
137; CHECK: vldr		[[D1:d[0-9]+]], [r0]
138; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x1
139; CHECK: vneg.s32	[[D3:d[0-9]+]], [[D1]]
140; CHECK: vand		[[D1]], [[D1]], [[D3]]
141; CHECK: vsub.i32	[[D1]], [[D1]], [[D2]]
142; CHECK: vcnt.8		[[D1]], [[D1]]
143; CHECK: vpaddl.u8	[[D1]], [[D1]]
144; CHECK: vpaddl.u16	[[D1]], [[D1]]
145; CHECK: vstr		[[D1]], [r0]
146  %a = load <2 x i32>, <2 x i32>* %p
147  %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
148  store <2 x i32> %tmp, <2 x i32>* %p
149  ret void
150}
151
152define void @test_v4i32(<4 x i32>* %p) {
153; CHECK-LABEL: test_v4i32:
154; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
155; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x1
156; CHECK: vneg.s32	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
157; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
158; CHECK: vsub.i32	[[Q1]], [[Q1]], [[Q2]]
159; CHECK: vcnt.8		[[Q1]], [[Q1]]
160; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
161; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
162; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
163  %a = load <4 x i32>, <4 x i32>* %p
164  %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
165  store <4 x i32> %tmp, <4 x i32>* %p
166  ret void
167}
168
169define void @test_v1i64(<1 x i64>* %p) {
170; CHECK-LABEL: test_v1i64:
171; CHECK: vldr		[[D1:d[0-9]+]], [r0]
172; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
173; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
174; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
175; CHECK: vand		[[D1]], [[D1]], [[D2]]
176; CHECK: vadd.i64	[[D1]], [[D1]], [[D3]]
177; CHECK: vcnt.8		[[D1]], [[D1]]
178; CHECK: vpaddl.u8	[[D1]], [[D1]]
179; CHECK: vpaddl.u16	[[D1]], [[D1]]
180; CHECK: vpaddl.u32	[[D1]], [[D1]]
181; CHECK: vstr		[[D1]], [r0]
182  %a = load <1 x i64>, <1 x i64>* %p
183  %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
184  store <1 x i64> %tmp, <1 x i64>* %p
185  ret void
186}
187
188define void @test_v2i64(<2 x i64>* %p) {
189; CHECK-LABEL: test_v2i64:
190; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
191; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
192; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
193; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
194; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
195; CHECK: vadd.i64	[[Q1]], [[Q1]], [[Q3]]
196; CHECK: vcnt.8		[[Q1]], [[Q1]]
197; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
198; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
199; CHECK: vpaddl.u32	[[Q1]], [[Q1]]
200; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
201  %a = load <2 x i64>, <2 x i64>* %p
202  %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
203  store <2 x i64> %tmp, <2 x i64>* %p
204  ret void
205}
206
207;------------------------------------------------------------------------------
208
209define void @test_v1i8_zero_undef(<1 x i8>* %p) {
210; CHECK-LABEL: test_v1i8_zero_undef
211  %a = load <1 x i8>, <1 x i8>* %p
212  %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
213  store <1 x i8> %tmp, <1 x i8>* %p
214  ret void
215}
216
217define void @test_v2i8_zero_undef(<2 x i8>* %p) {
218; CHECK-LABEL: test_v2i8_zero_undef:
219  %a = load <2 x i8>, <2 x i8>* %p
220  %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
221  store <2 x i8> %tmp, <2 x i8>* %p
222  ret void
223}
224
225define void @test_v4i8_zero_undef(<4 x i8>* %p) {
226; CHECK-LABEL: test_v4i8_zero_undef:
227  %a = load <4 x i8>, <4 x i8>* %p
228  %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
229  store <4 x i8> %tmp, <4 x i8>* %p
230  ret void
231}
232
233define void @test_v8i8_zero_undef(<8 x i8>* %p) {
234; CHECK-LABEL: test_v8i8_zero_undef:
235; CHECK: vldr		[[D1:d[0-9]+]], [r0]
236; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
237; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
238; CHECK: vand		[[D1]], [[D1]], [[D3]]
239; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
240; CHECK: vcnt.8		[[D1]], [[D1]]
241; CHECK: vstr		[[D1]], [r0]
242  %a = load <8 x i8>, <8 x i8>* %p
243  %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
244  store <8 x i8> %tmp, <8 x i8>* %p
245  ret void
246}
247
248define void @test_v16i8_zero_undef(<16 x i8>* %p) {
249; CHECK-LABEL: test_v16i8_zero_undef:
250; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
251; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
252; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
253; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
254; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
255; CHECK: vcnt.8		[[Q1]], [[Q1]]
256; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
257  %a = load <16 x i8>, <16 x i8>* %p
258  %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
259  store <16 x i8> %tmp, <16 x i8>* %p
260  ret void
261}
262
263define void @test_v1i16_zero_undef(<1 x i16>* %p) {
264; CHECK-LABEL: test_v1i16_zero_undef:
265  %a = load <1 x i16>, <1 x i16>* %p
266  %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
267  store <1 x i16> %tmp, <1 x i16>* %p
268  ret void
269}
270
271define void @test_v2i16_zero_undef(<2 x i16>* %p) {
272; CHECK-LABEL: test_v2i16_zero_undef:
273  %a = load <2 x i16>, <2 x i16>* %p
274  %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
275  store <2 x i16> %tmp, <2 x i16>* %p
276  ret void
277}
278
279define void @test_v4i16_zero_undef(<4 x i16>* %p) {
280; CHECK-LABEL: test_v4i16_zero_undef:
281; CHECK: vldr		[[D1:d[0-9]+]], [r0]
282; CHECK: vneg.s16	[[D2:d[0-9]+]], [[D1]]
283; CHECK: vand		[[D1]], [[D1]], [[D2]]
284; CHECK: vmov.i16	[[D3:d[0-9]+]], #0xf
285; CHECK: vclz.i16	[[D1]], [[D1]]
286; CHECK: vsub.i16	[[D1]], [[D3]], [[D1]]
287; CHECK: vstr		[[D1]], [r0]
288  %a = load <4 x i16>, <4 x i16>* %p
289  %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
290  store <4 x i16> %tmp, <4 x i16>* %p
291  ret void
292}
293
294define void @test_v8i16_zero_undef(<8 x i16>* %p) {
295; CHECK-LABEL: test_v8i16_zero_undef:
296; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
297; CHECK: vneg.s16	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
298; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
299; CHECK: vmov.i16	[[Q3:q[0-9]+]], #0xf
300; CHECK: vclz.i16	[[Q1]], [[Q1]]
301; CHECK: vsub.i16	[[Q1]], [[Q3]], [[Q1]]
302; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
303  %a = load <8 x i16>, <8 x i16>* %p
304  %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
305  store <8 x i16> %tmp, <8 x i16>* %p
306  ret void
307}
308
309define void @test_v1i32_zero_undef(<1 x i32>* %p) {
310; CHECK-LABEL: test_v1i32_zero_undef:
311  %a = load <1 x i32>, <1 x i32>* %p
312  %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
313  store <1 x i32> %tmp, <1 x i32>* %p
314  ret void
315}
316
317define void @test_v2i32_zero_undef(<2 x i32>* %p) {
318; CHECK-LABEL: test_v2i32_zero_undef:
319; CHECK: vldr		[[D1:d[0-9]+]], [r0]
320; CHECK: vneg.s32	[[D2:d[0-9]+]], [[D1]]
321; CHECK: vand		[[D1]], [[D1]], [[D2]]
322; CHECK: vmov.i32	[[D3:d[0-9]+]], #0x1f
323; CHECK: vclz.i32	[[D1]], [[D1]]
324; CHECK: vsub.i32	[[D1]], [[D3]], [[D1]]
325; CHECK: vstr		[[D1]], [r0]
326  %a = load <2 x i32>, <2 x i32>* %p
327  %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
328  store <2 x i32> %tmp, <2 x i32>* %p
329  ret void
330}
331
332define void @test_v4i32_zero_undef(<4 x i32>* %p) {
333; CHECK-LABEL: test_v4i32_zero_undef:
334; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
335; CHECK: vneg.s32	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
336; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
337; CHECK: vmov.i32	[[Q3:q[0-9]+]], #0x1f
338; CHECK: vclz.i32	[[Q1]], [[Q1]]
339; CHECK: vsub.i32	[[Q1]], [[Q3]], [[Q1]]
340; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
341  %a = load <4 x i32>, <4 x i32>* %p
342  %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
343  store <4 x i32> %tmp, <4 x i32>* %p
344  ret void
345}
346
347define void @test_v1i64_zero_undef(<1 x i64>* %p) {
348; CHECK-LABEL: test_v1i64_zero_undef:
349; CHECK: vldr		[[D1:d[0-9]+]], [r0]
350; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
351; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
352; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
353; CHECK: vand		[[D1]], [[D1]], [[D2]]
354; CHECK: vadd.i64	[[D1]], [[D1]], [[D3]]
355; CHECK: vcnt.8		[[D1]], [[D1]]
356; CHECK: vpaddl.u8	[[D1]], [[D1]]
357; CHECK: vpaddl.u16	[[D1]], [[D1]]
358; CHECK: vpaddl.u32	[[D1]], [[D1]]
359; CHECK: vstr		[[D1]], [r0]
360  %a = load <1 x i64>, <1 x i64>* %p
361  %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
362  store <1 x i64> %tmp, <1 x i64>* %p
363  ret void
364}
365
366define void @test_v2i64_zero_undef(<2 x i64>* %p) {
367; CHECK-LABEL: test_v2i64_zero_undef:
368; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
369; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
370; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
371; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
372; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
373; CHECK: vadd.i64	[[Q1]], [[Q1]], [[Q3]]
374; CHECK: vcnt.8		[[Q1]], [[Q1]]
375; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
376; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
377; CHECK: vpaddl.u32	[[Q1]], [[Q1]]
378; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
379  %a = load <2 x i64>, <2 x i64>* %p
380  %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
381  store <2 x i64> %tmp, <2 x i64>* %p
382  ret void
383}
384