1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
6
7define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
8; AVX1-LABEL: testv4i64:
9; AVX1:       # BB#0:
10; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
11; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
12; AVX1-NEXT:    bsrq %rax, %rax
13; AVX1-NEXT:    movl $127, %ecx
14; AVX1-NEXT:    cmoveq %rcx, %rax
15; AVX1-NEXT:    vmovq %rax, %xmm2
16; AVX1-NEXT:    vmovq %xmm1, %rax
17; AVX1-NEXT:    bsrq %rax, %rax
18; AVX1-NEXT:    cmoveq %rcx, %rax
19; AVX1-NEXT:    vmovq %rax, %xmm1
20; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
21; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
22; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
23; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
24; AVX1-NEXT:    bsrq %rax, %rax
25; AVX1-NEXT:    cmoveq %rcx, %rax
26; AVX1-NEXT:    vmovq %rax, %xmm3
27; AVX1-NEXT:    vmovq %xmm0, %rax
28; AVX1-NEXT:    bsrq %rax, %rax
29; AVX1-NEXT:    cmoveq %rcx, %rax
30; AVX1-NEXT:    vmovq %rax, %xmm0
31; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
32; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
33; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
34; AVX1-NEXT:    retq
35;
36; AVX2-LABEL: testv4i64:
37; AVX2:       # BB#0:
38; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
39; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
40; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
41; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
42; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
43; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
44; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
45; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
46; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
47; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
48; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
49; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
50; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
51; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
52; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
53; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
54; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
55; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
56; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
57; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
58; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
59; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
60; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
61; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
62; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
63; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
64; AVX2-NEXT:    retq
65;
66; AVX512VLCD-LABEL: testv4i64:
67; AVX512VLCD:       ## BB#0:
68; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
69; AVX512VLCD-NEXT:    retq
70;
71; AVX512CD-LABEL: testv4i64:
72; AVX512CD:       ## BB#0:
73; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
74; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
75; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
76; AVX512CD-NEXT:    retq
77
78  %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0)
79  ret <4 x i64> %out
80}
81
82define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
83; AVX1-LABEL: testv4i64u:
84; AVX1:       # BB#0:
85; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
86; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
87; AVX1-NEXT:    bsrq %rax, %rax
88; AVX1-NEXT:    vmovq %rax, %xmm2
89; AVX1-NEXT:    vmovq %xmm1, %rax
90; AVX1-NEXT:    bsrq %rax, %rax
91; AVX1-NEXT:    vmovq %rax, %xmm1
92; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
93; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
94; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
95; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
96; AVX1-NEXT:    bsrq %rax, %rax
97; AVX1-NEXT:    vmovq %rax, %xmm3
98; AVX1-NEXT:    vmovq %xmm0, %rax
99; AVX1-NEXT:    bsrq %rax, %rax
100; AVX1-NEXT:    vmovq %rax, %xmm0
101; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
102; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
103; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: testv4i64u:
107; AVX2:       # BB#0:
108; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
109; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
110; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
111; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
112; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
113; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
114; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
115; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
116; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
117; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
118; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
119; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
120; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
121; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
122; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
123; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
124; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm2
125; AVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
126; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
127; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
128; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
129; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm0, %ymm0
130; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
131; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
132; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
133; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
134; AVX2-NEXT:    retq
135;
136; AVX512VLCD-LABEL: testv4i64u:
137; AVX512VLCD:       ## BB#0:
138; AVX512VLCD-NEXT:    vplzcntq %ymm0, %ymm0
139; AVX512VLCD-NEXT:    retq
140;
141; AVX512CD-LABEL: testv4i64u:
142; AVX512CD:       ## BB#0:
143; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
144; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
145; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
146; AVX512CD-NEXT:    retq
147
148  %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1)
149  ret <4 x i64> %out
150}
151
152define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
153; AVX1-LABEL: testv8i32:
154; AVX1:       # BB#0:
155; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
156; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
157; AVX1-NEXT:    bsrl %eax, %ecx
158; AVX1-NEXT:    movl $63, %eax
159; AVX1-NEXT:    cmovel %eax, %ecx
160; AVX1-NEXT:    vmovd %xmm1, %edx
161; AVX1-NEXT:    bsrl %edx, %edx
162; AVX1-NEXT:    cmovel %eax, %edx
163; AVX1-NEXT:    vmovd %edx, %xmm2
164; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
165; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
166; AVX1-NEXT:    bsrl %ecx, %ecx
167; AVX1-NEXT:    cmovel %eax, %ecx
168; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
169; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
170; AVX1-NEXT:    bsrl %ecx, %ecx
171; AVX1-NEXT:    cmovel %eax, %ecx
172; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm2, %xmm1
173; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
174; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
175; AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
176; AVX1-NEXT:    bsrl %ecx, %ecx
177; AVX1-NEXT:    cmovel %eax, %ecx
178; AVX1-NEXT:    vmovd %xmm0, %edx
179; AVX1-NEXT:    bsrl %edx, %edx
180; AVX1-NEXT:    cmovel %eax, %edx
181; AVX1-NEXT:    vmovd %edx, %xmm3
182; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm3, %xmm3
183; AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
184; AVX1-NEXT:    bsrl %ecx, %ecx
185; AVX1-NEXT:    cmovel %eax, %ecx
186; AVX1-NEXT:    vpinsrd $2, %ecx, %xmm3, %xmm3
187; AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
188; AVX1-NEXT:    bsrl %ecx, %ecx
189; AVX1-NEXT:    cmovel %eax, %ecx
190; AVX1-NEXT:    vpinsrd $3, %ecx, %xmm3, %xmm0
191; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
192; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
193; AVX1-NEXT:    retq
194;
195; AVX2-LABEL: testv8i32:
196; AVX2:       # BB#0:
197; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
198; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
199; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
200; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
201; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
202; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
203; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
204; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
205; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
206; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
207; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
208; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
209; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
210; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
211; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
212; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
213; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
214; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
215; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
216; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
217; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
218; AVX2-NEXT:    retq
219;
220; AVX512VLCD-LABEL: testv8i32:
221; AVX512VLCD:       ## BB#0:
222; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
223; AVX512VLCD-NEXT:    retq
224;
225; AVX512CD-LABEL: testv8i32:
226; AVX512CD:       ## BB#0:
227; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
228; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
229; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
230; AVX512CD-NEXT:    retq
231
232  %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0)
233  ret <8 x i32> %out
234}
235
236define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
237; AVX1-LABEL: testv8i32u:
238; AVX1:       # BB#0:
239; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
240; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
241; AVX1-NEXT:    bsrl %eax, %eax
242; AVX1-NEXT:    vmovd %xmm1, %ecx
243; AVX1-NEXT:    bsrl %ecx, %ecx
244; AVX1-NEXT:    vmovd %ecx, %xmm2
245; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
246; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
247; AVX1-NEXT:    bsrl %eax, %eax
248; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
249; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
250; AVX1-NEXT:    bsrl %eax, %eax
251; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
252; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
253; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
254; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
255; AVX1-NEXT:    bsrl %eax, %eax
256; AVX1-NEXT:    vmovd %xmm0, %ecx
257; AVX1-NEXT:    bsrl %ecx, %ecx
258; AVX1-NEXT:    vmovd %ecx, %xmm3
259; AVX1-NEXT:    vpinsrd $1, %eax, %xmm3, %xmm3
260; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
261; AVX1-NEXT:    bsrl %eax, %eax
262; AVX1-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
263; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
264; AVX1-NEXT:    bsrl %eax, %eax
265; AVX1-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
266; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
267; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
268; AVX1-NEXT:    retq
269;
270; AVX2-LABEL: testv8i32u:
271; AVX2:       # BB#0:
272; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
273; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
274; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
275; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
276; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
277; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
278; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
279; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
280; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
281; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
282; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
283; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm2
284; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
285; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm2
286; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
287; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
288; AVX2-NEXT:    vpcmpeqw %ymm4, %ymm0, %ymm0
289; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
290; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
291; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
292; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
293; AVX2-NEXT:    retq
294;
295; AVX512VLCD-LABEL: testv8i32u:
296; AVX512VLCD:       ## BB#0:
297; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
298; AVX512VLCD-NEXT:    retq
299;
300; AVX512CD-LABEL: testv8i32u:
301; AVX512CD:       ## BB#0:
302; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
303; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
304; AVX512CD-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
305; AVX512CD-NEXT:    retq
306
307  %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1)
308  ret <8 x i32> %out
309}
310
311define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
312; AVX1-LABEL: testv16i16:
313; AVX1:       # BB#0:
314; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
315; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
316; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
317; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
318; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
319; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
320; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
321; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
322; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
323; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
324; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
325; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
326; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
327; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
328; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
329; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
330; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
331; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
332; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
333; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
334; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
335; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
336; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
337; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
338; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
339; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
340; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
341; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
342; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
343; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
344; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
345; AVX1-NEXT:    retq
346;
347; AVX2-LABEL: testv16i16:
348; AVX2:       # BB#0:
349; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
350; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
351; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
352; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
353; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
354; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
355; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
356; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
357; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
358; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
359; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
360; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
361; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
362; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
363; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
364; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
365; AVX2-NEXT:    retq
366;
367; AVX512-LABEL: testv16i16:
368; AVX512:       ## BB#0:
369; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
370; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
371; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
372; AVX512-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
373; AVX512-NEXT:    retq
374  %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0)
375  ret <16 x i16> %out
376}
377
378define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
379; AVX1-LABEL: testv16i16u:
380; AVX1:       # BB#0:
381; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
382; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
383; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
384; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
385; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
386; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
387; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
388; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
389; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm5, %xmm7
390; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
391; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
392; AVX1-NEXT:    vpaddb %xmm5, %xmm3, %xmm3
393; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm1
394; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
395; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
396; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
397; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
398; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
399; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
400; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
401; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
402; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm5
403; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
404; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm2
405; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
406; AVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm0
407; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
408; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
409; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
410; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
411; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
412; AVX1-NEXT:    retq
413;
414; AVX2-LABEL: testv16i16u:
415; AVX2:       # BB#0:
416; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
417; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
418; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
419; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
420; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm4
421; AVX2-NEXT:    vpand %ymm1, %ymm4, %ymm1
422; AVX2-NEXT:    vpxor %ymm4, %ymm4, %ymm4
423; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm5
424; AVX2-NEXT:    vpand %ymm5, %ymm2, %ymm2
425; AVX2-NEXT:    vpshufb %ymm1, %ymm3, %ymm1
426; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
427; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm0
428; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
429; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
430; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
431; AVX2-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
432; AVX2-NEXT:    retq
433;
434; AVX512-LABEL: testv16i16u:
435; AVX512:       ## BB#0:
436; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
437; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
438; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
439; AVX512-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
440; AVX512-NEXT:    retq
441  %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1)
442  ret <16 x i16> %out
443}
444
445define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
446; AVX1-LABEL: testv32i8:
447; AVX1:       # BB#0:
448; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
449; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
450; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
451; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
452; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
453; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
454; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
455; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
456; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
457; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
458; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
459; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
460; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
461; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
462; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
463; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
464; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
465; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
466; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
467; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
468; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
469; AVX1-NEXT:    retq
470;
471; AVX2-LABEL: testv32i8:
472; AVX2:       # BB#0:
473; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
474; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
475; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
476; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
477; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
478; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
479; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
480; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
481; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
482; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
483; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
484; AVX2-NEXT:    retq
485;
486; AVX512VLCD-LABEL: testv32i8:
487; AVX512VLCD:       ## BB#0:
488; AVX512VLCD-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
489; AVX512VLCD-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
490; AVX512VLCD-NEXT:    vplzcntd %zmm1, %zmm1
491; AVX512VLCD-NEXT:    vpmovdb %zmm1, %xmm1
492; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
493; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
494; AVX512VLCD-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
495; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
496; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
497; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
498; AVX512VLCD-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
499; AVX512VLCD-NEXT:    retq
500;
501; AVX512CD-LABEL: testv32i8:
502; AVX512CD:       ## BB#0:
503; AVX512CD-NEXT:    vextractf128 $1, %ymm0, %xmm1
504; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
505; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
506; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
507; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
508; AVX512CD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
509; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
510; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
511; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
512; AVX512CD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
513; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
514; AVX512CD-NEXT:    retq
515  %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
516  ret <32 x i8> %out
517}
518
519define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
520; AVX1-LABEL: testv32i8u:
521; AVX1:       # BB#0:
522; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
523; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
524; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
525; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
526; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
527; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
528; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
529; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
530; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
531; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
532; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
533; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
534; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
535; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
536; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
537; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
538; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm2
539; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
540; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
541; AVX1-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
542; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
543; AVX1-NEXT:    retq
544;
545; AVX2-LABEL: testv32i8u:
546; AVX2:       # BB#0:
547; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
548; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
549; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
550; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
551; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
552; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
553; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
554; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm1
555; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
556; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
557; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
558; AVX2-NEXT:    retq
559;
560; AVX512VLCD-LABEL: testv32i8u:
561; AVX512VLCD:       ## BB#0:
562; AVX512VLCD-NEXT:    vextracti32x4 $1, %ymm0, %xmm1
563; AVX512VLCD-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
564; AVX512VLCD-NEXT:    vplzcntd %zmm1, %zmm1
565; AVX512VLCD-NEXT:    vpmovdb %zmm1, %xmm1
566; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
567; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
568; AVX512VLCD-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
569; AVX512VLCD-NEXT:    vplzcntd %zmm0, %zmm0
570; AVX512VLCD-NEXT:    vpmovdb %zmm0, %xmm0
571; AVX512VLCD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
572; AVX512VLCD-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
573; AVX512VLCD-NEXT:    retq
574;
575; AVX512CD-LABEL: testv32i8u:
576; AVX512CD:       ## BB#0:
577; AVX512CD-NEXT:    vextractf128 $1, %ymm0, %xmm1
578; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
579; AVX512CD-NEXT:    vplzcntd %zmm1, %zmm1
580; AVX512CD-NEXT:    vpmovdb %zmm1, %xmm1
581; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
582; AVX512CD-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
583; AVX512CD-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
584; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
585; AVX512CD-NEXT:    vpmovdb %zmm0, %xmm0
586; AVX512CD-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
587; AVX512CD-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
588; AVX512CD-NEXT:    retq
589  %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1)
590  ret <32 x i8> %out
591}
592
593define <4 x i64> @foldv4i64() nounwind {
594; AVX-LABEL: foldv4i64:
595; AVX:       # BB#0:
596; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
597; AVX-NEXT:    retq
598;
599; AVX512VLCD-LABEL: foldv4i64:
600; AVX512VLCD:       ## BB#0:
601; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56]
602; AVX512VLCD-NEXT:    retq
603;
604; AVX512CD-LABEL: foldv4i64:
605; AVX512CD:       ## BB#0:
606; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
607; AVX512CD-NEXT:    retq
608  %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
609  ret <4 x i64> %out
610}
611
612define <4 x i64> @foldv4i64u() nounwind {
613; AVX-LABEL: foldv4i64u:
614; AVX:       # BB#0:
615; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
616; AVX-NEXT:    retq
617;
618; AVX512VLCD-LABEL: foldv4i64u:
619; AVX512VLCD:       ## BB#0:
620; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [55,0,64,56]
621; AVX512VLCD-NEXT:    retq
622;
623; AVX512CD-LABEL: foldv4i64u:
624; AVX512CD:       ## BB#0:
625; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,64,56]
626; AVX512CD-NEXT:    retq
627  %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
628  ret <4 x i64> %out
629}
630
631define <8 x i32> @foldv8i32() nounwind {
632; AVX-LABEL: foldv8i32:
633; AVX:       # BB#0:
634; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
635; AVX-NEXT:    retq
636;
637; AVX512VLCD-LABEL: foldv8i32:
638; AVX512VLCD:       ## BB#0:
639; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
640; AVX512VLCD-NEXT:    retq
641;
642; AVX512CD-LABEL: foldv8i32:
643; AVX512CD:       ## BB#0:
644; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
645; AVX512CD-NEXT:    retq
646  %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
647  ret <8 x i32> %out
648}
649
650define <8 x i32> @foldv8i32u() nounwind {
651; AVX-LABEL: foldv8i32u:
652; AVX:       # BB#0:
653; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
654; AVX-NEXT:    retq
655;
656; AVX512VLCD-LABEL: foldv8i32u:
657; AVX512VLCD:       ## BB#0:
658; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
659; AVX512VLCD-NEXT:    retq
660;
661; AVX512CD-LABEL: foldv8i32u:
662; AVX512CD:       ## BB#0:
663; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
664; AVX512CD-NEXT:    retq
665  %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
666  ret <8 x i32> %out
667}
668
669define <16 x i16> @foldv16i16() nounwind {
670; AVX-LABEL: foldv16i16:
671; AVX:       # BB#0:
672; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
673; AVX-NEXT:    retq
674;
675; AVX512VLCD-LABEL: foldv16i16:
676; AVX512VLCD:       ## BB#0:
677; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
678; AVX512VLCD-NEXT:    retq
679;
680; AVX512CD-LABEL: foldv16i16:
681; AVX512CD:       ## BB#0:
682; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
683; AVX512CD-NEXT:    retq
684  %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
685  ret <16 x i16> %out
686}
687
688define <16 x i16> @foldv16i16u() nounwind {
689; AVX-LABEL: foldv16i16u:
690; AVX:       # BB#0:
691; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
692; AVX-NEXT:    retq
693;
694; AVX512VLCD-LABEL: foldv16i16u:
695; AVX512VLCD:       ## BB#0:
696; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
697; AVX512VLCD-NEXT:    retq
698;
699; AVX512CD-LABEL: foldv16i16u:
700; AVX512CD:       ## BB#0:
701; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
702; AVX512CD-NEXT:    retq
703  %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
704  ret <16 x i16> %out
705}
706
707define <32 x i8> @foldv32i8() nounwind {
708; AVX-LABEL: foldv32i8:
709; AVX:       # BB#0:
710; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
711; AVX-NEXT:    retq
712;
713; AVX512VLCD-LABEL: foldv32i8:
714; AVX512VLCD:       ## BB#0:
715; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
716; AVX512VLCD-NEXT:    retq
717;
718; AVX512CD-LABEL: foldv32i8:
719; AVX512CD:       ## BB#0:
720; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
721; AVX512CD-NEXT:    retq
722  %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
723  ret <32 x i8> %out
724}
725
726define <32 x i8> @foldv32i8u() nounwind {
727; AVX-LABEL: foldv32i8u:
728; AVX:       # BB#0:
729; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
730; AVX-NEXT:    retq
731;
732; AVX512VLCD-LABEL: foldv32i8u:
733; AVX512VLCD:       ## BB#0:
734; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
735; AVX512VLCD-NEXT:    retq
736;
737; AVX512CD-LABEL: foldv32i8u:
738; AVX512CD:       ## BB#0:
739; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
740; AVX512CD-NEXT:    retq
741  %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
742  ret <32 x i8> %out
743}
744
745declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
746declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
747declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
748declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
749