1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=NOBW,AVX
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=NOBW,AVX
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=AVX512VLBWDQ
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512VLCD
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefixes=NOBW,AVX512,AVX512CD
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
15
16define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
17; SSE2-LABEL: testv2i64:
18; SSE2:       # %bb.0:
19; SSE2-NEXT:    movdqa %xmm0, %xmm1
20; SSE2-NEXT:    psrlq $1, %xmm1
21; SSE2-NEXT:    por %xmm0, %xmm1
22; SSE2-NEXT:    movdqa %xmm1, %xmm0
23; SSE2-NEXT:    psrlq $2, %xmm0
24; SSE2-NEXT:    por %xmm1, %xmm0
25; SSE2-NEXT:    movdqa %xmm0, %xmm1
26; SSE2-NEXT:    psrlq $4, %xmm1
27; SSE2-NEXT:    por %xmm0, %xmm1
28; SSE2-NEXT:    movdqa %xmm1, %xmm0
29; SSE2-NEXT:    psrlq $8, %xmm0
30; SSE2-NEXT:    por %xmm1, %xmm0
31; SSE2-NEXT:    movdqa %xmm0, %xmm1
32; SSE2-NEXT:    psrlq $16, %xmm1
33; SSE2-NEXT:    por %xmm0, %xmm1
34; SSE2-NEXT:    movdqa %xmm1, %xmm0
35; SSE2-NEXT:    psrlq $32, %xmm0
36; SSE2-NEXT:    por %xmm1, %xmm0
37; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
38; SSE2-NEXT:    pxor %xmm0, %xmm1
39; SSE2-NEXT:    movdqa %xmm1, %xmm0
40; SSE2-NEXT:    psrlw $1, %xmm0
41; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
42; SSE2-NEXT:    psubb %xmm0, %xmm1
43; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
44; SSE2-NEXT:    movdqa %xmm1, %xmm2
45; SSE2-NEXT:    pand %xmm0, %xmm2
46; SSE2-NEXT:    psrlw $2, %xmm1
47; SSE2-NEXT:    pand %xmm0, %xmm1
48; SSE2-NEXT:    paddb %xmm2, %xmm1
49; SSE2-NEXT:    movdqa %xmm1, %xmm2
50; SSE2-NEXT:    psrlw $4, %xmm2
51; SSE2-NEXT:    paddb %xmm1, %xmm2
52; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
53; SSE2-NEXT:    pxor %xmm0, %xmm0
54; SSE2-NEXT:    psadbw %xmm2, %xmm0
55; SSE2-NEXT:    retq
56;
57; SSE3-LABEL: testv2i64:
58; SSE3:       # %bb.0:
59; SSE3-NEXT:    movdqa %xmm0, %xmm1
60; SSE3-NEXT:    psrlq $1, %xmm1
61; SSE3-NEXT:    por %xmm0, %xmm1
62; SSE3-NEXT:    movdqa %xmm1, %xmm0
63; SSE3-NEXT:    psrlq $2, %xmm0
64; SSE3-NEXT:    por %xmm1, %xmm0
65; SSE3-NEXT:    movdqa %xmm0, %xmm1
66; SSE3-NEXT:    psrlq $4, %xmm1
67; SSE3-NEXT:    por %xmm0, %xmm1
68; SSE3-NEXT:    movdqa %xmm1, %xmm0
69; SSE3-NEXT:    psrlq $8, %xmm0
70; SSE3-NEXT:    por %xmm1, %xmm0
71; SSE3-NEXT:    movdqa %xmm0, %xmm1
72; SSE3-NEXT:    psrlq $16, %xmm1
73; SSE3-NEXT:    por %xmm0, %xmm1
74; SSE3-NEXT:    movdqa %xmm1, %xmm0
75; SSE3-NEXT:    psrlq $32, %xmm0
76; SSE3-NEXT:    por %xmm1, %xmm0
77; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
78; SSE3-NEXT:    pxor %xmm0, %xmm1
79; SSE3-NEXT:    movdqa %xmm1, %xmm0
80; SSE3-NEXT:    psrlw $1, %xmm0
81; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
82; SSE3-NEXT:    psubb %xmm0, %xmm1
83; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
84; SSE3-NEXT:    movdqa %xmm1, %xmm2
85; SSE3-NEXT:    pand %xmm0, %xmm2
86; SSE3-NEXT:    psrlw $2, %xmm1
87; SSE3-NEXT:    pand %xmm0, %xmm1
88; SSE3-NEXT:    paddb %xmm2, %xmm1
89; SSE3-NEXT:    movdqa %xmm1, %xmm2
90; SSE3-NEXT:    psrlw $4, %xmm2
91; SSE3-NEXT:    paddb %xmm1, %xmm2
92; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
93; SSE3-NEXT:    pxor %xmm0, %xmm0
94; SSE3-NEXT:    psadbw %xmm2, %xmm0
95; SSE3-NEXT:    retq
96;
97; SSSE3-LABEL: testv2i64:
98; SSSE3:       # %bb.0:
99; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
100; SSSE3-NEXT:    movdqa %xmm2, %xmm3
101; SSSE3-NEXT:    pshufb %xmm0, %xmm3
102; SSSE3-NEXT:    movdqa %xmm0, %xmm1
103; SSSE3-NEXT:    psrlw $4, %xmm1
104; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
105; SSSE3-NEXT:    pxor %xmm4, %xmm4
106; SSSE3-NEXT:    pshufb %xmm1, %xmm2
107; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
108; SSSE3-NEXT:    pand %xmm3, %xmm1
109; SSSE3-NEXT:    paddb %xmm2, %xmm1
110; SSSE3-NEXT:    movdqa %xmm0, %xmm2
111; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
112; SSSE3-NEXT:    psrlw $8, %xmm2
113; SSSE3-NEXT:    pand %xmm1, %xmm2
114; SSSE3-NEXT:    psrlw $8, %xmm1
115; SSSE3-NEXT:    paddw %xmm2, %xmm1
116; SSSE3-NEXT:    movdqa %xmm0, %xmm2
117; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
118; SSSE3-NEXT:    psrld $16, %xmm2
119; SSSE3-NEXT:    pand %xmm1, %xmm2
120; SSSE3-NEXT:    psrld $16, %xmm1
121; SSSE3-NEXT:    paddd %xmm2, %xmm1
122; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
123; SSSE3-NEXT:    psrlq $32, %xmm0
124; SSSE3-NEXT:    pand %xmm1, %xmm0
125; SSSE3-NEXT:    psrlq $32, %xmm1
126; SSSE3-NEXT:    paddq %xmm0, %xmm1
127; SSSE3-NEXT:    movdqa %xmm1, %xmm0
128; SSSE3-NEXT:    retq
129;
130; SSE41-LABEL: testv2i64:
131; SSE41:       # %bb.0:
132; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
133; SSE41-NEXT:    movdqa %xmm2, %xmm3
134; SSE41-NEXT:    pshufb %xmm0, %xmm3
135; SSE41-NEXT:    movdqa %xmm0, %xmm1
136; SSE41-NEXT:    psrlw $4, %xmm1
137; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
138; SSE41-NEXT:    pxor %xmm4, %xmm4
139; SSE41-NEXT:    pshufb %xmm1, %xmm2
140; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
141; SSE41-NEXT:    pand %xmm3, %xmm1
142; SSE41-NEXT:    paddb %xmm2, %xmm1
143; SSE41-NEXT:    movdqa %xmm0, %xmm2
144; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
145; SSE41-NEXT:    psrlw $8, %xmm2
146; SSE41-NEXT:    pand %xmm1, %xmm2
147; SSE41-NEXT:    psrlw $8, %xmm1
148; SSE41-NEXT:    paddw %xmm2, %xmm1
149; SSE41-NEXT:    movdqa %xmm0, %xmm2
150; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
151; SSE41-NEXT:    psrld $16, %xmm2
152; SSE41-NEXT:    pand %xmm1, %xmm2
153; SSE41-NEXT:    psrld $16, %xmm1
154; SSE41-NEXT:    paddd %xmm2, %xmm1
155; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
156; SSE41-NEXT:    psrlq $32, %xmm0
157; SSE41-NEXT:    pand %xmm1, %xmm0
158; SSE41-NEXT:    psrlq $32, %xmm1
159; SSE41-NEXT:    paddq %xmm0, %xmm1
160; SSE41-NEXT:    movdqa %xmm1, %xmm0
161; SSE41-NEXT:    retq
162;
163; AVX-LABEL: testv2i64:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
166; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
167; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
168; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
169; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
170; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
171; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
172; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
173; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
174; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
175; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
176; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
177; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
178; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
179; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
180; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
181; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
182; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
183; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
184; AVX-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
185; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
186; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
187; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm1
188; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
189; AVX-NEXT:    retq
190;
191; AVX512VLBWDQ-LABEL: testv2i64:
192; AVX512VLBWDQ:       # %bb.0:
193; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
194; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
195; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
196; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
197; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
198; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
199; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
200; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
201; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
202; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
203; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
204; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
205; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
206; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
207; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
208; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
209; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
210; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
211; AVX512VLBWDQ-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
212; AVX512VLBWDQ-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
213; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm0, %xmm0
214; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
215; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm1, %xmm1
216; AVX512VLBWDQ-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
217; AVX512VLBWDQ-NEXT:    retq
218;
219; AVX512VLCD-LABEL: testv2i64:
220; AVX512VLCD:       # %bb.0:
221; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
222; AVX512VLCD-NEXT:    retq
223;
224; AVX512CD-LABEL: testv2i64:
225; AVX512CD:       # %bb.0:
226; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
227; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
228; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
229; AVX512CD-NEXT:    vzeroupper
230; AVX512CD-NEXT:    retq
231;
232; X32-SSE-LABEL: testv2i64:
233; X32-SSE:       # %bb.0:
234; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
235; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
236; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
237; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
238; X32-SSE-NEXT:    psrlw $4, %xmm1
239; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
240; X32-SSE-NEXT:    pxor %xmm4, %xmm4
241; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
242; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
243; X32-SSE-NEXT:    pand %xmm3, %xmm1
244; X32-SSE-NEXT:    paddb %xmm2, %xmm1
245; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
246; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
247; X32-SSE-NEXT:    psrlw $8, %xmm2
248; X32-SSE-NEXT:    pand %xmm1, %xmm2
249; X32-SSE-NEXT:    psrlw $8, %xmm1
250; X32-SSE-NEXT:    paddw %xmm2, %xmm1
251; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
252; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
253; X32-SSE-NEXT:    psrld $16, %xmm2
254; X32-SSE-NEXT:    pand %xmm1, %xmm2
255; X32-SSE-NEXT:    psrld $16, %xmm1
256; X32-SSE-NEXT:    paddd %xmm2, %xmm1
257; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
258; X32-SSE-NEXT:    psrlq $32, %xmm0
259; X32-SSE-NEXT:    pand %xmm1, %xmm0
260; X32-SSE-NEXT:    psrlq $32, %xmm1
261; X32-SSE-NEXT:    paddq %xmm0, %xmm1
262; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
263; X32-SSE-NEXT:    retl
264
265  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
266  ret <2 x i64> %out
267}
268
269define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
270; SSE2-LABEL: testv2i64u:
271; SSE2:       # %bb.0:
272; SSE2-NEXT:    movdqa %xmm0, %xmm1
273; SSE2-NEXT:    psrlq $1, %xmm1
274; SSE2-NEXT:    por %xmm0, %xmm1
275; SSE2-NEXT:    movdqa %xmm1, %xmm0
276; SSE2-NEXT:    psrlq $2, %xmm0
277; SSE2-NEXT:    por %xmm1, %xmm0
278; SSE2-NEXT:    movdqa %xmm0, %xmm1
279; SSE2-NEXT:    psrlq $4, %xmm1
280; SSE2-NEXT:    por %xmm0, %xmm1
281; SSE2-NEXT:    movdqa %xmm1, %xmm0
282; SSE2-NEXT:    psrlq $8, %xmm0
283; SSE2-NEXT:    por %xmm1, %xmm0
284; SSE2-NEXT:    movdqa %xmm0, %xmm1
285; SSE2-NEXT:    psrlq $16, %xmm1
286; SSE2-NEXT:    por %xmm0, %xmm1
287; SSE2-NEXT:    movdqa %xmm1, %xmm0
288; SSE2-NEXT:    psrlq $32, %xmm0
289; SSE2-NEXT:    por %xmm1, %xmm0
290; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
291; SSE2-NEXT:    pxor %xmm0, %xmm1
292; SSE2-NEXT:    movdqa %xmm1, %xmm0
293; SSE2-NEXT:    psrlw $1, %xmm0
294; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
295; SSE2-NEXT:    psubb %xmm0, %xmm1
296; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
297; SSE2-NEXT:    movdqa %xmm1, %xmm2
298; SSE2-NEXT:    pand %xmm0, %xmm2
299; SSE2-NEXT:    psrlw $2, %xmm1
300; SSE2-NEXT:    pand %xmm0, %xmm1
301; SSE2-NEXT:    paddb %xmm2, %xmm1
302; SSE2-NEXT:    movdqa %xmm1, %xmm2
303; SSE2-NEXT:    psrlw $4, %xmm2
304; SSE2-NEXT:    paddb %xmm1, %xmm2
305; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
306; SSE2-NEXT:    pxor %xmm0, %xmm0
307; SSE2-NEXT:    psadbw %xmm2, %xmm0
308; SSE2-NEXT:    retq
309;
310; SSE3-LABEL: testv2i64u:
311; SSE3:       # %bb.0:
312; SSE3-NEXT:    movdqa %xmm0, %xmm1
313; SSE3-NEXT:    psrlq $1, %xmm1
314; SSE3-NEXT:    por %xmm0, %xmm1
315; SSE3-NEXT:    movdqa %xmm1, %xmm0
316; SSE3-NEXT:    psrlq $2, %xmm0
317; SSE3-NEXT:    por %xmm1, %xmm0
318; SSE3-NEXT:    movdqa %xmm0, %xmm1
319; SSE3-NEXT:    psrlq $4, %xmm1
320; SSE3-NEXT:    por %xmm0, %xmm1
321; SSE3-NEXT:    movdqa %xmm1, %xmm0
322; SSE3-NEXT:    psrlq $8, %xmm0
323; SSE3-NEXT:    por %xmm1, %xmm0
324; SSE3-NEXT:    movdqa %xmm0, %xmm1
325; SSE3-NEXT:    psrlq $16, %xmm1
326; SSE3-NEXT:    por %xmm0, %xmm1
327; SSE3-NEXT:    movdqa %xmm1, %xmm0
328; SSE3-NEXT:    psrlq $32, %xmm0
329; SSE3-NEXT:    por %xmm1, %xmm0
330; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
331; SSE3-NEXT:    pxor %xmm0, %xmm1
332; SSE3-NEXT:    movdqa %xmm1, %xmm0
333; SSE3-NEXT:    psrlw $1, %xmm0
334; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
335; SSE3-NEXT:    psubb %xmm0, %xmm1
336; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
337; SSE3-NEXT:    movdqa %xmm1, %xmm2
338; SSE3-NEXT:    pand %xmm0, %xmm2
339; SSE3-NEXT:    psrlw $2, %xmm1
340; SSE3-NEXT:    pand %xmm0, %xmm1
341; SSE3-NEXT:    paddb %xmm2, %xmm1
342; SSE3-NEXT:    movdqa %xmm1, %xmm2
343; SSE3-NEXT:    psrlw $4, %xmm2
344; SSE3-NEXT:    paddb %xmm1, %xmm2
345; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
346; SSE3-NEXT:    pxor %xmm0, %xmm0
347; SSE3-NEXT:    psadbw %xmm2, %xmm0
348; SSE3-NEXT:    retq
349;
350; SSSE3-LABEL: testv2i64u:
351; SSSE3:       # %bb.0:
352; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
353; SSSE3-NEXT:    movdqa %xmm2, %xmm3
354; SSSE3-NEXT:    pshufb %xmm0, %xmm3
355; SSSE3-NEXT:    movdqa %xmm0, %xmm1
356; SSSE3-NEXT:    psrlw $4, %xmm1
357; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
358; SSSE3-NEXT:    pxor %xmm4, %xmm4
359; SSSE3-NEXT:    pshufb %xmm1, %xmm2
360; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
361; SSSE3-NEXT:    pand %xmm3, %xmm1
362; SSSE3-NEXT:    paddb %xmm2, %xmm1
363; SSSE3-NEXT:    movdqa %xmm0, %xmm2
364; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
365; SSSE3-NEXT:    psrlw $8, %xmm2
366; SSSE3-NEXT:    pand %xmm1, %xmm2
367; SSSE3-NEXT:    psrlw $8, %xmm1
368; SSSE3-NEXT:    paddw %xmm2, %xmm1
369; SSSE3-NEXT:    movdqa %xmm0, %xmm2
370; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm2
371; SSSE3-NEXT:    psrld $16, %xmm2
372; SSSE3-NEXT:    pand %xmm1, %xmm2
373; SSSE3-NEXT:    psrld $16, %xmm1
374; SSSE3-NEXT:    paddd %xmm2, %xmm1
375; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
376; SSSE3-NEXT:    psrlq $32, %xmm0
377; SSSE3-NEXT:    pand %xmm1, %xmm0
378; SSSE3-NEXT:    psrlq $32, %xmm1
379; SSSE3-NEXT:    paddq %xmm0, %xmm1
380; SSSE3-NEXT:    movdqa %xmm1, %xmm0
381; SSSE3-NEXT:    retq
382;
383; SSE41-LABEL: testv2i64u:
384; SSE41:       # %bb.0:
385; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
386; SSE41-NEXT:    movdqa %xmm2, %xmm3
387; SSE41-NEXT:    pshufb %xmm0, %xmm3
388; SSE41-NEXT:    movdqa %xmm0, %xmm1
389; SSE41-NEXT:    psrlw $4, %xmm1
390; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
391; SSE41-NEXT:    pxor %xmm4, %xmm4
392; SSE41-NEXT:    pshufb %xmm1, %xmm2
393; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
394; SSE41-NEXT:    pand %xmm3, %xmm1
395; SSE41-NEXT:    paddb %xmm2, %xmm1
396; SSE41-NEXT:    movdqa %xmm0, %xmm2
397; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
398; SSE41-NEXT:    psrlw $8, %xmm2
399; SSE41-NEXT:    pand %xmm1, %xmm2
400; SSE41-NEXT:    psrlw $8, %xmm1
401; SSE41-NEXT:    paddw %xmm2, %xmm1
402; SSE41-NEXT:    movdqa %xmm0, %xmm2
403; SSE41-NEXT:    pcmpeqw %xmm4, %xmm2
404; SSE41-NEXT:    psrld $16, %xmm2
405; SSE41-NEXT:    pand %xmm1, %xmm2
406; SSE41-NEXT:    psrld $16, %xmm1
407; SSE41-NEXT:    paddd %xmm2, %xmm1
408; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
409; SSE41-NEXT:    psrlq $32, %xmm0
410; SSE41-NEXT:    pand %xmm1, %xmm0
411; SSE41-NEXT:    psrlq $32, %xmm1
412; SSE41-NEXT:    paddq %xmm0, %xmm1
413; SSE41-NEXT:    movdqa %xmm1, %xmm0
414; SSE41-NEXT:    retq
415;
416; AVX-LABEL: testv2i64u:
417; AVX:       # %bb.0:
418; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
419; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
420; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
421; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
422; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
423; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
424; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
425; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
426; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
427; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
428; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
429; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
430; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
431; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
432; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
433; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
434; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
435; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
436; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
437; AVX-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
438; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
439; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
440; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm1
441; AVX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
442; AVX-NEXT:    retq
443;
444; AVX512VLBWDQ-LABEL: testv2i64u:
445; AVX512VLBWDQ:       # %bb.0:
446; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
447; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
448; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
449; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
450; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
451; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
452; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
453; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
454; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
455; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
456; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
457; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
458; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
459; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
460; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm2
461; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm2, %xmm2
462; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
463; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
464; AVX512VLBWDQ-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
465; AVX512VLBWDQ-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
466; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm0, %xmm0
467; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
468; AVX512VLBWDQ-NEXT:    vpsrlq $32, %xmm1, %xmm1
469; AVX512VLBWDQ-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
470; AVX512VLBWDQ-NEXT:    retq
471;
472; AVX512VLCD-LABEL: testv2i64u:
473; AVX512VLCD:       # %bb.0:
474; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
475; AVX512VLCD-NEXT:    retq
476;
477; AVX512CD-LABEL: testv2i64u:
478; AVX512CD:       # %bb.0:
479; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
480; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
481; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
482; AVX512CD-NEXT:    vzeroupper
483; AVX512CD-NEXT:    retq
484;
485; X32-SSE-LABEL: testv2i64u:
486; X32-SSE:       # %bb.0:
487; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
488; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
489; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
490; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
491; X32-SSE-NEXT:    psrlw $4, %xmm1
492; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
493; X32-SSE-NEXT:    pxor %xmm4, %xmm4
494; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
495; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
496; X32-SSE-NEXT:    pand %xmm3, %xmm1
497; X32-SSE-NEXT:    paddb %xmm2, %xmm1
498; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
499; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
500; X32-SSE-NEXT:    psrlw $8, %xmm2
501; X32-SSE-NEXT:    pand %xmm1, %xmm2
502; X32-SSE-NEXT:    psrlw $8, %xmm1
503; X32-SSE-NEXT:    paddw %xmm2, %xmm1
504; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
505; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm2
506; X32-SSE-NEXT:    psrld $16, %xmm2
507; X32-SSE-NEXT:    pand %xmm1, %xmm2
508; X32-SSE-NEXT:    psrld $16, %xmm1
509; X32-SSE-NEXT:    paddd %xmm2, %xmm1
510; X32-SSE-NEXT:    pcmpeqd %xmm4, %xmm0
511; X32-SSE-NEXT:    psrlq $32, %xmm0
512; X32-SSE-NEXT:    pand %xmm1, %xmm0
513; X32-SSE-NEXT:    psrlq $32, %xmm1
514; X32-SSE-NEXT:    paddq %xmm0, %xmm1
515; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
516; X32-SSE-NEXT:    retl
517
518  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
519  ret <2 x i64> %out
520}
521
522define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
523; SSE2-LABEL: testv4i32:
524; SSE2:       # %bb.0:
525; SSE2-NEXT:    movdqa %xmm0, %xmm1
526; SSE2-NEXT:    psrld $1, %xmm1
527; SSE2-NEXT:    por %xmm0, %xmm1
528; SSE2-NEXT:    movdqa %xmm1, %xmm0
529; SSE2-NEXT:    psrld $2, %xmm0
530; SSE2-NEXT:    por %xmm1, %xmm0
531; SSE2-NEXT:    movdqa %xmm0, %xmm1
532; SSE2-NEXT:    psrld $4, %xmm1
533; SSE2-NEXT:    por %xmm0, %xmm1
534; SSE2-NEXT:    movdqa %xmm1, %xmm0
535; SSE2-NEXT:    psrld $8, %xmm0
536; SSE2-NEXT:    por %xmm1, %xmm0
537; SSE2-NEXT:    movdqa %xmm0, %xmm1
538; SSE2-NEXT:    psrld $16, %xmm1
539; SSE2-NEXT:    por %xmm0, %xmm1
540; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
541; SSE2-NEXT:    pxor %xmm1, %xmm2
542; SSE2-NEXT:    movdqa %xmm2, %xmm0
543; SSE2-NEXT:    psrlw $1, %xmm0
544; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
545; SSE2-NEXT:    psubb %xmm0, %xmm2
546; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
547; SSE2-NEXT:    movdqa %xmm2, %xmm1
548; SSE2-NEXT:    pand %xmm0, %xmm1
549; SSE2-NEXT:    psrlw $2, %xmm2
550; SSE2-NEXT:    pand %xmm0, %xmm2
551; SSE2-NEXT:    paddb %xmm1, %xmm2
552; SSE2-NEXT:    movdqa %xmm2, %xmm0
553; SSE2-NEXT:    psrlw $4, %xmm0
554; SSE2-NEXT:    paddb %xmm2, %xmm0
555; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
556; SSE2-NEXT:    pxor %xmm1, %xmm1
557; SSE2-NEXT:    movdqa %xmm0, %xmm2
558; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
559; SSE2-NEXT:    psadbw %xmm1, %xmm2
560; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
561; SSE2-NEXT:    psadbw %xmm1, %xmm0
562; SSE2-NEXT:    packuswb %xmm2, %xmm0
563; SSE2-NEXT:    retq
564;
565; SSE3-LABEL: testv4i32:
566; SSE3:       # %bb.0:
567; SSE3-NEXT:    movdqa %xmm0, %xmm1
568; SSE3-NEXT:    psrld $1, %xmm1
569; SSE3-NEXT:    por %xmm0, %xmm1
570; SSE3-NEXT:    movdqa %xmm1, %xmm0
571; SSE3-NEXT:    psrld $2, %xmm0
572; SSE3-NEXT:    por %xmm1, %xmm0
573; SSE3-NEXT:    movdqa %xmm0, %xmm1
574; SSE3-NEXT:    psrld $4, %xmm1
575; SSE3-NEXT:    por %xmm0, %xmm1
576; SSE3-NEXT:    movdqa %xmm1, %xmm0
577; SSE3-NEXT:    psrld $8, %xmm0
578; SSE3-NEXT:    por %xmm1, %xmm0
579; SSE3-NEXT:    movdqa %xmm0, %xmm1
580; SSE3-NEXT:    psrld $16, %xmm1
581; SSE3-NEXT:    por %xmm0, %xmm1
582; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
583; SSE3-NEXT:    pxor %xmm1, %xmm2
584; SSE3-NEXT:    movdqa %xmm2, %xmm0
585; SSE3-NEXT:    psrlw $1, %xmm0
586; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
587; SSE3-NEXT:    psubb %xmm0, %xmm2
588; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
589; SSE3-NEXT:    movdqa %xmm2, %xmm1
590; SSE3-NEXT:    pand %xmm0, %xmm1
591; SSE3-NEXT:    psrlw $2, %xmm2
592; SSE3-NEXT:    pand %xmm0, %xmm2
593; SSE3-NEXT:    paddb %xmm1, %xmm2
594; SSE3-NEXT:    movdqa %xmm2, %xmm0
595; SSE3-NEXT:    psrlw $4, %xmm0
596; SSE3-NEXT:    paddb %xmm2, %xmm0
597; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
598; SSE3-NEXT:    pxor %xmm1, %xmm1
599; SSE3-NEXT:    movdqa %xmm0, %xmm2
600; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
601; SSE3-NEXT:    psadbw %xmm1, %xmm2
602; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
603; SSE3-NEXT:    psadbw %xmm1, %xmm0
604; SSE3-NEXT:    packuswb %xmm2, %xmm0
605; SSE3-NEXT:    retq
606;
607; SSSE3-LABEL: testv4i32:
608; SSSE3:       # %bb.0:
609; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
610; SSSE3-NEXT:    movdqa %xmm2, %xmm3
611; SSSE3-NEXT:    pshufb %xmm0, %xmm3
612; SSSE3-NEXT:    movdqa %xmm0, %xmm1
613; SSSE3-NEXT:    psrlw $4, %xmm1
614; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
615; SSSE3-NEXT:    pxor %xmm4, %xmm4
616; SSSE3-NEXT:    pshufb %xmm1, %xmm2
617; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
618; SSSE3-NEXT:    pand %xmm3, %xmm1
619; SSSE3-NEXT:    paddb %xmm2, %xmm1
620; SSSE3-NEXT:    movdqa %xmm0, %xmm2
621; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
622; SSSE3-NEXT:    psrlw $8, %xmm2
623; SSSE3-NEXT:    pand %xmm1, %xmm2
624; SSSE3-NEXT:    psrlw $8, %xmm1
625; SSSE3-NEXT:    paddw %xmm2, %xmm1
626; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
627; SSSE3-NEXT:    psrld $16, %xmm0
628; SSSE3-NEXT:    pand %xmm1, %xmm0
629; SSSE3-NEXT:    psrld $16, %xmm1
630; SSSE3-NEXT:    paddd %xmm0, %xmm1
631; SSSE3-NEXT:    movdqa %xmm1, %xmm0
632; SSSE3-NEXT:    retq
633;
634; SSE41-LABEL: testv4i32:
635; SSE41:       # %bb.0:
636; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
637; SSE41-NEXT:    movdqa %xmm2, %xmm3
638; SSE41-NEXT:    pshufb %xmm0, %xmm3
639; SSE41-NEXT:    movdqa %xmm0, %xmm1
640; SSE41-NEXT:    psrlw $4, %xmm1
641; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
642; SSE41-NEXT:    pxor %xmm4, %xmm4
643; SSE41-NEXT:    pshufb %xmm1, %xmm2
644; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
645; SSE41-NEXT:    pand %xmm3, %xmm1
646; SSE41-NEXT:    paddb %xmm2, %xmm1
647; SSE41-NEXT:    movdqa %xmm0, %xmm2
648; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
649; SSE41-NEXT:    psrlw $8, %xmm2
650; SSE41-NEXT:    pand %xmm1, %xmm2
651; SSE41-NEXT:    psrlw $8, %xmm1
652; SSE41-NEXT:    paddw %xmm2, %xmm1
653; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
654; SSE41-NEXT:    psrld $16, %xmm0
655; SSE41-NEXT:    pand %xmm1, %xmm0
656; SSE41-NEXT:    psrld $16, %xmm1
657; SSE41-NEXT:    paddd %xmm0, %xmm1
658; SSE41-NEXT:    movdqa %xmm1, %xmm0
659; SSE41-NEXT:    retq
660;
661; AVX-LABEL: testv4i32:
662; AVX:       # %bb.0:
663; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
664; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
665; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
666; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
667; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
668; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
669; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
670; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
671; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
672; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
673; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
674; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
675; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
676; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
677; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
678; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
679; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
680; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
681; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
682; AVX-NEXT:    retq
683;
684; AVX512VLBWDQ-LABEL: testv4i32:
685; AVX512VLBWDQ:       # %bb.0:
686; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
687; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
688; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
689; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
690; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
691; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
692; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
693; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
694; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
695; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
696; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
697; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
698; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
699; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
700; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
701; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm0, %xmm0
702; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
703; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
704; AVX512VLBWDQ-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
705; AVX512VLBWDQ-NEXT:    retq
706;
707; AVX512VLCD-LABEL: testv4i32:
708; AVX512VLCD:       # %bb.0:
709; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
710; AVX512VLCD-NEXT:    retq
711;
712; AVX512CD-LABEL: testv4i32:
713; AVX512CD:       # %bb.0:
714; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
715; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
716; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
717; AVX512CD-NEXT:    vzeroupper
718; AVX512CD-NEXT:    retq
719;
720; X32-SSE-LABEL: testv4i32:
721; X32-SSE:       # %bb.0:
722; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
723; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
724; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
725; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
726; X32-SSE-NEXT:    psrlw $4, %xmm1
727; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
728; X32-SSE-NEXT:    pxor %xmm4, %xmm4
729; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
730; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
731; X32-SSE-NEXT:    pand %xmm3, %xmm1
732; X32-SSE-NEXT:    paddb %xmm2, %xmm1
733; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
734; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
735; X32-SSE-NEXT:    psrlw $8, %xmm2
736; X32-SSE-NEXT:    pand %xmm1, %xmm2
737; X32-SSE-NEXT:    psrlw $8, %xmm1
738; X32-SSE-NEXT:    paddw %xmm2, %xmm1
739; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
740; X32-SSE-NEXT:    psrld $16, %xmm0
741; X32-SSE-NEXT:    pand %xmm1, %xmm0
742; X32-SSE-NEXT:    psrld $16, %xmm1
743; X32-SSE-NEXT:    paddd %xmm0, %xmm1
744; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
745; X32-SSE-NEXT:    retl
746
747  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
748  ret <4 x i32> %out
749}
750
751define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
752; SSE2-LABEL: testv4i32u:
753; SSE2:       # %bb.0:
754; SSE2-NEXT:    movdqa %xmm0, %xmm1
755; SSE2-NEXT:    psrld $1, %xmm1
756; SSE2-NEXT:    por %xmm0, %xmm1
757; SSE2-NEXT:    movdqa %xmm1, %xmm0
758; SSE2-NEXT:    psrld $2, %xmm0
759; SSE2-NEXT:    por %xmm1, %xmm0
760; SSE2-NEXT:    movdqa %xmm0, %xmm1
761; SSE2-NEXT:    psrld $4, %xmm1
762; SSE2-NEXT:    por %xmm0, %xmm1
763; SSE2-NEXT:    movdqa %xmm1, %xmm0
764; SSE2-NEXT:    psrld $8, %xmm0
765; SSE2-NEXT:    por %xmm1, %xmm0
766; SSE2-NEXT:    movdqa %xmm0, %xmm1
767; SSE2-NEXT:    psrld $16, %xmm1
768; SSE2-NEXT:    por %xmm0, %xmm1
769; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
770; SSE2-NEXT:    pxor %xmm1, %xmm2
771; SSE2-NEXT:    movdqa %xmm2, %xmm0
772; SSE2-NEXT:    psrlw $1, %xmm0
773; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
774; SSE2-NEXT:    psubb %xmm0, %xmm2
775; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
776; SSE2-NEXT:    movdqa %xmm2, %xmm1
777; SSE2-NEXT:    pand %xmm0, %xmm1
778; SSE2-NEXT:    psrlw $2, %xmm2
779; SSE2-NEXT:    pand %xmm0, %xmm2
780; SSE2-NEXT:    paddb %xmm1, %xmm2
781; SSE2-NEXT:    movdqa %xmm2, %xmm0
782; SSE2-NEXT:    psrlw $4, %xmm0
783; SSE2-NEXT:    paddb %xmm2, %xmm0
784; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
785; SSE2-NEXT:    pxor %xmm1, %xmm1
786; SSE2-NEXT:    movdqa %xmm0, %xmm2
787; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
788; SSE2-NEXT:    psadbw %xmm1, %xmm2
789; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
790; SSE2-NEXT:    psadbw %xmm1, %xmm0
791; SSE2-NEXT:    packuswb %xmm2, %xmm0
792; SSE2-NEXT:    retq
793;
794; SSE3-LABEL: testv4i32u:
795; SSE3:       # %bb.0:
796; SSE3-NEXT:    movdqa %xmm0, %xmm1
797; SSE3-NEXT:    psrld $1, %xmm1
798; SSE3-NEXT:    por %xmm0, %xmm1
799; SSE3-NEXT:    movdqa %xmm1, %xmm0
800; SSE3-NEXT:    psrld $2, %xmm0
801; SSE3-NEXT:    por %xmm1, %xmm0
802; SSE3-NEXT:    movdqa %xmm0, %xmm1
803; SSE3-NEXT:    psrld $4, %xmm1
804; SSE3-NEXT:    por %xmm0, %xmm1
805; SSE3-NEXT:    movdqa %xmm1, %xmm0
806; SSE3-NEXT:    psrld $8, %xmm0
807; SSE3-NEXT:    por %xmm1, %xmm0
808; SSE3-NEXT:    movdqa %xmm0, %xmm1
809; SSE3-NEXT:    psrld $16, %xmm1
810; SSE3-NEXT:    por %xmm0, %xmm1
811; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
812; SSE3-NEXT:    pxor %xmm1, %xmm2
813; SSE3-NEXT:    movdqa %xmm2, %xmm0
814; SSE3-NEXT:    psrlw $1, %xmm0
815; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
816; SSE3-NEXT:    psubb %xmm0, %xmm2
817; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
818; SSE3-NEXT:    movdqa %xmm2, %xmm1
819; SSE3-NEXT:    pand %xmm0, %xmm1
820; SSE3-NEXT:    psrlw $2, %xmm2
821; SSE3-NEXT:    pand %xmm0, %xmm2
822; SSE3-NEXT:    paddb %xmm1, %xmm2
823; SSE3-NEXT:    movdqa %xmm2, %xmm0
824; SSE3-NEXT:    psrlw $4, %xmm0
825; SSE3-NEXT:    paddb %xmm2, %xmm0
826; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
827; SSE3-NEXT:    pxor %xmm1, %xmm1
828; SSE3-NEXT:    movdqa %xmm0, %xmm2
829; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
830; SSE3-NEXT:    psadbw %xmm1, %xmm2
831; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
832; SSE3-NEXT:    psadbw %xmm1, %xmm0
833; SSE3-NEXT:    packuswb %xmm2, %xmm0
834; SSE3-NEXT:    retq
835;
836; SSSE3-LABEL: testv4i32u:
837; SSSE3:       # %bb.0:
838; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
839; SSSE3-NEXT:    movdqa %xmm2, %xmm3
840; SSSE3-NEXT:    pshufb %xmm0, %xmm3
841; SSSE3-NEXT:    movdqa %xmm0, %xmm1
842; SSSE3-NEXT:    psrlw $4, %xmm1
843; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
844; SSSE3-NEXT:    pxor %xmm4, %xmm4
845; SSSE3-NEXT:    pshufb %xmm1, %xmm2
846; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
847; SSSE3-NEXT:    pand %xmm3, %xmm1
848; SSSE3-NEXT:    paddb %xmm2, %xmm1
849; SSSE3-NEXT:    movdqa %xmm0, %xmm2
850; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm2
851; SSSE3-NEXT:    psrlw $8, %xmm2
852; SSSE3-NEXT:    pand %xmm1, %xmm2
853; SSSE3-NEXT:    psrlw $8, %xmm1
854; SSSE3-NEXT:    paddw %xmm2, %xmm1
855; SSSE3-NEXT:    pcmpeqw %xmm4, %xmm0
856; SSSE3-NEXT:    psrld $16, %xmm0
857; SSSE3-NEXT:    pand %xmm1, %xmm0
858; SSSE3-NEXT:    psrld $16, %xmm1
859; SSSE3-NEXT:    paddd %xmm0, %xmm1
860; SSSE3-NEXT:    movdqa %xmm1, %xmm0
861; SSSE3-NEXT:    retq
862;
863; SSE41-LABEL: testv4i32u:
864; SSE41:       # %bb.0:
865; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
866; SSE41-NEXT:    movdqa %xmm2, %xmm3
867; SSE41-NEXT:    pshufb %xmm0, %xmm3
868; SSE41-NEXT:    movdqa %xmm0, %xmm1
869; SSE41-NEXT:    psrlw $4, %xmm1
870; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
871; SSE41-NEXT:    pxor %xmm4, %xmm4
872; SSE41-NEXT:    pshufb %xmm1, %xmm2
873; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
874; SSE41-NEXT:    pand %xmm3, %xmm1
875; SSE41-NEXT:    paddb %xmm2, %xmm1
876; SSE41-NEXT:    movdqa %xmm0, %xmm2
877; SSE41-NEXT:    pcmpeqb %xmm4, %xmm2
878; SSE41-NEXT:    psrlw $8, %xmm2
879; SSE41-NEXT:    pand %xmm1, %xmm2
880; SSE41-NEXT:    psrlw $8, %xmm1
881; SSE41-NEXT:    paddw %xmm2, %xmm1
882; SSE41-NEXT:    pcmpeqw %xmm4, %xmm0
883; SSE41-NEXT:    psrld $16, %xmm0
884; SSE41-NEXT:    pand %xmm1, %xmm0
885; SSE41-NEXT:    psrld $16, %xmm1
886; SSE41-NEXT:    paddd %xmm0, %xmm1
887; SSE41-NEXT:    movdqa %xmm1, %xmm0
888; SSE41-NEXT:    retq
889;
890; AVX-LABEL: testv4i32u:
891; AVX:       # %bb.0:
892; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
893; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
894; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
895; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
896; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
897; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
898; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
899; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
900; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
901; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
902; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
903; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
904; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
905; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
906; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
907; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
908; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
909; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
910; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
911; AVX-NEXT:    retq
912;
913; AVX512VLBWDQ-LABEL: testv4i32u:
914; AVX512VLBWDQ:       # %bb.0:
915; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
916; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
917; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
918; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
919; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
920; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
921; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
922; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
923; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
924; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
925; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
926; AVX512VLBWDQ-NEXT:    vpand %xmm2, %xmm1, %xmm2
927; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
928; AVX512VLBWDQ-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
929; AVX512VLBWDQ-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
930; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm0, %xmm0
931; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
932; AVX512VLBWDQ-NEXT:    vpsrld $16, %xmm1, %xmm1
933; AVX512VLBWDQ-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
934; AVX512VLBWDQ-NEXT:    retq
935;
936; AVX512VLCD-LABEL: testv4i32u:
937; AVX512VLCD:       # %bb.0:
938; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
939; AVX512VLCD-NEXT:    retq
940;
941; AVX512CD-LABEL: testv4i32u:
942; AVX512CD:       # %bb.0:
943; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
944; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
945; AVX512CD-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
946; AVX512CD-NEXT:    vzeroupper
947; AVX512CD-NEXT:    retq
948;
949; X32-SSE-LABEL: testv4i32u:
950; X32-SSE:       # %bb.0:
951; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
952; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
953; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
954; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
955; X32-SSE-NEXT:    psrlw $4, %xmm1
956; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
957; X32-SSE-NEXT:    pxor %xmm4, %xmm4
958; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
959; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
960; X32-SSE-NEXT:    pand %xmm3, %xmm1
961; X32-SSE-NEXT:    paddb %xmm2, %xmm1
962; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
963; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm2
964; X32-SSE-NEXT:    psrlw $8, %xmm2
965; X32-SSE-NEXT:    pand %xmm1, %xmm2
966; X32-SSE-NEXT:    psrlw $8, %xmm1
967; X32-SSE-NEXT:    paddw %xmm2, %xmm1
968; X32-SSE-NEXT:    pcmpeqw %xmm4, %xmm0
969; X32-SSE-NEXT:    psrld $16, %xmm0
970; X32-SSE-NEXT:    pand %xmm1, %xmm0
971; X32-SSE-NEXT:    psrld $16, %xmm1
972; X32-SSE-NEXT:    paddd %xmm0, %xmm1
973; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
974; X32-SSE-NEXT:    retl
975
976  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
977  ret <4 x i32> %out
978}
979
980define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
981; SSE2-LABEL: testv8i16:
982; SSE2:       # %bb.0:
983; SSE2-NEXT:    movdqa %xmm0, %xmm1
984; SSE2-NEXT:    psrlw $1, %xmm1
985; SSE2-NEXT:    por %xmm0, %xmm1
986; SSE2-NEXT:    movdqa %xmm1, %xmm0
987; SSE2-NEXT:    psrlw $2, %xmm0
988; SSE2-NEXT:    por %xmm1, %xmm0
989; SSE2-NEXT:    movdqa %xmm0, %xmm1
990; SSE2-NEXT:    psrlw $4, %xmm1
991; SSE2-NEXT:    por %xmm0, %xmm1
992; SSE2-NEXT:    movdqa %xmm1, %xmm0
993; SSE2-NEXT:    psrlw $8, %xmm0
994; SSE2-NEXT:    por %xmm1, %xmm0
995; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
996; SSE2-NEXT:    pxor %xmm0, %xmm1
997; SSE2-NEXT:    movdqa %xmm1, %xmm0
998; SSE2-NEXT:    psrlw $1, %xmm0
999; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1000; SSE2-NEXT:    psubb %xmm0, %xmm1
1001; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1002; SSE2-NEXT:    movdqa %xmm1, %xmm2
1003; SSE2-NEXT:    pand %xmm0, %xmm2
1004; SSE2-NEXT:    psrlw $2, %xmm1
1005; SSE2-NEXT:    pand %xmm0, %xmm1
1006; SSE2-NEXT:    paddb %xmm2, %xmm1
1007; SSE2-NEXT:    movdqa %xmm1, %xmm2
1008; SSE2-NEXT:    psrlw $4, %xmm2
1009; SSE2-NEXT:    paddb %xmm1, %xmm2
1010; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1011; SSE2-NEXT:    movdqa %xmm2, %xmm0
1012; SSE2-NEXT:    psllw $8, %xmm0
1013; SSE2-NEXT:    paddb %xmm2, %xmm0
1014; SSE2-NEXT:    psrlw $8, %xmm0
1015; SSE2-NEXT:    retq
1016;
1017; SSE3-LABEL: testv8i16:
1018; SSE3:       # %bb.0:
1019; SSE3-NEXT:    movdqa %xmm0, %xmm1
1020; SSE3-NEXT:    psrlw $1, %xmm1
1021; SSE3-NEXT:    por %xmm0, %xmm1
1022; SSE3-NEXT:    movdqa %xmm1, %xmm0
1023; SSE3-NEXT:    psrlw $2, %xmm0
1024; SSE3-NEXT:    por %xmm1, %xmm0
1025; SSE3-NEXT:    movdqa %xmm0, %xmm1
1026; SSE3-NEXT:    psrlw $4, %xmm1
1027; SSE3-NEXT:    por %xmm0, %xmm1
1028; SSE3-NEXT:    movdqa %xmm1, %xmm0
1029; SSE3-NEXT:    psrlw $8, %xmm0
1030; SSE3-NEXT:    por %xmm1, %xmm0
1031; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1032; SSE3-NEXT:    pxor %xmm0, %xmm1
1033; SSE3-NEXT:    movdqa %xmm1, %xmm0
1034; SSE3-NEXT:    psrlw $1, %xmm0
1035; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1036; SSE3-NEXT:    psubb %xmm0, %xmm1
1037; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1038; SSE3-NEXT:    movdqa %xmm1, %xmm2
1039; SSE3-NEXT:    pand %xmm0, %xmm2
1040; SSE3-NEXT:    psrlw $2, %xmm1
1041; SSE3-NEXT:    pand %xmm0, %xmm1
1042; SSE3-NEXT:    paddb %xmm2, %xmm1
1043; SSE3-NEXT:    movdqa %xmm1, %xmm2
1044; SSE3-NEXT:    psrlw $4, %xmm2
1045; SSE3-NEXT:    paddb %xmm1, %xmm2
1046; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
1047; SSE3-NEXT:    movdqa %xmm2, %xmm0
1048; SSE3-NEXT:    psllw $8, %xmm0
1049; SSE3-NEXT:    paddb %xmm2, %xmm0
1050; SSE3-NEXT:    psrlw $8, %xmm0
1051; SSE3-NEXT:    retq
1052;
1053; SSSE3-LABEL: testv8i16:
1054; SSSE3:       # %bb.0:
1055; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1056; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1057; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1058; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1059; SSSE3-NEXT:    psrlw $4, %xmm1
1060; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1061; SSSE3-NEXT:    pxor %xmm4, %xmm4
1062; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1063; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
1064; SSSE3-NEXT:    pand %xmm3, %xmm1
1065; SSSE3-NEXT:    paddb %xmm2, %xmm1
1066; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
1067; SSSE3-NEXT:    psrlw $8, %xmm0
1068; SSSE3-NEXT:    pand %xmm1, %xmm0
1069; SSSE3-NEXT:    psrlw $8, %xmm1
1070; SSSE3-NEXT:    paddw %xmm0, %xmm1
1071; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1072; SSSE3-NEXT:    retq
1073;
1074; SSE41-LABEL: testv8i16:
1075; SSE41:       # %bb.0:
1076; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1077; SSE41-NEXT:    movdqa %xmm2, %xmm3
1078; SSE41-NEXT:    pshufb %xmm0, %xmm3
1079; SSE41-NEXT:    movdqa %xmm0, %xmm1
1080; SSE41-NEXT:    psrlw $4, %xmm1
1081; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
1082; SSE41-NEXT:    pxor %xmm4, %xmm4
1083; SSE41-NEXT:    pshufb %xmm1, %xmm2
1084; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
1085; SSE41-NEXT:    pand %xmm3, %xmm1
1086; SSE41-NEXT:    paddb %xmm2, %xmm1
1087; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
1088; SSE41-NEXT:    psrlw $8, %xmm0
1089; SSE41-NEXT:    pand %xmm1, %xmm0
1090; SSE41-NEXT:    psrlw $8, %xmm1
1091; SSE41-NEXT:    paddw %xmm0, %xmm1
1092; SSE41-NEXT:    movdqa %xmm1, %xmm0
1093; SSE41-NEXT:    retq
1094;
1095; AVX-LABEL: testv8i16:
1096; AVX:       # %bb.0:
1097; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1098; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1099; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
1100; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1101; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1102; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
1103; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
1104; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1105; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
1106; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
1107; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1108; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
1109; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
1110; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1111; AVX-NEXT:    retq
1112;
1113; AVX512VLBWDQ-LABEL: testv8i16:
1114; AVX512VLBWDQ:       # %bb.0:
1115; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1116; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1117; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
1118; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1119; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1120; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
1121; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
1122; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1123; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
1124; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
1125; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
1126; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
1127; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
1128; AVX512VLBWDQ-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1129; AVX512VLBWDQ-NEXT:    retq
1130;
1131; AVX512VLCD-LABEL: testv8i16:
1132; AVX512VLCD:       # %bb.0:
1133; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1134; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
1135; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
1136; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1137; AVX512VLCD-NEXT:    vzeroupper
1138; AVX512VLCD-NEXT:    retq
1139;
1140; AVX512CD-LABEL: testv8i16:
1141; AVX512CD:       # %bb.0:
1142; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1143; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
1144; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
1145; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1146; AVX512CD-NEXT:    vzeroupper
1147; AVX512CD-NEXT:    retq
1148;
1149; X32-SSE-LABEL: testv8i16:
1150; X32-SSE:       # %bb.0:
1151; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1152; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1153; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
1154; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1155; X32-SSE-NEXT:    psrlw $4, %xmm1
1156; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1157; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1158; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
1159; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
1160; X32-SSE-NEXT:    pand %xmm3, %xmm1
1161; X32-SSE-NEXT:    paddb %xmm2, %xmm1
1162; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
1163; X32-SSE-NEXT:    psrlw $8, %xmm0
1164; X32-SSE-NEXT:    pand %xmm1, %xmm0
1165; X32-SSE-NEXT:    psrlw $8, %xmm1
1166; X32-SSE-NEXT:    paddw %xmm0, %xmm1
1167; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1168; X32-SSE-NEXT:    retl
1169  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
1170  ret <8 x i16> %out
1171}
1172
1173define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
1174; SSE2-LABEL: testv8i16u:
1175; SSE2:       # %bb.0:
1176; SSE2-NEXT:    movdqa %xmm0, %xmm1
1177; SSE2-NEXT:    psrlw $1, %xmm1
1178; SSE2-NEXT:    por %xmm0, %xmm1
1179; SSE2-NEXT:    movdqa %xmm1, %xmm0
1180; SSE2-NEXT:    psrlw $2, %xmm0
1181; SSE2-NEXT:    por %xmm1, %xmm0
1182; SSE2-NEXT:    movdqa %xmm0, %xmm1
1183; SSE2-NEXT:    psrlw $4, %xmm1
1184; SSE2-NEXT:    por %xmm0, %xmm1
1185; SSE2-NEXT:    movdqa %xmm1, %xmm0
1186; SSE2-NEXT:    psrlw $8, %xmm0
1187; SSE2-NEXT:    por %xmm1, %xmm0
1188; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1189; SSE2-NEXT:    pxor %xmm0, %xmm1
1190; SSE2-NEXT:    movdqa %xmm1, %xmm0
1191; SSE2-NEXT:    psrlw $1, %xmm0
1192; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1193; SSE2-NEXT:    psubb %xmm0, %xmm1
1194; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1195; SSE2-NEXT:    movdqa %xmm1, %xmm2
1196; SSE2-NEXT:    pand %xmm0, %xmm2
1197; SSE2-NEXT:    psrlw $2, %xmm1
1198; SSE2-NEXT:    pand %xmm0, %xmm1
1199; SSE2-NEXT:    paddb %xmm2, %xmm1
1200; SSE2-NEXT:    movdqa %xmm1, %xmm2
1201; SSE2-NEXT:    psrlw $4, %xmm2
1202; SSE2-NEXT:    paddb %xmm1, %xmm2
1203; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1204; SSE2-NEXT:    movdqa %xmm2, %xmm0
1205; SSE2-NEXT:    psllw $8, %xmm0
1206; SSE2-NEXT:    paddb %xmm2, %xmm0
1207; SSE2-NEXT:    psrlw $8, %xmm0
1208; SSE2-NEXT:    retq
1209;
1210; SSE3-LABEL: testv8i16u:
1211; SSE3:       # %bb.0:
1212; SSE3-NEXT:    movdqa %xmm0, %xmm1
1213; SSE3-NEXT:    psrlw $1, %xmm1
1214; SSE3-NEXT:    por %xmm0, %xmm1
1215; SSE3-NEXT:    movdqa %xmm1, %xmm0
1216; SSE3-NEXT:    psrlw $2, %xmm0
1217; SSE3-NEXT:    por %xmm1, %xmm0
1218; SSE3-NEXT:    movdqa %xmm0, %xmm1
1219; SSE3-NEXT:    psrlw $4, %xmm1
1220; SSE3-NEXT:    por %xmm0, %xmm1
1221; SSE3-NEXT:    movdqa %xmm1, %xmm0
1222; SSE3-NEXT:    psrlw $8, %xmm0
1223; SSE3-NEXT:    por %xmm1, %xmm0
1224; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1225; SSE3-NEXT:    pxor %xmm0, %xmm1
1226; SSE3-NEXT:    movdqa %xmm1, %xmm0
1227; SSE3-NEXT:    psrlw $1, %xmm0
1228; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1229; SSE3-NEXT:    psubb %xmm0, %xmm1
1230; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1231; SSE3-NEXT:    movdqa %xmm1, %xmm2
1232; SSE3-NEXT:    pand %xmm0, %xmm2
1233; SSE3-NEXT:    psrlw $2, %xmm1
1234; SSE3-NEXT:    pand %xmm0, %xmm1
1235; SSE3-NEXT:    paddb %xmm2, %xmm1
1236; SSE3-NEXT:    movdqa %xmm1, %xmm2
1237; SSE3-NEXT:    psrlw $4, %xmm2
1238; SSE3-NEXT:    paddb %xmm1, %xmm2
1239; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
1240; SSE3-NEXT:    movdqa %xmm2, %xmm0
1241; SSE3-NEXT:    psllw $8, %xmm0
1242; SSE3-NEXT:    paddb %xmm2, %xmm0
1243; SSE3-NEXT:    psrlw $8, %xmm0
1244; SSE3-NEXT:    retq
1245;
1246; SSSE3-LABEL: testv8i16u:
1247; SSSE3:       # %bb.0:
1248; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1249; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1250; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1251; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1252; SSSE3-NEXT:    psrlw $4, %xmm1
1253; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1254; SSSE3-NEXT:    pxor %xmm4, %xmm4
1255; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1256; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm1
1257; SSSE3-NEXT:    pand %xmm3, %xmm1
1258; SSSE3-NEXT:    paddb %xmm2, %xmm1
1259; SSSE3-NEXT:    pcmpeqb %xmm4, %xmm0
1260; SSSE3-NEXT:    psrlw $8, %xmm0
1261; SSSE3-NEXT:    pand %xmm1, %xmm0
1262; SSSE3-NEXT:    psrlw $8, %xmm1
1263; SSSE3-NEXT:    paddw %xmm0, %xmm1
1264; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1265; SSSE3-NEXT:    retq
1266;
1267; SSE41-LABEL: testv8i16u:
1268; SSE41:       # %bb.0:
1269; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1270; SSE41-NEXT:    movdqa %xmm2, %xmm3
1271; SSE41-NEXT:    pshufb %xmm0, %xmm3
1272; SSE41-NEXT:    movdqa %xmm0, %xmm1
1273; SSE41-NEXT:    psrlw $4, %xmm1
1274; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
1275; SSE41-NEXT:    pxor %xmm4, %xmm4
1276; SSE41-NEXT:    pshufb %xmm1, %xmm2
1277; SSE41-NEXT:    pcmpeqb %xmm4, %xmm1
1278; SSE41-NEXT:    pand %xmm3, %xmm1
1279; SSE41-NEXT:    paddb %xmm2, %xmm1
1280; SSE41-NEXT:    pcmpeqb %xmm4, %xmm0
1281; SSE41-NEXT:    psrlw $8, %xmm0
1282; SSE41-NEXT:    pand %xmm1, %xmm0
1283; SSE41-NEXT:    psrlw $8, %xmm1
1284; SSE41-NEXT:    paddw %xmm0, %xmm1
1285; SSE41-NEXT:    movdqa %xmm1, %xmm0
1286; SSE41-NEXT:    retq
1287;
1288; AVX-LABEL: testv8i16u:
1289; AVX:       # %bb.0:
1290; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1291; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1292; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
1293; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1294; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1295; AVX-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
1296; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
1297; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1298; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
1299; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
1300; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1301; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
1302; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
1303; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1304; AVX-NEXT:    retq
1305;
1306; AVX512VLBWDQ-LABEL: testv8i16u:
1307; AVX512VLBWDQ:       # %bb.0:
1308; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1309; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1310; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
1311; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1312; AVX512VLBWDQ-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1313; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm3, %xmm5
1314; AVX512VLBWDQ-NEXT:    vpand %xmm5, %xmm2, %xmm2
1315; AVX512VLBWDQ-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1316; AVX512VLBWDQ-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
1317; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
1318; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
1319; AVX512VLBWDQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
1320; AVX512VLBWDQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
1321; AVX512VLBWDQ-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
1322; AVX512VLBWDQ-NEXT:    retq
1323;
1324; AVX512VLCD-LABEL: testv8i16u:
1325; AVX512VLCD:       # %bb.0:
1326; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1327; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
1328; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
1329; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1330; AVX512VLCD-NEXT:    vzeroupper
1331; AVX512VLCD-NEXT:    retq
1332;
1333; AVX512CD-LABEL: testv8i16u:
1334; AVX512CD:       # %bb.0:
1335; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1336; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
1337; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
1338; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1339; AVX512CD-NEXT:    vzeroupper
1340; AVX512CD-NEXT:    retq
1341;
1342; X32-SSE-LABEL: testv8i16u:
1343; X32-SSE:       # %bb.0:
1344; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1345; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1346; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
1347; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1348; X32-SSE-NEXT:    psrlw $4, %xmm1
1349; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1350; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1351; X32-SSE-NEXT:    pshufb %xmm1, %xmm2
1352; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm1
1353; X32-SSE-NEXT:    pand %xmm3, %xmm1
1354; X32-SSE-NEXT:    paddb %xmm2, %xmm1
1355; X32-SSE-NEXT:    pcmpeqb %xmm4, %xmm0
1356; X32-SSE-NEXT:    psrlw $8, %xmm0
1357; X32-SSE-NEXT:    pand %xmm1, %xmm0
1358; X32-SSE-NEXT:    psrlw $8, %xmm1
1359; X32-SSE-NEXT:    paddw %xmm0, %xmm1
1360; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1361; X32-SSE-NEXT:    retl
1362  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
1363  ret <8 x i16> %out
1364}
1365
1366define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
1367; SSE2-LABEL: testv16i8:
1368; SSE2:       # %bb.0:
1369; SSE2-NEXT:    movdqa %xmm0, %xmm1
1370; SSE2-NEXT:    psrlw $1, %xmm1
1371; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1372; SSE2-NEXT:    por %xmm0, %xmm1
1373; SSE2-NEXT:    movdqa %xmm1, %xmm0
1374; SSE2-NEXT:    psrlw $2, %xmm0
1375; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1376; SSE2-NEXT:    por %xmm1, %xmm0
1377; SSE2-NEXT:    movdqa %xmm0, %xmm1
1378; SSE2-NEXT:    psrlw $4, %xmm1
1379; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1380; SSE2-NEXT:    pand %xmm2, %xmm1
1381; SSE2-NEXT:    por %xmm0, %xmm1
1382; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1383; SSE2-NEXT:    pxor %xmm1, %xmm3
1384; SSE2-NEXT:    movdqa %xmm3, %xmm0
1385; SSE2-NEXT:    psrlw $1, %xmm0
1386; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1387; SSE2-NEXT:    psubb %xmm0, %xmm3
1388; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1389; SSE2-NEXT:    movdqa %xmm3, %xmm1
1390; SSE2-NEXT:    pand %xmm0, %xmm1
1391; SSE2-NEXT:    psrlw $2, %xmm3
1392; SSE2-NEXT:    pand %xmm0, %xmm3
1393; SSE2-NEXT:    paddb %xmm1, %xmm3
1394; SSE2-NEXT:    movdqa %xmm3, %xmm0
1395; SSE2-NEXT:    psrlw $4, %xmm0
1396; SSE2-NEXT:    paddb %xmm3, %xmm0
1397; SSE2-NEXT:    pand %xmm2, %xmm0
1398; SSE2-NEXT:    retq
1399;
1400; SSE3-LABEL: testv16i8:
1401; SSE3:       # %bb.0:
1402; SSE3-NEXT:    movdqa %xmm0, %xmm1
1403; SSE3-NEXT:    psrlw $1, %xmm1
1404; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1405; SSE3-NEXT:    por %xmm0, %xmm1
1406; SSE3-NEXT:    movdqa %xmm1, %xmm0
1407; SSE3-NEXT:    psrlw $2, %xmm0
1408; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1409; SSE3-NEXT:    por %xmm1, %xmm0
1410; SSE3-NEXT:    movdqa %xmm0, %xmm1
1411; SSE3-NEXT:    psrlw $4, %xmm1
1412; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1413; SSE3-NEXT:    pand %xmm2, %xmm1
1414; SSE3-NEXT:    por %xmm0, %xmm1
1415; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1416; SSE3-NEXT:    pxor %xmm1, %xmm3
1417; SSE3-NEXT:    movdqa %xmm3, %xmm0
1418; SSE3-NEXT:    psrlw $1, %xmm0
1419; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1420; SSE3-NEXT:    psubb %xmm0, %xmm3
1421; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1422; SSE3-NEXT:    movdqa %xmm3, %xmm1
1423; SSE3-NEXT:    pand %xmm0, %xmm1
1424; SSE3-NEXT:    psrlw $2, %xmm3
1425; SSE3-NEXT:    pand %xmm0, %xmm3
1426; SSE3-NEXT:    paddb %xmm1, %xmm3
1427; SSE3-NEXT:    movdqa %xmm3, %xmm0
1428; SSE3-NEXT:    psrlw $4, %xmm0
1429; SSE3-NEXT:    paddb %xmm3, %xmm0
1430; SSE3-NEXT:    pand %xmm2, %xmm0
1431; SSE3-NEXT:    retq
1432;
1433; SSSE3-LABEL: testv16i8:
1434; SSSE3:       # %bb.0:
1435; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1436; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1437; SSSE3-NEXT:    pshufb %xmm0, %xmm2
1438; SSSE3-NEXT:    psrlw $4, %xmm0
1439; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1440; SSSE3-NEXT:    pxor %xmm3, %xmm3
1441; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
1442; SSSE3-NEXT:    pand %xmm2, %xmm3
1443; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1444; SSSE3-NEXT:    paddb %xmm3, %xmm1
1445; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1446; SSSE3-NEXT:    retq
1447;
1448; SSE41-LABEL: testv16i8:
1449; SSE41:       # %bb.0:
1450; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1451; SSE41-NEXT:    movdqa %xmm1, %xmm2
1452; SSE41-NEXT:    pshufb %xmm0, %xmm2
1453; SSE41-NEXT:    psrlw $4, %xmm0
1454; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
1455; SSE41-NEXT:    pxor %xmm3, %xmm3
1456; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
1457; SSE41-NEXT:    pand %xmm2, %xmm3
1458; SSE41-NEXT:    pshufb %xmm0, %xmm1
1459; SSE41-NEXT:    paddb %xmm3, %xmm1
1460; SSE41-NEXT:    movdqa %xmm1, %xmm0
1461; SSE41-NEXT:    retq
1462;
1463; AVX-LABEL: testv16i8:
1464; AVX:       # %bb.0:
1465; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1466; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1467; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1468; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1469; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1470; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
1471; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
1472; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1473; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
1474; AVX-NEXT:    retq
1475;
1476; AVX512VLBWDQ-LABEL: testv16i8:
1477; AVX512VLBWDQ:       # %bb.0:
1478; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1479; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1480; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
1481; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1482; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1483; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
1484; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
1485; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1486; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
1487; AVX512VLBWDQ-NEXT:    retq
1488;
1489; AVX512-LABEL: testv16i8:
1490; AVX512:       # %bb.0:
1491; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1492; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
1493; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1494; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1495; AVX512-NEXT:    vzeroupper
1496; AVX512-NEXT:    retq
1497;
1498; X32-SSE-LABEL: testv16i8:
1499; X32-SSE:       # %bb.0:
1500; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1501; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1502; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
1503; X32-SSE-NEXT:    psrlw $4, %xmm0
1504; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1505; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1506; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
1507; X32-SSE-NEXT:    pand %xmm2, %xmm3
1508; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1509; X32-SSE-NEXT:    paddb %xmm3, %xmm1
1510; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1511; X32-SSE-NEXT:    retl
1512  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
1513  ret <16 x i8> %out
1514}
1515
1516define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
1517; SSE2-LABEL: testv16i8u:
1518; SSE2:       # %bb.0:
1519; SSE2-NEXT:    movdqa %xmm0, %xmm1
1520; SSE2-NEXT:    psrlw $1, %xmm1
1521; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1522; SSE2-NEXT:    por %xmm0, %xmm1
1523; SSE2-NEXT:    movdqa %xmm1, %xmm0
1524; SSE2-NEXT:    psrlw $2, %xmm0
1525; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1526; SSE2-NEXT:    por %xmm1, %xmm0
1527; SSE2-NEXT:    movdqa %xmm0, %xmm1
1528; SSE2-NEXT:    psrlw $4, %xmm1
1529; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1530; SSE2-NEXT:    pand %xmm2, %xmm1
1531; SSE2-NEXT:    por %xmm0, %xmm1
1532; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1533; SSE2-NEXT:    pxor %xmm1, %xmm3
1534; SSE2-NEXT:    movdqa %xmm3, %xmm0
1535; SSE2-NEXT:    psrlw $1, %xmm0
1536; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1537; SSE2-NEXT:    psubb %xmm0, %xmm3
1538; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1539; SSE2-NEXT:    movdqa %xmm3, %xmm1
1540; SSE2-NEXT:    pand %xmm0, %xmm1
1541; SSE2-NEXT:    psrlw $2, %xmm3
1542; SSE2-NEXT:    pand %xmm0, %xmm3
1543; SSE2-NEXT:    paddb %xmm1, %xmm3
1544; SSE2-NEXT:    movdqa %xmm3, %xmm0
1545; SSE2-NEXT:    psrlw $4, %xmm0
1546; SSE2-NEXT:    paddb %xmm3, %xmm0
1547; SSE2-NEXT:    pand %xmm2, %xmm0
1548; SSE2-NEXT:    retq
1549;
1550; SSE3-LABEL: testv16i8u:
1551; SSE3:       # %bb.0:
1552; SSE3-NEXT:    movdqa %xmm0, %xmm1
1553; SSE3-NEXT:    psrlw $1, %xmm1
1554; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1555; SSE3-NEXT:    por %xmm0, %xmm1
1556; SSE3-NEXT:    movdqa %xmm1, %xmm0
1557; SSE3-NEXT:    psrlw $2, %xmm0
1558; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1559; SSE3-NEXT:    por %xmm1, %xmm0
1560; SSE3-NEXT:    movdqa %xmm0, %xmm1
1561; SSE3-NEXT:    psrlw $4, %xmm1
1562; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1563; SSE3-NEXT:    pand %xmm2, %xmm1
1564; SSE3-NEXT:    por %xmm0, %xmm1
1565; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1566; SSE3-NEXT:    pxor %xmm1, %xmm3
1567; SSE3-NEXT:    movdqa %xmm3, %xmm0
1568; SSE3-NEXT:    psrlw $1, %xmm0
1569; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1570; SSE3-NEXT:    psubb %xmm0, %xmm3
1571; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1572; SSE3-NEXT:    movdqa %xmm3, %xmm1
1573; SSE3-NEXT:    pand %xmm0, %xmm1
1574; SSE3-NEXT:    psrlw $2, %xmm3
1575; SSE3-NEXT:    pand %xmm0, %xmm3
1576; SSE3-NEXT:    paddb %xmm1, %xmm3
1577; SSE3-NEXT:    movdqa %xmm3, %xmm0
1578; SSE3-NEXT:    psrlw $4, %xmm0
1579; SSE3-NEXT:    paddb %xmm3, %xmm0
1580; SSE3-NEXT:    pand %xmm2, %xmm0
1581; SSE3-NEXT:    retq
1582;
1583; SSSE3-LABEL: testv16i8u:
1584; SSSE3:       # %bb.0:
1585; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1586; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1587; SSSE3-NEXT:    pshufb %xmm0, %xmm2
1588; SSSE3-NEXT:    psrlw $4, %xmm0
1589; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1590; SSSE3-NEXT:    pxor %xmm3, %xmm3
1591; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm3
1592; SSSE3-NEXT:    pand %xmm2, %xmm3
1593; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1594; SSSE3-NEXT:    paddb %xmm3, %xmm1
1595; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1596; SSSE3-NEXT:    retq
1597;
1598; SSE41-LABEL: testv16i8u:
1599; SSE41:       # %bb.0:
1600; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1601; SSE41-NEXT:    movdqa %xmm1, %xmm2
1602; SSE41-NEXT:    pshufb %xmm0, %xmm2
1603; SSE41-NEXT:    psrlw $4, %xmm0
1604; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
1605; SSE41-NEXT:    pxor %xmm3, %xmm3
1606; SSE41-NEXT:    pcmpeqb %xmm0, %xmm3
1607; SSE41-NEXT:    pand %xmm2, %xmm3
1608; SSE41-NEXT:    pshufb %xmm0, %xmm1
1609; SSE41-NEXT:    paddb %xmm3, %xmm1
1610; SSE41-NEXT:    movdqa %xmm1, %xmm0
1611; SSE41-NEXT:    retq
1612;
1613; AVX-LABEL: testv16i8u:
1614; AVX:       # %bb.0:
1615; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1616; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1617; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1618; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1619; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1620; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
1621; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
1622; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1623; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
1624; AVX-NEXT:    retq
1625;
1626; AVX512VLBWDQ-LABEL: testv16i8u:
1627; AVX512VLBWDQ:       # %bb.0:
1628; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1629; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
1630; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
1631; AVX512VLBWDQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1632; AVX512VLBWDQ-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1633; AVX512VLBWDQ-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
1634; AVX512VLBWDQ-NEXT:    vpand %xmm3, %xmm2, %xmm2
1635; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
1636; AVX512VLBWDQ-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
1637; AVX512VLBWDQ-NEXT:    retq
1638;
1639; AVX512-LABEL: testv16i8u:
1640; AVX512:       # %bb.0:
1641; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1642; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
1643; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1644; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1645; AVX512-NEXT:    vzeroupper
1646; AVX512-NEXT:    retq
1647;
1648; X32-SSE-LABEL: testv16i8u:
1649; X32-SSE:       # %bb.0:
1650; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1651; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1652; X32-SSE-NEXT:    pshufb %xmm0, %xmm2
1653; X32-SSE-NEXT:    psrlw $4, %xmm0
1654; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1655; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1656; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm3
1657; X32-SSE-NEXT:    pand %xmm2, %xmm3
1658; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1659; X32-SSE-NEXT:    paddb %xmm3, %xmm1
1660; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1661; X32-SSE-NEXT:    retl
1662  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
1663  ret <16 x i8> %out
1664}
1665
1666define <2 x i64> @foldv2i64() nounwind {
1667; SSE-LABEL: foldv2i64:
1668; SSE:       # %bb.0:
1669; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
1670; SSE-NEXT:    retq
1671;
1672; NOBW-LABEL: foldv2i64:
1673; NOBW:       # %bb.0:
1674; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
1675; NOBW-NEXT:    retq
1676;
1677; AVX512VLBWDQ-LABEL: foldv2i64:
1678; AVX512VLBWDQ:       # %bb.0:
1679; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
1680; AVX512VLBWDQ-NEXT:    retq
1681;
1682; X32-SSE-LABEL: foldv2i64:
1683; X32-SSE:       # %bb.0:
1684; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
1685; X32-SSE-NEXT:    retl
1686  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
1687  ret <2 x i64> %out
1688}
1689
1690define <2 x i64> @foldv2i64u() nounwind {
1691; SSE-LABEL: foldv2i64u:
1692; SSE:       # %bb.0:
1693; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
1694; SSE-NEXT:    retq
1695;
1696; NOBW-LABEL: foldv2i64u:
1697; NOBW:       # %bb.0:
1698; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
1699; NOBW-NEXT:    retq
1700;
1701; AVX512VLBWDQ-LABEL: foldv2i64u:
1702; AVX512VLBWDQ:       # %bb.0:
1703; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
1704; AVX512VLBWDQ-NEXT:    retq
1705;
1706; X32-SSE-LABEL: foldv2i64u:
1707; X32-SSE:       # %bb.0:
1708; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
1709; X32-SSE-NEXT:    retl
1710  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
1711  ret <2 x i64> %out
1712}
1713
1714define <4 x i32> @foldv4i32() nounwind {
1715; SSE-LABEL: foldv4i32:
1716; SSE:       # %bb.0:
1717; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1718; SSE-NEXT:    retq
1719;
1720; NOBW-LABEL: foldv4i32:
1721; NOBW:       # %bb.0:
1722; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1723; NOBW-NEXT:    retq
1724;
1725; AVX512VLBWDQ-LABEL: foldv4i32:
1726; AVX512VLBWDQ:       # %bb.0:
1727; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1728; AVX512VLBWDQ-NEXT:    retq
1729;
1730; X32-SSE-LABEL: foldv4i32:
1731; X32-SSE:       # %bb.0:
1732; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1733; X32-SSE-NEXT:    retl
1734  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
1735  ret <4 x i32> %out
1736}
1737
1738define <4 x i32> @foldv4i32u() nounwind {
1739; SSE-LABEL: foldv4i32u:
1740; SSE:       # %bb.0:
1741; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1742; SSE-NEXT:    retq
1743;
1744; NOBW-LABEL: foldv4i32u:
1745; NOBW:       # %bb.0:
1746; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1747; NOBW-NEXT:    retq
1748;
1749; AVX512VLBWDQ-LABEL: foldv4i32u:
1750; AVX512VLBWDQ:       # %bb.0:
1751; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1752; AVX512VLBWDQ-NEXT:    retq
1753;
1754; X32-SSE-LABEL: foldv4i32u:
1755; X32-SSE:       # %bb.0:
1756; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1757; X32-SSE-NEXT:    retl
1758  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
1759  ret <4 x i32> %out
1760}
1761
1762define <8 x i16> @foldv8i16() nounwind {
1763; SSE-LABEL: foldv8i16:
1764; SSE:       # %bb.0:
1765; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1766; SSE-NEXT:    retq
1767;
1768; NOBW-LABEL: foldv8i16:
1769; NOBW:       # %bb.0:
1770; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1771; NOBW-NEXT:    retq
1772;
1773; AVX512VLBWDQ-LABEL: foldv8i16:
1774; AVX512VLBWDQ:       # %bb.0:
1775; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1776; AVX512VLBWDQ-NEXT:    retq
1777;
1778; X32-SSE-LABEL: foldv8i16:
1779; X32-SSE:       # %bb.0:
1780; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1781; X32-SSE-NEXT:    retl
1782  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
1783  ret <8 x i16> %out
1784}
1785
1786define <8 x i16> @foldv8i16u() nounwind {
1787; SSE-LABEL: foldv8i16u:
1788; SSE:       # %bb.0:
1789; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1790; SSE-NEXT:    retq
1791;
1792; NOBW-LABEL: foldv8i16u:
1793; NOBW:       # %bb.0:
1794; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1795; NOBW-NEXT:    retq
1796;
1797; AVX512VLBWDQ-LABEL: foldv8i16u:
1798; AVX512VLBWDQ:       # %bb.0:
1799; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1800; AVX512VLBWDQ-NEXT:    retq
1801;
1802; X32-SSE-LABEL: foldv8i16u:
1803; X32-SSE:       # %bb.0:
1804; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1805; X32-SSE-NEXT:    retl
1806  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
1807  ret <8 x i16> %out
1808}
1809
1810define <16 x i8> @foldv16i8() nounwind {
1811; SSE-LABEL: foldv16i8:
1812; SSE:       # %bb.0:
1813; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1814; SSE-NEXT:    retq
1815;
1816; NOBW-LABEL: foldv16i8:
1817; NOBW:       # %bb.0:
1818; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1819; NOBW-NEXT:    retq
1820;
1821; AVX512VLBWDQ-LABEL: foldv16i8:
1822; AVX512VLBWDQ:       # %bb.0:
1823; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1824; AVX512VLBWDQ-NEXT:    retq
1825;
1826; X32-SSE-LABEL: foldv16i8:
1827; X32-SSE:       # %bb.0:
1828; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1829; X32-SSE-NEXT:    retl
1830  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
1831  ret <16 x i8> %out
1832}
1833
1834define <16 x i8> @foldv16i8u() nounwind {
1835; SSE-LABEL: foldv16i8u:
1836; SSE:       # %bb.0:
1837; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1838; SSE-NEXT:    retq
1839;
1840; NOBW-LABEL: foldv16i8u:
1841; NOBW:       # %bb.0:
1842; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1843; NOBW-NEXT:    retq
1844;
1845; AVX512VLBWDQ-LABEL: foldv16i8u:
1846; AVX512VLBWDQ:       # %bb.0:
1847; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1848; AVX512VLBWDQ-NEXT:    retq
1849;
1850; X32-SSE-LABEL: foldv16i8u:
1851; X32-SSE:       # %bb.0:
1852; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1853; X32-SSE-NEXT:    retl
1854  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
1855  ret <16 x i8> %out
1856}
1857
1858declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
1859declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
1860declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
1861declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
1862