1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
10;
11; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
13
14define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
15; SSE2-LABEL: testv2i64:
16; SSE2:       # BB#0:
17; SSE2-NEXT:    movd %xmm0, %rax
18; SSE2-NEXT:    bsfq %rax, %rax
19; SSE2-NEXT:    movl $64, %ecx
20; SSE2-NEXT:    cmoveq %rcx, %rax
21; SSE2-NEXT:    movd %rax, %xmm1
22; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
23; SSE2-NEXT:    movd %xmm0, %rax
24; SSE2-NEXT:    bsfq %rax, %rax
25; SSE2-NEXT:    cmoveq %rcx, %rax
26; SSE2-NEXT:    movd %rax, %xmm0
27; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
28; SSE2-NEXT:    movdqa %xmm1, %xmm0
29; SSE2-NEXT:    retq
30;
31; SSE3-LABEL: testv2i64:
32; SSE3:       # BB#0:
33; SSE3-NEXT:    movd %xmm0, %rax
34; SSE3-NEXT:    bsfq %rax, %rax
35; SSE3-NEXT:    movl $64, %ecx
36; SSE3-NEXT:    cmoveq %rcx, %rax
37; SSE3-NEXT:    movd %rax, %xmm1
38; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
39; SSE3-NEXT:    movd %xmm0, %rax
40; SSE3-NEXT:    bsfq %rax, %rax
41; SSE3-NEXT:    cmoveq %rcx, %rax
42; SSE3-NEXT:    movd %rax, %xmm0
43; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
44; SSE3-NEXT:    movdqa %xmm1, %xmm0
45; SSE3-NEXT:    retq
46;
47; SSSE3-LABEL: testv2i64:
48; SSSE3:       # BB#0:
49; SSSE3-NEXT:    movd %xmm0, %rax
50; SSSE3-NEXT:    bsfq %rax, %rax
51; SSSE3-NEXT:    movl $64, %ecx
52; SSSE3-NEXT:    cmoveq %rcx, %rax
53; SSSE3-NEXT:    movd %rax, %xmm1
54; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
55; SSSE3-NEXT:    movd %xmm0, %rax
56; SSSE3-NEXT:    bsfq %rax, %rax
57; SSSE3-NEXT:    cmoveq %rcx, %rax
58; SSSE3-NEXT:    movd %rax, %xmm0
59; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
60; SSSE3-NEXT:    movdqa %xmm1, %xmm0
61; SSSE3-NEXT:    retq
62;
63; SSE41-LABEL: testv2i64:
64; SSE41:       # BB#0:
65; SSE41-NEXT:    pextrq $1, %xmm0, %rax
66; SSE41-NEXT:    bsfq %rax, %rax
67; SSE41-NEXT:    movl $64, %ecx
68; SSE41-NEXT:    cmoveq %rcx, %rax
69; SSE41-NEXT:    movd %rax, %xmm1
70; SSE41-NEXT:    movd %xmm0, %rax
71; SSE41-NEXT:    bsfq %rax, %rax
72; SSE41-NEXT:    cmoveq %rcx, %rax
73; SSE41-NEXT:    movd %rax, %xmm0
74; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
75; SSE41-NEXT:    retq
76;
77; AVX-LABEL: testv2i64:
78; AVX:       # BB#0:
79; AVX-NEXT:    vpextrq $1, %xmm0, %rax
80; AVX-NEXT:    bsfq %rax, %rax
81; AVX-NEXT:    movl $64, %ecx
82; AVX-NEXT:    cmoveq %rcx, %rax
83; AVX-NEXT:    vmovq %rax, %xmm1
84; AVX-NEXT:    vmovq %xmm0, %rax
85; AVX-NEXT:    bsfq %rax, %rax
86; AVX-NEXT:    cmoveq %rcx, %rax
87; AVX-NEXT:    vmovq %rax, %xmm0
88; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
89; AVX-NEXT:    retq
90;
91; X32-SSE-LABEL: testv2i64:
92; X32-SSE:       # BB#0:
93; X32-SSE-NEXT:    pushl %esi
94; X32-SSE-NEXT:    pextrd $3, %xmm0, %eax
95; X32-SSE-NEXT:    bsfl %eax, %eax
96; X32-SSE-NEXT:    movl $32, %ecx
97; X32-SSE-NEXT:    cmovel %ecx, %eax
98; X32-SSE-NEXT:    addl $32, %eax
99; X32-SSE-NEXT:    pextrd $2, %xmm0, %edx
100; X32-SSE-NEXT:    bsfl %edx, %esi
101; X32-SSE-NEXT:    testl %edx, %edx
102; X32-SSE-NEXT:    cmovel %eax, %esi
103; X32-SSE-NEXT:    movd %esi, %xmm1
104; X32-SSE-NEXT:    pextrd $1, %xmm0, %eax
105; X32-SSE-NEXT:    bsfl %eax, %eax
106; X32-SSE-NEXT:    cmovel %ecx, %eax
107; X32-SSE-NEXT:    addl $32, %eax
108; X32-SSE-NEXT:    movd %xmm0, %ecx
109; X32-SSE-NEXT:    bsfl %ecx, %edx
110; X32-SSE-NEXT:    testl %ecx, %ecx
111; X32-SSE-NEXT:    cmovel %eax, %edx
112; X32-SSE-NEXT:    movd %edx, %xmm0
113; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
114; X32-SSE-NEXT:    popl %esi
115; X32-SSE-NEXT:    retl
116  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
117  ret <2 x i64> %out
118}
119
120define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
121; SSE2-LABEL: testv2i64u:
122; SSE2:       # BB#0:
123; SSE2-NEXT:    movd %xmm0, %rax
124; SSE2-NEXT:    bsfq %rax, %rax
125; SSE2-NEXT:    movd %rax, %xmm1
126; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
127; SSE2-NEXT:    movd %xmm0, %rax
128; SSE2-NEXT:    bsfq %rax, %rax
129; SSE2-NEXT:    movd %rax, %xmm0
130; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
131; SSE2-NEXT:    movdqa %xmm1, %xmm0
132; SSE2-NEXT:    retq
133;
134; SSE3-LABEL: testv2i64u:
135; SSE3:       # BB#0:
136; SSE3-NEXT:    movd %xmm0, %rax
137; SSE3-NEXT:    bsfq %rax, %rax
138; SSE3-NEXT:    movd %rax, %xmm1
139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
140; SSE3-NEXT:    movd %xmm0, %rax
141; SSE3-NEXT:    bsfq %rax, %rax
142; SSE3-NEXT:    movd %rax, %xmm0
143; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
144; SSE3-NEXT:    movdqa %xmm1, %xmm0
145; SSE3-NEXT:    retq
146;
147; SSSE3-LABEL: testv2i64u:
148; SSSE3:       # BB#0:
149; SSSE3-NEXT:    movd %xmm0, %rax
150; SSSE3-NEXT:    bsfq %rax, %rax
151; SSSE3-NEXT:    movd %rax, %xmm1
152; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
153; SSSE3-NEXT:    movd %xmm0, %rax
154; SSSE3-NEXT:    bsfq %rax, %rax
155; SSSE3-NEXT:    movd %rax, %xmm0
156; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
157; SSSE3-NEXT:    movdqa %xmm1, %xmm0
158; SSSE3-NEXT:    retq
159;
160; SSE41-LABEL: testv2i64u:
161; SSE41:       # BB#0:
162; SSE41-NEXT:    pextrq $1, %xmm0, %rax
163; SSE41-NEXT:    bsfq %rax, %rax
164; SSE41-NEXT:    movd %rax, %xmm1
165; SSE41-NEXT:    movd %xmm0, %rax
166; SSE41-NEXT:    bsfq %rax, %rax
167; SSE41-NEXT:    movd %rax, %xmm0
168; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
169; SSE41-NEXT:    retq
170;
171; AVX1-LABEL: testv2i64u:
172; AVX1:       # BB#0:
173; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
174; AVX1-NEXT:    bsfq %rax, %rax
175; AVX1-NEXT:    vmovq %rax, %xmm1
176; AVX1-NEXT:    vmovq %xmm0, %rax
177; AVX1-NEXT:    bsfq %rax, %rax
178; AVX1-NEXT:    vmovq %rax, %xmm0
179; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
180; AVX1-NEXT:    retq
181;
182; AVX2-LABEL: testv2i64u:
183; AVX2:       # BB#0:
184; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
185; AVX2-NEXT:    bsfq %rax, %rax
186; AVX2-NEXT:    vmovq %rax, %xmm1
187; AVX2-NEXT:    vmovq %xmm0, %rax
188; AVX2-NEXT:    bsfq %rax, %rax
189; AVX2-NEXT:    vmovq %rax, %xmm0
190; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
191; AVX2-NEXT:    retq
192;
193; AVX512CDVL-LABEL: testv2i64u:
194; AVX512CDVL:       # BB#0:
195; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
196; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
197; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
198; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
199; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [63,63]
200; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
201; AVX512CDVL-NEXT:    retq
202;
203; AVX512CD-LABEL: testv2i64u:
204; AVX512CD:       # BB#0:
205; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
206; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
207; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
208; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
209; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
210; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
211; AVX512CD-NEXT:    retq
212;
213; X32-SSE-LABEL: testv2i64u:
214; X32-SSE:       # BB#0:
215; X32-SSE-NEXT:    pextrd $2, %xmm0, %eax
216; X32-SSE-NEXT:    bsfl %eax, %ecx
217; X32-SSE-NEXT:    pextrd $3, %xmm0, %edx
218; X32-SSE-NEXT:    bsfl %edx, %edx
219; X32-SSE-NEXT:    addl $32, %edx
220; X32-SSE-NEXT:    testl %eax, %eax
221; X32-SSE-NEXT:    cmovnel %ecx, %edx
222; X32-SSE-NEXT:    movd %edx, %xmm1
223; X32-SSE-NEXT:    movd %xmm0, %eax
224; X32-SSE-NEXT:    bsfl %eax, %ecx
225; X32-SSE-NEXT:    pextrd $1, %xmm0, %edx
226; X32-SSE-NEXT:    bsfl %edx, %edx
227; X32-SSE-NEXT:    addl $32, %edx
228; X32-SSE-NEXT:    testl %eax, %eax
229; X32-SSE-NEXT:    cmovnel %ecx, %edx
230; X32-SSE-NEXT:    movd %edx, %xmm0
231; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
232; X32-SSE-NEXT:    retl
233  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
234  ret <2 x i64> %out
235}
236
237define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
238; SSE2-LABEL: testv4i32:
239; SSE2:       # BB#0:
240; SSE2-NEXT:    pxor %xmm1, %xmm1
241; SSE2-NEXT:    pxor %xmm2, %xmm2
242; SSE2-NEXT:    psubd %xmm0, %xmm2
243; SSE2-NEXT:    pand %xmm0, %xmm2
244; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
245; SSE2-NEXT:    movdqa %xmm2, %xmm0
246; SSE2-NEXT:    psrld $1, %xmm0
247; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
248; SSE2-NEXT:    psubd %xmm0, %xmm2
249; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
250; SSE2-NEXT:    movdqa %xmm2, %xmm3
251; SSE2-NEXT:    pand %xmm0, %xmm3
252; SSE2-NEXT:    psrld $2, %xmm2
253; SSE2-NEXT:    pand %xmm0, %xmm2
254; SSE2-NEXT:    paddd %xmm3, %xmm2
255; SSE2-NEXT:    movdqa %xmm2, %xmm0
256; SSE2-NEXT:    psrld $4, %xmm0
257; SSE2-NEXT:    paddd %xmm2, %xmm0
258; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
259; SSE2-NEXT:    movdqa %xmm0, %xmm2
260; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
261; SSE2-NEXT:    psadbw %xmm1, %xmm2
262; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
263; SSE2-NEXT:    psadbw %xmm1, %xmm0
264; SSE2-NEXT:    packuswb %xmm2, %xmm0
265; SSE2-NEXT:    retq
266;
267; SSE3-LABEL: testv4i32:
268; SSE3:       # BB#0:
269; SSE3-NEXT:    pxor %xmm1, %xmm1
270; SSE3-NEXT:    pxor %xmm2, %xmm2
271; SSE3-NEXT:    psubd %xmm0, %xmm2
272; SSE3-NEXT:    pand %xmm0, %xmm2
273; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
274; SSE3-NEXT:    movdqa %xmm2, %xmm0
275; SSE3-NEXT:    psrld $1, %xmm0
276; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
277; SSE3-NEXT:    psubd %xmm0, %xmm2
278; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
279; SSE3-NEXT:    movdqa %xmm2, %xmm3
280; SSE3-NEXT:    pand %xmm0, %xmm3
281; SSE3-NEXT:    psrld $2, %xmm2
282; SSE3-NEXT:    pand %xmm0, %xmm2
283; SSE3-NEXT:    paddd %xmm3, %xmm2
284; SSE3-NEXT:    movdqa %xmm2, %xmm0
285; SSE3-NEXT:    psrld $4, %xmm0
286; SSE3-NEXT:    paddd %xmm2, %xmm0
287; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
288; SSE3-NEXT:    movdqa %xmm0, %xmm2
289; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
290; SSE3-NEXT:    psadbw %xmm1, %xmm2
291; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292; SSE3-NEXT:    psadbw %xmm1, %xmm0
293; SSE3-NEXT:    packuswb %xmm2, %xmm0
294; SSE3-NEXT:    retq
295;
296; SSSE3-LABEL: testv4i32:
297; SSSE3:       # BB#0:
298; SSSE3-NEXT:    pxor %xmm1, %xmm1
299; SSSE3-NEXT:    pxor %xmm2, %xmm2
300; SSSE3-NEXT:    psubd %xmm0, %xmm2
301; SSSE3-NEXT:    pand %xmm0, %xmm2
302; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
303; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
304; SSSE3-NEXT:    movdqa %xmm2, %xmm4
305; SSSE3-NEXT:    pand %xmm3, %xmm4
306; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
307; SSSE3-NEXT:    movdqa %xmm0, %xmm5
308; SSSE3-NEXT:    pshufb %xmm4, %xmm5
309; SSSE3-NEXT:    psrlw $4, %xmm2
310; SSSE3-NEXT:    pand %xmm3, %xmm2
311; SSSE3-NEXT:    pshufb %xmm2, %xmm0
312; SSSE3-NEXT:    paddb %xmm5, %xmm0
313; SSSE3-NEXT:    movdqa %xmm0, %xmm2
314; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
315; SSSE3-NEXT:    psadbw %xmm1, %xmm2
316; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
317; SSSE3-NEXT:    psadbw %xmm1, %xmm0
318; SSSE3-NEXT:    packuswb %xmm2, %xmm0
319; SSSE3-NEXT:    retq
320;
321; SSE41-LABEL: testv4i32:
322; SSE41:       # BB#0:
323; SSE41-NEXT:    pxor %xmm1, %xmm1
324; SSE41-NEXT:    pxor %xmm2, %xmm2
325; SSE41-NEXT:    psubd %xmm0, %xmm2
326; SSE41-NEXT:    pand %xmm0, %xmm2
327; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
328; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
329; SSE41-NEXT:    movdqa %xmm2, %xmm4
330; SSE41-NEXT:    pand %xmm3, %xmm4
331; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
332; SSE41-NEXT:    movdqa %xmm0, %xmm5
333; SSE41-NEXT:    pshufb %xmm4, %xmm5
334; SSE41-NEXT:    psrlw $4, %xmm2
335; SSE41-NEXT:    pand %xmm3, %xmm2
336; SSE41-NEXT:    pshufb %xmm2, %xmm0
337; SSE41-NEXT:    paddb %xmm5, %xmm0
338; SSE41-NEXT:    movdqa %xmm0, %xmm2
339; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
340; SSE41-NEXT:    psadbw %xmm1, %xmm2
341; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342; SSE41-NEXT:    psadbw %xmm1, %xmm0
343; SSE41-NEXT:    packuswb %xmm2, %xmm0
344; SSE41-NEXT:    retq
345;
346; AVX1-LABEL: testv4i32:
347; AVX1:       # BB#0:
348; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
349; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
350; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
351; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
352; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
353; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
354; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
355; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
356; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
357; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
358; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
359; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
360; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
361; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
362; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
363; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
364; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
365; AVX1-NEXT:    retq
366;
367; AVX2-LABEL: testv4i32:
368; AVX2:       # BB#0:
369; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
370; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
371; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
372; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
373; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
374; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
375; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
376; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
377; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
378; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
379; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
380; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
381; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
382; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
383; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
384; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
385; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
386; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
387; AVX2-NEXT:    retq
388;
389; AVX512CDVL-LABEL: testv4i32:
390; AVX512CDVL:       # BB#0:
391; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
392; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
393; AVX512CDVL-NEXT:    vpandd %xmm2, %xmm0, %xmm0
394; AVX512CDVL-NEXT:    vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0
395; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
396; AVX512CDVL-NEXT:    vpandq %xmm2, %xmm0, %xmm3
397; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
398; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
399; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
400; AVX512CDVL-NEXT:    vpandq %xmm2, %xmm0, %xmm0
401; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
402; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
403; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
404; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
405; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
406; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
407; AVX512CDVL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
408; AVX512CDVL-NEXT:    retq
409;
410; AVX512CD-LABEL: testv4i32:
411; AVX512CD:       # BB#0:
412; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
413; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
414; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
415; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
416; AVX512CD-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
417; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
418; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
419; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
420; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
421; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
422; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
423; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
424; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
425; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
426; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
427; AVX512CD-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
428; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
429; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
430; AVX512CD-NEXT:    retq
431;
432; X32-SSE-LABEL: testv4i32:
433; X32-SSE:       # BB#0:
434; X32-SSE-NEXT:    pxor %xmm1, %xmm1
435; X32-SSE-NEXT:    pxor %xmm2, %xmm2
436; X32-SSE-NEXT:    psubd %xmm0, %xmm2
437; X32-SSE-NEXT:    pand %xmm0, %xmm2
438; X32-SSE-NEXT:    psubd {{\.LCPI.*}}, %xmm2
439; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
440; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
441; X32-SSE-NEXT:    pand %xmm3, %xmm4
442; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
443; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
444; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
445; X32-SSE-NEXT:    psrlw $4, %xmm2
446; X32-SSE-NEXT:    pand %xmm3, %xmm2
447; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
448; X32-SSE-NEXT:    paddb %xmm5, %xmm0
449; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
450; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
451; X32-SSE-NEXT:    psadbw %xmm1, %xmm2
452; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
453; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
454; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
455; X32-SSE-NEXT:    retl
456  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
457  ret <4 x i32> %out
458}
459
460define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
461; SSE2-LABEL: testv4i32u:
462; SSE2:       # BB#0:
463; SSE2-NEXT:    pxor %xmm1, %xmm1
464; SSE2-NEXT:    pxor %xmm2, %xmm2
465; SSE2-NEXT:    psubd %xmm0, %xmm2
466; SSE2-NEXT:    pand %xmm0, %xmm2
467; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
468; SSE2-NEXT:    movdqa %xmm2, %xmm0
469; SSE2-NEXT:    psrld $1, %xmm0
470; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
471; SSE2-NEXT:    psubd %xmm0, %xmm2
472; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
473; SSE2-NEXT:    movdqa %xmm2, %xmm3
474; SSE2-NEXT:    pand %xmm0, %xmm3
475; SSE2-NEXT:    psrld $2, %xmm2
476; SSE2-NEXT:    pand %xmm0, %xmm2
477; SSE2-NEXT:    paddd %xmm3, %xmm2
478; SSE2-NEXT:    movdqa %xmm2, %xmm0
479; SSE2-NEXT:    psrld $4, %xmm0
480; SSE2-NEXT:    paddd %xmm2, %xmm0
481; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
482; SSE2-NEXT:    movdqa %xmm0, %xmm2
483; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
484; SSE2-NEXT:    psadbw %xmm1, %xmm2
485; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
486; SSE2-NEXT:    psadbw %xmm1, %xmm0
487; SSE2-NEXT:    packuswb %xmm2, %xmm0
488; SSE2-NEXT:    retq
489;
490; SSE3-LABEL: testv4i32u:
491; SSE3:       # BB#0:
492; SSE3-NEXT:    pxor %xmm1, %xmm1
493; SSE3-NEXT:    pxor %xmm2, %xmm2
494; SSE3-NEXT:    psubd %xmm0, %xmm2
495; SSE3-NEXT:    pand %xmm0, %xmm2
496; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
497; SSE3-NEXT:    movdqa %xmm2, %xmm0
498; SSE3-NEXT:    psrld $1, %xmm0
499; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
500; SSE3-NEXT:    psubd %xmm0, %xmm2
501; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
502; SSE3-NEXT:    movdqa %xmm2, %xmm3
503; SSE3-NEXT:    pand %xmm0, %xmm3
504; SSE3-NEXT:    psrld $2, %xmm2
505; SSE3-NEXT:    pand %xmm0, %xmm2
506; SSE3-NEXT:    paddd %xmm3, %xmm2
507; SSE3-NEXT:    movdqa %xmm2, %xmm0
508; SSE3-NEXT:    psrld $4, %xmm0
509; SSE3-NEXT:    paddd %xmm2, %xmm0
510; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
511; SSE3-NEXT:    movdqa %xmm0, %xmm2
512; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
513; SSE3-NEXT:    psadbw %xmm1, %xmm2
514; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
515; SSE3-NEXT:    psadbw %xmm1, %xmm0
516; SSE3-NEXT:    packuswb %xmm2, %xmm0
517; SSE3-NEXT:    retq
518;
519; SSSE3-LABEL: testv4i32u:
520; SSSE3:       # BB#0:
521; SSSE3-NEXT:    pxor %xmm1, %xmm1
522; SSSE3-NEXT:    pxor %xmm2, %xmm2
523; SSSE3-NEXT:    psubd %xmm0, %xmm2
524; SSSE3-NEXT:    pand %xmm0, %xmm2
525; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
526; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
527; SSSE3-NEXT:    movdqa %xmm2, %xmm4
528; SSSE3-NEXT:    pand %xmm3, %xmm4
529; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
530; SSSE3-NEXT:    movdqa %xmm0, %xmm5
531; SSSE3-NEXT:    pshufb %xmm4, %xmm5
532; SSSE3-NEXT:    psrlw $4, %xmm2
533; SSSE3-NEXT:    pand %xmm3, %xmm2
534; SSSE3-NEXT:    pshufb %xmm2, %xmm0
535; SSSE3-NEXT:    paddb %xmm5, %xmm0
536; SSSE3-NEXT:    movdqa %xmm0, %xmm2
537; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
538; SSSE3-NEXT:    psadbw %xmm1, %xmm2
539; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
540; SSSE3-NEXT:    psadbw %xmm1, %xmm0
541; SSSE3-NEXT:    packuswb %xmm2, %xmm0
542; SSSE3-NEXT:    retq
543;
544; SSE41-LABEL: testv4i32u:
545; SSE41:       # BB#0:
546; SSE41-NEXT:    pxor %xmm1, %xmm1
547; SSE41-NEXT:    pxor %xmm2, %xmm2
548; SSE41-NEXT:    psubd %xmm0, %xmm2
549; SSE41-NEXT:    pand %xmm0, %xmm2
550; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
551; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
552; SSE41-NEXT:    movdqa %xmm2, %xmm4
553; SSE41-NEXT:    pand %xmm3, %xmm4
554; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
555; SSE41-NEXT:    movdqa %xmm0, %xmm5
556; SSE41-NEXT:    pshufb %xmm4, %xmm5
557; SSE41-NEXT:    psrlw $4, %xmm2
558; SSE41-NEXT:    pand %xmm3, %xmm2
559; SSE41-NEXT:    pshufb %xmm2, %xmm0
560; SSE41-NEXT:    paddb %xmm5, %xmm0
561; SSE41-NEXT:    movdqa %xmm0, %xmm2
562; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
563; SSE41-NEXT:    psadbw %xmm1, %xmm2
564; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
565; SSE41-NEXT:    psadbw %xmm1, %xmm0
566; SSE41-NEXT:    packuswb %xmm2, %xmm0
567; SSE41-NEXT:    retq
568;
569; AVX1-LABEL: testv4i32u:
570; AVX1:       # BB#0:
571; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
572; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
573; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
574; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
575; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
576; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
577; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
578; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
579; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
580; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
581; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
582; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
583; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
584; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
585; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
586; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
587; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
588; AVX1-NEXT:    retq
589;
590; AVX2-LABEL: testv4i32u:
591; AVX2:       # BB#0:
592; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
593; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
594; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
595; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
596; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
597; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
598; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
599; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
600; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
601; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
602; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
603; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
604; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
605; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
606; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
607; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
608; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
609; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
610; AVX2-NEXT:    retq
611;
612; AVX512CDVL-LABEL: testv4i32u:
613; AVX512CDVL:       # BB#0:
614; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
615; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
616; AVX512CDVL-NEXT:    vpandd %xmm1, %xmm0, %xmm0
617; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
618; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
619; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
620; AVX512CDVL-NEXT:    retq
621;
622; AVX512CD-LABEL: testv4i32u:
623; AVX512CD:       # BB#0:
624; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
625; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
626; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
627; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
628; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
629; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
630; AVX512CD-NEXT:    retq
631;
632; X32-SSE-LABEL: testv4i32u:
633; X32-SSE:       # BB#0:
634; X32-SSE-NEXT:    pxor %xmm1, %xmm1
635; X32-SSE-NEXT:    pxor %xmm2, %xmm2
636; X32-SSE-NEXT:    psubd %xmm0, %xmm2
637; X32-SSE-NEXT:    pand %xmm0, %xmm2
638; X32-SSE-NEXT:    psubd {{\.LCPI.*}}, %xmm2
639; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
640; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
641; X32-SSE-NEXT:    pand %xmm3, %xmm4
642; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
643; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
644; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
645; X32-SSE-NEXT:    psrlw $4, %xmm2
646; X32-SSE-NEXT:    pand %xmm3, %xmm2
647; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
648; X32-SSE-NEXT:    paddb %xmm5, %xmm0
649; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
650; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
651; X32-SSE-NEXT:    psadbw %xmm1, %xmm2
652; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
653; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
654; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
655; X32-SSE-NEXT:    retl
656  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
657  ret <4 x i32> %out
658}
659
660define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
661; SSE2-LABEL: testv8i16:
662; SSE2:       # BB#0:
663; SSE2-NEXT:    pxor %xmm1, %xmm1
664; SSE2-NEXT:    psubw %xmm0, %xmm1
665; SSE2-NEXT:    pand %xmm0, %xmm1
666; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
667; SSE2-NEXT:    movdqa %xmm1, %xmm0
668; SSE2-NEXT:    psrlw $1, %xmm0
669; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
670; SSE2-NEXT:    psubw %xmm0, %xmm1
671; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
672; SSE2-NEXT:    movdqa %xmm1, %xmm2
673; SSE2-NEXT:    pand %xmm0, %xmm2
674; SSE2-NEXT:    psrlw $2, %xmm1
675; SSE2-NEXT:    pand %xmm0, %xmm1
676; SSE2-NEXT:    paddw %xmm2, %xmm1
677; SSE2-NEXT:    movdqa %xmm1, %xmm2
678; SSE2-NEXT:    psrlw $4, %xmm2
679; SSE2-NEXT:    paddw %xmm1, %xmm2
680; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
681; SSE2-NEXT:    movdqa %xmm2, %xmm0
682; SSE2-NEXT:    psllw $8, %xmm0
683; SSE2-NEXT:    paddb %xmm2, %xmm0
684; SSE2-NEXT:    psrlw $8, %xmm0
685; SSE2-NEXT:    retq
686;
687; SSE3-LABEL: testv8i16:
688; SSE3:       # BB#0:
689; SSE3-NEXT:    pxor %xmm1, %xmm1
690; SSE3-NEXT:    psubw %xmm0, %xmm1
691; SSE3-NEXT:    pand %xmm0, %xmm1
692; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
693; SSE3-NEXT:    movdqa %xmm1, %xmm0
694; SSE3-NEXT:    psrlw $1, %xmm0
695; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
696; SSE3-NEXT:    psubw %xmm0, %xmm1
697; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
698; SSE3-NEXT:    movdqa %xmm1, %xmm2
699; SSE3-NEXT:    pand %xmm0, %xmm2
700; SSE3-NEXT:    psrlw $2, %xmm1
701; SSE3-NEXT:    pand %xmm0, %xmm1
702; SSE3-NEXT:    paddw %xmm2, %xmm1
703; SSE3-NEXT:    movdqa %xmm1, %xmm2
704; SSE3-NEXT:    psrlw $4, %xmm2
705; SSE3-NEXT:    paddw %xmm1, %xmm2
706; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
707; SSE3-NEXT:    movdqa %xmm2, %xmm0
708; SSE3-NEXT:    psllw $8, %xmm0
709; SSE3-NEXT:    paddb %xmm2, %xmm0
710; SSE3-NEXT:    psrlw $8, %xmm0
711; SSE3-NEXT:    retq
712;
713; SSSE3-LABEL: testv8i16:
714; SSSE3:       # BB#0:
715; SSSE3-NEXT:    pxor %xmm1, %xmm1
716; SSSE3-NEXT:    psubw %xmm0, %xmm1
717; SSSE3-NEXT:    pand %xmm0, %xmm1
718; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
719; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
720; SSSE3-NEXT:    movdqa %xmm1, %xmm2
721; SSSE3-NEXT:    pand %xmm0, %xmm2
722; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
723; SSSE3-NEXT:    movdqa %xmm3, %xmm4
724; SSSE3-NEXT:    pshufb %xmm2, %xmm4
725; SSSE3-NEXT:    psrlw $4, %xmm1
726; SSSE3-NEXT:    pand %xmm0, %xmm1
727; SSSE3-NEXT:    pshufb %xmm1, %xmm3
728; SSSE3-NEXT:    paddb %xmm4, %xmm3
729; SSSE3-NEXT:    movdqa %xmm3, %xmm0
730; SSSE3-NEXT:    psllw $8, %xmm0
731; SSSE3-NEXT:    paddb %xmm3, %xmm0
732; SSSE3-NEXT:    psrlw $8, %xmm0
733; SSSE3-NEXT:    retq
734;
735; SSE41-LABEL: testv8i16:
736; SSE41:       # BB#0:
737; SSE41-NEXT:    pxor %xmm1, %xmm1
738; SSE41-NEXT:    psubw %xmm0, %xmm1
739; SSE41-NEXT:    pand %xmm0, %xmm1
740; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
741; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
742; SSE41-NEXT:    movdqa %xmm1, %xmm2
743; SSE41-NEXT:    pand %xmm0, %xmm2
744; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
745; SSE41-NEXT:    movdqa %xmm3, %xmm4
746; SSE41-NEXT:    pshufb %xmm2, %xmm4
747; SSE41-NEXT:    psrlw $4, %xmm1
748; SSE41-NEXT:    pand %xmm0, %xmm1
749; SSE41-NEXT:    pshufb %xmm1, %xmm3
750; SSE41-NEXT:    paddb %xmm4, %xmm3
751; SSE41-NEXT:    movdqa %xmm3, %xmm0
752; SSE41-NEXT:    psllw $8, %xmm0
753; SSE41-NEXT:    paddb %xmm3, %xmm0
754; SSE41-NEXT:    psrlw $8, %xmm0
755; SSE41-NEXT:    retq
756;
757; AVX1-LABEL: testv8i16:
758; AVX1:       # BB#0:
759; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
760; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
761; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
762; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
763; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
764; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
765; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
766; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
767; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
768; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
769; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
770; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
771; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
772; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
773; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
774; AVX1-NEXT:    retq
775;
776; AVX2-LABEL: testv8i16:
777; AVX2:       # BB#0:
778; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
779; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
780; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
781; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
782; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
783; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
784; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
785; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
786; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
787; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
788; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
789; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
790; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
791; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
792; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
793; AVX2-NEXT:    retq
794;
795; AVX512CDVL-LABEL: testv8i16:
796; AVX512CDVL:       # BB#0:
797; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
798; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
799; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
800; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
801; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
802; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
803; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
804; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
805; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
806; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
807; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
808; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
809; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
810; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
811; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
812; AVX512CDVL-NEXT:    retq
813;
814; AVX512CD-LABEL: testv8i16:
815; AVX512CD:       # BB#0:
816; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
817; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
818; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
819; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
820; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
821; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
822; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
823; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
824; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
825; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
826; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
827; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
828; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
829; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
830; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
831; AVX512CD-NEXT:    retq
832;
833; X32-SSE-LABEL: testv8i16:
834; X32-SSE:       # BB#0:
835; X32-SSE-NEXT:    pxor %xmm1, %xmm1
836; X32-SSE-NEXT:    psubw %xmm0, %xmm1
837; X32-SSE-NEXT:    pand %xmm0, %xmm1
838; X32-SSE-NEXT:    psubw {{\.LCPI.*}}, %xmm1
839; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
840; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
841; X32-SSE-NEXT:    pand %xmm0, %xmm2
842; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
843; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
844; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
845; X32-SSE-NEXT:    psrlw $4, %xmm1
846; X32-SSE-NEXT:    pand %xmm0, %xmm1
847; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
848; X32-SSE-NEXT:    paddb %xmm4, %xmm3
849; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
850; X32-SSE-NEXT:    psllw $8, %xmm0
851; X32-SSE-NEXT:    paddb %xmm3, %xmm0
852; X32-SSE-NEXT:    psrlw $8, %xmm0
853; X32-SSE-NEXT:    retl
854  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
855  ret <8 x i16> %out
856}
857
858define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
859; SSE2-LABEL: testv8i16u:
860; SSE2:       # BB#0:
861; SSE2-NEXT:    pxor %xmm1, %xmm1
862; SSE2-NEXT:    psubw %xmm0, %xmm1
863; SSE2-NEXT:    pand %xmm0, %xmm1
864; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
865; SSE2-NEXT:    movdqa %xmm1, %xmm0
866; SSE2-NEXT:    psrlw $1, %xmm0
867; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
868; SSE2-NEXT:    psubw %xmm0, %xmm1
869; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
870; SSE2-NEXT:    movdqa %xmm1, %xmm2
871; SSE2-NEXT:    pand %xmm0, %xmm2
872; SSE2-NEXT:    psrlw $2, %xmm1
873; SSE2-NEXT:    pand %xmm0, %xmm1
874; SSE2-NEXT:    paddw %xmm2, %xmm1
875; SSE2-NEXT:    movdqa %xmm1, %xmm2
876; SSE2-NEXT:    psrlw $4, %xmm2
877; SSE2-NEXT:    paddw %xmm1, %xmm2
878; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
879; SSE2-NEXT:    movdqa %xmm2, %xmm0
880; SSE2-NEXT:    psllw $8, %xmm0
881; SSE2-NEXT:    paddb %xmm2, %xmm0
882; SSE2-NEXT:    psrlw $8, %xmm0
883; SSE2-NEXT:    retq
884;
885; SSE3-LABEL: testv8i16u:
886; SSE3:       # BB#0:
887; SSE3-NEXT:    pxor %xmm1, %xmm1
888; SSE3-NEXT:    psubw %xmm0, %xmm1
889; SSE3-NEXT:    pand %xmm0, %xmm1
890; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
891; SSE3-NEXT:    movdqa %xmm1, %xmm0
892; SSE3-NEXT:    psrlw $1, %xmm0
893; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
894; SSE3-NEXT:    psubw %xmm0, %xmm1
895; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
896; SSE3-NEXT:    movdqa %xmm1, %xmm2
897; SSE3-NEXT:    pand %xmm0, %xmm2
898; SSE3-NEXT:    psrlw $2, %xmm1
899; SSE3-NEXT:    pand %xmm0, %xmm1
900; SSE3-NEXT:    paddw %xmm2, %xmm1
901; SSE3-NEXT:    movdqa %xmm1, %xmm2
902; SSE3-NEXT:    psrlw $4, %xmm2
903; SSE3-NEXT:    paddw %xmm1, %xmm2
904; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
905; SSE3-NEXT:    movdqa %xmm2, %xmm0
906; SSE3-NEXT:    psllw $8, %xmm0
907; SSE3-NEXT:    paddb %xmm2, %xmm0
908; SSE3-NEXT:    psrlw $8, %xmm0
909; SSE3-NEXT:    retq
910;
911; SSSE3-LABEL: testv8i16u:
912; SSSE3:       # BB#0:
913; SSSE3-NEXT:    pxor %xmm1, %xmm1
914; SSSE3-NEXT:    psubw %xmm0, %xmm1
915; SSSE3-NEXT:    pand %xmm0, %xmm1
916; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
917; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
918; SSSE3-NEXT:    movdqa %xmm1, %xmm2
919; SSSE3-NEXT:    pand %xmm0, %xmm2
920; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
921; SSSE3-NEXT:    movdqa %xmm3, %xmm4
922; SSSE3-NEXT:    pshufb %xmm2, %xmm4
923; SSSE3-NEXT:    psrlw $4, %xmm1
924; SSSE3-NEXT:    pand %xmm0, %xmm1
925; SSSE3-NEXT:    pshufb %xmm1, %xmm3
926; SSSE3-NEXT:    paddb %xmm4, %xmm3
927; SSSE3-NEXT:    movdqa %xmm3, %xmm0
928; SSSE3-NEXT:    psllw $8, %xmm0
929; SSSE3-NEXT:    paddb %xmm3, %xmm0
930; SSSE3-NEXT:    psrlw $8, %xmm0
931; SSSE3-NEXT:    retq
932;
933; SSE41-LABEL: testv8i16u:
934; SSE41:       # BB#0:
935; SSE41-NEXT:    pxor %xmm1, %xmm1
936; SSE41-NEXT:    psubw %xmm0, %xmm1
937; SSE41-NEXT:    pand %xmm0, %xmm1
938; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
939; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
940; SSE41-NEXT:    movdqa %xmm1, %xmm2
941; SSE41-NEXT:    pand %xmm0, %xmm2
942; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
943; SSE41-NEXT:    movdqa %xmm3, %xmm4
944; SSE41-NEXT:    pshufb %xmm2, %xmm4
945; SSE41-NEXT:    psrlw $4, %xmm1
946; SSE41-NEXT:    pand %xmm0, %xmm1
947; SSE41-NEXT:    pshufb %xmm1, %xmm3
948; SSE41-NEXT:    paddb %xmm4, %xmm3
949; SSE41-NEXT:    movdqa %xmm3, %xmm0
950; SSE41-NEXT:    psllw $8, %xmm0
951; SSE41-NEXT:    paddb %xmm3, %xmm0
952; SSE41-NEXT:    psrlw $8, %xmm0
953; SSE41-NEXT:    retq
954;
955; AVX1-LABEL: testv8i16u:
956; AVX1:       # BB#0:
957; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
958; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
959; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
960; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
961; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
962; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
963; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
964; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
965; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
966; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
967; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
968; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
969; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
970; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
971; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
972; AVX1-NEXT:    retq
973;
974; AVX2-LABEL: testv8i16u:
975; AVX2:       # BB#0:
976; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
977; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
978; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
979; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
980; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
981; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
982; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
983; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
984; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
985; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
986; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
987; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
988; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
989; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
990; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
991; AVX2-NEXT:    retq
992;
993; AVX512CDVL-LABEL: testv8i16u:
994; AVX512CDVL:       # BB#0:
995; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
996; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
997; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
998; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
999; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1000; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
1001; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1002; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1003; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
1004; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
1005; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1006; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1007; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
1008; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1009; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
1010; AVX512CDVL-NEXT:    retq
1011;
1012; AVX512CD-LABEL: testv8i16u:
1013; AVX512CD:       # BB#0:
1014; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1015; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
1016; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1017; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1018; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1019; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
1020; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1021; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1022; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
1023; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1024; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1025; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1026; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
1027; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1028; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
1029; AVX512CD-NEXT:    retq
1030;
1031; X32-SSE-LABEL: testv8i16u:
1032; X32-SSE:       # BB#0:
1033; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1034; X32-SSE-NEXT:    psubw %xmm0, %xmm1
1035; X32-SSE-NEXT:    pand %xmm0, %xmm1
1036; X32-SSE-NEXT:    psubw {{\.LCPI.*}}, %xmm1
1037; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1038; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1039; X32-SSE-NEXT:    pand %xmm0, %xmm2
1040; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1041; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1042; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
1043; X32-SSE-NEXT:    psrlw $4, %xmm1
1044; X32-SSE-NEXT:    pand %xmm0, %xmm1
1045; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
1046; X32-SSE-NEXT:    paddb %xmm4, %xmm3
1047; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
1048; X32-SSE-NEXT:    psllw $8, %xmm0
1049; X32-SSE-NEXT:    paddb %xmm3, %xmm0
1050; X32-SSE-NEXT:    psrlw $8, %xmm0
1051; X32-SSE-NEXT:    retl
1052  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
1053  ret <8 x i16> %out
1054}
1055
1056define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
1057; SSE2-LABEL: testv16i8:
1058; SSE2:       # BB#0:
1059; SSE2-NEXT:    pxor %xmm1, %xmm1
1060; SSE2-NEXT:    psubb %xmm0, %xmm1
1061; SSE2-NEXT:    pand %xmm0, %xmm1
1062; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
1063; SSE2-NEXT:    movdqa %xmm1, %xmm0
1064; SSE2-NEXT:    psrlw $1, %xmm0
1065; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1066; SSE2-NEXT:    psubb %xmm0, %xmm1
1067; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1068; SSE2-NEXT:    movdqa %xmm1, %xmm2
1069; SSE2-NEXT:    pand %xmm0, %xmm2
1070; SSE2-NEXT:    psrlw $2, %xmm1
1071; SSE2-NEXT:    pand %xmm0, %xmm1
1072; SSE2-NEXT:    paddb %xmm2, %xmm1
1073; SSE2-NEXT:    movdqa %xmm1, %xmm0
1074; SSE2-NEXT:    psrlw $4, %xmm0
1075; SSE2-NEXT:    paddb %xmm1, %xmm0
1076; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1077; SSE2-NEXT:    retq
1078;
1079; SSE3-LABEL: testv16i8:
1080; SSE3:       # BB#0:
1081; SSE3-NEXT:    pxor %xmm1, %xmm1
1082; SSE3-NEXT:    psubb %xmm0, %xmm1
1083; SSE3-NEXT:    pand %xmm0, %xmm1
1084; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
1085; SSE3-NEXT:    movdqa %xmm1, %xmm0
1086; SSE3-NEXT:    psrlw $1, %xmm0
1087; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1088; SSE3-NEXT:    psubb %xmm0, %xmm1
1089; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1090; SSE3-NEXT:    movdqa %xmm1, %xmm2
1091; SSE3-NEXT:    pand %xmm0, %xmm2
1092; SSE3-NEXT:    psrlw $2, %xmm1
1093; SSE3-NEXT:    pand %xmm0, %xmm1
1094; SSE3-NEXT:    paddb %xmm2, %xmm1
1095; SSE3-NEXT:    movdqa %xmm1, %xmm0
1096; SSE3-NEXT:    psrlw $4, %xmm0
1097; SSE3-NEXT:    paddb %xmm1, %xmm0
1098; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1099; SSE3-NEXT:    retq
1100;
1101; SSSE3-LABEL: testv16i8:
1102; SSSE3:       # BB#0:
1103; SSSE3-NEXT:    pxor %xmm1, %xmm1
1104; SSSE3-NEXT:    psubb %xmm0, %xmm1
1105; SSSE3-NEXT:    pand %xmm0, %xmm1
1106; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
1107; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1108; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1109; SSSE3-NEXT:    pand %xmm2, %xmm3
1110; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1111; SSSE3-NEXT:    movdqa %xmm0, %xmm4
1112; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1113; SSSE3-NEXT:    psrlw $4, %xmm1
1114; SSSE3-NEXT:    pand %xmm2, %xmm1
1115; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1116; SSSE3-NEXT:    paddb %xmm4, %xmm0
1117; SSSE3-NEXT:    retq
1118;
1119; SSE41-LABEL: testv16i8:
1120; SSE41:       # BB#0:
1121; SSE41-NEXT:    pxor %xmm1, %xmm1
1122; SSE41-NEXT:    psubb %xmm0, %xmm1
1123; SSE41-NEXT:    pand %xmm0, %xmm1
1124; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
1125; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1126; SSE41-NEXT:    movdqa %xmm1, %xmm3
1127; SSE41-NEXT:    pand %xmm2, %xmm3
1128; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1129; SSE41-NEXT:    movdqa %xmm0, %xmm4
1130; SSE41-NEXT:    pshufb %xmm3, %xmm4
1131; SSE41-NEXT:    psrlw $4, %xmm1
1132; SSE41-NEXT:    pand %xmm2, %xmm1
1133; SSE41-NEXT:    pshufb %xmm1, %xmm0
1134; SSE41-NEXT:    paddb %xmm4, %xmm0
1135; SSE41-NEXT:    retq
1136;
1137; AVX1-LABEL: testv16i8:
1138; AVX1:       # BB#0:
1139; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1140; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1141; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1142; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1143; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1144; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
1145; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1146; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1147; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1148; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1149; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1150; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1151; AVX1-NEXT:    retq
1152;
1153; AVX2-LABEL: testv16i8:
1154; AVX2:       # BB#0:
1155; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1156; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1157; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1158; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1159; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1160; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
1161; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1162; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1163; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
1164; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1165; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1166; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1167; AVX2-NEXT:    retq
1168;
1169; AVX512CDVL-LABEL: testv16i8:
1170; AVX512CDVL:       # BB#0:
1171; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
1172; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1173; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
1174; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1175; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1176; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
1177; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1178; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1179; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
1180; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
1181; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1182; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1183; AVX512CDVL-NEXT:    retq
1184;
1185; AVX512CD-LABEL: testv16i8:
1186; AVX512CD:       # BB#0:
1187; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1188; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1189; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1190; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1191; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1192; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
1193; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1194; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1195; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
1196; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1197; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1198; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1199; AVX512CD-NEXT:    retq
1200;
1201; X32-SSE-LABEL: testv16i8:
1202; X32-SSE:       # BB#0:
1203; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1204; X32-SSE-NEXT:    psubb %xmm0, %xmm1
1205; X32-SSE-NEXT:    pand %xmm0, %xmm1
1206; X32-SSE-NEXT:    psubb {{\.LCPI.*}}, %xmm1
1207; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1208; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1209; X32-SSE-NEXT:    pand %xmm2, %xmm3
1210; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1211; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
1212; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1213; X32-SSE-NEXT:    psrlw $4, %xmm1
1214; X32-SSE-NEXT:    pand %xmm2, %xmm1
1215; X32-SSE-NEXT:    pshufb %xmm1, %xmm0
1216; X32-SSE-NEXT:    paddb %xmm4, %xmm0
1217; X32-SSE-NEXT:    retl
1218  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
1219  ret <16 x i8> %out
1220}
1221
1222define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
1223; SSE2-LABEL: testv16i8u:
1224; SSE2:       # BB#0:
1225; SSE2-NEXT:    pxor %xmm1, %xmm1
1226; SSE2-NEXT:    psubb %xmm0, %xmm1
1227; SSE2-NEXT:    pand %xmm0, %xmm1
1228; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
1229; SSE2-NEXT:    movdqa %xmm1, %xmm0
1230; SSE2-NEXT:    psrlw $1, %xmm0
1231; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1232; SSE2-NEXT:    psubb %xmm0, %xmm1
1233; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1234; SSE2-NEXT:    movdqa %xmm1, %xmm2
1235; SSE2-NEXT:    pand %xmm0, %xmm2
1236; SSE2-NEXT:    psrlw $2, %xmm1
1237; SSE2-NEXT:    pand %xmm0, %xmm1
1238; SSE2-NEXT:    paddb %xmm2, %xmm1
1239; SSE2-NEXT:    movdqa %xmm1, %xmm0
1240; SSE2-NEXT:    psrlw $4, %xmm0
1241; SSE2-NEXT:    paddb %xmm1, %xmm0
1242; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1243; SSE2-NEXT:    retq
1244;
1245; SSE3-LABEL: testv16i8u:
1246; SSE3:       # BB#0:
1247; SSE3-NEXT:    pxor %xmm1, %xmm1
1248; SSE3-NEXT:    psubb %xmm0, %xmm1
1249; SSE3-NEXT:    pand %xmm0, %xmm1
1250; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
1251; SSE3-NEXT:    movdqa %xmm1, %xmm0
1252; SSE3-NEXT:    psrlw $1, %xmm0
1253; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1254; SSE3-NEXT:    psubb %xmm0, %xmm1
1255; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1256; SSE3-NEXT:    movdqa %xmm1, %xmm2
1257; SSE3-NEXT:    pand %xmm0, %xmm2
1258; SSE3-NEXT:    psrlw $2, %xmm1
1259; SSE3-NEXT:    pand %xmm0, %xmm1
1260; SSE3-NEXT:    paddb %xmm2, %xmm1
1261; SSE3-NEXT:    movdqa %xmm1, %xmm0
1262; SSE3-NEXT:    psrlw $4, %xmm0
1263; SSE3-NEXT:    paddb %xmm1, %xmm0
1264; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
1265; SSE3-NEXT:    retq
1266;
1267; SSSE3-LABEL: testv16i8u:
1268; SSSE3:       # BB#0:
1269; SSSE3-NEXT:    pxor %xmm1, %xmm1
1270; SSSE3-NEXT:    psubb %xmm0, %xmm1
1271; SSSE3-NEXT:    pand %xmm0, %xmm1
1272; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
1273; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1274; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1275; SSSE3-NEXT:    pand %xmm2, %xmm3
1276; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1277; SSSE3-NEXT:    movdqa %xmm0, %xmm4
1278; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1279; SSSE3-NEXT:    psrlw $4, %xmm1
1280; SSSE3-NEXT:    pand %xmm2, %xmm1
1281; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1282; SSSE3-NEXT:    paddb %xmm4, %xmm0
1283; SSSE3-NEXT:    retq
1284;
1285; SSE41-LABEL: testv16i8u:
1286; SSE41:       # BB#0:
1287; SSE41-NEXT:    pxor %xmm1, %xmm1
1288; SSE41-NEXT:    psubb %xmm0, %xmm1
1289; SSE41-NEXT:    pand %xmm0, %xmm1
1290; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
1291; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1292; SSE41-NEXT:    movdqa %xmm1, %xmm3
1293; SSE41-NEXT:    pand %xmm2, %xmm3
1294; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1295; SSE41-NEXT:    movdqa %xmm0, %xmm4
1296; SSE41-NEXT:    pshufb %xmm3, %xmm4
1297; SSE41-NEXT:    psrlw $4, %xmm1
1298; SSE41-NEXT:    pand %xmm2, %xmm1
1299; SSE41-NEXT:    pshufb %xmm1, %xmm0
1300; SSE41-NEXT:    paddb %xmm4, %xmm0
1301; SSE41-NEXT:    retq
1302;
1303; AVX1-LABEL: testv16i8u:
1304; AVX1:       # BB#0:
1305; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1306; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1307; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1308; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1309; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1310; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
1311; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1312; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1313; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1314; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1315; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1316; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1317; AVX1-NEXT:    retq
1318;
1319; AVX2-LABEL: testv16i8u:
1320; AVX2:       # BB#0:
1321; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1322; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1323; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1324; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1325; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1326; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
1327; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1328; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1329; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
1330; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1331; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1332; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1333; AVX2-NEXT:    retq
1334;
1335; AVX512CDVL-LABEL: testv16i8u:
1336; AVX512CDVL:       # BB#0:
1337; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
1338; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1339; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
1340; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1341; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1342; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
1343; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1344; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1345; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
1346; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
1347; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1348; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1349; AVX512CDVL-NEXT:    retq
1350;
1351; AVX512CD-LABEL: testv16i8u:
1352; AVX512CD:       # BB#0:
1353; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1354; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
1355; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1356; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1357; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1358; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
1359; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1360; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1361; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
1362; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
1363; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1364; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1365; AVX512CD-NEXT:    retq
1366;
1367; X32-SSE-LABEL: testv16i8u:
1368; X32-SSE:       # BB#0:
1369; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1370; X32-SSE-NEXT:    psubb %xmm0, %xmm1
1371; X32-SSE-NEXT:    pand %xmm0, %xmm1
1372; X32-SSE-NEXT:    psubb {{\.LCPI.*}}, %xmm1
1373; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1374; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1375; X32-SSE-NEXT:    pand %xmm2, %xmm3
1376; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1377; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
1378; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1379; X32-SSE-NEXT:    psrlw $4, %xmm1
1380; X32-SSE-NEXT:    pand %xmm2, %xmm1
1381; X32-SSE-NEXT:    pshufb %xmm1, %xmm0
1382; X32-SSE-NEXT:    paddb %xmm4, %xmm0
1383; X32-SSE-NEXT:    retl
1384  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
1385  ret <16 x i8> %out
1386}
1387
1388define <2 x i64> @foldv2i64() nounwind {
1389; SSE-LABEL: foldv2i64:
1390; SSE:       # BB#0:
1391; SSE-NEXT:    movl $8, %eax
1392; SSE-NEXT:    movd %rax, %xmm0
1393; SSE-NEXT:    retq
1394;
1395; AVX-LABEL: foldv2i64:
1396; AVX:       # BB#0:
1397; AVX-NEXT:    movl $8, %eax
1398; AVX-NEXT:    vmovq %rax, %xmm0
1399; AVX-NEXT:    retq
1400;
1401; X32-SSE-LABEL: foldv2i64:
1402; X32-SSE:       # BB#0:
1403; X32-SSE-NEXT:    movl $8, %eax
1404; X32-SSE-NEXT:    movd %eax, %xmm0
1405; X32-SSE-NEXT:    retl
1406  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
1407  ret <2 x i64> %out
1408}
1409
1410define <2 x i64> @foldv2i64u() nounwind {
1411; SSE-LABEL: foldv2i64u:
1412; SSE:       # BB#0:
1413; SSE-NEXT:    movl $8, %eax
1414; SSE-NEXT:    movd %rax, %xmm0
1415; SSE-NEXT:    retq
1416;
1417; AVX-LABEL: foldv2i64u:
1418; AVX:       # BB#0:
1419; AVX-NEXT:    movl $8, %eax
1420; AVX-NEXT:    vmovq %rax, %xmm0
1421; AVX-NEXT:    retq
1422;
1423; X32-SSE-LABEL: foldv2i64u:
1424; X32-SSE:       # BB#0:
1425; X32-SSE-NEXT:    movl $8, %eax
1426; X32-SSE-NEXT:    movd %eax, %xmm0
1427; X32-SSE-NEXT:    retl
1428  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
1429  ret <2 x i64> %out
1430}
1431
1432define <4 x i32> @foldv4i32() nounwind {
1433; SSE-LABEL: foldv4i32:
1434; SSE:       # BB#0:
1435; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1436; SSE-NEXT:    retq
1437;
1438; AVX1-LABEL: foldv4i32:
1439; AVX1:       # BB#0:
1440; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1441; AVX1-NEXT:    retq
1442;
1443; AVX2-LABEL: foldv4i32:
1444; AVX2:       # BB#0:
1445; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1446; AVX2-NEXT:    retq
1447;
1448; AVX512CDVL-LABEL: foldv4i32:
1449; AVX512CDVL:       # BB#0:
1450; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
1451; AVX512CDVL-NEXT:    retq
1452;
1453; AVX512CD-LABEL: foldv4i32:
1454; AVX512CD:       # BB#0:
1455; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1456; AVX512CD-NEXT:    retq
1457;
1458; X32-SSE-LABEL: foldv4i32:
1459; X32-SSE:       # BB#0:
1460; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1461; X32-SSE-NEXT:    retl
1462  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
1463  ret <4 x i32> %out
1464}
1465
1466define <4 x i32> @foldv4i32u() nounwind {
1467; SSE-LABEL: foldv4i32u:
1468; SSE:       # BB#0:
1469; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1470; SSE-NEXT:    retq
1471;
1472; AVX1-LABEL: foldv4i32u:
1473; AVX1:       # BB#0:
1474; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1475; AVX1-NEXT:    retq
1476;
1477; AVX2-LABEL: foldv4i32u:
1478; AVX2:       # BB#0:
1479; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1480; AVX2-NEXT:    retq
1481;
1482; AVX512CDVL-LABEL: foldv4i32u:
1483; AVX512CDVL:       # BB#0:
1484; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
1485; AVX512CDVL-NEXT:    retq
1486;
1487; AVX512CD-LABEL: foldv4i32u:
1488; AVX512CD:       # BB#0:
1489; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1490; AVX512CD-NEXT:    retq
1491;
1492; X32-SSE-LABEL: foldv4i32u:
1493; X32-SSE:       # BB#0:
1494; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1495; X32-SSE-NEXT:    retl
1496  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
1497  ret <4 x i32> %out
1498}
1499
1500define <8 x i16> @foldv8i16() nounwind {
1501; SSE-LABEL: foldv8i16:
1502; SSE:       # BB#0:
1503; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1504; SSE-NEXT:    retq
1505;
1506; AVX1-LABEL: foldv8i16:
1507; AVX1:       # BB#0:
1508; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1509; AVX1-NEXT:    retq
1510;
1511; AVX2-LABEL: foldv8i16:
1512; AVX2:       # BB#0:
1513; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1514; AVX2-NEXT:    retq
1515;
1516; AVX512CDVL-LABEL: foldv8i16:
1517; AVX512CDVL:       # BB#0:
1518; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1519; AVX512CDVL-NEXT:    retq
1520;
1521; AVX512CD-LABEL: foldv8i16:
1522; AVX512CD:       # BB#0:
1523; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1524; AVX512CD-NEXT:    retq
1525;
1526; X32-SSE-LABEL: foldv8i16:
1527; X32-SSE:       # BB#0:
1528; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1529; X32-SSE-NEXT:    retl
1530  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
1531  ret <8 x i16> %out
1532}
1533
1534define <8 x i16> @foldv8i16u() nounwind {
1535; SSE-LABEL: foldv8i16u:
1536; SSE:       # BB#0:
1537; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1538; SSE-NEXT:    retq
1539;
1540; AVX1-LABEL: foldv8i16u:
1541; AVX1:       # BB#0:
1542; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1543; AVX1-NEXT:    retq
1544;
1545; AVX2-LABEL: foldv8i16u:
1546; AVX2:       # BB#0:
1547; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1548; AVX2-NEXT:    retq
1549;
1550; AVX512CDVL-LABEL: foldv8i16u:
1551; AVX512CDVL:       # BB#0:
1552; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1553; AVX512CDVL-NEXT:    retq
1554;
1555; AVX512CD-LABEL: foldv8i16u:
1556; AVX512CD:       # BB#0:
1557; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1558; AVX512CD-NEXT:    retq
1559;
1560; X32-SSE-LABEL: foldv8i16u:
1561; X32-SSE:       # BB#0:
1562; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1563; X32-SSE-NEXT:    retl
1564  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
1565  ret <8 x i16> %out
1566}
1567
1568define <16 x i8> @foldv16i8() nounwind {
1569; SSE-LABEL: foldv16i8:
1570; SSE:       # BB#0:
1571; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1572; SSE-NEXT:    retq
1573;
1574; AVX1-LABEL: foldv16i8:
1575; AVX1:       # BB#0:
1576; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1577; AVX1-NEXT:    retq
1578;
1579; AVX2-LABEL: foldv16i8:
1580; AVX2:       # BB#0:
1581; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1582; AVX2-NEXT:    retq
1583;
1584; AVX512CDVL-LABEL: foldv16i8:
1585; AVX512CDVL:       # BB#0:
1586; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1587; AVX512CDVL-NEXT:    retq
1588;
1589; AVX512CD-LABEL: foldv16i8:
1590; AVX512CD:       # BB#0:
1591; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1592; AVX512CD-NEXT:    retq
1593;
1594; X32-SSE-LABEL: foldv16i8:
1595; X32-SSE:       # BB#0:
1596; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1597; X32-SSE-NEXT:    retl
1598  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
1599  ret <16 x i8> %out
1600}
1601
1602define <16 x i8> @foldv16i8u() nounwind {
1603; SSE-LABEL: foldv16i8u:
1604; SSE:       # BB#0:
1605; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1606; SSE-NEXT:    retq
1607;
1608; AVX1-LABEL: foldv16i8u:
1609; AVX1:       # BB#0:
1610; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1611; AVX1-NEXT:    retq
1612;
1613; AVX2-LABEL: foldv16i8u:
1614; AVX2:       # BB#0:
1615; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1616; AVX2-NEXT:    retq
1617;
1618; AVX512CDVL-LABEL: foldv16i8u:
1619; AVX512CDVL:       # BB#0:
1620; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1621; AVX512CDVL-NEXT:    retq
1622;
1623; AVX512CD-LABEL: foldv16i8u:
1624; AVX512CD:       # BB#0:
1625; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1626; AVX512CD-NEXT:    retq
1627;
1628; X32-SSE-LABEL: foldv16i8u:
1629; X32-SSE:       # BB#0:
1630; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1631; X32-SSE-NEXT:    retl
1632  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
1633  ret <16 x i8> %out
1634}
1635
1636declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
1637declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
1638declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
1639declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
1640