1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512CDVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefixes=AVX,AVX512CD
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=AVX512VPOPCNTDQVL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=BITALG_NOVLX
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=BITALG
14;
15; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
17
18define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
19; SSE2-LABEL: testv2i64:
20; SSE2:       # %bb.0:
21; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
22; SSE2-NEXT:    paddq %xmm0, %xmm1
23; SSE2-NEXT:    pandn %xmm1, %xmm0
24; SSE2-NEXT:    movdqa %xmm0, %xmm1
25; SSE2-NEXT:    psrlw $1, %xmm1
26; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
27; SSE2-NEXT:    psubb %xmm1, %xmm0
28; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
29; SSE2-NEXT:    movdqa %xmm0, %xmm2
30; SSE2-NEXT:    pand %xmm1, %xmm2
31; SSE2-NEXT:    psrlw $2, %xmm0
32; SSE2-NEXT:    pand %xmm1, %xmm0
33; SSE2-NEXT:    paddb %xmm2, %xmm0
34; SSE2-NEXT:    movdqa %xmm0, %xmm1
35; SSE2-NEXT:    psrlw $4, %xmm1
36; SSE2-NEXT:    paddb %xmm0, %xmm1
37; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
38; SSE2-NEXT:    pxor %xmm0, %xmm0
39; SSE2-NEXT:    psadbw %xmm0, %xmm1
40; SSE2-NEXT:    movdqa %xmm1, %xmm0
41; SSE2-NEXT:    retq
42;
43; SSE3-LABEL: testv2i64:
44; SSE3:       # %bb.0:
45; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
46; SSE3-NEXT:    paddq %xmm0, %xmm1
47; SSE3-NEXT:    pandn %xmm1, %xmm0
48; SSE3-NEXT:    movdqa %xmm0, %xmm1
49; SSE3-NEXT:    psrlw $1, %xmm1
50; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
51; SSE3-NEXT:    psubb %xmm1, %xmm0
52; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
53; SSE3-NEXT:    movdqa %xmm0, %xmm2
54; SSE3-NEXT:    pand %xmm1, %xmm2
55; SSE3-NEXT:    psrlw $2, %xmm0
56; SSE3-NEXT:    pand %xmm1, %xmm0
57; SSE3-NEXT:    paddb %xmm2, %xmm0
58; SSE3-NEXT:    movdqa %xmm0, %xmm1
59; SSE3-NEXT:    psrlw $4, %xmm1
60; SSE3-NEXT:    paddb %xmm0, %xmm1
61; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
62; SSE3-NEXT:    pxor %xmm0, %xmm0
63; SSE3-NEXT:    psadbw %xmm0, %xmm1
64; SSE3-NEXT:    movdqa %xmm1, %xmm0
65; SSE3-NEXT:    retq
66;
67; SSSE3-LABEL: testv2i64:
68; SSSE3:       # %bb.0:
69; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
70; SSSE3-NEXT:    paddq %xmm0, %xmm1
71; SSSE3-NEXT:    pandn %xmm1, %xmm0
72; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
73; SSSE3-NEXT:    movdqa %xmm0, %xmm2
74; SSSE3-NEXT:    pand %xmm1, %xmm2
75; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
76; SSSE3-NEXT:    movdqa %xmm3, %xmm4
77; SSSE3-NEXT:    pshufb %xmm2, %xmm4
78; SSSE3-NEXT:    psrlw $4, %xmm0
79; SSSE3-NEXT:    pand %xmm1, %xmm0
80; SSSE3-NEXT:    pshufb %xmm0, %xmm3
81; SSSE3-NEXT:    paddb %xmm4, %xmm3
82; SSSE3-NEXT:    pxor %xmm0, %xmm0
83; SSSE3-NEXT:    psadbw %xmm3, %xmm0
84; SSSE3-NEXT:    retq
85;
86; SSE41-LABEL: testv2i64:
87; SSE41:       # %bb.0:
88; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
89; SSE41-NEXT:    paddq %xmm0, %xmm1
90; SSE41-NEXT:    pandn %xmm1, %xmm0
91; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
92; SSE41-NEXT:    movdqa %xmm0, %xmm2
93; SSE41-NEXT:    pand %xmm1, %xmm2
94; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
95; SSE41-NEXT:    movdqa %xmm3, %xmm4
96; SSE41-NEXT:    pshufb %xmm2, %xmm4
97; SSE41-NEXT:    psrlw $4, %xmm0
98; SSE41-NEXT:    pand %xmm1, %xmm0
99; SSE41-NEXT:    pshufb %xmm0, %xmm3
100; SSE41-NEXT:    paddb %xmm4, %xmm3
101; SSE41-NEXT:    pxor %xmm0, %xmm0
102; SSE41-NEXT:    psadbw %xmm3, %xmm0
103; SSE41-NEXT:    retq
104;
105; AVX1-LABEL: testv2i64:
106; AVX1:       # %bb.0:
107; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
108; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
109; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
110; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
111; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
112; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
113; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
114; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
115; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
116; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
117; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
118; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
119; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
120; AVX1-NEXT:    retq
121;
122; AVX2-LABEL: testv2i64:
123; AVX2:       # %bb.0:
124; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
125; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
126; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
127; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
128; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
129; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
130; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
131; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
132; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
133; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
134; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
135; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
136; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
137; AVX2-NEXT:    retq
138;
139; AVX512CDVL-LABEL: testv2i64:
140; AVX512CDVL:       # %bb.0:
141; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
142; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
143; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
144; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
145; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
146; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
147; AVX512CDVL-NEXT:    retq
148;
149; AVX512CD-LABEL: testv2i64:
150; AVX512CD:       # %bb.0:
151; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
152; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
153; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
154; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
155; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
156; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
157; AVX512CD-NEXT:    vzeroupper
158; AVX512CD-NEXT:    retq
159;
160; AVX512VPOPCNTDQ-LABEL: testv2i64:
161; AVX512VPOPCNTDQ:       # %bb.0:
162; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
163; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
164; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
165; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
166; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
167; AVX512VPOPCNTDQ-NEXT:    vzeroupper
168; AVX512VPOPCNTDQ-NEXT:    retq
169;
170; AVX512VPOPCNTDQVL-LABEL: testv2i64:
171; AVX512VPOPCNTDQVL:       # %bb.0:
172; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
173; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
174; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
175; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
176; AVX512VPOPCNTDQVL-NEXT:    retq
177;
178; BITALG_NOVLX-LABEL: testv2i64:
179; BITALG_NOVLX:       # %bb.0:
180; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
181; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
182; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
183; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
184; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
185; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
186; BITALG_NOVLX-NEXT:    vzeroupper
187; BITALG_NOVLX-NEXT:    retq
188;
189; BITALG-LABEL: testv2i64:
190; BITALG:       # %bb.0:
191; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
192; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
193; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
194; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
195; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
196; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
197; BITALG-NEXT:    retq
198;
199; X32-SSE-LABEL: testv2i64:
200; X32-SSE:       # %bb.0:
201; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
202; X32-SSE-NEXT:    paddq %xmm0, %xmm1
203; X32-SSE-NEXT:    pandn %xmm1, %xmm0
204; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
205; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
206; X32-SSE-NEXT:    pand %xmm1, %xmm2
207; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
208; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
209; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
210; X32-SSE-NEXT:    psrlw $4, %xmm0
211; X32-SSE-NEXT:    pand %xmm1, %xmm0
212; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
213; X32-SSE-NEXT:    paddb %xmm4, %xmm3
214; X32-SSE-NEXT:    pxor %xmm0, %xmm0
215; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
216; X32-SSE-NEXT:    retl
217  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
218  ret <2 x i64> %out
219}
220
221define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
222; SSE2-LABEL: testv2i64u:
223; SSE2:       # %bb.0:
224; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
225; SSE2-NEXT:    paddq %xmm0, %xmm1
226; SSE2-NEXT:    pandn %xmm1, %xmm0
227; SSE2-NEXT:    movdqa %xmm0, %xmm1
228; SSE2-NEXT:    psrlw $1, %xmm1
229; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
230; SSE2-NEXT:    psubb %xmm1, %xmm0
231; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
232; SSE2-NEXT:    movdqa %xmm0, %xmm2
233; SSE2-NEXT:    pand %xmm1, %xmm2
234; SSE2-NEXT:    psrlw $2, %xmm0
235; SSE2-NEXT:    pand %xmm1, %xmm0
236; SSE2-NEXT:    paddb %xmm2, %xmm0
237; SSE2-NEXT:    movdqa %xmm0, %xmm1
238; SSE2-NEXT:    psrlw $4, %xmm1
239; SSE2-NEXT:    paddb %xmm0, %xmm1
240; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
241; SSE2-NEXT:    pxor %xmm0, %xmm0
242; SSE2-NEXT:    psadbw %xmm0, %xmm1
243; SSE2-NEXT:    movdqa %xmm1, %xmm0
244; SSE2-NEXT:    retq
245;
246; SSE3-LABEL: testv2i64u:
247; SSE3:       # %bb.0:
248; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
249; SSE3-NEXT:    paddq %xmm0, %xmm1
250; SSE3-NEXT:    pandn %xmm1, %xmm0
251; SSE3-NEXT:    movdqa %xmm0, %xmm1
252; SSE3-NEXT:    psrlw $1, %xmm1
253; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
254; SSE3-NEXT:    psubb %xmm1, %xmm0
255; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
256; SSE3-NEXT:    movdqa %xmm0, %xmm2
257; SSE3-NEXT:    pand %xmm1, %xmm2
258; SSE3-NEXT:    psrlw $2, %xmm0
259; SSE3-NEXT:    pand %xmm1, %xmm0
260; SSE3-NEXT:    paddb %xmm2, %xmm0
261; SSE3-NEXT:    movdqa %xmm0, %xmm1
262; SSE3-NEXT:    psrlw $4, %xmm1
263; SSE3-NEXT:    paddb %xmm0, %xmm1
264; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
265; SSE3-NEXT:    pxor %xmm0, %xmm0
266; SSE3-NEXT:    psadbw %xmm0, %xmm1
267; SSE3-NEXT:    movdqa %xmm1, %xmm0
268; SSE3-NEXT:    retq
269;
270; SSSE3-LABEL: testv2i64u:
271; SSSE3:       # %bb.0:
272; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
273; SSSE3-NEXT:    paddq %xmm0, %xmm1
274; SSSE3-NEXT:    pandn %xmm1, %xmm0
275; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
276; SSSE3-NEXT:    movdqa %xmm0, %xmm2
277; SSSE3-NEXT:    pand %xmm1, %xmm2
278; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
279; SSSE3-NEXT:    movdqa %xmm3, %xmm4
280; SSSE3-NEXT:    pshufb %xmm2, %xmm4
281; SSSE3-NEXT:    psrlw $4, %xmm0
282; SSSE3-NEXT:    pand %xmm1, %xmm0
283; SSSE3-NEXT:    pshufb %xmm0, %xmm3
284; SSSE3-NEXT:    paddb %xmm4, %xmm3
285; SSSE3-NEXT:    pxor %xmm0, %xmm0
286; SSSE3-NEXT:    psadbw %xmm3, %xmm0
287; SSSE3-NEXT:    retq
288;
289; SSE41-LABEL: testv2i64u:
290; SSE41:       # %bb.0:
291; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
292; SSE41-NEXT:    paddq %xmm0, %xmm1
293; SSE41-NEXT:    pandn %xmm1, %xmm0
294; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
295; SSE41-NEXT:    movdqa %xmm0, %xmm2
296; SSE41-NEXT:    pand %xmm1, %xmm2
297; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
298; SSE41-NEXT:    movdqa %xmm3, %xmm4
299; SSE41-NEXT:    pshufb %xmm2, %xmm4
300; SSE41-NEXT:    psrlw $4, %xmm0
301; SSE41-NEXT:    pand %xmm1, %xmm0
302; SSE41-NEXT:    pshufb %xmm0, %xmm3
303; SSE41-NEXT:    paddb %xmm4, %xmm3
304; SSE41-NEXT:    pxor %xmm0, %xmm0
305; SSE41-NEXT:    psadbw %xmm3, %xmm0
306; SSE41-NEXT:    retq
307;
308; AVX1-LABEL: testv2i64u:
309; AVX1:       # %bb.0:
310; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
311; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
312; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
313; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
314; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
315; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
316; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
317; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
318; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
319; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
320; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
321; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
322; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    retq
324;
325; AVX2-LABEL: testv2i64u:
326; AVX2:       # %bb.0:
327; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
328; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
329; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
330; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
331; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
332; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
333; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
334; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
335; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
336; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
337; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
338; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
339; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
340; AVX2-NEXT:    retq
341;
342; AVX512CDVL-LABEL: testv2i64u:
343; AVX512CDVL:       # %bb.0:
344; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
345; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
346; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
347; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
348; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
349; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
350; AVX512CDVL-NEXT:    retq
351;
352; AVX512CD-LABEL: testv2i64u:
353; AVX512CD:       # %bb.0:
354; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
355; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
356; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
357; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
358; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
359; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
360; AVX512CD-NEXT:    vzeroupper
361; AVX512CD-NEXT:    retq
362;
363; AVX512VPOPCNTDQ-LABEL: testv2i64u:
364; AVX512VPOPCNTDQ:       # %bb.0:
365; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
366; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
367; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
368; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
369; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
370; AVX512VPOPCNTDQ-NEXT:    vzeroupper
371; AVX512VPOPCNTDQ-NEXT:    retq
372;
373; AVX512VPOPCNTDQVL-LABEL: testv2i64u:
374; AVX512VPOPCNTDQVL:       # %bb.0:
375; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
376; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
377; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
378; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
379; AVX512VPOPCNTDQVL-NEXT:    retq
380;
381; BITALG_NOVLX-LABEL: testv2i64u:
382; BITALG_NOVLX:       # %bb.0:
383; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
384; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
385; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
386; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
387; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
388; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
389; BITALG_NOVLX-NEXT:    vzeroupper
390; BITALG_NOVLX-NEXT:    retq
391;
392; BITALG-LABEL: testv2i64u:
393; BITALG:       # %bb.0:
394; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
395; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
396; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
397; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
398; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
399; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
400; BITALG-NEXT:    retq
401;
402; X32-SSE-LABEL: testv2i64u:
403; X32-SSE:       # %bb.0:
404; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
405; X32-SSE-NEXT:    paddq %xmm0, %xmm1
406; X32-SSE-NEXT:    pandn %xmm1, %xmm0
407; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
408; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
409; X32-SSE-NEXT:    pand %xmm1, %xmm2
410; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
411; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
412; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
413; X32-SSE-NEXT:    psrlw $4, %xmm0
414; X32-SSE-NEXT:    pand %xmm1, %xmm0
415; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
416; X32-SSE-NEXT:    paddb %xmm4, %xmm3
417; X32-SSE-NEXT:    pxor %xmm0, %xmm0
418; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
419; X32-SSE-NEXT:    retl
420  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
421  ret <2 x i64> %out
422}
423
424define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
425; SSE2-LABEL: testv4i32:
426; SSE2:       # %bb.0:
427; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
428; SSE2-NEXT:    paddd %xmm0, %xmm1
429; SSE2-NEXT:    pandn %xmm1, %xmm0
430; SSE2-NEXT:    movdqa %xmm0, %xmm1
431; SSE2-NEXT:    psrlw $1, %xmm1
432; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
433; SSE2-NEXT:    psubb %xmm1, %xmm0
434; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
435; SSE2-NEXT:    movdqa %xmm0, %xmm2
436; SSE2-NEXT:    pand %xmm1, %xmm2
437; SSE2-NEXT:    psrlw $2, %xmm0
438; SSE2-NEXT:    pand %xmm1, %xmm0
439; SSE2-NEXT:    paddb %xmm2, %xmm0
440; SSE2-NEXT:    movdqa %xmm0, %xmm1
441; SSE2-NEXT:    psrlw $4, %xmm1
442; SSE2-NEXT:    paddb %xmm0, %xmm1
443; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
444; SSE2-NEXT:    pxor %xmm0, %xmm0
445; SSE2-NEXT:    movdqa %xmm1, %xmm2
446; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
447; SSE2-NEXT:    psadbw %xmm0, %xmm2
448; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
449; SSE2-NEXT:    psadbw %xmm0, %xmm1
450; SSE2-NEXT:    packuswb %xmm2, %xmm1
451; SSE2-NEXT:    movdqa %xmm1, %xmm0
452; SSE2-NEXT:    retq
453;
454; SSE3-LABEL: testv4i32:
455; SSE3:       # %bb.0:
456; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
457; SSE3-NEXT:    paddd %xmm0, %xmm1
458; SSE3-NEXT:    pandn %xmm1, %xmm0
459; SSE3-NEXT:    movdqa %xmm0, %xmm1
460; SSE3-NEXT:    psrlw $1, %xmm1
461; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
462; SSE3-NEXT:    psubb %xmm1, %xmm0
463; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
464; SSE3-NEXT:    movdqa %xmm0, %xmm2
465; SSE3-NEXT:    pand %xmm1, %xmm2
466; SSE3-NEXT:    psrlw $2, %xmm0
467; SSE3-NEXT:    pand %xmm1, %xmm0
468; SSE3-NEXT:    paddb %xmm2, %xmm0
469; SSE3-NEXT:    movdqa %xmm0, %xmm1
470; SSE3-NEXT:    psrlw $4, %xmm1
471; SSE3-NEXT:    paddb %xmm0, %xmm1
472; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
473; SSE3-NEXT:    pxor %xmm0, %xmm0
474; SSE3-NEXT:    movdqa %xmm1, %xmm2
475; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
476; SSE3-NEXT:    psadbw %xmm0, %xmm2
477; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
478; SSE3-NEXT:    psadbw %xmm0, %xmm1
479; SSE3-NEXT:    packuswb %xmm2, %xmm1
480; SSE3-NEXT:    movdqa %xmm1, %xmm0
481; SSE3-NEXT:    retq
482;
483; SSSE3-LABEL: testv4i32:
484; SSSE3:       # %bb.0:
485; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
486; SSSE3-NEXT:    paddd %xmm0, %xmm1
487; SSSE3-NEXT:    pandn %xmm1, %xmm0
488; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
489; SSSE3-NEXT:    movdqa %xmm0, %xmm3
490; SSSE3-NEXT:    pand %xmm2, %xmm3
491; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
492; SSSE3-NEXT:    movdqa %xmm1, %xmm4
493; SSSE3-NEXT:    pshufb %xmm3, %xmm4
494; SSSE3-NEXT:    psrlw $4, %xmm0
495; SSSE3-NEXT:    pand %xmm2, %xmm0
496; SSSE3-NEXT:    pshufb %xmm0, %xmm1
497; SSSE3-NEXT:    paddb %xmm4, %xmm1
498; SSSE3-NEXT:    pxor %xmm0, %xmm0
499; SSSE3-NEXT:    movdqa %xmm1, %xmm2
500; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
501; SSSE3-NEXT:    psadbw %xmm0, %xmm2
502; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
503; SSSE3-NEXT:    psadbw %xmm0, %xmm1
504; SSSE3-NEXT:    packuswb %xmm2, %xmm1
505; SSSE3-NEXT:    movdqa %xmm1, %xmm0
506; SSSE3-NEXT:    retq
507;
508; SSE41-LABEL: testv4i32:
509; SSE41:       # %bb.0:
510; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
511; SSE41-NEXT:    paddd %xmm0, %xmm1
512; SSE41-NEXT:    pandn %xmm1, %xmm0
513; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
514; SSE41-NEXT:    movdqa %xmm0, %xmm2
515; SSE41-NEXT:    pand %xmm1, %xmm2
516; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
517; SSE41-NEXT:    movdqa %xmm3, %xmm4
518; SSE41-NEXT:    pshufb %xmm2, %xmm4
519; SSE41-NEXT:    psrlw $4, %xmm0
520; SSE41-NEXT:    pand %xmm1, %xmm0
521; SSE41-NEXT:    pshufb %xmm0, %xmm3
522; SSE41-NEXT:    paddb %xmm4, %xmm3
523; SSE41-NEXT:    pxor %xmm1, %xmm1
524; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
525; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
526; SSE41-NEXT:    psadbw %xmm1, %xmm3
527; SSE41-NEXT:    psadbw %xmm1, %xmm0
528; SSE41-NEXT:    packuswb %xmm3, %xmm0
529; SSE41-NEXT:    retq
530;
531; AVX1-LABEL: testv4i32:
532; AVX1:       # %bb.0:
533; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
534; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
535; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
536; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
537; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
538; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
539; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
540; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
541; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
542; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
543; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
544; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
545; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
546; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
547; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
548; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
549; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
550; AVX1-NEXT:    retq
551;
552; AVX2-LABEL: testv4i32:
553; AVX2:       # %bb.0:
554; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
555; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
556; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
557; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
558; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
559; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
560; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
561; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
562; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
563; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
564; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
565; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
566; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
567; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
568; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
569; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
570; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
571; AVX2-NEXT:    retq
572;
573; AVX512CDVL-LABEL: testv4i32:
574; AVX512CDVL:       # %bb.0:
575; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
576; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
577; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
578; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
579; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
580; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
581; AVX512CDVL-NEXT:    retq
582;
583; AVX512CD-LABEL: testv4i32:
584; AVX512CD:       # %bb.0:
585; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
586; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
587; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
588; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
589; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
590; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
591; AVX512CD-NEXT:    vzeroupper
592; AVX512CD-NEXT:    retq
593;
594; AVX512VPOPCNTDQ-LABEL: testv4i32:
595; AVX512VPOPCNTDQ:       # %bb.0:
596; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
597; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
598; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
599; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
600; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
601; AVX512VPOPCNTDQ-NEXT:    vzeroupper
602; AVX512VPOPCNTDQ-NEXT:    retq
603;
604; AVX512VPOPCNTDQVL-LABEL: testv4i32:
605; AVX512VPOPCNTDQVL:       # %bb.0:
606; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
607; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
608; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
609; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
610; AVX512VPOPCNTDQVL-NEXT:    retq
611;
612; BITALG_NOVLX-LABEL: testv4i32:
613; BITALG_NOVLX:       # %bb.0:
614; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
615; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
616; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
617; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
618; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
619; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
620; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
621; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
622; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
623; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
624; BITALG_NOVLX-NEXT:    vzeroupper
625; BITALG_NOVLX-NEXT:    retq
626;
627; BITALG-LABEL: testv4i32:
628; BITALG:       # %bb.0:
629; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
630; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
631; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
632; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
633; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
634; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
635; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
636; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
637; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
638; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
639; BITALG-NEXT:    retq
640;
641; X32-SSE-LABEL: testv4i32:
642; X32-SSE:       # %bb.0:
643; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
644; X32-SSE-NEXT:    paddd %xmm0, %xmm1
645; X32-SSE-NEXT:    pandn %xmm1, %xmm0
646; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
647; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
648; X32-SSE-NEXT:    pand %xmm1, %xmm2
649; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
650; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
651; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
652; X32-SSE-NEXT:    psrlw $4, %xmm0
653; X32-SSE-NEXT:    pand %xmm1, %xmm0
654; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
655; X32-SSE-NEXT:    paddb %xmm4, %xmm3
656; X32-SSE-NEXT:    pxor %xmm1, %xmm1
657; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
658; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
659; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
660; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
661; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
662; X32-SSE-NEXT:    retl
663  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
664  ret <4 x i32> %out
665}
666
667define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
668; SSE2-LABEL: testv4i32u:
669; SSE2:       # %bb.0:
670; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
671; SSE2-NEXT:    paddd %xmm0, %xmm1
672; SSE2-NEXT:    pandn %xmm1, %xmm0
673; SSE2-NEXT:    movdqa %xmm0, %xmm1
674; SSE2-NEXT:    psrlw $1, %xmm1
675; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
676; SSE2-NEXT:    psubb %xmm1, %xmm0
677; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
678; SSE2-NEXT:    movdqa %xmm0, %xmm2
679; SSE2-NEXT:    pand %xmm1, %xmm2
680; SSE2-NEXT:    psrlw $2, %xmm0
681; SSE2-NEXT:    pand %xmm1, %xmm0
682; SSE2-NEXT:    paddb %xmm2, %xmm0
683; SSE2-NEXT:    movdqa %xmm0, %xmm1
684; SSE2-NEXT:    psrlw $4, %xmm1
685; SSE2-NEXT:    paddb %xmm0, %xmm1
686; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
687; SSE2-NEXT:    pxor %xmm0, %xmm0
688; SSE2-NEXT:    movdqa %xmm1, %xmm2
689; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
690; SSE2-NEXT:    psadbw %xmm0, %xmm2
691; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
692; SSE2-NEXT:    psadbw %xmm0, %xmm1
693; SSE2-NEXT:    packuswb %xmm2, %xmm1
694; SSE2-NEXT:    movdqa %xmm1, %xmm0
695; SSE2-NEXT:    retq
696;
697; SSE3-LABEL: testv4i32u:
698; SSE3:       # %bb.0:
699; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
700; SSE3-NEXT:    paddd %xmm0, %xmm1
701; SSE3-NEXT:    pandn %xmm1, %xmm0
702; SSE3-NEXT:    movdqa %xmm0, %xmm1
703; SSE3-NEXT:    psrlw $1, %xmm1
704; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
705; SSE3-NEXT:    psubb %xmm1, %xmm0
706; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
707; SSE3-NEXT:    movdqa %xmm0, %xmm2
708; SSE3-NEXT:    pand %xmm1, %xmm2
709; SSE3-NEXT:    psrlw $2, %xmm0
710; SSE3-NEXT:    pand %xmm1, %xmm0
711; SSE3-NEXT:    paddb %xmm2, %xmm0
712; SSE3-NEXT:    movdqa %xmm0, %xmm1
713; SSE3-NEXT:    psrlw $4, %xmm1
714; SSE3-NEXT:    paddb %xmm0, %xmm1
715; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
716; SSE3-NEXT:    pxor %xmm0, %xmm0
717; SSE3-NEXT:    movdqa %xmm1, %xmm2
718; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
719; SSE3-NEXT:    psadbw %xmm0, %xmm2
720; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
721; SSE3-NEXT:    psadbw %xmm0, %xmm1
722; SSE3-NEXT:    packuswb %xmm2, %xmm1
723; SSE3-NEXT:    movdqa %xmm1, %xmm0
724; SSE3-NEXT:    retq
725;
726; SSSE3-LABEL: testv4i32u:
727; SSSE3:       # %bb.0:
728; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
729; SSSE3-NEXT:    paddd %xmm0, %xmm1
730; SSSE3-NEXT:    pandn %xmm1, %xmm0
731; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
732; SSSE3-NEXT:    movdqa %xmm0, %xmm3
733; SSSE3-NEXT:    pand %xmm2, %xmm3
734; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
735; SSSE3-NEXT:    movdqa %xmm1, %xmm4
736; SSSE3-NEXT:    pshufb %xmm3, %xmm4
737; SSSE3-NEXT:    psrlw $4, %xmm0
738; SSSE3-NEXT:    pand %xmm2, %xmm0
739; SSSE3-NEXT:    pshufb %xmm0, %xmm1
740; SSSE3-NEXT:    paddb %xmm4, %xmm1
741; SSSE3-NEXT:    pxor %xmm0, %xmm0
742; SSSE3-NEXT:    movdqa %xmm1, %xmm2
743; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
744; SSSE3-NEXT:    psadbw %xmm0, %xmm2
745; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
746; SSSE3-NEXT:    psadbw %xmm0, %xmm1
747; SSSE3-NEXT:    packuswb %xmm2, %xmm1
748; SSSE3-NEXT:    movdqa %xmm1, %xmm0
749; SSSE3-NEXT:    retq
750;
751; SSE41-LABEL: testv4i32u:
752; SSE41:       # %bb.0:
753; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
754; SSE41-NEXT:    paddd %xmm0, %xmm1
755; SSE41-NEXT:    pandn %xmm1, %xmm0
756; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
757; SSE41-NEXT:    movdqa %xmm0, %xmm2
758; SSE41-NEXT:    pand %xmm1, %xmm2
759; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
760; SSE41-NEXT:    movdqa %xmm3, %xmm4
761; SSE41-NEXT:    pshufb %xmm2, %xmm4
762; SSE41-NEXT:    psrlw $4, %xmm0
763; SSE41-NEXT:    pand %xmm1, %xmm0
764; SSE41-NEXT:    pshufb %xmm0, %xmm3
765; SSE41-NEXT:    paddb %xmm4, %xmm3
766; SSE41-NEXT:    pxor %xmm1, %xmm1
767; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
768; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
769; SSE41-NEXT:    psadbw %xmm1, %xmm3
770; SSE41-NEXT:    psadbw %xmm1, %xmm0
771; SSE41-NEXT:    packuswb %xmm3, %xmm0
772; SSE41-NEXT:    retq
773;
774; AVX1-LABEL: testv4i32u:
775; AVX1:       # %bb.0:
776; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
777; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
778; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
779; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
780; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
781; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
782; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
783; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
784; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
785; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
786; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
787; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
788; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
789; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
790; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
791; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
792; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
793; AVX1-NEXT:    retq
794;
795; AVX2-LABEL: testv4i32u:
796; AVX2:       # %bb.0:
797; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
798; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
799; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
800; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
801; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
802; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
803; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
804; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
805; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
806; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
807; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
808; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
809; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
810; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
811; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
812; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
813; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
814; AVX2-NEXT:    retq
815;
816; AVX512CDVL-LABEL: testv4i32u:
817; AVX512CDVL:       # %bb.0:
818; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
819; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
820; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
821; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
822; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
823; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
824; AVX512CDVL-NEXT:    retq
825;
826; AVX512CD-LABEL: testv4i32u:
827; AVX512CD:       # %bb.0:
828; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
829; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
830; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
831; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
832; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
833; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
834; AVX512CD-NEXT:    vzeroupper
835; AVX512CD-NEXT:    retq
836;
837; AVX512VPOPCNTDQ-LABEL: testv4i32u:
838; AVX512VPOPCNTDQ:       # %bb.0:
839; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
840; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
841; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
842; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
843; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
844; AVX512VPOPCNTDQ-NEXT:    vzeroupper
845; AVX512VPOPCNTDQ-NEXT:    retq
846;
847; AVX512VPOPCNTDQVL-LABEL: testv4i32u:
848; AVX512VPOPCNTDQVL:       # %bb.0:
849; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
850; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
851; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
852; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
853; AVX512VPOPCNTDQVL-NEXT:    retq
854;
855; BITALG_NOVLX-LABEL: testv4i32u:
856; BITALG_NOVLX:       # %bb.0:
857; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
858; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
859; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
860; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
861; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
862; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
863; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
864; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
865; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
866; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
867; BITALG_NOVLX-NEXT:    vzeroupper
868; BITALG_NOVLX-NEXT:    retq
869;
870; BITALG-LABEL: testv4i32u:
871; BITALG:       # %bb.0:
872; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
873; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
874; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
875; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
876; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
877; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
878; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
879; BITALG-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
880; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
881; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
882; BITALG-NEXT:    retq
883;
884; X32-SSE-LABEL: testv4i32u:
885; X32-SSE:       # %bb.0:
886; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
887; X32-SSE-NEXT:    paddd %xmm0, %xmm1
888; X32-SSE-NEXT:    pandn %xmm1, %xmm0
889; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
890; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
891; X32-SSE-NEXT:    pand %xmm1, %xmm2
892; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
893; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
894; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
895; X32-SSE-NEXT:    psrlw $4, %xmm0
896; X32-SSE-NEXT:    pand %xmm1, %xmm0
897; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
898; X32-SSE-NEXT:    paddb %xmm4, %xmm3
899; X32-SSE-NEXT:    pxor %xmm1, %xmm1
900; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
901; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
902; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
903; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
904; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
905; X32-SSE-NEXT:    retl
906  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
907  ret <4 x i32> %out
908}
909
910define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
911; SSE2-LABEL: testv8i16:
912; SSE2:       # %bb.0:
913; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
914; SSE2-NEXT:    paddw %xmm0, %xmm1
915; SSE2-NEXT:    pandn %xmm1, %xmm0
916; SSE2-NEXT:    movdqa %xmm0, %xmm1
917; SSE2-NEXT:    psrlw $1, %xmm1
918; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
919; SSE2-NEXT:    psubb %xmm1, %xmm0
920; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
921; SSE2-NEXT:    movdqa %xmm0, %xmm2
922; SSE2-NEXT:    pand %xmm1, %xmm2
923; SSE2-NEXT:    psrlw $2, %xmm0
924; SSE2-NEXT:    pand %xmm1, %xmm0
925; SSE2-NEXT:    paddb %xmm2, %xmm0
926; SSE2-NEXT:    movdqa %xmm0, %xmm1
927; SSE2-NEXT:    psrlw $4, %xmm1
928; SSE2-NEXT:    paddb %xmm0, %xmm1
929; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
930; SSE2-NEXT:    movdqa %xmm1, %xmm0
931; SSE2-NEXT:    psllw $8, %xmm0
932; SSE2-NEXT:    paddb %xmm1, %xmm0
933; SSE2-NEXT:    psrlw $8, %xmm0
934; SSE2-NEXT:    retq
935;
936; SSE3-LABEL: testv8i16:
937; SSE3:       # %bb.0:
938; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
939; SSE3-NEXT:    paddw %xmm0, %xmm1
940; SSE3-NEXT:    pandn %xmm1, %xmm0
941; SSE3-NEXT:    movdqa %xmm0, %xmm1
942; SSE3-NEXT:    psrlw $1, %xmm1
943; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
944; SSE3-NEXT:    psubb %xmm1, %xmm0
945; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
946; SSE3-NEXT:    movdqa %xmm0, %xmm2
947; SSE3-NEXT:    pand %xmm1, %xmm2
948; SSE3-NEXT:    psrlw $2, %xmm0
949; SSE3-NEXT:    pand %xmm1, %xmm0
950; SSE3-NEXT:    paddb %xmm2, %xmm0
951; SSE3-NEXT:    movdqa %xmm0, %xmm1
952; SSE3-NEXT:    psrlw $4, %xmm1
953; SSE3-NEXT:    paddb %xmm0, %xmm1
954; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
955; SSE3-NEXT:    movdqa %xmm1, %xmm0
956; SSE3-NEXT:    psllw $8, %xmm0
957; SSE3-NEXT:    paddb %xmm1, %xmm0
958; SSE3-NEXT:    psrlw $8, %xmm0
959; SSE3-NEXT:    retq
960;
961; SSSE3-LABEL: testv8i16:
962; SSSE3:       # %bb.0:
963; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
964; SSSE3-NEXT:    paddw %xmm0, %xmm1
965; SSSE3-NEXT:    pandn %xmm1, %xmm0
966; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
967; SSSE3-NEXT:    movdqa %xmm0, %xmm2
968; SSSE3-NEXT:    pand %xmm1, %xmm2
969; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
970; SSSE3-NEXT:    movdqa %xmm3, %xmm4
971; SSSE3-NEXT:    pshufb %xmm2, %xmm4
972; SSSE3-NEXT:    psrlw $4, %xmm0
973; SSSE3-NEXT:    pand %xmm1, %xmm0
974; SSSE3-NEXT:    pshufb %xmm0, %xmm3
975; SSSE3-NEXT:    paddb %xmm4, %xmm3
976; SSSE3-NEXT:    movdqa %xmm3, %xmm0
977; SSSE3-NEXT:    psllw $8, %xmm0
978; SSSE3-NEXT:    paddb %xmm3, %xmm0
979; SSSE3-NEXT:    psrlw $8, %xmm0
980; SSSE3-NEXT:    retq
981;
982; SSE41-LABEL: testv8i16:
983; SSE41:       # %bb.0:
984; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
985; SSE41-NEXT:    paddw %xmm0, %xmm1
986; SSE41-NEXT:    pandn %xmm1, %xmm0
987; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
988; SSE41-NEXT:    movdqa %xmm0, %xmm2
989; SSE41-NEXT:    pand %xmm1, %xmm2
990; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
991; SSE41-NEXT:    movdqa %xmm3, %xmm4
992; SSE41-NEXT:    pshufb %xmm2, %xmm4
993; SSE41-NEXT:    psrlw $4, %xmm0
994; SSE41-NEXT:    pand %xmm1, %xmm0
995; SSE41-NEXT:    pshufb %xmm0, %xmm3
996; SSE41-NEXT:    paddb %xmm4, %xmm3
997; SSE41-NEXT:    movdqa %xmm3, %xmm0
998; SSE41-NEXT:    psllw $8, %xmm0
999; SSE41-NEXT:    paddb %xmm3, %xmm0
1000; SSE41-NEXT:    psrlw $8, %xmm0
1001; SSE41-NEXT:    retq
1002;
1003; AVX-LABEL: testv8i16:
1004; AVX:       # %bb.0:
1005; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1006; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1007; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1008; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1009; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1010; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1011; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1012; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1013; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1014; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1015; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1016; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
1017; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1018; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1019; AVX-NEXT:    retq
1020;
1021; AVX512VPOPCNTDQ-LABEL: testv8i16:
1022; AVX512VPOPCNTDQ:       # %bb.0:
1023; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1024; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1025; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1026; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1027; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
1028; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
1029; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1030; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1031; AVX512VPOPCNTDQ-NEXT:    retq
1032;
1033; AVX512VPOPCNTDQVL-LABEL: testv8i16:
1034; AVX512VPOPCNTDQVL:       # %bb.0:
1035; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1036; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1037; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1038; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1039; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
1040; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
1041; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
1042; AVX512VPOPCNTDQVL-NEXT:    retq
1043;
1044; BITALG_NOVLX-LABEL: testv8i16:
1045; BITALG_NOVLX:       # %bb.0:
1046; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1047; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1048; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1049; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
1050; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1051; BITALG_NOVLX-NEXT:    vzeroupper
1052; BITALG_NOVLX-NEXT:    retq
1053;
1054; BITALG-LABEL: testv8i16:
1055; BITALG:       # %bb.0:
1056; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1057; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1058; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1059; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
1060; BITALG-NEXT:    retq
1061;
1062; X32-SSE-LABEL: testv8i16:
1063; X32-SSE:       # %bb.0:
1064; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1065; X32-SSE-NEXT:    paddw %xmm0, %xmm1
1066; X32-SSE-NEXT:    pandn %xmm1, %xmm0
1067; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1068; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1069; X32-SSE-NEXT:    pand %xmm1, %xmm2
1070; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1071; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1072; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
1073; X32-SSE-NEXT:    psrlw $4, %xmm0
1074; X32-SSE-NEXT:    pand %xmm1, %xmm0
1075; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
1076; X32-SSE-NEXT:    paddb %xmm4, %xmm3
1077; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
1078; X32-SSE-NEXT:    psllw $8, %xmm0
1079; X32-SSE-NEXT:    paddb %xmm3, %xmm0
1080; X32-SSE-NEXT:    psrlw $8, %xmm0
1081; X32-SSE-NEXT:    retl
1082  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
1083  ret <8 x i16> %out
1084}
1085
1086define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
1087; SSE2-LABEL: testv8i16u:
1088; SSE2:       # %bb.0:
1089; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1090; SSE2-NEXT:    paddw %xmm0, %xmm1
1091; SSE2-NEXT:    pandn %xmm1, %xmm0
1092; SSE2-NEXT:    movdqa %xmm0, %xmm1
1093; SSE2-NEXT:    psrlw $1, %xmm1
1094; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1095; SSE2-NEXT:    psubb %xmm1, %xmm0
1096; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1097; SSE2-NEXT:    movdqa %xmm0, %xmm2
1098; SSE2-NEXT:    pand %xmm1, %xmm2
1099; SSE2-NEXT:    psrlw $2, %xmm0
1100; SSE2-NEXT:    pand %xmm1, %xmm0
1101; SSE2-NEXT:    paddb %xmm2, %xmm0
1102; SSE2-NEXT:    movdqa %xmm0, %xmm1
1103; SSE2-NEXT:    psrlw $4, %xmm1
1104; SSE2-NEXT:    paddb %xmm0, %xmm1
1105; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1106; SSE2-NEXT:    movdqa %xmm1, %xmm0
1107; SSE2-NEXT:    psllw $8, %xmm0
1108; SSE2-NEXT:    paddb %xmm1, %xmm0
1109; SSE2-NEXT:    psrlw $8, %xmm0
1110; SSE2-NEXT:    retq
1111;
1112; SSE3-LABEL: testv8i16u:
1113; SSE3:       # %bb.0:
1114; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1115; SSE3-NEXT:    paddw %xmm0, %xmm1
1116; SSE3-NEXT:    pandn %xmm1, %xmm0
1117; SSE3-NEXT:    movdqa %xmm0, %xmm1
1118; SSE3-NEXT:    psrlw $1, %xmm1
1119; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1120; SSE3-NEXT:    psubb %xmm1, %xmm0
1121; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1122; SSE3-NEXT:    movdqa %xmm0, %xmm2
1123; SSE3-NEXT:    pand %xmm1, %xmm2
1124; SSE3-NEXT:    psrlw $2, %xmm0
1125; SSE3-NEXT:    pand %xmm1, %xmm0
1126; SSE3-NEXT:    paddb %xmm2, %xmm0
1127; SSE3-NEXT:    movdqa %xmm0, %xmm1
1128; SSE3-NEXT:    psrlw $4, %xmm1
1129; SSE3-NEXT:    paddb %xmm0, %xmm1
1130; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1131; SSE3-NEXT:    movdqa %xmm1, %xmm0
1132; SSE3-NEXT:    psllw $8, %xmm0
1133; SSE3-NEXT:    paddb %xmm1, %xmm0
1134; SSE3-NEXT:    psrlw $8, %xmm0
1135; SSE3-NEXT:    retq
1136;
1137; SSSE3-LABEL: testv8i16u:
1138; SSSE3:       # %bb.0:
1139; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1140; SSSE3-NEXT:    paddw %xmm0, %xmm1
1141; SSSE3-NEXT:    pandn %xmm1, %xmm0
1142; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1143; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1144; SSSE3-NEXT:    pand %xmm1, %xmm2
1145; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1146; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1147; SSSE3-NEXT:    pshufb %xmm2, %xmm4
1148; SSSE3-NEXT:    psrlw $4, %xmm0
1149; SSSE3-NEXT:    pand %xmm1, %xmm0
1150; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1151; SSSE3-NEXT:    paddb %xmm4, %xmm3
1152; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1153; SSSE3-NEXT:    psllw $8, %xmm0
1154; SSSE3-NEXT:    paddb %xmm3, %xmm0
1155; SSSE3-NEXT:    psrlw $8, %xmm0
1156; SSSE3-NEXT:    retq
1157;
1158; SSE41-LABEL: testv8i16u:
1159; SSE41:       # %bb.0:
1160; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1161; SSE41-NEXT:    paddw %xmm0, %xmm1
1162; SSE41-NEXT:    pandn %xmm1, %xmm0
1163; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1164; SSE41-NEXT:    movdqa %xmm0, %xmm2
1165; SSE41-NEXT:    pand %xmm1, %xmm2
1166; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1167; SSE41-NEXT:    movdqa %xmm3, %xmm4
1168; SSE41-NEXT:    pshufb %xmm2, %xmm4
1169; SSE41-NEXT:    psrlw $4, %xmm0
1170; SSE41-NEXT:    pand %xmm1, %xmm0
1171; SSE41-NEXT:    pshufb %xmm0, %xmm3
1172; SSE41-NEXT:    paddb %xmm4, %xmm3
1173; SSE41-NEXT:    movdqa %xmm3, %xmm0
1174; SSE41-NEXT:    psllw $8, %xmm0
1175; SSE41-NEXT:    paddb %xmm3, %xmm0
1176; SSE41-NEXT:    psrlw $8, %xmm0
1177; SSE41-NEXT:    retq
1178;
1179; AVX-LABEL: testv8i16u:
1180; AVX:       # %bb.0:
1181; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1182; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1183; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1184; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1185; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1186; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1187; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1188; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1189; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1190; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1191; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1192; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
1193; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1194; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1195; AVX-NEXT:    retq
1196;
1197; AVX512VPOPCNTDQ-LABEL: testv8i16u:
1198; AVX512VPOPCNTDQ:       # %bb.0:
1199; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1200; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1201; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1202; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1203; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
1204; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
1205; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1206; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1207; AVX512VPOPCNTDQ-NEXT:    retq
1208;
1209; AVX512VPOPCNTDQVL-LABEL: testv8i16u:
1210; AVX512VPOPCNTDQVL:       # %bb.0:
1211; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1212; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1213; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1214; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1215; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
1216; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
1217; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
1218; AVX512VPOPCNTDQVL-NEXT:    retq
1219;
1220; BITALG_NOVLX-LABEL: testv8i16u:
1221; BITALG_NOVLX:       # %bb.0:
1222; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1223; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1224; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1225; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
1226; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1227; BITALG_NOVLX-NEXT:    vzeroupper
1228; BITALG_NOVLX-NEXT:    retq
1229;
1230; BITALG-LABEL: testv8i16u:
1231; BITALG:       # %bb.0:
1232; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1233; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
1234; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1235; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
1236; BITALG-NEXT:    retq
1237;
1238; X32-SSE-LABEL: testv8i16u:
1239; X32-SSE:       # %bb.0:
1240; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1241; X32-SSE-NEXT:    paddw %xmm0, %xmm1
1242; X32-SSE-NEXT:    pandn %xmm1, %xmm0
1243; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1244; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1245; X32-SSE-NEXT:    pand %xmm1, %xmm2
1246; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1247; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1248; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
1249; X32-SSE-NEXT:    psrlw $4, %xmm0
1250; X32-SSE-NEXT:    pand %xmm1, %xmm0
1251; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
1252; X32-SSE-NEXT:    paddb %xmm4, %xmm3
1253; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
1254; X32-SSE-NEXT:    psllw $8, %xmm0
1255; X32-SSE-NEXT:    paddb %xmm3, %xmm0
1256; X32-SSE-NEXT:    psrlw $8, %xmm0
1257; X32-SSE-NEXT:    retl
1258  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
1259  ret <8 x i16> %out
1260}
1261
1262define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
1263; SSE2-LABEL: testv16i8:
1264; SSE2:       # %bb.0:
1265; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1266; SSE2-NEXT:    paddb %xmm0, %xmm1
1267; SSE2-NEXT:    pandn %xmm1, %xmm0
1268; SSE2-NEXT:    movdqa %xmm0, %xmm1
1269; SSE2-NEXT:    psrlw $1, %xmm1
1270; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1271; SSE2-NEXT:    psubb %xmm1, %xmm0
1272; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1273; SSE2-NEXT:    movdqa %xmm0, %xmm2
1274; SSE2-NEXT:    pand %xmm1, %xmm2
1275; SSE2-NEXT:    psrlw $2, %xmm0
1276; SSE2-NEXT:    pand %xmm1, %xmm0
1277; SSE2-NEXT:    paddb %xmm2, %xmm0
1278; SSE2-NEXT:    movdqa %xmm0, %xmm1
1279; SSE2-NEXT:    psrlw $4, %xmm1
1280; SSE2-NEXT:    paddb %xmm0, %xmm1
1281; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1282; SSE2-NEXT:    movdqa %xmm1, %xmm0
1283; SSE2-NEXT:    retq
1284;
1285; SSE3-LABEL: testv16i8:
1286; SSE3:       # %bb.0:
1287; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1288; SSE3-NEXT:    paddb %xmm0, %xmm1
1289; SSE3-NEXT:    pandn %xmm1, %xmm0
1290; SSE3-NEXT:    movdqa %xmm0, %xmm1
1291; SSE3-NEXT:    psrlw $1, %xmm1
1292; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1293; SSE3-NEXT:    psubb %xmm1, %xmm0
1294; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1295; SSE3-NEXT:    movdqa %xmm0, %xmm2
1296; SSE3-NEXT:    pand %xmm1, %xmm2
1297; SSE3-NEXT:    psrlw $2, %xmm0
1298; SSE3-NEXT:    pand %xmm1, %xmm0
1299; SSE3-NEXT:    paddb %xmm2, %xmm0
1300; SSE3-NEXT:    movdqa %xmm0, %xmm1
1301; SSE3-NEXT:    psrlw $4, %xmm1
1302; SSE3-NEXT:    paddb %xmm0, %xmm1
1303; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1304; SSE3-NEXT:    movdqa %xmm1, %xmm0
1305; SSE3-NEXT:    retq
1306;
1307; SSSE3-LABEL: testv16i8:
1308; SSSE3:       # %bb.0:
1309; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1310; SSSE3-NEXT:    paddb %xmm0, %xmm1
1311; SSSE3-NEXT:    pandn %xmm1, %xmm0
1312; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1313; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1314; SSSE3-NEXT:    pand %xmm2, %xmm3
1315; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1316; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1317; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1318; SSSE3-NEXT:    psrlw $4, %xmm0
1319; SSSE3-NEXT:    pand %xmm2, %xmm0
1320; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1321; SSSE3-NEXT:    paddb %xmm4, %xmm1
1322; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1323; SSSE3-NEXT:    retq
1324;
1325; SSE41-LABEL: testv16i8:
1326; SSE41:       # %bb.0:
1327; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1328; SSE41-NEXT:    paddb %xmm0, %xmm1
1329; SSE41-NEXT:    pandn %xmm1, %xmm0
1330; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1331; SSE41-NEXT:    movdqa %xmm0, %xmm3
1332; SSE41-NEXT:    pand %xmm2, %xmm3
1333; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1334; SSE41-NEXT:    movdqa %xmm1, %xmm4
1335; SSE41-NEXT:    pshufb %xmm3, %xmm4
1336; SSE41-NEXT:    psrlw $4, %xmm0
1337; SSE41-NEXT:    pand %xmm2, %xmm0
1338; SSE41-NEXT:    pshufb %xmm0, %xmm1
1339; SSE41-NEXT:    paddb %xmm4, %xmm1
1340; SSE41-NEXT:    movdqa %xmm1, %xmm0
1341; SSE41-NEXT:    retq
1342;
1343; AVX-LABEL: testv16i8:
1344; AVX:       # %bb.0:
1345; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1346; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1347; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1348; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1349; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1350; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1351; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1352; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1353; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1354; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1355; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1356; AVX-NEXT:    retq
1357;
1358; AVX512VPOPCNTDQ-LABEL: testv16i8:
1359; AVX512VPOPCNTDQ:       # %bb.0:
1360; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1361; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1362; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1363; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1364; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
1365; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
1366; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1367; AVX512VPOPCNTDQ-NEXT:    retq
1368;
1369; AVX512VPOPCNTDQVL-LABEL: testv16i8:
1370; AVX512VPOPCNTDQVL:       # %bb.0:
1371; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1372; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1373; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1374; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1375; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
1376; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
1377; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
1378; AVX512VPOPCNTDQVL-NEXT:    retq
1379;
1380; BITALG_NOVLX-LABEL: testv16i8:
1381; BITALG_NOVLX:       # %bb.0:
1382; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1383; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1384; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1385; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
1386; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1387; BITALG_NOVLX-NEXT:    vzeroupper
1388; BITALG_NOVLX-NEXT:    retq
1389;
1390; BITALG-LABEL: testv16i8:
1391; BITALG:       # %bb.0:
1392; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1393; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1394; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1395; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
1396; BITALG-NEXT:    retq
1397;
1398; X32-SSE-LABEL: testv16i8:
1399; X32-SSE:       # %bb.0:
1400; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1401; X32-SSE-NEXT:    paddb %xmm0, %xmm1
1402; X32-SSE-NEXT:    pandn %xmm1, %xmm0
1403; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1404; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
1405; X32-SSE-NEXT:    pand %xmm2, %xmm3
1406; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1407; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1408; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1409; X32-SSE-NEXT:    psrlw $4, %xmm0
1410; X32-SSE-NEXT:    pand %xmm2, %xmm0
1411; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1412; X32-SSE-NEXT:    paddb %xmm4, %xmm1
1413; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1414; X32-SSE-NEXT:    retl
1415  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
1416  ret <16 x i8> %out
1417}
1418
1419define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
1420; SSE2-LABEL: testv16i8u:
1421; SSE2:       # %bb.0:
1422; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
1423; SSE2-NEXT:    paddb %xmm0, %xmm1
1424; SSE2-NEXT:    pandn %xmm1, %xmm0
1425; SSE2-NEXT:    movdqa %xmm0, %xmm1
1426; SSE2-NEXT:    psrlw $1, %xmm1
1427; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1428; SSE2-NEXT:    psubb %xmm1, %xmm0
1429; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1430; SSE2-NEXT:    movdqa %xmm0, %xmm2
1431; SSE2-NEXT:    pand %xmm1, %xmm2
1432; SSE2-NEXT:    psrlw $2, %xmm0
1433; SSE2-NEXT:    pand %xmm1, %xmm0
1434; SSE2-NEXT:    paddb %xmm2, %xmm0
1435; SSE2-NEXT:    movdqa %xmm0, %xmm1
1436; SSE2-NEXT:    psrlw $4, %xmm1
1437; SSE2-NEXT:    paddb %xmm0, %xmm1
1438; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1439; SSE2-NEXT:    movdqa %xmm1, %xmm0
1440; SSE2-NEXT:    retq
1441;
1442; SSE3-LABEL: testv16i8u:
1443; SSE3:       # %bb.0:
1444; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1445; SSE3-NEXT:    paddb %xmm0, %xmm1
1446; SSE3-NEXT:    pandn %xmm1, %xmm0
1447; SSE3-NEXT:    movdqa %xmm0, %xmm1
1448; SSE3-NEXT:    psrlw $1, %xmm1
1449; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1450; SSE3-NEXT:    psubb %xmm1, %xmm0
1451; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1452; SSE3-NEXT:    movdqa %xmm0, %xmm2
1453; SSE3-NEXT:    pand %xmm1, %xmm2
1454; SSE3-NEXT:    psrlw $2, %xmm0
1455; SSE3-NEXT:    pand %xmm1, %xmm0
1456; SSE3-NEXT:    paddb %xmm2, %xmm0
1457; SSE3-NEXT:    movdqa %xmm0, %xmm1
1458; SSE3-NEXT:    psrlw $4, %xmm1
1459; SSE3-NEXT:    paddb %xmm0, %xmm1
1460; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
1461; SSE3-NEXT:    movdqa %xmm1, %xmm0
1462; SSE3-NEXT:    retq
1463;
1464; SSSE3-LABEL: testv16i8u:
1465; SSSE3:       # %bb.0:
1466; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
1467; SSSE3-NEXT:    paddb %xmm0, %xmm1
1468; SSSE3-NEXT:    pandn %xmm1, %xmm0
1469; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1470; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1471; SSSE3-NEXT:    pand %xmm2, %xmm3
1472; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1473; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1474; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1475; SSSE3-NEXT:    psrlw $4, %xmm0
1476; SSSE3-NEXT:    pand %xmm2, %xmm0
1477; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1478; SSSE3-NEXT:    paddb %xmm4, %xmm1
1479; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1480; SSSE3-NEXT:    retq
1481;
1482; SSE41-LABEL: testv16i8u:
1483; SSE41:       # %bb.0:
1484; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1485; SSE41-NEXT:    paddb %xmm0, %xmm1
1486; SSE41-NEXT:    pandn %xmm1, %xmm0
1487; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1488; SSE41-NEXT:    movdqa %xmm0, %xmm3
1489; SSE41-NEXT:    pand %xmm2, %xmm3
1490; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1491; SSE41-NEXT:    movdqa %xmm1, %xmm4
1492; SSE41-NEXT:    pshufb %xmm3, %xmm4
1493; SSE41-NEXT:    psrlw $4, %xmm0
1494; SSE41-NEXT:    pand %xmm2, %xmm0
1495; SSE41-NEXT:    pshufb %xmm0, %xmm1
1496; SSE41-NEXT:    paddb %xmm4, %xmm1
1497; SSE41-NEXT:    movdqa %xmm1, %xmm0
1498; SSE41-NEXT:    retq
1499;
1500; AVX-LABEL: testv16i8u:
1501; AVX:       # %bb.0:
1502; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1503; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1504; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1505; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1506; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1507; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1508; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1509; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1510; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1511; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1512; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1513; AVX-NEXT:    retq
1514;
1515; AVX512VPOPCNTDQ-LABEL: testv16i8u:
1516; AVX512VPOPCNTDQ:       # %bb.0:
1517; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1518; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1519; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1520; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1521; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
1522; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
1523; AVX512VPOPCNTDQ-NEXT:    vzeroupper
1524; AVX512VPOPCNTDQ-NEXT:    retq
1525;
1526; AVX512VPOPCNTDQVL-LABEL: testv16i8u:
1527; AVX512VPOPCNTDQVL:       # %bb.0:
1528; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1529; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1530; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1531; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1532; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
1533; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
1534; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
1535; AVX512VPOPCNTDQVL-NEXT:    retq
1536;
1537; BITALG_NOVLX-LABEL: testv16i8u:
1538; BITALG_NOVLX:       # %bb.0:
1539; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1540; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1541; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1542; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
1543; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1544; BITALG_NOVLX-NEXT:    vzeroupper
1545; BITALG_NOVLX-NEXT:    retq
1546;
1547; BITALG-LABEL: testv16i8u:
1548; BITALG:       # %bb.0:
1549; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1550; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
1551; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
1552; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
1553; BITALG-NEXT:    retq
1554;
1555; X32-SSE-LABEL: testv16i8u:
1556; X32-SSE:       # %bb.0:
1557; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
1558; X32-SSE-NEXT:    paddb %xmm0, %xmm1
1559; X32-SSE-NEXT:    pandn %xmm1, %xmm0
1560; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1561; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
1562; X32-SSE-NEXT:    pand %xmm2, %xmm3
1563; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
1564; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1565; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1566; X32-SSE-NEXT:    psrlw $4, %xmm0
1567; X32-SSE-NEXT:    pand %xmm2, %xmm0
1568; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1569; X32-SSE-NEXT:    paddb %xmm4, %xmm1
1570; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1571; X32-SSE-NEXT:    retl
1572  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
1573  ret <16 x i8> %out
1574}
1575
1576define <2 x i64> @foldv2i64() nounwind {
1577; SSE-LABEL: foldv2i64:
1578; SSE:       # %bb.0:
1579; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
1580; SSE-NEXT:    retq
1581;
1582; AVX-LABEL: foldv2i64:
1583; AVX:       # %bb.0:
1584; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1585; AVX-NEXT:    retq
1586;
1587; AVX512VPOPCNTDQ-LABEL: foldv2i64:
1588; AVX512VPOPCNTDQ:       # %bb.0:
1589; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1590; AVX512VPOPCNTDQ-NEXT:    retq
1591;
1592; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
1593; AVX512VPOPCNTDQVL:       # %bb.0:
1594; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1595; AVX512VPOPCNTDQVL-NEXT:    retq
1596;
1597; BITALG_NOVLX-LABEL: foldv2i64:
1598; BITALG_NOVLX:       # %bb.0:
1599; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1600; BITALG_NOVLX-NEXT:    retq
1601;
1602; BITALG-LABEL: foldv2i64:
1603; BITALG:       # %bb.0:
1604; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1605; BITALG-NEXT:    retq
1606;
1607; X32-SSE-LABEL: foldv2i64:
1608; X32-SSE:       # %bb.0:
1609; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
1610; X32-SSE-NEXT:    retl
1611  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
1612  ret <2 x i64> %out
1613}
1614
1615define <2 x i64> @foldv2i64u() nounwind {
1616; SSE-LABEL: foldv2i64u:
1617; SSE:       # %bb.0:
1618; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
1619; SSE-NEXT:    retq
1620;
1621; AVX-LABEL: foldv2i64u:
1622; AVX:       # %bb.0:
1623; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1624; AVX-NEXT:    retq
1625;
1626; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
1627; AVX512VPOPCNTDQ:       # %bb.0:
1628; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1629; AVX512VPOPCNTDQ-NEXT:    retq
1630;
1631; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
1632; AVX512VPOPCNTDQVL:       # %bb.0:
1633; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1634; AVX512VPOPCNTDQVL-NEXT:    retq
1635;
1636; BITALG_NOVLX-LABEL: foldv2i64u:
1637; BITALG_NOVLX:       # %bb.0:
1638; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1639; BITALG_NOVLX-NEXT:    retq
1640;
1641; BITALG-LABEL: foldv2i64u:
1642; BITALG:       # %bb.0:
1643; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
1644; BITALG-NEXT:    retq
1645;
1646; X32-SSE-LABEL: foldv2i64u:
1647; X32-SSE:       # %bb.0:
1648; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
1649; X32-SSE-NEXT:    retl
1650  %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
1651  ret <2 x i64> %out
1652}
1653
1654define <4 x i32> @foldv4i32() nounwind {
1655; SSE-LABEL: foldv4i32:
1656; SSE:       # %bb.0:
1657; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1658; SSE-NEXT:    retq
1659;
1660; AVX-LABEL: foldv4i32:
1661; AVX:       # %bb.0:
1662; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1663; AVX-NEXT:    retq
1664;
1665; AVX512VPOPCNTDQ-LABEL: foldv4i32:
1666; AVX512VPOPCNTDQ:       # %bb.0:
1667; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1668; AVX512VPOPCNTDQ-NEXT:    retq
1669;
1670; AVX512VPOPCNTDQVL-LABEL: foldv4i32:
1671; AVX512VPOPCNTDQVL:       # %bb.0:
1672; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1673; AVX512VPOPCNTDQVL-NEXT:    retq
1674;
1675; BITALG_NOVLX-LABEL: foldv4i32:
1676; BITALG_NOVLX:       # %bb.0:
1677; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1678; BITALG_NOVLX-NEXT:    retq
1679;
1680; BITALG-LABEL: foldv4i32:
1681; BITALG:       # %bb.0:
1682; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1683; BITALG-NEXT:    retq
1684;
1685; X32-SSE-LABEL: foldv4i32:
1686; X32-SSE:       # %bb.0:
1687; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1688; X32-SSE-NEXT:    retl
1689  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
1690  ret <4 x i32> %out
1691}
1692
1693define <4 x i32> @foldv4i32u() nounwind {
1694; SSE-LABEL: foldv4i32u:
1695; SSE:       # %bb.0:
1696; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1697; SSE-NEXT:    retq
1698;
1699; AVX-LABEL: foldv4i32u:
1700; AVX:       # %bb.0:
1701; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1702; AVX-NEXT:    retq
1703;
1704; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
1705; AVX512VPOPCNTDQ:       # %bb.0:
1706; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1707; AVX512VPOPCNTDQ-NEXT:    retq
1708;
1709; AVX512VPOPCNTDQVL-LABEL: foldv4i32u:
1710; AVX512VPOPCNTDQVL:       # %bb.0:
1711; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1712; AVX512VPOPCNTDQVL-NEXT:    retq
1713;
1714; BITALG_NOVLX-LABEL: foldv4i32u:
1715; BITALG_NOVLX:       # %bb.0:
1716; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1717; BITALG_NOVLX-NEXT:    retq
1718;
1719; BITALG-LABEL: foldv4i32u:
1720; BITALG:       # %bb.0:
1721; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
1722; BITALG-NEXT:    retq
1723;
1724; X32-SSE-LABEL: foldv4i32u:
1725; X32-SSE:       # %bb.0:
1726; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
1727; X32-SSE-NEXT:    retl
1728  %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
1729  ret <4 x i32> %out
1730}
1731
1732define <8 x i16> @foldv8i16() nounwind {
1733; SSE-LABEL: foldv8i16:
1734; SSE:       # %bb.0:
1735; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1736; SSE-NEXT:    retq
1737;
1738; AVX-LABEL: foldv8i16:
1739; AVX:       # %bb.0:
1740; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1741; AVX-NEXT:    retq
1742;
1743; AVX512VPOPCNTDQ-LABEL: foldv8i16:
1744; AVX512VPOPCNTDQ:       # %bb.0:
1745; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1746; AVX512VPOPCNTDQ-NEXT:    retq
1747;
1748; AVX512VPOPCNTDQVL-LABEL: foldv8i16:
1749; AVX512VPOPCNTDQVL:       # %bb.0:
1750; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1751; AVX512VPOPCNTDQVL-NEXT:    retq
1752;
1753; BITALG_NOVLX-LABEL: foldv8i16:
1754; BITALG_NOVLX:       # %bb.0:
1755; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1756; BITALG_NOVLX-NEXT:    retq
1757;
1758; BITALG-LABEL: foldv8i16:
1759; BITALG:       # %bb.0:
1760; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1761; BITALG-NEXT:    retq
1762;
1763; X32-SSE-LABEL: foldv8i16:
1764; X32-SSE:       # %bb.0:
1765; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1766; X32-SSE-NEXT:    retl
1767  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
1768  ret <8 x i16> %out
1769}
1770
1771define <8 x i16> @foldv8i16u() nounwind {
1772; SSE-LABEL: foldv8i16u:
1773; SSE:       # %bb.0:
1774; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1775; SSE-NEXT:    retq
1776;
1777; AVX-LABEL: foldv8i16u:
1778; AVX:       # %bb.0:
1779; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1780; AVX-NEXT:    retq
1781;
1782; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
1783; AVX512VPOPCNTDQ:       # %bb.0:
1784; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1785; AVX512VPOPCNTDQ-NEXT:    retq
1786;
1787; AVX512VPOPCNTDQVL-LABEL: foldv8i16u:
1788; AVX512VPOPCNTDQVL:       # %bb.0:
1789; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1790; AVX512VPOPCNTDQVL-NEXT:    retq
1791;
1792; BITALG_NOVLX-LABEL: foldv8i16u:
1793; BITALG_NOVLX:       # %bb.0:
1794; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1795; BITALG_NOVLX-NEXT:    retq
1796;
1797; BITALG-LABEL: foldv8i16u:
1798; BITALG:       # %bb.0:
1799; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1800; BITALG-NEXT:    retq
1801;
1802; X32-SSE-LABEL: foldv8i16u:
1803; X32-SSE:       # %bb.0:
1804; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
1805; X32-SSE-NEXT:    retl
1806  %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
1807  ret <8 x i16> %out
1808}
1809
1810define <16 x i8> @foldv16i8() nounwind {
1811; SSE-LABEL: foldv16i8:
1812; SSE:       # %bb.0:
1813; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1814; SSE-NEXT:    retq
1815;
1816; AVX-LABEL: foldv16i8:
1817; AVX:       # %bb.0:
1818; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1819; AVX-NEXT:    retq
1820;
1821; AVX512VPOPCNTDQ-LABEL: foldv16i8:
1822; AVX512VPOPCNTDQ:       # %bb.0:
1823; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1824; AVX512VPOPCNTDQ-NEXT:    retq
1825;
1826; AVX512VPOPCNTDQVL-LABEL: foldv16i8:
1827; AVX512VPOPCNTDQVL:       # %bb.0:
1828; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1829; AVX512VPOPCNTDQVL-NEXT:    retq
1830;
1831; BITALG_NOVLX-LABEL: foldv16i8:
1832; BITALG_NOVLX:       # %bb.0:
1833; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1834; BITALG_NOVLX-NEXT:    retq
1835;
1836; BITALG-LABEL: foldv16i8:
1837; BITALG:       # %bb.0:
1838; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1839; BITALG-NEXT:    retq
1840;
1841; X32-SSE-LABEL: foldv16i8:
1842; X32-SSE:       # %bb.0:
1843; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1844; X32-SSE-NEXT:    retl
1845  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
1846  ret <16 x i8> %out
1847}
1848
1849define <16 x i8> @foldv16i8u() nounwind {
1850; SSE-LABEL: foldv16i8u:
1851; SSE:       # %bb.0:
1852; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1853; SSE-NEXT:    retq
1854;
1855; AVX-LABEL: foldv16i8u:
1856; AVX:       # %bb.0:
1857; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1858; AVX-NEXT:    retq
1859;
1860; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
1861; AVX512VPOPCNTDQ:       # %bb.0:
1862; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1863; AVX512VPOPCNTDQ-NEXT:    retq
1864;
1865; AVX512VPOPCNTDQVL-LABEL: foldv16i8u:
1866; AVX512VPOPCNTDQVL:       # %bb.0:
1867; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1868; AVX512VPOPCNTDQVL-NEXT:    retq
1869;
1870; BITALG_NOVLX-LABEL: foldv16i8u:
1871; BITALG_NOVLX:       # %bb.0:
1872; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1873; BITALG_NOVLX-NEXT:    retq
1874;
1875; BITALG-LABEL: foldv16i8u:
1876; BITALG:       # %bb.0:
1877; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1878; BITALG-NEXT:    retq
1879;
1880; X32-SSE-LABEL: foldv16i8u:
1881; X32-SSE:       # %bb.0:
1882; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
1883; X32-SSE-NEXT:    retl
1884  %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
1885  ret <16 x i8> %out
1886}
1887
1888declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
1889declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
1890declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
1891declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
1892