1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
10;
11; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
13
14define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
15; SSE2-LABEL: testv2i64:
16; SSE2:       # BB#0:
17; SSE2-NEXT:    movd %xmm0, %rax
18; SSE2-NEXT:    bsrq %rax, %rax
19; SSE2-NEXT:    movl $127, %ecx
20; SSE2-NEXT:    cmoveq %rcx, %rax
21; SSE2-NEXT:    xorq $63, %rax
22; SSE2-NEXT:    movd %rax, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
24; SSE2-NEXT:    movd %xmm0, %rax
25; SSE2-NEXT:    bsrq %rax, %rax
26; SSE2-NEXT:    cmoveq %rcx, %rax
27; SSE2-NEXT:    xorq $63, %rax
28; SSE2-NEXT:    movd %rax, %xmm0
29; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
30; SSE2-NEXT:    movdqa %xmm1, %xmm0
31; SSE2-NEXT:    retq
32;
33; SSE3-LABEL: testv2i64:
34; SSE3:       # BB#0:
35; SSE3-NEXT:    movd %xmm0, %rax
36; SSE3-NEXT:    bsrq %rax, %rax
37; SSE3-NEXT:    movl $127, %ecx
38; SSE3-NEXT:    cmoveq %rcx, %rax
39; SSE3-NEXT:    xorq $63, %rax
40; SSE3-NEXT:    movd %rax, %xmm1
41; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
42; SSE3-NEXT:    movd %xmm0, %rax
43; SSE3-NEXT:    bsrq %rax, %rax
44; SSE3-NEXT:    cmoveq %rcx, %rax
45; SSE3-NEXT:    xorq $63, %rax
46; SSE3-NEXT:    movd %rax, %xmm0
47; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
48; SSE3-NEXT:    movdqa %xmm1, %xmm0
49; SSE3-NEXT:    retq
50;
51; SSSE3-LABEL: testv2i64:
52; SSSE3:       # BB#0:
53; SSSE3-NEXT:    movd %xmm0, %rax
54; SSSE3-NEXT:    bsrq %rax, %rax
55; SSSE3-NEXT:    movl $127, %ecx
56; SSSE3-NEXT:    cmoveq %rcx, %rax
57; SSSE3-NEXT:    xorq $63, %rax
58; SSSE3-NEXT:    movd %rax, %xmm1
59; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
60; SSSE3-NEXT:    movd %xmm0, %rax
61; SSSE3-NEXT:    bsrq %rax, %rax
62; SSSE3-NEXT:    cmoveq %rcx, %rax
63; SSSE3-NEXT:    xorq $63, %rax
64; SSSE3-NEXT:    movd %rax, %xmm0
65; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
66; SSSE3-NEXT:    movdqa %xmm1, %xmm0
67; SSSE3-NEXT:    retq
68;
69; SSE41-LABEL: testv2i64:
70; SSE41:       # BB#0:
71; SSE41-NEXT:    pextrq $1, %xmm0, %rax
72; SSE41-NEXT:    bsrq %rax, %rax
73; SSE41-NEXT:    movl $127, %ecx
74; SSE41-NEXT:    cmoveq %rcx, %rax
75; SSE41-NEXT:    xorq $63, %rax
76; SSE41-NEXT:    movd %rax, %xmm1
77; SSE41-NEXT:    movd %xmm0, %rax
78; SSE41-NEXT:    bsrq %rax, %rax
79; SSE41-NEXT:    cmoveq %rcx, %rax
80; SSE41-NEXT:    xorq $63, %rax
81; SSE41-NEXT:    movd %rax, %xmm0
82; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
83; SSE41-NEXT:    retq
84;
85; AVX-LABEL: testv2i64:
86; AVX:       # BB#0:
87; AVX-NEXT:    vpextrq $1, %xmm0, %rax
88; AVX-NEXT:    bsrq %rax, %rax
89; AVX-NEXT:    movl $127, %ecx
90; AVX-NEXT:    cmoveq %rcx, %rax
91; AVX-NEXT:    xorq $63, %rax
92; AVX-NEXT:    vmovq %rax, %xmm1
93; AVX-NEXT:    vmovq %xmm0, %rax
94; AVX-NEXT:    bsrq %rax, %rax
95; AVX-NEXT:    cmoveq %rcx, %rax
96; AVX-NEXT:    xorq $63, %rax
97; AVX-NEXT:    vmovq %rax, %xmm0
98; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
99; AVX-NEXT:    retq
100;
101; AVX512VLCD-LABEL: testv2i64:
102; AVX512VLCD:       ## BB#0:
103; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
104; AVX512VLCD-NEXT:    retq
105;
106; AVX512CD-LABEL: testv2i64:
107; AVX512CD:       ## BB#0:
108; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
109; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
110; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
111; AVX512CD-NEXT:    retq
112;
113; X32-SSE-LABEL: testv2i64:
114; X32-SSE:       # BB#0:
115; X32-SSE-NEXT:    pushl %esi
116; X32-SSE-NEXT:    pextrd $2, %xmm0, %eax
117; X32-SSE-NEXT:    bsrl %eax, %eax
118; X32-SSE-NEXT:    movl $63, %ecx
119; X32-SSE-NEXT:    cmovel %ecx, %eax
120; X32-SSE-NEXT:    xorl $31, %eax
121; X32-SSE-NEXT:    addl $32, %eax
122; X32-SSE-NEXT:    pextrd $3, %xmm0, %edx
123; X32-SSE-NEXT:    bsrl %edx, %esi
124; X32-SSE-NEXT:    xorl $31, %esi
125; X32-SSE-NEXT:    testl %edx, %edx
126; X32-SSE-NEXT:    cmovel %eax, %esi
127; X32-SSE-NEXT:    movd %esi, %xmm1
128; X32-SSE-NEXT:    movd %xmm0, %eax
129; X32-SSE-NEXT:    bsrl %eax, %eax
130; X32-SSE-NEXT:    cmovel %ecx, %eax
131; X32-SSE-NEXT:    xorl $31, %eax
132; X32-SSE-NEXT:    addl $32, %eax
133; X32-SSE-NEXT:    pextrd $1, %xmm0, %ecx
134; X32-SSE-NEXT:    bsrl %ecx, %edx
135; X32-SSE-NEXT:    xorl $31, %edx
136; X32-SSE-NEXT:    testl %ecx, %ecx
137; X32-SSE-NEXT:    cmovel %eax, %edx
138; X32-SSE-NEXT:    movd %edx, %xmm0
139; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
140; X32-SSE-NEXT:    popl %esi
141; X32-SSE-NEXT:    retl
142
143  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
144  ret <2 x i64> %out
145}
146
147define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
148; SSE2-LABEL: testv2i64u:
149; SSE2:       # BB#0:
150; SSE2-NEXT:    movd %xmm0, %rax
151; SSE2-NEXT:    bsrq %rax, %rax
152; SSE2-NEXT:    xorq $63, %rax
153; SSE2-NEXT:    movd %rax, %xmm1
154; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
155; SSE2-NEXT:    movd %xmm0, %rax
156; SSE2-NEXT:    bsrq %rax, %rax
157; SSE2-NEXT:    xorq $63, %rax
158; SSE2-NEXT:    movd %rax, %xmm0
159; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
160; SSE2-NEXT:    movdqa %xmm1, %xmm0
161; SSE2-NEXT:    retq
162;
163; SSE3-LABEL: testv2i64u:
164; SSE3:       # BB#0:
165; SSE3-NEXT:    movd %xmm0, %rax
166; SSE3-NEXT:    bsrq %rax, %rax
167; SSE3-NEXT:    xorq $63, %rax
168; SSE3-NEXT:    movd %rax, %xmm1
169; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
170; SSE3-NEXT:    movd %xmm0, %rax
171; SSE3-NEXT:    bsrq %rax, %rax
172; SSE3-NEXT:    xorq $63, %rax
173; SSE3-NEXT:    movd %rax, %xmm0
174; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
175; SSE3-NEXT:    movdqa %xmm1, %xmm0
176; SSE3-NEXT:    retq
177;
178; SSSE3-LABEL: testv2i64u:
179; SSSE3:       # BB#0:
180; SSSE3-NEXT:    movd %xmm0, %rax
181; SSSE3-NEXT:    bsrq %rax, %rax
182; SSSE3-NEXT:    xorq $63, %rax
183; SSSE3-NEXT:    movd %rax, %xmm1
184; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
185; SSSE3-NEXT:    movd %xmm0, %rax
186; SSSE3-NEXT:    bsrq %rax, %rax
187; SSSE3-NEXT:    xorq $63, %rax
188; SSSE3-NEXT:    movd %rax, %xmm0
189; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
190; SSSE3-NEXT:    movdqa %xmm1, %xmm0
191; SSSE3-NEXT:    retq
192;
193; SSE41-LABEL: testv2i64u:
194; SSE41:       # BB#0:
195; SSE41-NEXT:    pextrq $1, %xmm0, %rax
196; SSE41-NEXT:    bsrq %rax, %rax
197; SSE41-NEXT:    xorq $63, %rax
198; SSE41-NEXT:    movd %rax, %xmm1
199; SSE41-NEXT:    movd %xmm0, %rax
200; SSE41-NEXT:    bsrq %rax, %rax
201; SSE41-NEXT:    xorq $63, %rax
202; SSE41-NEXT:    movd %rax, %xmm0
203; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
204; SSE41-NEXT:    retq
205;
206; AVX-LABEL: testv2i64u:
207; AVX:       # BB#0:
208; AVX-NEXT:    vpextrq $1, %xmm0, %rax
209; AVX-NEXT:    bsrq %rax, %rax
210; AVX-NEXT:    xorq $63, %rax
211; AVX-NEXT:    vmovq %rax, %xmm1
212; AVX-NEXT:    vmovq %xmm0, %rax
213; AVX-NEXT:    bsrq %rax, %rax
214; AVX-NEXT:    xorq $63, %rax
215; AVX-NEXT:    vmovq %rax, %xmm0
216; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
217; AVX-NEXT:    retq
218;
219; AVX512VLCD-LABEL: testv2i64u:
220; AVX512VLCD:       ## BB#0:
221; AVX512VLCD-NEXT:    vplzcntq %xmm0, %xmm0
222; AVX512VLCD-NEXT:    retq
223;
224; AVX512CD-LABEL: testv2i64u:
225; AVX512CD:       ## BB#0:
226; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
227; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
228; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
229; AVX512CD-NEXT:    retq
230;
231; X32-SSE-LABEL: testv2i64u:
232; X32-SSE:       # BB#0:
233; X32-SSE-NEXT:    pextrd $3, %xmm0, %eax
234; X32-SSE-NEXT:    bsrl %eax, %ecx
235; X32-SSE-NEXT:    xorl $31, %ecx
236; X32-SSE-NEXT:    pextrd $2, %xmm0, %edx
237; X32-SSE-NEXT:    bsrl %edx, %edx
238; X32-SSE-NEXT:    xorl $31, %edx
239; X32-SSE-NEXT:    addl $32, %edx
240; X32-SSE-NEXT:    testl %eax, %eax
241; X32-SSE-NEXT:    cmovnel %ecx, %edx
242; X32-SSE-NEXT:    movd %edx, %xmm1
243; X32-SSE-NEXT:    pextrd $1, %xmm0, %eax
244; X32-SSE-NEXT:    bsrl %eax, %ecx
245; X32-SSE-NEXT:    xorl $31, %ecx
246; X32-SSE-NEXT:    movd %xmm0, %edx
247; X32-SSE-NEXT:    bsrl %edx, %edx
248; X32-SSE-NEXT:    xorl $31, %edx
249; X32-SSE-NEXT:    addl $32, %edx
250; X32-SSE-NEXT:    testl %eax, %eax
251; X32-SSE-NEXT:    cmovnel %ecx, %edx
252; X32-SSE-NEXT:    movd %edx, %xmm0
253; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
254; X32-SSE-NEXT:    retl
255
256  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
257  ret <2 x i64> %out
258}
259
260define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
261; SSE2-LABEL: testv4i32:
262; SSE2:       # BB#0:
263; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
264; SSE2-NEXT:    movd %xmm1, %eax
265; SSE2-NEXT:    bsrl %eax, %eax
266; SSE2-NEXT:    movl $63, %ecx
267; SSE2-NEXT:    cmovel %ecx, %eax
268; SSE2-NEXT:    xorl $31, %eax
269; SSE2-NEXT:    movd %eax, %xmm1
270; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
271; SSE2-NEXT:    movd %xmm2, %eax
272; SSE2-NEXT:    bsrl %eax, %eax
273; SSE2-NEXT:    cmovel %ecx, %eax
274; SSE2-NEXT:    xorl $31, %eax
275; SSE2-NEXT:    movd %eax, %xmm2
276; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
277; SSE2-NEXT:    movd %xmm0, %eax
278; SSE2-NEXT:    bsrl %eax, %eax
279; SSE2-NEXT:    cmovel %ecx, %eax
280; SSE2-NEXT:    xorl $31, %eax
281; SSE2-NEXT:    movd %eax, %xmm1
282; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
283; SSE2-NEXT:    movd %xmm0, %eax
284; SSE2-NEXT:    bsrl %eax, %eax
285; SSE2-NEXT:    cmovel %ecx, %eax
286; SSE2-NEXT:    xorl $31, %eax
287; SSE2-NEXT:    movd %eax, %xmm0
288; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
289; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
290; SSE2-NEXT:    movdqa %xmm1, %xmm0
291; SSE2-NEXT:    retq
292;
293; SSE3-LABEL: testv4i32:
294; SSE3:       # BB#0:
295; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
296; SSE3-NEXT:    movd %xmm1, %eax
297; SSE3-NEXT:    bsrl %eax, %eax
298; SSE3-NEXT:    movl $63, %ecx
299; SSE3-NEXT:    cmovel %ecx, %eax
300; SSE3-NEXT:    xorl $31, %eax
301; SSE3-NEXT:    movd %eax, %xmm1
302; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
303; SSE3-NEXT:    movd %xmm2, %eax
304; SSE3-NEXT:    bsrl %eax, %eax
305; SSE3-NEXT:    cmovel %ecx, %eax
306; SSE3-NEXT:    xorl $31, %eax
307; SSE3-NEXT:    movd %eax, %xmm2
308; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
309; SSE3-NEXT:    movd %xmm0, %eax
310; SSE3-NEXT:    bsrl %eax, %eax
311; SSE3-NEXT:    cmovel %ecx, %eax
312; SSE3-NEXT:    xorl $31, %eax
313; SSE3-NEXT:    movd %eax, %xmm1
314; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
315; SSE3-NEXT:    movd %xmm0, %eax
316; SSE3-NEXT:    bsrl %eax, %eax
317; SSE3-NEXT:    cmovel %ecx, %eax
318; SSE3-NEXT:    xorl $31, %eax
319; SSE3-NEXT:    movd %eax, %xmm0
320; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
321; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
322; SSE3-NEXT:    movdqa %xmm1, %xmm0
323; SSE3-NEXT:    retq
324;
325; SSSE3-LABEL: testv4i32:
326; SSSE3:       # BB#0:
327; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
328; SSSE3-NEXT:    movd %xmm1, %eax
329; SSSE3-NEXT:    bsrl %eax, %eax
330; SSSE3-NEXT:    movl $63, %ecx
331; SSSE3-NEXT:    cmovel %ecx, %eax
332; SSSE3-NEXT:    xorl $31, %eax
333; SSSE3-NEXT:    movd %eax, %xmm1
334; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
335; SSSE3-NEXT:    movd %xmm2, %eax
336; SSSE3-NEXT:    bsrl %eax, %eax
337; SSSE3-NEXT:    cmovel %ecx, %eax
338; SSSE3-NEXT:    xorl $31, %eax
339; SSSE3-NEXT:    movd %eax, %xmm2
340; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
341; SSSE3-NEXT:    movd %xmm0, %eax
342; SSSE3-NEXT:    bsrl %eax, %eax
343; SSSE3-NEXT:    cmovel %ecx, %eax
344; SSSE3-NEXT:    xorl $31, %eax
345; SSSE3-NEXT:    movd %eax, %xmm1
346; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
347; SSSE3-NEXT:    movd %xmm0, %eax
348; SSSE3-NEXT:    bsrl %eax, %eax
349; SSSE3-NEXT:    cmovel %ecx, %eax
350; SSSE3-NEXT:    xorl $31, %eax
351; SSSE3-NEXT:    movd %eax, %xmm0
352; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
353; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
354; SSSE3-NEXT:    movdqa %xmm1, %xmm0
355; SSSE3-NEXT:    retq
356;
357; SSE41-LABEL: testv4i32:
358; SSE41:       # BB#0:
359; SSE41-NEXT:    pextrd $1, %xmm0, %eax
360; SSE41-NEXT:    bsrl %eax, %eax
361; SSE41-NEXT:    movl $63, %ecx
362; SSE41-NEXT:    cmovel %ecx, %eax
363; SSE41-NEXT:    xorl $31, %eax
364; SSE41-NEXT:    movd %xmm0, %edx
365; SSE41-NEXT:    bsrl %edx, %edx
366; SSE41-NEXT:    cmovel %ecx, %edx
367; SSE41-NEXT:    xorl $31, %edx
368; SSE41-NEXT:    movd %edx, %xmm1
369; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
370; SSE41-NEXT:    pextrd $2, %xmm0, %eax
371; SSE41-NEXT:    bsrl %eax, %eax
372; SSE41-NEXT:    cmovel %ecx, %eax
373; SSE41-NEXT:    xorl $31, %eax
374; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
375; SSE41-NEXT:    pextrd $3, %xmm0, %eax
376; SSE41-NEXT:    bsrl %eax, %eax
377; SSE41-NEXT:    cmovel %ecx, %eax
378; SSE41-NEXT:    xorl $31, %eax
379; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
380; SSE41-NEXT:    movdqa %xmm1, %xmm0
381; SSE41-NEXT:    retq
382;
383; AVX-LABEL: testv4i32:
384; AVX:       # BB#0:
385; AVX-NEXT:    vpextrd $1, %xmm0, %eax
386; AVX-NEXT:    bsrl %eax, %eax
387; AVX-NEXT:    movl $63, %ecx
388; AVX-NEXT:    cmovel %ecx, %eax
389; AVX-NEXT:    xorl $31, %eax
390; AVX-NEXT:    vmovd %xmm0, %edx
391; AVX-NEXT:    bsrl %edx, %edx
392; AVX-NEXT:    cmovel %ecx, %edx
393; AVX-NEXT:    xorl $31, %edx
394; AVX-NEXT:    vmovd %edx, %xmm1
395; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
396; AVX-NEXT:    vpextrd $2, %xmm0, %eax
397; AVX-NEXT:    bsrl %eax, %eax
398; AVX-NEXT:    cmovel %ecx, %eax
399; AVX-NEXT:    xorl $31, %eax
400; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
401; AVX-NEXT:    vpextrd $3, %xmm0, %eax
402; AVX-NEXT:    bsrl %eax, %eax
403; AVX-NEXT:    cmovel %ecx, %eax
404; AVX-NEXT:    xorl $31, %eax
405; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
406; AVX-NEXT:    retq
407;
408; AVX512VLCD-LABEL: testv4i32:
409; AVX512VLCD:       ## BB#0:
410; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
411; AVX512VLCD-NEXT:    retq
412;
413; AVX512CD-LABEL: testv4i32:
414; AVX512CD:       ## BB#0:
415; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
416; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
417; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
418; AVX512CD-NEXT:    retq
419;
420; X32-SSE-LABEL: testv4i32:
421; X32-SSE:       # BB#0:
422; X32-SSE-NEXT:    pextrd $1, %xmm0, %eax
423; X32-SSE-NEXT:    bsrl %eax, %eax
424; X32-SSE-NEXT:    movl $63, %ecx
425; X32-SSE-NEXT:    cmovel %ecx, %eax
426; X32-SSE-NEXT:    xorl $31, %eax
427; X32-SSE-NEXT:    movd %xmm0, %edx
428; X32-SSE-NEXT:    bsrl %edx, %edx
429; X32-SSE-NEXT:    cmovel %ecx, %edx
430; X32-SSE-NEXT:    xorl $31, %edx
431; X32-SSE-NEXT:    movd %edx, %xmm1
432; X32-SSE-NEXT:    pinsrd $1, %eax, %xmm1
433; X32-SSE-NEXT:    pextrd $2, %xmm0, %eax
434; X32-SSE-NEXT:    bsrl %eax, %eax
435; X32-SSE-NEXT:    cmovel %ecx, %eax
436; X32-SSE-NEXT:    xorl $31, %eax
437; X32-SSE-NEXT:    pinsrd $2, %eax, %xmm1
438; X32-SSE-NEXT:    pextrd $3, %xmm0, %eax
439; X32-SSE-NEXT:    bsrl %eax, %eax
440; X32-SSE-NEXT:    cmovel %ecx, %eax
441; X32-SSE-NEXT:    xorl $31, %eax
442; X32-SSE-NEXT:    pinsrd $3, %eax, %xmm1
443; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
444; X32-SSE-NEXT:    retl
445
446  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
447  ret <4 x i32> %out
448}
449
450define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
451; SSE2-LABEL: testv4i32u:
452; SSE2:       # BB#0:
453; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
454; SSE2-NEXT:    movd %xmm1, %eax
455; SSE2-NEXT:    bsrl %eax, %eax
456; SSE2-NEXT:    xorl $31, %eax
457; SSE2-NEXT:    movd %eax, %xmm1
458; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
459; SSE2-NEXT:    movd %xmm2, %eax
460; SSE2-NEXT:    bsrl %eax, %eax
461; SSE2-NEXT:    xorl $31, %eax
462; SSE2-NEXT:    movd %eax, %xmm2
463; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
464; SSE2-NEXT:    movd %xmm0, %eax
465; SSE2-NEXT:    bsrl %eax, %eax
466; SSE2-NEXT:    xorl $31, %eax
467; SSE2-NEXT:    movd %eax, %xmm1
468; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
469; SSE2-NEXT:    movd %xmm0, %eax
470; SSE2-NEXT:    bsrl %eax, %eax
471; SSE2-NEXT:    xorl $31, %eax
472; SSE2-NEXT:    movd %eax, %xmm0
473; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
474; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
475; SSE2-NEXT:    movdqa %xmm1, %xmm0
476; SSE2-NEXT:    retq
477;
478; SSE3-LABEL: testv4i32u:
479; SSE3:       # BB#0:
480; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
481; SSE3-NEXT:    movd %xmm1, %eax
482; SSE3-NEXT:    bsrl %eax, %eax
483; SSE3-NEXT:    xorl $31, %eax
484; SSE3-NEXT:    movd %eax, %xmm1
485; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
486; SSE3-NEXT:    movd %xmm2, %eax
487; SSE3-NEXT:    bsrl %eax, %eax
488; SSE3-NEXT:    xorl $31, %eax
489; SSE3-NEXT:    movd %eax, %xmm2
490; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
491; SSE3-NEXT:    movd %xmm0, %eax
492; SSE3-NEXT:    bsrl %eax, %eax
493; SSE3-NEXT:    xorl $31, %eax
494; SSE3-NEXT:    movd %eax, %xmm1
495; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
496; SSE3-NEXT:    movd %xmm0, %eax
497; SSE3-NEXT:    bsrl %eax, %eax
498; SSE3-NEXT:    xorl $31, %eax
499; SSE3-NEXT:    movd %eax, %xmm0
500; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
501; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
502; SSE3-NEXT:    movdqa %xmm1, %xmm0
503; SSE3-NEXT:    retq
504;
505; SSSE3-LABEL: testv4i32u:
506; SSSE3:       # BB#0:
507; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
508; SSSE3-NEXT:    movd %xmm1, %eax
509; SSSE3-NEXT:    bsrl %eax, %eax
510; SSSE3-NEXT:    xorl $31, %eax
511; SSSE3-NEXT:    movd %eax, %xmm1
512; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
513; SSSE3-NEXT:    movd %xmm2, %eax
514; SSSE3-NEXT:    bsrl %eax, %eax
515; SSSE3-NEXT:    xorl $31, %eax
516; SSSE3-NEXT:    movd %eax, %xmm2
517; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
518; SSSE3-NEXT:    movd %xmm0, %eax
519; SSSE3-NEXT:    bsrl %eax, %eax
520; SSSE3-NEXT:    xorl $31, %eax
521; SSSE3-NEXT:    movd %eax, %xmm1
522; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
523; SSSE3-NEXT:    movd %xmm0, %eax
524; SSSE3-NEXT:    bsrl %eax, %eax
525; SSSE3-NEXT:    xorl $31, %eax
526; SSSE3-NEXT:    movd %eax, %xmm0
527; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
528; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
529; SSSE3-NEXT:    movdqa %xmm1, %xmm0
530; SSSE3-NEXT:    retq
531;
532; SSE41-LABEL: testv4i32u:
533; SSE41:       # BB#0:
534; SSE41-NEXT:    pextrd $1, %xmm0, %eax
535; SSE41-NEXT:    bsrl %eax, %eax
536; SSE41-NEXT:    xorl $31, %eax
537; SSE41-NEXT:    movd %xmm0, %ecx
538; SSE41-NEXT:    bsrl %ecx, %ecx
539; SSE41-NEXT:    xorl $31, %ecx
540; SSE41-NEXT:    movd %ecx, %xmm1
541; SSE41-NEXT:    pinsrd $1, %eax, %xmm1
542; SSE41-NEXT:    pextrd $2, %xmm0, %eax
543; SSE41-NEXT:    bsrl %eax, %eax
544; SSE41-NEXT:    xorl $31, %eax
545; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
546; SSE41-NEXT:    pextrd $3, %xmm0, %eax
547; SSE41-NEXT:    bsrl %eax, %eax
548; SSE41-NEXT:    xorl $31, %eax
549; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
550; SSE41-NEXT:    movdqa %xmm1, %xmm0
551; SSE41-NEXT:    retq
552;
553; AVX-LABEL: testv4i32u:
554; AVX:       # BB#0:
555; AVX-NEXT:    vpextrd $1, %xmm0, %eax
556; AVX-NEXT:    bsrl %eax, %eax
557; AVX-NEXT:    xorl $31, %eax
558; AVX-NEXT:    vmovd %xmm0, %ecx
559; AVX-NEXT:    bsrl %ecx, %ecx
560; AVX-NEXT:    xorl $31, %ecx
561; AVX-NEXT:    vmovd %ecx, %xmm1
562; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
563; AVX-NEXT:    vpextrd $2, %xmm0, %eax
564; AVX-NEXT:    bsrl %eax, %eax
565; AVX-NEXT:    xorl $31, %eax
566; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
567; AVX-NEXT:    vpextrd $3, %xmm0, %eax
568; AVX-NEXT:    bsrl %eax, %eax
569; AVX-NEXT:    xorl $31, %eax
570; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
571; AVX-NEXT:    retq
572;
573; AVX512VLCD-LABEL: testv4i32u:
574; AVX512VLCD:       ## BB#0:
575; AVX512VLCD-NEXT:    vplzcntd %xmm0, %xmm0
576; AVX512VLCD-NEXT:    retq
577;
578; AVX512CD-LABEL: testv4i32u:
579; AVX512CD:       ## BB#0:
580; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
581; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
582; AVX512CD-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
583; AVX512CD-NEXT:    retq
584;
585; X32-SSE-LABEL: testv4i32u:
586; X32-SSE:       # BB#0:
587; X32-SSE-NEXT:    pextrd $1, %xmm0, %eax
588; X32-SSE-NEXT:    bsrl %eax, %eax
589; X32-SSE-NEXT:    xorl $31, %eax
590; X32-SSE-NEXT:    movd %xmm0, %ecx
591; X32-SSE-NEXT:    bsrl %ecx, %ecx
592; X32-SSE-NEXT:    xorl $31, %ecx
593; X32-SSE-NEXT:    movd %ecx, %xmm1
594; X32-SSE-NEXT:    pinsrd $1, %eax, %xmm1
595; X32-SSE-NEXT:    pextrd $2, %xmm0, %eax
596; X32-SSE-NEXT:    bsrl %eax, %eax
597; X32-SSE-NEXT:    xorl $31, %eax
598; X32-SSE-NEXT:    pinsrd $2, %eax, %xmm1
599; X32-SSE-NEXT:    pextrd $3, %xmm0, %eax
600; X32-SSE-NEXT:    bsrl %eax, %eax
601; X32-SSE-NEXT:    xorl $31, %eax
602; X32-SSE-NEXT:    pinsrd $3, %eax, %xmm1
603; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
604; X32-SSE-NEXT:    retl
605
606  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
607  ret <4 x i32> %out
608}
609
610define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
611; SSE2-LABEL: testv8i16:
612; SSE2:       # BB#0:
613; SSE2-NEXT:    pextrw $7, %xmm0, %eax
614; SSE2-NEXT:    bsrw %ax, %cx
615; SSE2-NEXT:    movw $31, %ax
616; SSE2-NEXT:    cmovew %ax, %cx
617; SSE2-NEXT:    xorl $15, %ecx
618; SSE2-NEXT:    movd %ecx, %xmm1
619; SSE2-NEXT:    pextrw $3, %xmm0, %ecx
620; SSE2-NEXT:    bsrw %cx, %cx
621; SSE2-NEXT:    cmovew %ax, %cx
622; SSE2-NEXT:    xorl $15, %ecx
623; SSE2-NEXT:    movd %ecx, %xmm2
624; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
625; SSE2-NEXT:    pextrw $5, %xmm0, %ecx
626; SSE2-NEXT:    bsrw %cx, %cx
627; SSE2-NEXT:    cmovew %ax, %cx
628; SSE2-NEXT:    xorl $15, %ecx
629; SSE2-NEXT:    movd %ecx, %xmm3
630; SSE2-NEXT:    pextrw $1, %xmm0, %ecx
631; SSE2-NEXT:    bsrw %cx, %cx
632; SSE2-NEXT:    cmovew %ax, %cx
633; SSE2-NEXT:    xorl $15, %ecx
634; SSE2-NEXT:    movd %ecx, %xmm1
635; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
636; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
637; SSE2-NEXT:    pextrw $6, %xmm0, %ecx
638; SSE2-NEXT:    bsrw %cx, %cx
639; SSE2-NEXT:    cmovew %ax, %cx
640; SSE2-NEXT:    xorl $15, %ecx
641; SSE2-NEXT:    movd %ecx, %xmm2
642; SSE2-NEXT:    pextrw $2, %xmm0, %ecx
643; SSE2-NEXT:    bsrw %cx, %cx
644; SSE2-NEXT:    cmovew %ax, %cx
645; SSE2-NEXT:    xorl $15, %ecx
646; SSE2-NEXT:    movd %ecx, %xmm3
647; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
648; SSE2-NEXT:    pextrw $4, %xmm0, %ecx
649; SSE2-NEXT:    bsrw %cx, %cx
650; SSE2-NEXT:    cmovew %ax, %cx
651; SSE2-NEXT:    xorl $15, %ecx
652; SSE2-NEXT:    movd %ecx, %xmm2
653; SSE2-NEXT:    movd %xmm0, %ecx
654; SSE2-NEXT:    bsrw %cx, %cx
655; SSE2-NEXT:    cmovew %ax, %cx
656; SSE2-NEXT:    xorl $15, %ecx
657; SSE2-NEXT:    movd %ecx, %xmm0
658; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
659; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
660; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
661; SSE2-NEXT:    retq
662;
663; SSE3-LABEL: testv8i16:
664; SSE3:       # BB#0:
665; SSE3-NEXT:    pextrw $7, %xmm0, %eax
666; SSE3-NEXT:    bsrw %ax, %cx
667; SSE3-NEXT:    movw $31, %ax
668; SSE3-NEXT:    cmovew %ax, %cx
669; SSE3-NEXT:    xorl $15, %ecx
670; SSE3-NEXT:    movd %ecx, %xmm1
671; SSE3-NEXT:    pextrw $3, %xmm0, %ecx
672; SSE3-NEXT:    bsrw %cx, %cx
673; SSE3-NEXT:    cmovew %ax, %cx
674; SSE3-NEXT:    xorl $15, %ecx
675; SSE3-NEXT:    movd %ecx, %xmm2
676; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
677; SSE3-NEXT:    pextrw $5, %xmm0, %ecx
678; SSE3-NEXT:    bsrw %cx, %cx
679; SSE3-NEXT:    cmovew %ax, %cx
680; SSE3-NEXT:    xorl $15, %ecx
681; SSE3-NEXT:    movd %ecx, %xmm3
682; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
683; SSE3-NEXT:    bsrw %cx, %cx
684; SSE3-NEXT:    cmovew %ax, %cx
685; SSE3-NEXT:    xorl $15, %ecx
686; SSE3-NEXT:    movd %ecx, %xmm1
687; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
688; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
689; SSE3-NEXT:    pextrw $6, %xmm0, %ecx
690; SSE3-NEXT:    bsrw %cx, %cx
691; SSE3-NEXT:    cmovew %ax, %cx
692; SSE3-NEXT:    xorl $15, %ecx
693; SSE3-NEXT:    movd %ecx, %xmm2
694; SSE3-NEXT:    pextrw $2, %xmm0, %ecx
695; SSE3-NEXT:    bsrw %cx, %cx
696; SSE3-NEXT:    cmovew %ax, %cx
697; SSE3-NEXT:    xorl $15, %ecx
698; SSE3-NEXT:    movd %ecx, %xmm3
699; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
700; SSE3-NEXT:    pextrw $4, %xmm0, %ecx
701; SSE3-NEXT:    bsrw %cx, %cx
702; SSE3-NEXT:    cmovew %ax, %cx
703; SSE3-NEXT:    xorl $15, %ecx
704; SSE3-NEXT:    movd %ecx, %xmm2
705; SSE3-NEXT:    movd %xmm0, %ecx
706; SSE3-NEXT:    bsrw %cx, %cx
707; SSE3-NEXT:    cmovew %ax, %cx
708; SSE3-NEXT:    xorl $15, %ecx
709; SSE3-NEXT:    movd %ecx, %xmm0
710; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
711; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
712; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
713; SSE3-NEXT:    retq
714;
715; SSSE3-LABEL: testv8i16:
716; SSSE3:       # BB#0:
717; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
718; SSSE3-NEXT:    movdqa %xmm0, %xmm1
719; SSSE3-NEXT:    pand %xmm2, %xmm1
720; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
721; SSSE3-NEXT:    movdqa %xmm3, %xmm4
722; SSSE3-NEXT:    pshufb %xmm1, %xmm4
723; SSSE3-NEXT:    movdqa %xmm0, %xmm1
724; SSSE3-NEXT:    psrlw $4, %xmm1
725; SSSE3-NEXT:    pand %xmm2, %xmm1
726; SSSE3-NEXT:    pxor %xmm2, %xmm2
727; SSSE3-NEXT:    pshufb %xmm1, %xmm3
728; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
729; SSSE3-NEXT:    pand %xmm4, %xmm1
730; SSSE3-NEXT:    paddb %xmm3, %xmm1
731; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
732; SSSE3-NEXT:    psrlw $8, %xmm0
733; SSSE3-NEXT:    pand %xmm1, %xmm0
734; SSSE3-NEXT:    psrlw $8, %xmm1
735; SSSE3-NEXT:    paddw %xmm0, %xmm1
736; SSSE3-NEXT:    movdqa %xmm1, %xmm0
737; SSSE3-NEXT:    retq
738;
739; SSE41-LABEL: testv8i16:
740; SSE41:       # BB#0:
741; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
742; SSE41-NEXT:    movdqa %xmm0, %xmm1
743; SSE41-NEXT:    pand %xmm2, %xmm1
744; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
745; SSE41-NEXT:    movdqa %xmm3, %xmm4
746; SSE41-NEXT:    pshufb %xmm1, %xmm4
747; SSE41-NEXT:    movdqa %xmm0, %xmm1
748; SSE41-NEXT:    psrlw $4, %xmm1
749; SSE41-NEXT:    pand %xmm2, %xmm1
750; SSE41-NEXT:    pxor %xmm2, %xmm2
751; SSE41-NEXT:    pshufb %xmm1, %xmm3
752; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
753; SSE41-NEXT:    pand %xmm4, %xmm1
754; SSE41-NEXT:    paddb %xmm3, %xmm1
755; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
756; SSE41-NEXT:    psrlw $8, %xmm0
757; SSE41-NEXT:    pand %xmm1, %xmm0
758; SSE41-NEXT:    psrlw $8, %xmm1
759; SSE41-NEXT:    paddw %xmm0, %xmm1
760; SSE41-NEXT:    movdqa %xmm1, %xmm0
761; SSE41-NEXT:    retq
762;
763; AVX-LABEL: testv8i16:
764; AVX:       # BB#0:
765; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
766; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
767; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
768; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
769; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
770; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
771; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
772; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
773; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
774; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
775; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
776; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
777; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
778; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
779; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
780; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
781; AVX-NEXT:    retq
782;
783; AVX512VLCD-LABEL: testv8i16:
784; AVX512VLCD:       ## BB#0:
785; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
786; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
787; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
788; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
789; AVX512VLCD-NEXT:    retq
790;
791; AVX512CD-LABEL: testv8i16:
792; AVX512CD:       ## BB#0:
793; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
794; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
795; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
796; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
797; AVX512CD-NEXT:    retq
798;
799; X32-SSE-LABEL: testv8i16:
800; X32-SSE:       # BB#0:
801; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
802; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
803; X32-SSE-NEXT:    pand %xmm2, %xmm1
804; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
805; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
806; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
807; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
808; X32-SSE-NEXT:    psrlw $4, %xmm1
809; X32-SSE-NEXT:    pand %xmm2, %xmm1
810; X32-SSE-NEXT:    pxor %xmm2, %xmm2
811; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
812; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
813; X32-SSE-NEXT:    pand %xmm4, %xmm1
814; X32-SSE-NEXT:    paddb %xmm3, %xmm1
815; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
816; X32-SSE-NEXT:    psrlw $8, %xmm0
817; X32-SSE-NEXT:    pand %xmm1, %xmm0
818; X32-SSE-NEXT:    psrlw $8, %xmm1
819; X32-SSE-NEXT:    paddw %xmm0, %xmm1
820; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
821; X32-SSE-NEXT:    retl
822  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
823  ret <8 x i16> %out
824}
825
826define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
827; SSE2-LABEL: testv8i16u:
828; SSE2:       # BB#0:
829; SSE2-NEXT:    pextrw $7, %xmm0, %eax
830; SSE2-NEXT:    bsrw %ax, %ax
831; SSE2-NEXT:    xorl $15, %eax
832; SSE2-NEXT:    movd %eax, %xmm1
833; SSE2-NEXT:    pextrw $3, %xmm0, %eax
834; SSE2-NEXT:    bsrw %ax, %ax
835; SSE2-NEXT:    xorl $15, %eax
836; SSE2-NEXT:    movd %eax, %xmm2
837; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
838; SSE2-NEXT:    pextrw $5, %xmm0, %eax
839; SSE2-NEXT:    bsrw %ax, %ax
840; SSE2-NEXT:    xorl $15, %eax
841; SSE2-NEXT:    movd %eax, %xmm3
842; SSE2-NEXT:    pextrw $1, %xmm0, %eax
843; SSE2-NEXT:    bsrw %ax, %ax
844; SSE2-NEXT:    xorl $15, %eax
845; SSE2-NEXT:    movd %eax, %xmm1
846; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
847; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
848; SSE2-NEXT:    pextrw $6, %xmm0, %eax
849; SSE2-NEXT:    bsrw %ax, %ax
850; SSE2-NEXT:    xorl $15, %eax
851; SSE2-NEXT:    movd %eax, %xmm2
852; SSE2-NEXT:    pextrw $2, %xmm0, %eax
853; SSE2-NEXT:    bsrw %ax, %ax
854; SSE2-NEXT:    xorl $15, %eax
855; SSE2-NEXT:    movd %eax, %xmm3
856; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
857; SSE2-NEXT:    pextrw $4, %xmm0, %eax
858; SSE2-NEXT:    bsrw %ax, %ax
859; SSE2-NEXT:    xorl $15, %eax
860; SSE2-NEXT:    movd %eax, %xmm2
861; SSE2-NEXT:    movd %xmm0, %eax
862; SSE2-NEXT:    bsrw %ax, %ax
863; SSE2-NEXT:    xorl $15, %eax
864; SSE2-NEXT:    movd %eax, %xmm0
865; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
866; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
867; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
868; SSE2-NEXT:    retq
869;
870; SSE3-LABEL: testv8i16u:
871; SSE3:       # BB#0:
872; SSE3-NEXT:    pextrw $7, %xmm0, %eax
873; SSE3-NEXT:    bsrw %ax, %ax
874; SSE3-NEXT:    xorl $15, %eax
875; SSE3-NEXT:    movd %eax, %xmm1
876; SSE3-NEXT:    pextrw $3, %xmm0, %eax
877; SSE3-NEXT:    bsrw %ax, %ax
878; SSE3-NEXT:    xorl $15, %eax
879; SSE3-NEXT:    movd %eax, %xmm2
880; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
881; SSE3-NEXT:    pextrw $5, %xmm0, %eax
882; SSE3-NEXT:    bsrw %ax, %ax
883; SSE3-NEXT:    xorl $15, %eax
884; SSE3-NEXT:    movd %eax, %xmm3
885; SSE3-NEXT:    pextrw $1, %xmm0, %eax
886; SSE3-NEXT:    bsrw %ax, %ax
887; SSE3-NEXT:    xorl $15, %eax
888; SSE3-NEXT:    movd %eax, %xmm1
889; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
890; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
891; SSE3-NEXT:    pextrw $6, %xmm0, %eax
892; SSE3-NEXT:    bsrw %ax, %ax
893; SSE3-NEXT:    xorl $15, %eax
894; SSE3-NEXT:    movd %eax, %xmm2
895; SSE3-NEXT:    pextrw $2, %xmm0, %eax
896; SSE3-NEXT:    bsrw %ax, %ax
897; SSE3-NEXT:    xorl $15, %eax
898; SSE3-NEXT:    movd %eax, %xmm3
899; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
900; SSE3-NEXT:    pextrw $4, %xmm0, %eax
901; SSE3-NEXT:    bsrw %ax, %ax
902; SSE3-NEXT:    xorl $15, %eax
903; SSE3-NEXT:    movd %eax, %xmm2
904; SSE3-NEXT:    movd %xmm0, %eax
905; SSE3-NEXT:    bsrw %ax, %ax
906; SSE3-NEXT:    xorl $15, %eax
907; SSE3-NEXT:    movd %eax, %xmm0
908; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
909; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
910; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
911; SSE3-NEXT:    retq
912;
913; SSSE3-LABEL: testv8i16u:
914; SSSE3:       # BB#0:
915; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
916; SSSE3-NEXT:    movdqa %xmm0, %xmm1
917; SSSE3-NEXT:    pand %xmm2, %xmm1
918; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
919; SSSE3-NEXT:    movdqa %xmm3, %xmm4
920; SSSE3-NEXT:    pshufb %xmm1, %xmm4
921; SSSE3-NEXT:    movdqa %xmm0, %xmm1
922; SSSE3-NEXT:    psrlw $4, %xmm1
923; SSSE3-NEXT:    pand %xmm2, %xmm1
924; SSSE3-NEXT:    pxor %xmm2, %xmm2
925; SSSE3-NEXT:    pshufb %xmm1, %xmm3
926; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm1
927; SSSE3-NEXT:    pand %xmm4, %xmm1
928; SSSE3-NEXT:    paddb %xmm3, %xmm1
929; SSSE3-NEXT:    pcmpeqb %xmm2, %xmm0
930; SSSE3-NEXT:    psrlw $8, %xmm0
931; SSSE3-NEXT:    pand %xmm1, %xmm0
932; SSSE3-NEXT:    psrlw $8, %xmm1
933; SSSE3-NEXT:    paddw %xmm0, %xmm1
934; SSSE3-NEXT:    movdqa %xmm1, %xmm0
935; SSSE3-NEXT:    retq
936;
937; SSE41-LABEL: testv8i16u:
938; SSE41:       # BB#0:
939; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
940; SSE41-NEXT:    movdqa %xmm0, %xmm1
941; SSE41-NEXT:    pand %xmm2, %xmm1
942; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
943; SSE41-NEXT:    movdqa %xmm3, %xmm4
944; SSE41-NEXT:    pshufb %xmm1, %xmm4
945; SSE41-NEXT:    movdqa %xmm0, %xmm1
946; SSE41-NEXT:    psrlw $4, %xmm1
947; SSE41-NEXT:    pand %xmm2, %xmm1
948; SSE41-NEXT:    pxor %xmm2, %xmm2
949; SSE41-NEXT:    pshufb %xmm1, %xmm3
950; SSE41-NEXT:    pcmpeqb %xmm2, %xmm1
951; SSE41-NEXT:    pand %xmm4, %xmm1
952; SSE41-NEXT:    paddb %xmm3, %xmm1
953; SSE41-NEXT:    pcmpeqb %xmm2, %xmm0
954; SSE41-NEXT:    psrlw $8, %xmm0
955; SSE41-NEXT:    pand %xmm1, %xmm0
956; SSE41-NEXT:    psrlw $8, %xmm1
957; SSE41-NEXT:    paddw %xmm0, %xmm1
958; SSE41-NEXT:    movdqa %xmm1, %xmm0
959; SSE41-NEXT:    retq
960;
961; AVX-LABEL: testv8i16u:
962; AVX:       # BB#0:
963; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
964; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
965; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
966; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
967; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
968; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
969; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
970; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
971; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
972; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
973; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
974; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm0
975; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
976; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
977; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
978; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
979; AVX-NEXT:    retq
980;
981; AVX512VLCD-LABEL: testv8i16u:
982; AVX512VLCD:       ## BB#0:
983; AVX512VLCD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
984; AVX512VLCD-NEXT:    vplzcntd %ymm0, %ymm0
985; AVX512VLCD-NEXT:    vpmovdw %ymm0, %xmm0
986; AVX512VLCD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
987; AVX512VLCD-NEXT:    retq
988;
989; AVX512CD-LABEL: testv8i16u:
990; AVX512CD:       ## BB#0:
991; AVX512CD-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
992; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
993; AVX512CD-NEXT:    vpmovdw %zmm0, %ymm0
994; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
995; AVX512CD-NEXT:    retq
996;
997; X32-SSE-LABEL: testv8i16u:
998; X32-SSE:       # BB#0:
999; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1000; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1001; X32-SSE-NEXT:    pand %xmm2, %xmm1
1002; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1003; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1004; X32-SSE-NEXT:    pshufb %xmm1, %xmm4
1005; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1006; X32-SSE-NEXT:    psrlw $4, %xmm1
1007; X32-SSE-NEXT:    pand %xmm2, %xmm1
1008; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1009; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
1010; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm1
1011; X32-SSE-NEXT:    pand %xmm4, %xmm1
1012; X32-SSE-NEXT:    paddb %xmm3, %xmm1
1013; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
1014; X32-SSE-NEXT:    psrlw $8, %xmm0
1015; X32-SSE-NEXT:    pand %xmm1, %xmm0
1016; X32-SSE-NEXT:    psrlw $8, %xmm1
1017; X32-SSE-NEXT:    paddw %xmm0, %xmm1
1018; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1019; X32-SSE-NEXT:    retl
1020  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
1021  ret <8 x i16> %out
1022}
1023
1024define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
1025; SSE2-LABEL: testv16i8:
1026; SSE2:       # BB#0:
1027; SSE2-NEXT:    pushq %rbp
1028; SSE2-NEXT:    pushq %rbx
1029; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1030; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1031; SSE2-NEXT:    bsrl %eax, %ecx
1032; SSE2-NEXT:    movl $15, %eax
1033; SSE2-NEXT:    cmovel %eax, %ecx
1034; SSE2-NEXT:    xorl $7, %ecx
1035; SSE2-NEXT:    movd %ecx, %xmm0
1036; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
1037; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
1038; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
1039; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1040; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
1041; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1042; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
1043; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1044; SSE2-NEXT:    bsrl %ecx, %ecx
1045; SSE2-NEXT:    cmovel %eax, %ecx
1046; SSE2-NEXT:    xorl $7, %ecx
1047; SSE2-NEXT:    movd %ecx, %xmm1
1048; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1049; SSE2-NEXT:    bsrl %edx, %ecx
1050; SSE2-NEXT:    cmovel %eax, %ecx
1051; SSE2-NEXT:    xorl $7, %ecx
1052; SSE2-NEXT:    movd %ecx, %xmm2
1053; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1054; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1055; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
1056; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
1057; SSE2-NEXT:    bsrl %ebp, %ebp
1058; SSE2-NEXT:    cmovel %eax, %ebp
1059; SSE2-NEXT:    xorl $7, %ebp
1060; SSE2-NEXT:    movd %ebp, %xmm0
1061; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1062; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1063; SSE2-NEXT:    bsrl %edi, %edi
1064; SSE2-NEXT:    cmovel %eax, %edi
1065; SSE2-NEXT:    xorl $7, %edi
1066; SSE2-NEXT:    movd %edi, %xmm1
1067; SSE2-NEXT:    bsrl %ecx, %ecx
1068; SSE2-NEXT:    cmovel %eax, %ecx
1069; SSE2-NEXT:    xorl $7, %ecx
1070; SSE2-NEXT:    movd %ecx, %xmm2
1071; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1072; SSE2-NEXT:    bsrl %esi, %ecx
1073; SSE2-NEXT:    cmovel %eax, %ecx
1074; SSE2-NEXT:    xorl $7, %ecx
1075; SSE2-NEXT:    movd %ecx, %xmm3
1076; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1077; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1078; SSE2-NEXT:    bsrl %ecx, %ecx
1079; SSE2-NEXT:    cmovel %eax, %ecx
1080; SSE2-NEXT:    xorl $7, %ecx
1081; SSE2-NEXT:    movd %ecx, %xmm1
1082; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1083; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1084; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1085; SSE2-NEXT:    bsrl %ebx, %ecx
1086; SSE2-NEXT:    cmovel %eax, %ecx
1087; SSE2-NEXT:    xorl $7, %ecx
1088; SSE2-NEXT:    movd %ecx, %xmm0
1089; SSE2-NEXT:    bsrl %edx, %ecx
1090; SSE2-NEXT:    cmovel %eax, %ecx
1091; SSE2-NEXT:    xorl $7, %ecx
1092; SSE2-NEXT:    movd %ecx, %xmm3
1093; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1094; SSE2-NEXT:    bsrl %r11d, %ecx
1095; SSE2-NEXT:    cmovel %eax, %ecx
1096; SSE2-NEXT:    xorl $7, %ecx
1097; SSE2-NEXT:    movd %ecx, %xmm0
1098; SSE2-NEXT:    bsrl %esi, %ecx
1099; SSE2-NEXT:    cmovel %eax, %ecx
1100; SSE2-NEXT:    xorl $7, %ecx
1101; SSE2-NEXT:    movd %ecx, %xmm2
1102; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1103; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1104; SSE2-NEXT:    bsrl %r9d, %ecx
1105; SSE2-NEXT:    cmovel %eax, %ecx
1106; SSE2-NEXT:    xorl $7, %ecx
1107; SSE2-NEXT:    movd %ecx, %xmm0
1108; SSE2-NEXT:    bsrl %r10d, %ecx
1109; SSE2-NEXT:    cmovel %eax, %ecx
1110; SSE2-NEXT:    xorl $7, %ecx
1111; SSE2-NEXT:    movd %ecx, %xmm3
1112; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1113; SSE2-NEXT:    bsrl %r8d, %ecx
1114; SSE2-NEXT:    cmovel %eax, %ecx
1115; SSE2-NEXT:    xorl $7, %ecx
1116; SSE2-NEXT:    movd %ecx, %xmm4
1117; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1118; SSE2-NEXT:    bsrl %ecx, %ecx
1119; SSE2-NEXT:    cmovel %eax, %ecx
1120; SSE2-NEXT:    xorl $7, %ecx
1121; SSE2-NEXT:    movd %ecx, %xmm0
1122; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1123; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1124; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1125; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1126; SSE2-NEXT:    popq %rbx
1127; SSE2-NEXT:    popq %rbp
1128; SSE2-NEXT:    retq
1129;
1130; SSE3-LABEL: testv16i8:
1131; SSE3:       # BB#0:
1132; SSE3-NEXT:    pushq %rbp
1133; SSE3-NEXT:    pushq %rbx
1134; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1135; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1136; SSE3-NEXT:    bsrl %eax, %ecx
1137; SSE3-NEXT:    movl $15, %eax
1138; SSE3-NEXT:    cmovel %eax, %ecx
1139; SSE3-NEXT:    xorl $7, %ecx
1140; SSE3-NEXT:    movd %ecx, %xmm0
1141; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
1142; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
1143; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
1144; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1145; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
1146; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1147; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
1148; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1149; SSE3-NEXT:    bsrl %ecx, %ecx
1150; SSE3-NEXT:    cmovel %eax, %ecx
1151; SSE3-NEXT:    xorl $7, %ecx
1152; SSE3-NEXT:    movd %ecx, %xmm1
1153; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1154; SSE3-NEXT:    bsrl %edx, %ecx
1155; SSE3-NEXT:    cmovel %eax, %ecx
1156; SSE3-NEXT:    xorl $7, %ecx
1157; SSE3-NEXT:    movd %ecx, %xmm2
1158; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1159; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1160; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
1161; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
1162; SSE3-NEXT:    bsrl %ebp, %ebp
1163; SSE3-NEXT:    cmovel %eax, %ebp
1164; SSE3-NEXT:    xorl $7, %ebp
1165; SSE3-NEXT:    movd %ebp, %xmm0
1166; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1167; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1168; SSE3-NEXT:    bsrl %edi, %edi
1169; SSE3-NEXT:    cmovel %eax, %edi
1170; SSE3-NEXT:    xorl $7, %edi
1171; SSE3-NEXT:    movd %edi, %xmm1
1172; SSE3-NEXT:    bsrl %ecx, %ecx
1173; SSE3-NEXT:    cmovel %eax, %ecx
1174; SSE3-NEXT:    xorl $7, %ecx
1175; SSE3-NEXT:    movd %ecx, %xmm2
1176; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1177; SSE3-NEXT:    bsrl %esi, %ecx
1178; SSE3-NEXT:    cmovel %eax, %ecx
1179; SSE3-NEXT:    xorl $7, %ecx
1180; SSE3-NEXT:    movd %ecx, %xmm3
1181; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1182; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1183; SSE3-NEXT:    bsrl %ecx, %ecx
1184; SSE3-NEXT:    cmovel %eax, %ecx
1185; SSE3-NEXT:    xorl $7, %ecx
1186; SSE3-NEXT:    movd %ecx, %xmm1
1187; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1188; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1189; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1190; SSE3-NEXT:    bsrl %ebx, %ecx
1191; SSE3-NEXT:    cmovel %eax, %ecx
1192; SSE3-NEXT:    xorl $7, %ecx
1193; SSE3-NEXT:    movd %ecx, %xmm0
1194; SSE3-NEXT:    bsrl %edx, %ecx
1195; SSE3-NEXT:    cmovel %eax, %ecx
1196; SSE3-NEXT:    xorl $7, %ecx
1197; SSE3-NEXT:    movd %ecx, %xmm3
1198; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1199; SSE3-NEXT:    bsrl %r11d, %ecx
1200; SSE3-NEXT:    cmovel %eax, %ecx
1201; SSE3-NEXT:    xorl $7, %ecx
1202; SSE3-NEXT:    movd %ecx, %xmm0
1203; SSE3-NEXT:    bsrl %esi, %ecx
1204; SSE3-NEXT:    cmovel %eax, %ecx
1205; SSE3-NEXT:    xorl $7, %ecx
1206; SSE3-NEXT:    movd %ecx, %xmm2
1207; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1208; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
1209; SSE3-NEXT:    bsrl %r9d, %ecx
1210; SSE3-NEXT:    cmovel %eax, %ecx
1211; SSE3-NEXT:    xorl $7, %ecx
1212; SSE3-NEXT:    movd %ecx, %xmm0
1213; SSE3-NEXT:    bsrl %r10d, %ecx
1214; SSE3-NEXT:    cmovel %eax, %ecx
1215; SSE3-NEXT:    xorl $7, %ecx
1216; SSE3-NEXT:    movd %ecx, %xmm3
1217; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1218; SSE3-NEXT:    bsrl %r8d, %ecx
1219; SSE3-NEXT:    cmovel %eax, %ecx
1220; SSE3-NEXT:    xorl $7, %ecx
1221; SSE3-NEXT:    movd %ecx, %xmm4
1222; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1223; SSE3-NEXT:    bsrl %ecx, %ecx
1224; SSE3-NEXT:    cmovel %eax, %ecx
1225; SSE3-NEXT:    xorl $7, %ecx
1226; SSE3-NEXT:    movd %ecx, %xmm0
1227; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1228; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1229; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1230; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1231; SSE3-NEXT:    popq %rbx
1232; SSE3-NEXT:    popq %rbp
1233; SSE3-NEXT:    retq
1234;
1235; SSSE3-LABEL: testv16i8:
1236; SSSE3:       # BB#0:
1237; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1238; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1239; SSSE3-NEXT:    pand %xmm2, %xmm3
1240; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1241; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1242; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1243; SSSE3-NEXT:    psrlw $4, %xmm0
1244; SSSE3-NEXT:    pand %xmm2, %xmm0
1245; SSSE3-NEXT:    pxor %xmm2, %xmm2
1246; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
1247; SSSE3-NEXT:    pand %xmm4, %xmm2
1248; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1249; SSSE3-NEXT:    paddb %xmm2, %xmm1
1250; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1251; SSSE3-NEXT:    retq
1252;
1253; SSE41-LABEL: testv16i8:
1254; SSE41:       # BB#0:
1255; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1256; SSE41-NEXT:    movdqa %xmm0, %xmm3
1257; SSE41-NEXT:    pand %xmm2, %xmm3
1258; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1259; SSE41-NEXT:    movdqa %xmm1, %xmm4
1260; SSE41-NEXT:    pshufb %xmm3, %xmm4
1261; SSE41-NEXT:    psrlw $4, %xmm0
1262; SSE41-NEXT:    pand %xmm2, %xmm0
1263; SSE41-NEXT:    pxor %xmm2, %xmm2
1264; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
1265; SSE41-NEXT:    pand %xmm4, %xmm2
1266; SSE41-NEXT:    pshufb %xmm0, %xmm1
1267; SSE41-NEXT:    paddb %xmm2, %xmm1
1268; SSE41-NEXT:    movdqa %xmm1, %xmm0
1269; SSE41-NEXT:    retq
1270;
1271; AVX-LABEL: testv16i8:
1272; AVX:       # BB#0:
1273; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1274; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1275; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1276; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1277; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1278; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1279; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1280; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
1281; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
1282; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1283; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1284; AVX-NEXT:    retq
1285;
1286; AVX512-LABEL: testv16i8:
1287; AVX512:       ## BB#0:
1288; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1289; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
1290; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1291; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1292; AVX512-NEXT:    retq
1293;
1294; X32-SSE-LABEL: testv16i8:
1295; X32-SSE:       # BB#0:
1296; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1297; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
1298; X32-SSE-NEXT:    pand %xmm2, %xmm3
1299; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1300; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1301; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1302; X32-SSE-NEXT:    psrlw $4, %xmm0
1303; X32-SSE-NEXT:    pand %xmm2, %xmm0
1304; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1305; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
1306; X32-SSE-NEXT:    pand %xmm4, %xmm2
1307; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1308; X32-SSE-NEXT:    paddb %xmm2, %xmm1
1309; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1310; X32-SSE-NEXT:    retl
1311  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
1312  ret <16 x i8> %out
1313}
1314
1315define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
1316; SSE2-LABEL: testv16i8u:
1317; SSE2:       # BB#0:
1318; SSE2-NEXT:    pushq %rbx
1319; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1320; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1321; SSE2-NEXT:    bsrl %eax, %eax
1322; SSE2-NEXT:    xorl $7, %eax
1323; SSE2-NEXT:    movd %eax, %xmm0
1324; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
1325; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1326; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
1327; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1328; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
1329; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1330; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
1331; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1332; SSE2-NEXT:    bsrl %esi, %esi
1333; SSE2-NEXT:    xorl $7, %esi
1334; SSE2-NEXT:    movd %esi, %xmm1
1335; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1336; SSE2-NEXT:    bsrl %eax, %eax
1337; SSE2-NEXT:    xorl $7, %eax
1338; SSE2-NEXT:    movd %eax, %xmm0
1339; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1340; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1341; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
1342; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
1343; SSE2-NEXT:    bsrl %ebx, %ebx
1344; SSE2-NEXT:    xorl $7, %ebx
1345; SSE2-NEXT:    movd %ebx, %xmm2
1346; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1347; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1348; SSE2-NEXT:    bsrl %edx, %edx
1349; SSE2-NEXT:    xorl $7, %edx
1350; SSE2-NEXT:    movd %edx, %xmm0
1351; SSE2-NEXT:    bsrl %esi, %edx
1352; SSE2-NEXT:    xorl $7, %edx
1353; SSE2-NEXT:    movd %edx, %xmm3
1354; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1355; SSE2-NEXT:    bsrl %ecx, %ecx
1356; SSE2-NEXT:    xorl $7, %ecx
1357; SSE2-NEXT:    movd %ecx, %xmm0
1358; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1359; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1360; SSE2-NEXT:    bsrl %edx, %edx
1361; SSE2-NEXT:    xorl $7, %edx
1362; SSE2-NEXT:    movd %edx, %xmm1
1363; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1364; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1365; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1366; SSE2-NEXT:    bsrl %edi, %edx
1367; SSE2-NEXT:    xorl $7, %edx
1368; SSE2-NEXT:    movd %edx, %xmm0
1369; SSE2-NEXT:    bsrl %eax, %eax
1370; SSE2-NEXT:    xorl $7, %eax
1371; SSE2-NEXT:    movd %eax, %xmm2
1372; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1373; SSE2-NEXT:    bsrl %r10d, %eax
1374; SSE2-NEXT:    xorl $7, %eax
1375; SSE2-NEXT:    movd %eax, %xmm0
1376; SSE2-NEXT:    bsrl %ecx, %eax
1377; SSE2-NEXT:    xorl $7, %eax
1378; SSE2-NEXT:    movd %eax, %xmm3
1379; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1380; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1381; SSE2-NEXT:    bsrl %r9d, %eax
1382; SSE2-NEXT:    xorl $7, %eax
1383; SSE2-NEXT:    movd %eax, %xmm0
1384; SSE2-NEXT:    bsrl %r11d, %eax
1385; SSE2-NEXT:    xorl $7, %eax
1386; SSE2-NEXT:    movd %eax, %xmm2
1387; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1388; SSE2-NEXT:    bsrl %r8d, %eax
1389; SSE2-NEXT:    xorl $7, %eax
1390; SSE2-NEXT:    movd %eax, %xmm4
1391; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1392; SSE2-NEXT:    bsrl %eax, %eax
1393; SSE2-NEXT:    xorl $7, %eax
1394; SSE2-NEXT:    movd %eax, %xmm0
1395; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1396; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1397; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1398; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1399; SSE2-NEXT:    popq %rbx
1400; SSE2-NEXT:    retq
1401;
1402; SSE3-LABEL: testv16i8u:
1403; SSE3:       # BB#0:
1404; SSE3-NEXT:    pushq %rbx
1405; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1406; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1407; SSE3-NEXT:    bsrl %eax, %eax
1408; SSE3-NEXT:    xorl $7, %eax
1409; SSE3-NEXT:    movd %eax, %xmm0
1410; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
1411; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1412; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
1413; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1414; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
1415; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1416; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
1417; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1418; SSE3-NEXT:    bsrl %esi, %esi
1419; SSE3-NEXT:    xorl $7, %esi
1420; SSE3-NEXT:    movd %esi, %xmm1
1421; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1422; SSE3-NEXT:    bsrl %eax, %eax
1423; SSE3-NEXT:    xorl $7, %eax
1424; SSE3-NEXT:    movd %eax, %xmm0
1425; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1426; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
1427; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
1428; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
1429; SSE3-NEXT:    bsrl %ebx, %ebx
1430; SSE3-NEXT:    xorl $7, %ebx
1431; SSE3-NEXT:    movd %ebx, %xmm2
1432; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1433; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1434; SSE3-NEXT:    bsrl %edx, %edx
1435; SSE3-NEXT:    xorl $7, %edx
1436; SSE3-NEXT:    movd %edx, %xmm0
1437; SSE3-NEXT:    bsrl %esi, %edx
1438; SSE3-NEXT:    xorl $7, %edx
1439; SSE3-NEXT:    movd %edx, %xmm3
1440; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1441; SSE3-NEXT:    bsrl %ecx, %ecx
1442; SSE3-NEXT:    xorl $7, %ecx
1443; SSE3-NEXT:    movd %ecx, %xmm0
1444; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
1445; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
1446; SSE3-NEXT:    bsrl %edx, %edx
1447; SSE3-NEXT:    xorl $7, %edx
1448; SSE3-NEXT:    movd %edx, %xmm1
1449; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1450; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1451; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1452; SSE3-NEXT:    bsrl %edi, %edx
1453; SSE3-NEXT:    xorl $7, %edx
1454; SSE3-NEXT:    movd %edx, %xmm0
1455; SSE3-NEXT:    bsrl %eax, %eax
1456; SSE3-NEXT:    xorl $7, %eax
1457; SSE3-NEXT:    movd %eax, %xmm2
1458; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1459; SSE3-NEXT:    bsrl %r10d, %eax
1460; SSE3-NEXT:    xorl $7, %eax
1461; SSE3-NEXT:    movd %eax, %xmm0
1462; SSE3-NEXT:    bsrl %ecx, %eax
1463; SSE3-NEXT:    xorl $7, %eax
1464; SSE3-NEXT:    movd %eax, %xmm3
1465; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1466; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1467; SSE3-NEXT:    bsrl %r9d, %eax
1468; SSE3-NEXT:    xorl $7, %eax
1469; SSE3-NEXT:    movd %eax, %xmm0
1470; SSE3-NEXT:    bsrl %r11d, %eax
1471; SSE3-NEXT:    xorl $7, %eax
1472; SSE3-NEXT:    movd %eax, %xmm2
1473; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1474; SSE3-NEXT:    bsrl %r8d, %eax
1475; SSE3-NEXT:    xorl $7, %eax
1476; SSE3-NEXT:    movd %eax, %xmm4
1477; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1478; SSE3-NEXT:    bsrl %eax, %eax
1479; SSE3-NEXT:    xorl $7, %eax
1480; SSE3-NEXT:    movd %eax, %xmm0
1481; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1482; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1483; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1484; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1485; SSE3-NEXT:    popq %rbx
1486; SSE3-NEXT:    retq
1487;
1488; SSSE3-LABEL: testv16i8u:
1489; SSSE3:       # BB#0:
1490; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1491; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1492; SSSE3-NEXT:    pand %xmm2, %xmm3
1493; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1494; SSSE3-NEXT:    movdqa %xmm1, %xmm4
1495; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1496; SSSE3-NEXT:    psrlw $4, %xmm0
1497; SSSE3-NEXT:    pand %xmm2, %xmm0
1498; SSSE3-NEXT:    pxor %xmm2, %xmm2
1499; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
1500; SSSE3-NEXT:    pand %xmm4, %xmm2
1501; SSSE3-NEXT:    pshufb %xmm0, %xmm1
1502; SSSE3-NEXT:    paddb %xmm2, %xmm1
1503; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1504; SSSE3-NEXT:    retq
1505;
1506; SSE41-LABEL: testv16i8u:
1507; SSE41:       # BB#0:
1508; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1509; SSE41-NEXT:    movdqa %xmm0, %xmm3
1510; SSE41-NEXT:    pand %xmm2, %xmm3
1511; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1512; SSE41-NEXT:    movdqa %xmm1, %xmm4
1513; SSE41-NEXT:    pshufb %xmm3, %xmm4
1514; SSE41-NEXT:    psrlw $4, %xmm0
1515; SSE41-NEXT:    pand %xmm2, %xmm0
1516; SSE41-NEXT:    pxor %xmm2, %xmm2
1517; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
1518; SSE41-NEXT:    pand %xmm4, %xmm2
1519; SSE41-NEXT:    pshufb %xmm0, %xmm1
1520; SSE41-NEXT:    paddb %xmm2, %xmm1
1521; SSE41-NEXT:    movdqa %xmm1, %xmm0
1522; SSE41-NEXT:    retq
1523;
1524; AVX-LABEL: testv16i8u:
1525; AVX:       # BB#0:
1526; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1527; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
1528; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1529; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1530; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1531; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
1532; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1533; AVX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1
1534; AVX-NEXT:    vpand %xmm1, %xmm2, %xmm1
1535; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
1536; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
1537; AVX-NEXT:    retq
1538;
1539; AVX512-LABEL: testv16i8u:
1540; AVX512:       ## BB#0:
1541; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1542; AVX512-NEXT:    vplzcntd %zmm0, %zmm0
1543; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1544; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1545; AVX512-NEXT:    retq
1546;
1547; X32-SSE-LABEL: testv16i8u:
1548; X32-SSE:       # BB#0:
1549; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1550; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
1551; X32-SSE-NEXT:    pand %xmm2, %xmm3
1552; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1553; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1554; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
1555; X32-SSE-NEXT:    psrlw $4, %xmm0
1556; X32-SSE-NEXT:    pand %xmm2, %xmm0
1557; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1558; X32-SSE-NEXT:    pcmpeqb %xmm0, %xmm2
1559; X32-SSE-NEXT:    pand %xmm4, %xmm2
1560; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
1561; X32-SSE-NEXT:    paddb %xmm2, %xmm1
1562; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1563; X32-SSE-NEXT:    retl
1564  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
1565  ret <16 x i8> %out
1566}
1567
1568define <2 x i64> @foldv2i64() nounwind {
1569; SSE-LABEL: foldv2i64:
1570; SSE:       # BB#0:
1571; SSE-NEXT:    movl $55, %eax
1572; SSE-NEXT:    movd %rax, %xmm0
1573; SSE-NEXT:    retq
1574;
1575; AVX-LABEL: foldv2i64:
1576; AVX:       # BB#0:
1577; AVX-NEXT:    movl $55, %eax
1578; AVX-NEXT:    vmovq %rax, %xmm0
1579; AVX-NEXT:    retq
1580;
1581; AVX512-LABEL: foldv2i64:
1582; AVX512:       ## BB#0:
1583; AVX512-NEXT:    movl $55, %eax
1584; AVX512-NEXT:    vmovq %rax, %xmm0
1585; AVX512-NEXT:    retq
1586;
1587; X32-SSE-LABEL: foldv2i64:
1588; X32-SSE:       # BB#0:
1589; X32-SSE-NEXT:    movl $55, %eax
1590; X32-SSE-NEXT:    movd %eax, %xmm0
1591; X32-SSE-NEXT:    retl
1592  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
1593  ret <2 x i64> %out
1594}
1595
1596define <2 x i64> @foldv2i64u() nounwind {
1597; SSE-LABEL: foldv2i64u:
1598; SSE:       # BB#0:
1599; SSE-NEXT:    movl $55, %eax
1600; SSE-NEXT:    movd %rax, %xmm0
1601; SSE-NEXT:    retq
1602;
1603; AVX-LABEL: foldv2i64u:
1604; AVX:       # BB#0:
1605; AVX-NEXT:    movl $55, %eax
1606; AVX-NEXT:    vmovq %rax, %xmm0
1607; AVX-NEXT:    retq
1608;
1609; AVX512-LABEL: foldv2i64u:
1610; AVX512:       ## BB#0:
1611; AVX512-NEXT:    movl $55, %eax
1612; AVX512-NEXT:    vmovq %rax, %xmm0
1613; AVX512-NEXT:    retq
1614;
1615; X32-SSE-LABEL: foldv2i64u:
1616; X32-SSE:       # BB#0:
1617; X32-SSE-NEXT:    movl $55, %eax
1618; X32-SSE-NEXT:    movd %eax, %xmm0
1619; X32-SSE-NEXT:    retl
1620  %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
1621  ret <2 x i64> %out
1622}
1623
1624define <4 x i32> @foldv4i32() nounwind {
1625; SSE-LABEL: foldv4i32:
1626; SSE:       # BB#0:
1627; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1628; SSE-NEXT:    retq
1629;
1630; AVX-LABEL: foldv4i32:
1631; AVX:       # BB#0:
1632; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1633; AVX-NEXT:    retq
1634;
1635; AVX512VLCD-LABEL: foldv4i32:
1636; AVX512VLCD:       ## BB#0:
1637; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24]
1638; AVX512VLCD-NEXT:    retq
1639;
1640; AVX512CD-LABEL: foldv4i32:
1641; AVX512CD:       ## BB#0:
1642; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1643; AVX512CD-NEXT:    retq
1644;
1645; X32-SSE-LABEL: foldv4i32:
1646; X32-SSE:       # BB#0:
1647; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1648; X32-SSE-NEXT:    retl
1649  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
1650  ret <4 x i32> %out
1651}
1652
1653define <4 x i32> @foldv4i32u() nounwind {
1654; SSE-LABEL: foldv4i32u:
1655; SSE:       # BB#0:
1656; SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1657; SSE-NEXT:    retq
1658;
1659; AVX-LABEL: foldv4i32u:
1660; AVX:       # BB#0:
1661; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1662; AVX-NEXT:    retq
1663;
1664; AVX512VLCD-LABEL: foldv4i32u:
1665; AVX512VLCD:       ## BB#0:
1666; AVX512VLCD-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [23,0,32,24]
1667; AVX512VLCD-NEXT:    retq
1668;
1669; AVX512CD-LABEL: foldv4i32u:
1670; AVX512CD:       ## BB#0:
1671; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [23,0,32,24]
1672; AVX512CD-NEXT:    retq
1673;
1674; X32-SSE-LABEL: foldv4i32u:
1675; X32-SSE:       # BB#0:
1676; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [23,0,32,24]
1677; X32-SSE-NEXT:    retl
1678  %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
1679  ret <4 x i32> %out
1680}
1681
1682define <8 x i16> @foldv8i16() nounwind {
1683; SSE-LABEL: foldv8i16:
1684; SSE:       # BB#0:
1685; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1686; SSE-NEXT:    retq
1687;
1688; AVX-LABEL: foldv8i16:
1689; AVX:       # BB#0:
1690; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1691; AVX-NEXT:    retq
1692;
1693; AVX512VLCD-LABEL: foldv8i16:
1694; AVX512VLCD:       ## BB#0:
1695; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1696; AVX512VLCD-NEXT:    retq
1697;
1698; AVX512CD-LABEL: foldv8i16:
1699; AVX512CD:       ## BB#0:
1700; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1701; AVX512CD-NEXT:    retq
1702;
1703; X32-SSE-LABEL: foldv8i16:
1704; X32-SSE:       # BB#0:
1705; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1706; X32-SSE-NEXT:    retl
1707  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
1708  ret <8 x i16> %out
1709}
1710
1711define <8 x i16> @foldv8i16u() nounwind {
1712; SSE-LABEL: foldv8i16u:
1713; SSE:       # BB#0:
1714; SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1715; SSE-NEXT:    retq
1716;
1717; AVX-LABEL: foldv8i16u:
1718; AVX:       # BB#0:
1719; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1720; AVX-NEXT:    retq
1721;
1722; AVX512VLCD-LABEL: foldv8i16u:
1723; AVX512VLCD:       ## BB#0:
1724; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1725; AVX512VLCD-NEXT:    retq
1726;
1727; AVX512CD-LABEL: foldv8i16u:
1728; AVX512CD:       ## BB#0:
1729; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1730; AVX512CD-NEXT:    retq
1731;
1732; X32-SSE-LABEL: foldv8i16u:
1733; X32-SSE:       # BB#0:
1734; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1735; X32-SSE-NEXT:    retl
1736  %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
1737  ret <8 x i16> %out
1738}
1739
1740define <16 x i8> @foldv16i8() nounwind {
1741; SSE-LABEL: foldv16i8:
1742; SSE:       # BB#0:
1743; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1744; SSE-NEXT:    retq
1745;
1746; AVX-LABEL: foldv16i8:
1747; AVX:       # BB#0:
1748; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1749; AVX-NEXT:    retq
1750;
1751; AVX512VLCD-LABEL: foldv16i8:
1752; AVX512VLCD:       ## BB#0:
1753; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1754; AVX512VLCD-NEXT:    retq
1755;
1756; AVX512CD-LABEL: foldv16i8:
1757; AVX512CD:       ## BB#0:
1758; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1759; AVX512CD-NEXT:    retq
1760;
1761; X32-SSE-LABEL: foldv16i8:
1762; X32-SSE:       # BB#0:
1763; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1764; X32-SSE-NEXT:    retl
1765  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
1766  ret <16 x i8> %out
1767}
1768
1769define <16 x i8> @foldv16i8u() nounwind {
1770; SSE-LABEL: foldv16i8u:
1771; SSE:       # BB#0:
1772; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1773; SSE-NEXT:    retq
1774;
1775; AVX-LABEL: foldv16i8u:
1776; AVX:       # BB#0:
1777; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1778; AVX-NEXT:    retq
1779;
1780; AVX512VLCD-LABEL: foldv16i8u:
1781; AVX512VLCD:       ## BB#0:
1782; AVX512VLCD-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1783; AVX512VLCD-NEXT:    retq
1784;
1785; AVX512CD-LABEL: foldv16i8u:
1786; AVX512CD:       ## BB#0:
1787; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1788; AVX512CD-NEXT:    retq
1789;
1790; X32-SSE-LABEL: foldv16i8u:
1791; X32-SSE:       # BB#0:
1792; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1793; X32-SSE-NEXT:    retl
1794  %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
1795  ret <16 x i8> %out
1796}
1797
1798declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
1799declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
1800declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
1801declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
1802