1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8
9define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
10; SSE2-LABEL: testv2i64:
11; SSE2:       # BB#0:
12; SSE2-NEXT:    movdqa %xmm0, %xmm1
13; SSE2-NEXT:    psrlq $1, %xmm1
14; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
15; SSE2-NEXT:    psubq %xmm1, %xmm0
16; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
17; SSE2-NEXT:    movdqa %xmm0, %xmm2
18; SSE2-NEXT:    pand %xmm1, %xmm2
19; SSE2-NEXT:    psrlq $2, %xmm0
20; SSE2-NEXT:    pand %xmm1, %xmm0
21; SSE2-NEXT:    paddq %xmm2, %xmm0
22; SSE2-NEXT:    movdqa %xmm0, %xmm1
23; SSE2-NEXT:    psrlq $4, %xmm1
24; SSE2-NEXT:    paddq %xmm0, %xmm1
25; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
26; SSE2-NEXT:    pxor %xmm0, %xmm0
27; SSE2-NEXT:    psadbw %xmm0, %xmm1
28; SSE2-NEXT:    movdqa %xmm1, %xmm0
29; SSE2-NEXT:    retq
30;
31; SSE3-LABEL: testv2i64:
32; SSE3:       # BB#0:
33; SSE3-NEXT:    movdqa %xmm0, %xmm1
34; SSE3-NEXT:    psrlq $1, %xmm1
35; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
36; SSE3-NEXT:    psubq %xmm1, %xmm0
37; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
38; SSE3-NEXT:    movdqa %xmm0, %xmm2
39; SSE3-NEXT:    pand %xmm1, %xmm2
40; SSE3-NEXT:    psrlq $2, %xmm0
41; SSE3-NEXT:    pand %xmm1, %xmm0
42; SSE3-NEXT:    paddq %xmm2, %xmm0
43; SSE3-NEXT:    movdqa %xmm0, %xmm1
44; SSE3-NEXT:    psrlq $4, %xmm1
45; SSE3-NEXT:    paddq %xmm0, %xmm1
46; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
47; SSE3-NEXT:    pxor %xmm0, %xmm0
48; SSE3-NEXT:    psadbw %xmm0, %xmm1
49; SSE3-NEXT:    movdqa %xmm1, %xmm0
50; SSE3-NEXT:    retq
51;
52; SSSE3-LABEL: testv2i64:
53; SSSE3:       # BB#0:
54; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
55; SSSE3-NEXT:    movdqa %xmm0, %xmm2
56; SSSE3-NEXT:    pand %xmm1, %xmm2
57; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
58; SSSE3-NEXT:    movdqa %xmm3, %xmm4
59; SSSE3-NEXT:    pshufb %xmm2, %xmm4
60; SSSE3-NEXT:    psrlw $4, %xmm0
61; SSSE3-NEXT:    pand %xmm1, %xmm0
62; SSSE3-NEXT:    pshufb %xmm0, %xmm3
63; SSSE3-NEXT:    paddb %xmm4, %xmm3
64; SSSE3-NEXT:    pxor %xmm0, %xmm0
65; SSSE3-NEXT:    psadbw %xmm3, %xmm0
66; SSSE3-NEXT:    retq
67;
68; SSE41-LABEL: testv2i64:
69; SSE41:       # BB#0:
70; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
71; SSE41-NEXT:    movdqa %xmm0, %xmm2
72; SSE41-NEXT:    pand %xmm1, %xmm2
73; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
74; SSE41-NEXT:    movdqa %xmm3, %xmm4
75; SSE41-NEXT:    pshufb %xmm2, %xmm4
76; SSE41-NEXT:    psrlw $4, %xmm0
77; SSE41-NEXT:    pand %xmm1, %xmm0
78; SSE41-NEXT:    pshufb %xmm0, %xmm3
79; SSE41-NEXT:    paddb %xmm4, %xmm3
80; SSE41-NEXT:    pxor %xmm0, %xmm0
81; SSE41-NEXT:    psadbw %xmm3, %xmm0
82; SSE41-NEXT:    retq
83;
84; AVX-LABEL: testv2i64:
85; AVX:       # BB#0:
86; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
87; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
88; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
89; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
90; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
91; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
92; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
93; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
94; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
95; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
96; AVX-NEXT:    retq
97  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
98  ret <2 x i64> %out
99}
100
101define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
102; SSE2-LABEL: testv4i32:
103; SSE2:       # BB#0:
104; SSE2-NEXT:    movdqa %xmm0, %xmm1
105; SSE2-NEXT:    psrld $1, %xmm1
106; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
107; SSE2-NEXT:    psubd %xmm1, %xmm0
108; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
109; SSE2-NEXT:    movdqa %xmm0, %xmm2
110; SSE2-NEXT:    pand %xmm1, %xmm2
111; SSE2-NEXT:    psrld $2, %xmm0
112; SSE2-NEXT:    pand %xmm1, %xmm0
113; SSE2-NEXT:    paddd %xmm2, %xmm0
114; SSE2-NEXT:    movdqa %xmm0, %xmm1
115; SSE2-NEXT:    psrld $4, %xmm1
116; SSE2-NEXT:    paddd %xmm0, %xmm1
117; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
118; SSE2-NEXT:    pxor %xmm0, %xmm0
119; SSE2-NEXT:    movdqa %xmm1, %xmm2
120; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
121; SSE2-NEXT:    psadbw %xmm0, %xmm2
122; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
123; SSE2-NEXT:    psadbw %xmm0, %xmm1
124; SSE2-NEXT:    packuswb %xmm2, %xmm1
125; SSE2-NEXT:    movdqa %xmm1, %xmm0
126; SSE2-NEXT:    retq
127;
128; SSE3-LABEL: testv4i32:
129; SSE3:       # BB#0:
130; SSE3-NEXT:    movdqa %xmm0, %xmm1
131; SSE3-NEXT:    psrld $1, %xmm1
132; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
133; SSE3-NEXT:    psubd %xmm1, %xmm0
134; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
135; SSE3-NEXT:    movdqa %xmm0, %xmm2
136; SSE3-NEXT:    pand %xmm1, %xmm2
137; SSE3-NEXT:    psrld $2, %xmm0
138; SSE3-NEXT:    pand %xmm1, %xmm0
139; SSE3-NEXT:    paddd %xmm2, %xmm0
140; SSE3-NEXT:    movdqa %xmm0, %xmm1
141; SSE3-NEXT:    psrld $4, %xmm1
142; SSE3-NEXT:    paddd %xmm0, %xmm1
143; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
144; SSE3-NEXT:    pxor %xmm0, %xmm0
145; SSE3-NEXT:    movdqa %xmm1, %xmm2
146; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
147; SSE3-NEXT:    psadbw %xmm0, %xmm2
148; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
149; SSE3-NEXT:    psadbw %xmm0, %xmm1
150; SSE3-NEXT:    packuswb %xmm2, %xmm1
151; SSE3-NEXT:    movdqa %xmm1, %xmm0
152; SSE3-NEXT:    retq
153;
154; SSSE3-LABEL: testv4i32:
155; SSSE3:       # BB#0:
156; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
157; SSSE3-NEXT:    movdqa %xmm0, %xmm3
158; SSSE3-NEXT:    pand %xmm2, %xmm3
159; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
160; SSSE3-NEXT:    movdqa %xmm1, %xmm4
161; SSSE3-NEXT:    pshufb %xmm3, %xmm4
162; SSSE3-NEXT:    psrlw $4, %xmm0
163; SSSE3-NEXT:    pand %xmm2, %xmm0
164; SSSE3-NEXT:    pshufb %xmm0, %xmm1
165; SSSE3-NEXT:    paddb %xmm4, %xmm1
166; SSSE3-NEXT:    pxor %xmm0, %xmm0
167; SSSE3-NEXT:    movdqa %xmm1, %xmm2
168; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
169; SSSE3-NEXT:    psadbw %xmm0, %xmm2
170; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
171; SSSE3-NEXT:    psadbw %xmm0, %xmm1
172; SSSE3-NEXT:    packuswb %xmm2, %xmm1
173; SSSE3-NEXT:    movdqa %xmm1, %xmm0
174; SSSE3-NEXT:    retq
175;
176; SSE41-LABEL: testv4i32:
177; SSE41:       # BB#0:
178; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
179; SSE41-NEXT:    movdqa %xmm0, %xmm3
180; SSE41-NEXT:    pand %xmm2, %xmm3
181; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
182; SSE41-NEXT:    movdqa %xmm1, %xmm4
183; SSE41-NEXT:    pshufb %xmm3, %xmm4
184; SSE41-NEXT:    psrlw $4, %xmm0
185; SSE41-NEXT:    pand %xmm2, %xmm0
186; SSE41-NEXT:    pshufb %xmm0, %xmm1
187; SSE41-NEXT:    paddb %xmm4, %xmm1
188; SSE41-NEXT:    pxor %xmm0, %xmm0
189; SSE41-NEXT:    movdqa %xmm1, %xmm2
190; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
191; SSE41-NEXT:    psadbw %xmm0, %xmm2
192; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
193; SSE41-NEXT:    psadbw %xmm0, %xmm1
194; SSE41-NEXT:    packuswb %xmm2, %xmm1
195; SSE41-NEXT:    movdqa %xmm1, %xmm0
196; SSE41-NEXT:    retq
197;
198; AVX-LABEL: testv4i32:
199; AVX:       # BB#0:
200; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
201; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
202; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
203; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
204; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
205; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
206; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
207; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
208; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
209; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
210; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
211; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
212; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
213; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
214; AVX-NEXT:    retq
215  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
216  ret <4 x i32> %out
217}
218
219define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
220; SSE2-LABEL: testv8i16:
221; SSE2:       # BB#0:
222; SSE2-NEXT:    movdqa %xmm0, %xmm1
223; SSE2-NEXT:    psrlw $1, %xmm1
224; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
225; SSE2-NEXT:    psubw %xmm1, %xmm0
226; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
227; SSE2-NEXT:    movdqa %xmm0, %xmm2
228; SSE2-NEXT:    pand %xmm1, %xmm2
229; SSE2-NEXT:    psrlw $2, %xmm0
230; SSE2-NEXT:    pand %xmm1, %xmm0
231; SSE2-NEXT:    paddw %xmm2, %xmm0
232; SSE2-NEXT:    movdqa %xmm0, %xmm1
233; SSE2-NEXT:    psrlw $4, %xmm1
234; SSE2-NEXT:    paddw %xmm0, %xmm1
235; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
236; SSE2-NEXT:    movdqa %xmm1, %xmm0
237; SSE2-NEXT:    psllw $8, %xmm0
238; SSE2-NEXT:    paddb %xmm1, %xmm0
239; SSE2-NEXT:    psrlw $8, %xmm0
240; SSE2-NEXT:    retq
241;
242; SSE3-LABEL: testv8i16:
243; SSE3:       # BB#0:
244; SSE3-NEXT:    movdqa %xmm0, %xmm1
245; SSE3-NEXT:    psrlw $1, %xmm1
246; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
247; SSE3-NEXT:    psubw %xmm1, %xmm0
248; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
249; SSE3-NEXT:    movdqa %xmm0, %xmm2
250; SSE3-NEXT:    pand %xmm1, %xmm2
251; SSE3-NEXT:    psrlw $2, %xmm0
252; SSE3-NEXT:    pand %xmm1, %xmm0
253; SSE3-NEXT:    paddw %xmm2, %xmm0
254; SSE3-NEXT:    movdqa %xmm0, %xmm1
255; SSE3-NEXT:    psrlw $4, %xmm1
256; SSE3-NEXT:    paddw %xmm0, %xmm1
257; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
258; SSE3-NEXT:    movdqa %xmm1, %xmm0
259; SSE3-NEXT:    psllw $8, %xmm0
260; SSE3-NEXT:    paddb %xmm1, %xmm0
261; SSE3-NEXT:    psrlw $8, %xmm0
262; SSE3-NEXT:    retq
263;
264; SSSE3-LABEL: testv8i16:
265; SSSE3:       # BB#0:
266; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
267; SSSE3-NEXT:    movdqa %xmm0, %xmm2
268; SSSE3-NEXT:    pand %xmm1, %xmm2
269; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
270; SSSE3-NEXT:    movdqa %xmm3, %xmm4
271; SSSE3-NEXT:    pshufb %xmm2, %xmm4
272; SSSE3-NEXT:    psrlw $4, %xmm0
273; SSSE3-NEXT:    pand %xmm1, %xmm0
274; SSSE3-NEXT:    pshufb %xmm0, %xmm3
275; SSSE3-NEXT:    paddb %xmm4, %xmm3
276; SSSE3-NEXT:    movdqa %xmm3, %xmm0
277; SSSE3-NEXT:    psllw $8, %xmm0
278; SSSE3-NEXT:    paddb %xmm3, %xmm0
279; SSSE3-NEXT:    psrlw $8, %xmm0
280; SSSE3-NEXT:    retq
281;
282; SSE41-LABEL: testv8i16:
283; SSE41:       # BB#0:
284; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
285; SSE41-NEXT:    movdqa %xmm0, %xmm2
286; SSE41-NEXT:    pand %xmm1, %xmm2
287; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
288; SSE41-NEXT:    movdqa %xmm3, %xmm4
289; SSE41-NEXT:    pshufb %xmm2, %xmm4
290; SSE41-NEXT:    psrlw $4, %xmm0
291; SSE41-NEXT:    pand %xmm1, %xmm0
292; SSE41-NEXT:    pshufb %xmm0, %xmm3
293; SSE41-NEXT:    paddb %xmm4, %xmm3
294; SSE41-NEXT:    movdqa %xmm3, %xmm0
295; SSE41-NEXT:    psllw $8, %xmm0
296; SSE41-NEXT:    paddb %xmm3, %xmm0
297; SSE41-NEXT:    psrlw $8, %xmm0
298; SSE41-NEXT:    retq
299;
300; AVX-LABEL: testv8i16:
301; AVX:       # BB#0:
302; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
303; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
304; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
305; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
306; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
307; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
308; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
309; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
310; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
311; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
312; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
313; AVX-NEXT:    retq
314  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
315  ret <8 x i16> %out
316}
317
318define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
319; SSE2-LABEL: testv16i8:
320; SSE2:       # BB#0:
321; SSE2-NEXT:    movdqa %xmm0, %xmm1
322; SSE2-NEXT:    psrlw $1, %xmm1
323; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
324; SSE2-NEXT:    psubb %xmm1, %xmm0
325; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
326; SSE2-NEXT:    movdqa %xmm0, %xmm2
327; SSE2-NEXT:    pand %xmm1, %xmm2
328; SSE2-NEXT:    psrlw $2, %xmm0
329; SSE2-NEXT:    pand %xmm1, %xmm0
330; SSE2-NEXT:    paddb %xmm2, %xmm0
331; SSE2-NEXT:    movdqa %xmm0, %xmm1
332; SSE2-NEXT:    psrlw $4, %xmm1
333; SSE2-NEXT:    paddb %xmm0, %xmm1
334; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
335; SSE2-NEXT:    movdqa %xmm1, %xmm0
336; SSE2-NEXT:    retq
337;
338; SSE3-LABEL: testv16i8:
339; SSE3:       # BB#0:
340; SSE3-NEXT:    movdqa %xmm0, %xmm1
341; SSE3-NEXT:    psrlw $1, %xmm1
342; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
343; SSE3-NEXT:    psubb %xmm1, %xmm0
344; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
345; SSE3-NEXT:    movdqa %xmm0, %xmm2
346; SSE3-NEXT:    pand %xmm1, %xmm2
347; SSE3-NEXT:    psrlw $2, %xmm0
348; SSE3-NEXT:    pand %xmm1, %xmm0
349; SSE3-NEXT:    paddb %xmm2, %xmm0
350; SSE3-NEXT:    movdqa %xmm0, %xmm1
351; SSE3-NEXT:    psrlw $4, %xmm1
352; SSE3-NEXT:    paddb %xmm0, %xmm1
353; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
354; SSE3-NEXT:    movdqa %xmm1, %xmm0
355; SSE3-NEXT:    retq
356;
357; SSSE3-LABEL: testv16i8:
358; SSSE3:       # BB#0:
359; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
360; SSSE3-NEXT:    movdqa %xmm0, %xmm3
361; SSSE3-NEXT:    pand %xmm2, %xmm3
362; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
363; SSSE3-NEXT:    movdqa %xmm1, %xmm4
364; SSSE3-NEXT:    pshufb %xmm3, %xmm4
365; SSSE3-NEXT:    psrlw $4, %xmm0
366; SSSE3-NEXT:    pand %xmm2, %xmm0
367; SSSE3-NEXT:    pshufb %xmm0, %xmm1
368; SSSE3-NEXT:    paddb %xmm4, %xmm1
369; SSSE3-NEXT:    movdqa %xmm1, %xmm0
370; SSSE3-NEXT:    retq
371;
372; SSE41-LABEL: testv16i8:
373; SSE41:       # BB#0:
374; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
375; SSE41-NEXT:    movdqa %xmm0, %xmm3
376; SSE41-NEXT:    pand %xmm2, %xmm3
377; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
378; SSE41-NEXT:    movdqa %xmm1, %xmm4
379; SSE41-NEXT:    pshufb %xmm3, %xmm4
380; SSE41-NEXT:    psrlw $4, %xmm0
381; SSE41-NEXT:    pand %xmm2, %xmm0
382; SSE41-NEXT:    pshufb %xmm0, %xmm1
383; SSE41-NEXT:    paddb %xmm4, %xmm1
384; SSE41-NEXT:    movdqa %xmm1, %xmm0
385; SSE41-NEXT:    retq
386;
387; AVX-LABEL: testv16i8:
388; AVX:       # BB#0:
389; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
390; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
391; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
392; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
393; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
394; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
395; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
396; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
397; AVX-NEXT:    retq
398  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
399  ret <16 x i8> %out
400}
401
402define <2 x i64> @foldv2i64() nounwind {
403; SSE-LABEL: foldv2i64:
404; SSE:       # BB#0:
405; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
406; SSE-NEXT:    retq
407;
408; AVX-LABEL: foldv2i64:
409; AVX:       # BB#0:
410; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
411; AVX-NEXT:    retq
412  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
413  ret <2 x i64> %out
414}
415
416define <4 x i32> @foldv4i32() nounwind {
417; SSE-LABEL: foldv4i32:
418; SSE:       # BB#0:
419; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
420; SSE-NEXT:    retq
421;
422; AVX-LABEL: foldv4i32:
423; AVX:       # BB#0:
424; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
425; AVX-NEXT:    retq
426  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
427  ret <4 x i32> %out
428}
429
430define <8 x i16> @foldv8i16() nounwind {
431; SSE-LABEL: foldv8i16:
432; SSE:       # BB#0:
433; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
434; SSE-NEXT:    retq
435;
436; AVX-LABEL: foldv8i16:
437; AVX:       # BB#0:
438; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
439; AVX-NEXT:    retq
440  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
441  ret <8 x i16> %out
442}
443
444define <16 x i8> @foldv16i8() nounwind {
445; SSE-LABEL: foldv16i8:
446; SSE:       # BB#0:
447; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
448; SSE-NEXT:    retq
449;
450; AVX-LABEL: foldv16i8:
451; AVX:       # BB#0:
452; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
453; AVX-NEXT:    retq
454  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
455  ret <16 x i8> %out
456}
457
458declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
459declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
460declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
461declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
462