1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2                | FileCheck -check-prefix=AVX2 %s
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx  -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s
4
5; Vector version of:
6; v = v - ((v >> 1) & 0x55555555)
7; v = (v & 0x33333333) + ((v >> 2) & 0x33333333)
8; v = (v + (v >> 4) & 0xF0F0F0F)
9; v = v + (v >> 8)
10; v = v + (v >> 16)
11; v = v + (v >> 32) ; i64 only
12
13define <8 x i32> @test0(<8 x i32> %x) {
14; AVX2-LABEL: @test0
15entry:
16; AVX2:  vpsrld  $1, %ymm
17; AVX2-NEXT:  vpbroadcastd
18; AVX2-NEXT:  vpand
19; AVX2-NEXT:  vpsubd
20; AVX2-NEXT:  vpbroadcastd
21; AVX2-NEXT:  vpand
22; AVX2-NEXT:  vpsrld  $2
23; AVX2-NEXT:  vpand
24; AVX2-NEXT:  vpaddd
25; AVX2-NEXT:  vpsrld  $4
26; AVX2-NEXT:  vpaddd
27; AVX2-NEXT:  vpbroadcastd
28; AVX2-NEXT:	vpand
29; AVX2-NEXT:	vpsrld	$8
30; AVX2-NEXT:	vpaddd
31; AVX2-NEXT:	vpsrld	$16
32; AVX2-NEXT:	vpaddd
33; AVX2-NEXT:	vpbroadcastd
34; AVX2-NEXT:	vpand
35  %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x)
36  ret <8 x i32> %y
37}
38
39define <4 x i64> @test1(<4 x i64> %x) {
40; AVX2-NOPOPCNT-LABEL: @test1
41entry:
42;	AVX2-NOPOPCNT: vpsrlq	$1, %ymm
43;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
44;	AVX2-NOPOPCNT-NEXT: vpand
45;	AVX2-NOPOPCNT-NEXT: vpsubq
46;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
47;	AVX2-NOPOPCNT-NEXT: vpand
48;	AVX2-NOPOPCNT-NEXT: vpsrlq	$2
49;	AVX2-NOPOPCNT-NEXT: vpand
50;	AVX2-NOPOPCNT-NEXT: vpaddq
51;	AVX2-NOPOPCNT-NEXT: vpsrlq	$4
52;	AVX2-NOPOPCNT-NEXT: vpaddq
53;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
54;	AVX2-NOPOPCNT-NEXT: vpand
55;	AVX2-NOPOPCNT-NEXT: vpsrlq	$8
56;	AVX2-NOPOPCNT-NEXT: vpaddq
57;	AVX2-NOPOPCNT-NEXT: vpsrlq	$16
58;	AVX2-NOPOPCNT-NEXT: vpaddq
59;	AVX2-NOPOPCNT-NEXT: vpsrlq	$32
60;	AVX2-NOPOPCNT-NEXT: vpaddq
61;	AVX2-NOPOPCNT-NEXT: vpbroadcastq
62;	AVX2-NOPOPCNT-NEXT: vpand
63  %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x)
64  ret <4 x i64> %y
65}
66
67define <4 x i32> @test2(<4 x i32> %x) {
68; AVX2-NOPOPCNT-LABEL: @test2
69; AVX1-NOPOPCNT-LABEL: @test2
70entry:
71; AVX2-NOPOPCNT:	vpsrld	$1, %xmm
72; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
73; AVX2-NOPOPCNT-NEXT:	vpand
74; AVX2-NOPOPCNT-NEXT:	vpsubd
75; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
76; AVX2-NOPOPCNT-NEXT:	vpand
77; AVX2-NOPOPCNT-NEXT:	vpsrld	$2
78; AVX2-NOPOPCNT-NEXT:	vpand
79; AVX2-NOPOPCNT-NEXT:	vpaddd
80; AVX2-NOPOPCNT-NEXT:	vpsrld	$4
81; AVX2-NOPOPCNT-NEXT:	vpaddd
82; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
83; AVX2-NOPOPCNT-NEXT:	vpand
84; AVX2-NOPOPCNT-NEXT:	vpsrld	$8
85; AVX2-NOPOPCNT-NEXT:	vpaddd
86; AVX2-NOPOPCNT-NEXT:	vpsrld	$16
87; AVX2-NOPOPCNT-NEXT:	vpaddd
88; AVX2-NOPOPCNT-NEXT:	vpbroadcastd
89; AVX2-NOPOPCNT-NEXT:	vpand
90; AVX1-NOPOPCNT:	vpsrld	$1, %xmm
91; AVX1-NOPOPCNT-NEXT:	vpand
92; AVX1-NOPOPCNT-NEXT:	vpsubd
93; AVX1-NOPOPCNT-NEXT:	vmovdqa
94; AVX1-NOPOPCNT-NEXT:	vpand
95; AVX1-NOPOPCNT-NEXT:	vpsrld	$2
96; AVX1-NOPOPCNT-NEXT:	vpand
97; AVX1-NOPOPCNT-NEXT:	vpaddd
98; AVX1-NOPOPCNT-NEXT:	vpsrld	$4
99; AVX1-NOPOPCNT-NEXT:	vpaddd
100; AVX1-NOPOPCNT-NEXT:	vpand
101; AVX1-NOPOPCNT-NEXT:	vpsrld	$8
102; AVX1-NOPOPCNT-NEXT:	vpaddd
103; AVX1-NOPOPCNT-NEXT:	vpsrld	$16
104; AVX1-NOPOPCNT-NEXT:	vpaddd
105; AVX1-NOPOPCNT-NEXT:	vpand
106  %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
107  ret <4 x i32> %y
108}
109
110define <2 x i64> @test3(<2 x i64> %x) {
111; AVX2-NOPOPCNT-LABEL: @test3
112; AVX1-NOPOPCNT-LABEL: @test3
113entry:
114; AVX2-NOPOPCNT:	vpsrlq	$1, %xmm
115; AVX2-NOPOPCNT-NEXT:	vpand
116; AVX2-NOPOPCNT-NEXT:	vpsubq
117; AVX2-NOPOPCNT-NEXT:	vmovdqa
118; AVX2-NOPOPCNT-NEXT:	vpand
119; AVX2-NOPOPCNT-NEXT:	vpsrlq	$2
120; AVX2-NOPOPCNT-NEXT:	vpand
121; AVX2-NOPOPCNT-NEXT:	vpaddq
122; AVX2-NOPOPCNT-NEXT:	vpsrlq	$4
123; AVX2-NOPOPCNT-NEXT:	vpaddq
124; AVX2-NOPOPCNT-NEXT:	vpand
125; AVX2-NOPOPCNT-NEXT:	vpsrlq	$8
126; AVX2-NOPOPCNT-NEXT:	vpaddq
127; AVX2-NOPOPCNT-NEXT:	vpsrlq	$16
128; AVX2-NOPOPCNT-NEXT:	vpaddq
129; AVX2-NOPOPCNT-NEXT:	vpsrlq	$32
130; AVX2-NOPOPCNT-NEXT:	vpaddq
131; AVX2-NOPOPCNT-NEXT:	vpand
132; AVX1-NOPOPCNT:	vpsrlq	$1, %xmm
133; AVX1-NOPOPCNT-NEXT:	vpand
134; AVX1-NOPOPCNT-NEXT:	vpsubq
135; AVX1-NOPOPCNT-NEXT:	vmovdqa
136; AVX1-NOPOPCNT-NEXT:	vpand
137; AVX1-NOPOPCNT-NEXT:	vpsrlq	$2
138; AVX1-NOPOPCNT-NEXT:	vpand
139; AVX1-NOPOPCNT-NEXT:	vpaddq
140; AVX1-NOPOPCNT-NEXT:	vpsrlq	$4
141; AVX1-NOPOPCNT-NEXT:	vpaddq
142; AVX1-NOPOPCNT-NEXT:	vpand
143; AVX1-NOPOPCNT-NEXT:	vpsrlq	$8
144; AVX1-NOPOPCNT-NEXT:	vpaddq
145; AVX1-NOPOPCNT-NEXT:	vpsrlq	$16
146; AVX1-NOPOPCNT-NEXT:	vpaddq
147; AVX1-NOPOPCNT-NEXT:	vpsrlq	$32
148; AVX1-NOPOPCNT-NEXT:	vpaddq
149; AVX1-NOPOPCNT-NEXT:	vpand
150  %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
151  ret <2 x i64> %y
152}
153
154declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
155declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
156
157declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
158declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
159
160