1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
2
3; CHECK: vpaddq %ymm
4define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
5  %x = add <4 x i64> %i, %j
6  ret <4 x i64> %x
7}
8
9; CHECK: vpaddd %ymm
10define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
11  %x = add <8 x i32> %i, %j
12  ret <8 x i32> %x
13}
14
15; CHECK: vpaddw %ymm
16define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
17  %x = add <16 x i16> %i, %j
18  ret <16 x i16> %x
19}
20
21; CHECK: vpaddb %ymm
22define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
23  %x = add <32 x i8> %i, %j
24  ret <32 x i8> %x
25}
26
27; CHECK: vpsubq %ymm
28define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
29  %x = sub <4 x i64> %i, %j
30  ret <4 x i64> %x
31}
32
33; CHECK: vpsubd %ymm
34define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
35  %x = sub <8 x i32> %i, %j
36  ret <8 x i32> %x
37}
38
39; CHECK: vpsubw %ymm
40define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
41  %x = sub <16 x i16> %i, %j
42  ret <16 x i16> %x
43}
44
45; CHECK: vpsubb %ymm
46define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
47  %x = sub <32 x i8> %i, %j
48  ret <32 x i8> %x
49}
50
51; CHECK: vpmulld %ymm
52define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
53  %x = mul <8 x i32> %i, %j
54  ret <8 x i32> %x
55}
56
57; CHECK: vpmullw %ymm
58define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
59  %x = mul <16 x i16> %i, %j
60  ret <16 x i16> %x
61}
62
63; CHECK: mul-v16i8
64; CHECK:       # BB#0:
65; CHECK-NEXT:  vpmovsxbw %xmm1, %ymm1
66; CHECK-NEXT:  vpmovsxbw %xmm0, %ymm0
67; CHECK-NEXT:  vpmullw %ymm1, %ymm0, %ymm0
68; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm1
69; CHECK-NEXT:  vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
70; CHECK-NEXT:  vpshufb %xmm2, %xmm1, %xmm1
71; CHECK-NEXT:  vpshufb %xmm2, %xmm0, %xmm0
72; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; CHECK-NEXT:  vzeroupper
74; CHECK-NEXT:  retq
75define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
76  %x = mul <16 x i8> %i, %j
77  ret <16 x i8> %x
78}
79
80; CHECK: mul-v32i8
81; CHECK:       # BB#0:
82; CHECK-NEXT:  vextracti128 $1, %ymm1, %xmm2
83; CHECK-NEXT:  vpmovsxbw %xmm2, %ymm2
84; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm3
85; CHECK-NEXT:  vpmovsxbw %xmm3, %ymm3
86; CHECK-NEXT:  vpmullw %ymm2, %ymm3, %ymm2
87; CHECK-NEXT:  vextracti128 $1, %ymm2, %xmm3
88; CHECK-NEXT:  vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
89; CHECK-NEXT:  vpshufb %xmm4, %xmm3, %xmm3
90; CHECK-NEXT:  vpshufb %xmm4, %xmm2, %xmm2
91; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
92; CHECK-NEXT:  vpmovsxbw %xmm1, %ymm1
93; CHECK-NEXT:  vpmovsxbw %xmm0, %ymm0
94; CHECK-NEXT:  vpmullw %ymm1, %ymm0, %ymm0
95; CHECK-NEXT:  vextracti128 $1, %ymm0, %xmm1
96; CHECK-NEXT:  vpshufb %xmm4, %xmm1, %xmm1
97; CHECK-NEXT:  vpshufb %xmm4, %xmm0, %xmm0
98; CHECK-NEXT:  vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
99; CHECK-NEXT:  vinserti128 $1, %xmm2, %ymm0, %ymm0
100; CHECK-NEXT:  retq
101define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
102  %x = mul <32 x i8> %i, %j
103  ret <32 x i8> %x
104}
105
106; CHECK: mul-v4i64
107; CHECK: vpmuludq %ymm
108; CHECK-NEXT: vpsrlq $32, %ymm
109; CHECK-NEXT: vpmuludq %ymm
110; CHECK-NEXT: vpsllq $32, %ymm
111; CHECK-NEXT: vpaddq %ymm
112; CHECK-NEXT: vpsrlq $32, %ymm
113; CHECK-NEXT: vpmuludq %ymm
114; CHECK-NEXT: vpsllq $32, %ymm
115; CHECK-NEXT: vpaddq %ymm
116define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
117  %x = mul <4 x i64> %i, %j
118  ret <4 x i64> %x
119}
120
121; CHECK: mul_const1
122; CHECK: vpaddd
123; CHECK: ret
124define <8 x i32> @mul_const1(<8 x i32> %x) {
125  %y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
126  ret <8 x i32> %y
127}
128
129; CHECK: mul_const2
130; CHECK: vpsllq  $2
131; CHECK: ret
132define <4 x i64> @mul_const2(<4 x i64> %x) {
133  %y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
134  ret <4 x i64> %y
135}
136
137; CHECK: mul_const3
138; CHECK: vpsllw  $3
139; CHECK: ret
140define <16 x i16> @mul_const3(<16 x i16> %x) {
141  %y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
142  ret <16 x i16> %y
143}
144
145; CHECK: mul_const4
146; CHECK: vpxor
147; CHECK: vpsubq
148; CHECK: ret
149define <4 x i64> @mul_const4(<4 x i64> %x) {
150  %y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
151  ret <4 x i64> %y
152}
153
154; CHECK: mul_const5
155; CHECK: vxorps
156; CHECK-NEXT: ret
157define <8 x i32> @mul_const5(<8 x i32> %x) {
158  %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
159  ret <8 x i32> %y
160}
161
162; CHECK: mul_const6
163; CHECK: vpmulld
164; CHECK: ret
165define <8 x i32> @mul_const6(<8 x i32> %x) {
166  %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
167  ret <8 x i32> %y
168}
169
170; CHECK: mul_const7
171; CHECK: vpaddq
172; CHECK: vpaddq
173; CHECK: ret
174define <8 x i64> @mul_const7(<8 x i64> %x) {
175  %y = mul <8 x i64> %x, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
176  ret <8 x i64> %y
177}
178
179; CHECK: mul_const8
180; CHECK: vpsllw  $3
181; CHECK: ret
182define <8 x i16> @mul_const8(<8 x i16> %x) {
183  %y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
184  ret <8 x i16> %y
185}
186
187; CHECK: mul_const9
188; CHECK: vpmulld
189; CHECK: ret
190define <8 x i32> @mul_const9(<8 x i32> %x) {
191  %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
192  ret <8 x i32> %y
193}
194
195; CHECK: mul_const10
196; CHECK: vpmulld
197; CHECK: ret
198define <4 x i32> @mul_const10(<4 x i32> %x) {
199  ; %x * 0x01010101
200  %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
201  ret <4 x i32> %m
202}
203
204; CHECK: mul_const11
205; CHECK: vpmulld
206; CHECK: ret
207define <4 x i32> @mul_const11(<4 x i32> %x) {
208  ; %x * 0x80808080
209  %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
210  ret <4 x i32> %m
211}
212