1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
7
8;
9; add
10;
11
12define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
13; SSE-LABEL: trunc_add_v4i64_4i32:
14; SSE:       # BB#0:
15; SSE-NEXT:    paddq %xmm2, %xmm0
16; SSE-NEXT:    paddq %xmm3, %xmm1
17; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
18; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
19; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
20; SSE-NEXT:    retq
21;
22; AVX1-LABEL: trunc_add_v4i64_4i32:
23; AVX1:       # BB#0:
24; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm2
25; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
26; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
27; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
28; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
29; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
30; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
31; AVX1-NEXT:    vzeroupper
32; AVX1-NEXT:    retq
33;
34; AVX2-LABEL: trunc_add_v4i64_4i32:
35; AVX2:       # BB#0:
36; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
37; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
38; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
39; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
40; AVX2-NEXT:    vzeroupper
41; AVX2-NEXT:    retq
42;
43; AVX512-LABEL: trunc_add_v4i64_4i32:
44; AVX512:       # BB#0:
45; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
46; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
47; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
48; AVX512-NEXT:    retq
49  %1 = add <4 x i64> %a0, %a1
50  %2 = trunc <4 x i64> %1 to <4 x i32>
51  ret <4 x i32> %2
52}
53
54define <8 x i16> @trunc_add_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
55; SSE-LABEL: trunc_add_v8i64_8i16:
56; SSE:       # BB#0:
57; SSE-NEXT:    paddq %xmm6, %xmm2
58; SSE-NEXT:    paddq %xmm4, %xmm0
59; SSE-NEXT:    paddq %xmm7, %xmm3
60; SSE-NEXT:    paddq %xmm5, %xmm1
61; SSE-NEXT:    pextrw $4, %xmm1, %eax
62; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
63; SSE-NEXT:    pextrw $4, %xmm0, %ecx
64; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
65; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
66; SSE-NEXT:    pextrw $4, %xmm3, %edx
67; SSE-NEXT:    movd %edx, %xmm1
68; SSE-NEXT:    movd %eax, %xmm3
69; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
70; SSE-NEXT:    pextrw $4, %xmm2, %eax
71; SSE-NEXT:    movd %eax, %xmm1
72; SSE-NEXT:    movd %ecx, %xmm2
73; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
74; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
75; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
76; SSE-NEXT:    retq
77;
78; AVX1-LABEL: trunc_add_v8i64_8i16:
79; AVX1:       # BB#0:
80; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
81; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
82; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
83; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
84; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
85; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
86; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
87; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
88; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
89; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
90; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
91; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
92; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
93; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
94; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
95; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
96; AVX1-NEXT:    vzeroupper
97; AVX1-NEXT:    retq
98;
99; AVX2-LABEL: trunc_add_v8i64_8i16:
100; AVX2:       # BB#0:
101; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
102; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
103; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
104; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
105; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
106; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
107; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
108; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
109; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
110; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
111; AVX2-NEXT:    vzeroupper
112; AVX2-NEXT:    retq
113;
114; AVX512-LABEL: trunc_add_v8i64_8i16:
115; AVX512:       # BB#0:
116; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
117; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
118; AVX512-NEXT:    retq
119  %1 = add <8 x i64> %a0, %a1
120  %2 = trunc <8 x i64> %1 to <8 x i16>
121  ret <8 x i16> %2
122}
123
124define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
125; SSE-LABEL: trunc_add_v8i32_8i16:
126; SSE:       # BB#0:
127; SSE-NEXT:    paddd %xmm2, %xmm0
128; SSE-NEXT:    paddd %xmm3, %xmm1
129; SSE-NEXT:    pslld $16, %xmm1
130; SSE-NEXT:    psrad $16, %xmm1
131; SSE-NEXT:    pslld $16, %xmm0
132; SSE-NEXT:    psrad $16, %xmm0
133; SSE-NEXT:    packssdw %xmm1, %xmm0
134; SSE-NEXT:    retq
135;
136; AVX1-LABEL: trunc_add_v8i32_8i16:
137; AVX1:       # BB#0:
138; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
139; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
140; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
141; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
142; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
143; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
144; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
145; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
146; AVX1-NEXT:    vzeroupper
147; AVX1-NEXT:    retq
148;
149; AVX2-LABEL: trunc_add_v8i32_8i16:
150; AVX2:       # BB#0:
151; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
152; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
153; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
154; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
155; AVX2-NEXT:    vzeroupper
156; AVX2-NEXT:    retq
157;
158; AVX512-LABEL: trunc_add_v8i32_8i16:
159; AVX512:       # BB#0:
160; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
161; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
162; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
163; AVX512-NEXT:    retq
164  %1 = add <8 x i32> %a0, %a1
165  %2 = trunc <8 x i32> %1 to <8 x i16>
166  ret <8 x i16> %2
167}
168
169define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
170; SSE-LABEL: trunc_add_v16i64_v16i8:
171; SSE:       # BB#0:
172; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
173; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
174; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
175; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
176; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
177; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
178; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
179; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
180; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
181; SSE-NEXT:    pand %xmm8, %xmm7
182; SSE-NEXT:    pand %xmm8, %xmm6
183; SSE-NEXT:    packuswb %xmm7, %xmm6
184; SSE-NEXT:    pand %xmm8, %xmm5
185; SSE-NEXT:    pand %xmm8, %xmm4
186; SSE-NEXT:    packuswb %xmm5, %xmm4
187; SSE-NEXT:    packuswb %xmm6, %xmm4
188; SSE-NEXT:    pand %xmm8, %xmm3
189; SSE-NEXT:    pand %xmm8, %xmm2
190; SSE-NEXT:    packuswb %xmm3, %xmm2
191; SSE-NEXT:    pand %xmm8, %xmm1
192; SSE-NEXT:    pand %xmm8, %xmm0
193; SSE-NEXT:    packuswb %xmm1, %xmm0
194; SSE-NEXT:    packuswb %xmm2, %xmm0
195; SSE-NEXT:    packuswb %xmm4, %xmm0
196; SSE-NEXT:    retq
197;
198; AVX1-LABEL: trunc_add_v16i64_v16i8:
199; AVX1:       # BB#0:
200; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
201; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
202; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
203; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
204; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
205; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
206; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
207; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
208; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
209; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
210; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
211; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
212; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
213; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
214; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
215; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
216; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
217; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
218; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
219; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
220; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
221; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
222; AVX1-NEXT:    vpackuswb %xmm2, %xmm5, %xmm2
223; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
224; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
225; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
226; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
227; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
228; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
229; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
230; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
231; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
232; AVX1-NEXT:    vzeroupper
233; AVX1-NEXT:    retq
234;
235; AVX2-LABEL: trunc_add_v16i64_v16i8:
236; AVX2:       # BB#0:
237; AVX2-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
238; AVX2-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
239; AVX2-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
240; AVX2-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
241; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
242; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
243; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
244; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
245; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
246; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
247; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
248; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
249; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
250; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
251; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
252; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
253; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
254; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
255; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
256; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
257; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
258; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
259; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
260; AVX2-NEXT:    vzeroupper
261; AVX2-NEXT:    retq
262;
263; AVX512-LABEL: trunc_add_v16i64_v16i8:
264; AVX512:       # BB#0:
265; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
266; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
267; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
268; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
269; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
270; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
271; AVX512-NEXT:    retq
272  %1 = add <16 x i64> %a0, %a1
273  %2 = trunc <16 x i64> %1 to <16 x i8>
274  ret <16 x i8> %2
275}
276
277define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
278; SSE-LABEL: trunc_add_v16i32_v16i8:
279; SSE:       # BB#0:
280; SSE-NEXT:    paddd %xmm4, %xmm0
281; SSE-NEXT:    paddd %xmm5, %xmm1
282; SSE-NEXT:    paddd %xmm6, %xmm2
283; SSE-NEXT:    paddd %xmm7, %xmm3
284; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
285; SSE-NEXT:    pand %xmm4, %xmm3
286; SSE-NEXT:    pand %xmm4, %xmm2
287; SSE-NEXT:    packuswb %xmm3, %xmm2
288; SSE-NEXT:    pand %xmm4, %xmm1
289; SSE-NEXT:    pand %xmm4, %xmm0
290; SSE-NEXT:    packuswb %xmm1, %xmm0
291; SSE-NEXT:    packuswb %xmm2, %xmm0
292; SSE-NEXT:    retq
293;
294; AVX1-LABEL: trunc_add_v16i32_v16i8:
295; AVX1:       # BB#0:
296; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
297; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
298; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
299; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
300; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
301; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
302; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
303; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
304; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
305; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
306; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
307; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
308; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
309; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
310; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
311; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
312; AVX1-NEXT:    vzeroupper
313; AVX1-NEXT:    retq
314;
315; AVX2-LABEL: trunc_add_v16i32_v16i8:
316; AVX2:       # BB#0:
317; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
318; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
319; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
320; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
321; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
322; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
323; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
324; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
325; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
326; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
327; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
328; AVX2-NEXT:    vzeroupper
329; AVX2-NEXT:    retq
330;
331; AVX512-LABEL: trunc_add_v16i32_v16i8:
332; AVX512:       # BB#0:
333; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
334; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
335; AVX512-NEXT:    retq
336  %1 = add <16 x i32> %a0, %a1
337  %2 = trunc <16 x i32> %1 to <16 x i8>
338  ret <16 x i8> %2
339}
340
341define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
342; SSE-LABEL: trunc_add_v16i16_v16i8:
343; SSE:       # BB#0:
344; SSE-NEXT:    paddw %xmm2, %xmm0
345; SSE-NEXT:    paddw %xmm3, %xmm1
346; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
347; SSE-NEXT:    pand %xmm2, %xmm1
348; SSE-NEXT:    pand %xmm2, %xmm0
349; SSE-NEXT:    packuswb %xmm1, %xmm0
350; SSE-NEXT:    retq
351;
352; AVX1-LABEL: trunc_add_v16i16_v16i8:
353; AVX1:       # BB#0:
354; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
355; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
356; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
357; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
358; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
359; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
360; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
361; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
362; AVX1-NEXT:    vzeroupper
363; AVX1-NEXT:    retq
364;
365; AVX2-LABEL: trunc_add_v16i16_v16i8:
366; AVX2:       # BB#0:
367; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
368; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
369; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
370; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
371; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
372; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
373; AVX2-NEXT:    vzeroupper
374; AVX2-NEXT:    retq
375;
376; AVX512F-LABEL: trunc_add_v16i16_v16i8:
377; AVX512F:       # BB#0:
378; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
379; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
380; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
381; AVX512F-NEXT:    retq
382;
383; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
384; AVX512BW:       # BB#0:
385; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
386; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
387; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
388; AVX512BW-NEXT:    retq
389  %1 = add <16 x i16> %a0, %a1
390  %2 = trunc <16 x i16> %1 to <16 x i8>
391  ret <16 x i8> %2
392}
393
394;
395; add to constant
396;
397
398define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind {
399; SSE-LABEL: trunc_add_const_v4i64_4i32:
400; SSE:       # BB#0:
401; SSE-NEXT:    movl $1, %eax
402; SSE-NEXT:    movd %rax, %xmm2
403; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
404; SSE-NEXT:    paddq %xmm0, %xmm2
405; SSE-NEXT:    paddq {{.*}}(%rip), %xmm1
406; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
407; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
408; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
409; SSE-NEXT:    retq
410;
411; AVX1-LABEL: trunc_add_const_v4i64_4i32:
412; AVX1:       # BB#0:
413; AVX1-NEXT:    movl $1, %eax
414; AVX1-NEXT:    vmovq %rax, %xmm1
415; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
416; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
417; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
418; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
419; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
420; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
421; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
422; AVX1-NEXT:    vzeroupper
423; AVX1-NEXT:    retq
424;
425; AVX2-LABEL: trunc_add_const_v4i64_4i32:
426; AVX2:       # BB#0:
427; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
428; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
429; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
430; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
431; AVX2-NEXT:    vzeroupper
432; AVX2-NEXT:    retq
433;
434; AVX512-LABEL: trunc_add_const_v4i64_4i32:
435; AVX512:       # BB#0:
436; AVX512-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
437; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
438; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
439; AVX512-NEXT:    retq
440  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
441  %2 = trunc <4 x i64> %1 to <4 x i32>
442  ret <4 x i32> %2
443}
444
445define <8 x i16> @trunc_add_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
446; SSE-LABEL: trunc_add_const_v16i64_v16i16:
447; SSE:       # BB#0:
448; SSE-NEXT:    movdqa %xmm0, %xmm4
449; SSE-NEXT:    movl $1, %eax
450; SSE-NEXT:    movd %rax, %xmm0
451; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
452; SSE-NEXT:    paddq %xmm4, %xmm0
453; SSE-NEXT:    paddq {{.*}}(%rip), %xmm2
454; SSE-NEXT:    paddq {{.*}}(%rip), %xmm3
455; SSE-NEXT:    paddq {{.*}}(%rip), %xmm1
456; SSE-NEXT:    pextrw $4, %xmm1, %eax
457; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
458; SSE-NEXT:    pextrw $4, %xmm0, %ecx
459; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
460; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
461; SSE-NEXT:    pextrw $4, %xmm3, %edx
462; SSE-NEXT:    movd %edx, %xmm1
463; SSE-NEXT:    movd %eax, %xmm3
464; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
465; SSE-NEXT:    movd %ecx, %xmm1
466; SSE-NEXT:    pextrw $4, %xmm2, %eax
467; SSE-NEXT:    movd %eax, %xmm2
468; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
469; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
470; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
471; SSE-NEXT:    retq
472;
473; AVX1-LABEL: trunc_add_const_v16i64_v16i16:
474; AVX1:       # BB#0:
475; AVX1-NEXT:    movl $1, %eax
476; AVX1-NEXT:    vmovq %rax, %xmm2
477; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
478; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
479; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
480; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
481; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm1, %xmm3
482; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
483; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm1, %xmm1
484; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
485; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
486; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
487; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
488; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
489; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
490; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
491; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
492; AVX1-NEXT:    vzeroupper
493; AVX1-NEXT:    retq
494;
495; AVX2-LABEL: trunc_add_const_v16i64_v16i16:
496; AVX2:       # BB#0:
497; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm1, %ymm1
498; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
499; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
500; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
501; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
502; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
503; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
504; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
505; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
506; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
507; AVX2-NEXT:    vzeroupper
508; AVX2-NEXT:    retq
509;
510; AVX512-LABEL: trunc_add_const_v16i64_v16i16:
511; AVX512:       # BB#0:
512; AVX512-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
513; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
514; AVX512-NEXT:    retq
515  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
516  %2 = trunc <8 x i64> %1 to <8 x i16>
517  ret <8 x i16> %2
518}
519
520define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
521; SSE-LABEL: trunc_add_const_v16i32_v16i16:
522; SSE:       # BB#0:
523; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
524; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
525; SSE-NEXT:    pslld $16, %xmm1
526; SSE-NEXT:    psrad $16, %xmm1
527; SSE-NEXT:    pslld $16, %xmm0
528; SSE-NEXT:    psrad $16, %xmm0
529; SSE-NEXT:    packssdw %xmm1, %xmm0
530; SSE-NEXT:    retq
531;
532; AVX1-LABEL: trunc_add_const_v16i32_v16i16:
533; AVX1:       # BB#0:
534; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm1
535; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
536; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
537; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
538; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
539; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
540; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
541; AVX1-NEXT:    vzeroupper
542; AVX1-NEXT:    retq
543;
544; AVX2-LABEL: trunc_add_const_v16i32_v16i16:
545; AVX2:       # BB#0:
546; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
547; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
548; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
549; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
550; AVX2-NEXT:    vzeroupper
551; AVX2-NEXT:    retq
552;
553; AVX512-LABEL: trunc_add_const_v16i32_v16i16:
554; AVX512:       # BB#0:
555; AVX512-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
556; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
557; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
558; AVX512-NEXT:    retq
559  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
560  %2 = trunc <8 x i32> %1 to <8 x i16>
561  ret <8 x i16> %2
562}
563
564define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
565; SSE-LABEL: trunc_add_const_v16i64_v16i8:
566; SSE:       # BB#0:
567; SSE-NEXT:    movl $1, %eax
568; SSE-NEXT:    movd %rax, %xmm8
569; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
570; SSE-NEXT:    paddq %xmm8, %xmm0
571; SSE-NEXT:    paddq {{.*}}(%rip), %xmm1
572; SSE-NEXT:    paddq {{.*}}(%rip), %xmm2
573; SSE-NEXT:    paddq {{.*}}(%rip), %xmm3
574; SSE-NEXT:    paddq {{.*}}(%rip), %xmm4
575; SSE-NEXT:    paddq {{.*}}(%rip), %xmm5
576; SSE-NEXT:    paddq {{.*}}(%rip), %xmm6
577; SSE-NEXT:    paddq {{.*}}(%rip), %xmm7
578; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
579; SSE-NEXT:    pand %xmm8, %xmm7
580; SSE-NEXT:    pand %xmm8, %xmm6
581; SSE-NEXT:    packuswb %xmm7, %xmm6
582; SSE-NEXT:    pand %xmm8, %xmm5
583; SSE-NEXT:    pand %xmm8, %xmm4
584; SSE-NEXT:    packuswb %xmm5, %xmm4
585; SSE-NEXT:    packuswb %xmm6, %xmm4
586; SSE-NEXT:    pand %xmm8, %xmm3
587; SSE-NEXT:    pand %xmm8, %xmm2
588; SSE-NEXT:    packuswb %xmm3, %xmm2
589; SSE-NEXT:    pand %xmm8, %xmm1
590; SSE-NEXT:    pand %xmm8, %xmm0
591; SSE-NEXT:    packuswb %xmm1, %xmm0
592; SSE-NEXT:    packuswb %xmm2, %xmm0
593; SSE-NEXT:    packuswb %xmm4, %xmm0
594; SSE-NEXT:    retq
595;
596; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
597; AVX1:       # BB#0:
598; AVX1-NEXT:    movl $1, %eax
599; AVX1-NEXT:    vmovq %rax, %xmm4
600; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
601; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
602; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
603; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
604; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm1, %xmm5
605; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
606; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm1, %xmm1
607; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm2, %xmm6
608; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
609; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm2, %xmm2
610; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm3, %xmm7
611; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
612; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm3, %xmm3
613; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
614; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
615; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
616; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
617; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
618; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
619; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
620; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
621; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
622; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
623; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
624; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
625; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
626; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
627; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
628; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
629; AVX1-NEXT:    vzeroupper
630; AVX1-NEXT:    retq
631;
632; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
633; AVX2:       # BB#0:
634; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm1, %ymm1
635; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
636; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm3, %ymm3
637; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm2, %ymm2
638; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
639; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
640; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
641; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
642; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
643; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
644; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
645; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
646; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
647; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
648; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
649; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
650; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
651; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
652; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
653; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
654; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
655; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
656; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
657; AVX2-NEXT:    vzeroupper
658; AVX2-NEXT:    retq
659;
660; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
661; AVX512:       # BB#0:
662; AVX512-NEXT:    vpaddq {{.*}}(%rip), %zmm1, %zmm1
663; AVX512-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
664; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
665; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
666; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
667; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
668; AVX512-NEXT:    retq
669  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
670  %2 = trunc <16 x i64> %1 to <16 x i8>
671  ret <16 x i8> %2
672}
673
674define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
675; SSE-LABEL: trunc_add_const_v16i32_v16i8:
676; SSE:       # BB#0:
677; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
678; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
679; SSE-NEXT:    paddd {{.*}}(%rip), %xmm2
680; SSE-NEXT:    paddd {{.*}}(%rip), %xmm3
681; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
682; SSE-NEXT:    pand %xmm4, %xmm3
683; SSE-NEXT:    pand %xmm4, %xmm2
684; SSE-NEXT:    packuswb %xmm3, %xmm2
685; SSE-NEXT:    pand %xmm4, %xmm1
686; SSE-NEXT:    pand %xmm4, %xmm0
687; SSE-NEXT:    packuswb %xmm1, %xmm0
688; SSE-NEXT:    packuswb %xmm2, %xmm0
689; SSE-NEXT:    retq
690;
691; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
692; AVX1:       # BB#0:
693; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm2
694; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
695; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
696; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm3
697; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
698; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
699; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
700; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
701; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
702; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
703; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
704; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
705; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
706; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
707; AVX1-NEXT:    vzeroupper
708; AVX1-NEXT:    retq
709;
710; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
711; AVX2:       # BB#0:
712; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
713; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm1, %ymm1
714; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
715; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
716; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
717; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
718; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
719; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
720; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
721; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
722; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
723; AVX2-NEXT:    vzeroupper
724; AVX2-NEXT:    retq
725;
726; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
727; AVX512:       # BB#0:
728; AVX512-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
729; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
730; AVX512-NEXT:    retq
731  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
732  %2 = trunc <16 x i32> %1 to <16 x i8>
733  ret <16 x i8> %2
734}
735
736define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
737; SSE-LABEL: trunc_add_const_v16i16_v16i8:
738; SSE:       # BB#0:
739; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
740; SSE-NEXT:    paddw {{.*}}(%rip), %xmm1
741; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
742; SSE-NEXT:    pand %xmm2, %xmm1
743; SSE-NEXT:    pand %xmm2, %xmm0
744; SSE-NEXT:    packuswb %xmm1, %xmm0
745; SSE-NEXT:    retq
746;
747; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
748; AVX1:       # BB#0:
749; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm1
750; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
751; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
752; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
753; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
754; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
755; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
756; AVX1-NEXT:    vzeroupper
757; AVX1-NEXT:    retq
758;
759; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
760; AVX2:       # BB#0:
761; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
762; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
763; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
764; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
765; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
766; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
767; AVX2-NEXT:    vzeroupper
768; AVX2-NEXT:    retq
769;
770; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
771; AVX512F:       # BB#0:
772; AVX512F-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
773; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
774; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
775; AVX512F-NEXT:    retq
776;
777; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
778; AVX512BW:       # BB#0:
779; AVX512BW-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
780; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
781; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
782; AVX512BW-NEXT:    retq
783  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
784  %2 = trunc <16 x i16> %1 to <16 x i8>
785  ret <16 x i8> %2
786}
787
788;
789; sub
790;
791
792define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
793; SSE-LABEL: trunc_sub_v4i64_4i32:
794; SSE:       # BB#0:
795; SSE-NEXT:    psubq %xmm2, %xmm0
796; SSE-NEXT:    psubq %xmm3, %xmm1
797; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
798; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
799; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
800; SSE-NEXT:    retq
801;
802; AVX1-LABEL: trunc_sub_v4i64_4i32:
803; AVX1:       # BB#0:
804; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
805; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
806; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
807; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
808; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
809; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
810; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
811; AVX1-NEXT:    vzeroupper
812; AVX1-NEXT:    retq
813;
814; AVX2-LABEL: trunc_sub_v4i64_4i32:
815; AVX2:       # BB#0:
816; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
817; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
818; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
819; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
820; AVX2-NEXT:    vzeroupper
821; AVX2-NEXT:    retq
822;
823; AVX512-LABEL: trunc_sub_v4i64_4i32:
824; AVX512:       # BB#0:
825; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
826; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
827; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
828; AVX512-NEXT:    retq
829  %1 = sub <4 x i64> %a0, %a1
830  %2 = trunc <4 x i64> %1 to <4 x i32>
831  ret <4 x i32> %2
832}
833
834define <8 x i16> @trunc_sub_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
835; SSE-LABEL: trunc_sub_v8i64_8i16:
836; SSE:       # BB#0:
837; SSE-NEXT:    psubq %xmm6, %xmm2
838; SSE-NEXT:    psubq %xmm4, %xmm0
839; SSE-NEXT:    psubq %xmm7, %xmm3
840; SSE-NEXT:    psubq %xmm5, %xmm1
841; SSE-NEXT:    pextrw $4, %xmm1, %eax
842; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
843; SSE-NEXT:    pextrw $4, %xmm0, %ecx
844; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
845; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
846; SSE-NEXT:    pextrw $4, %xmm3, %edx
847; SSE-NEXT:    movd %edx, %xmm1
848; SSE-NEXT:    movd %eax, %xmm3
849; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
850; SSE-NEXT:    pextrw $4, %xmm2, %eax
851; SSE-NEXT:    movd %eax, %xmm1
852; SSE-NEXT:    movd %ecx, %xmm2
853; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
854; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
855; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
856; SSE-NEXT:    retq
857;
858; AVX1-LABEL: trunc_sub_v8i64_8i16:
859; AVX1:       # BB#0:
860; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
861; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
862; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
863; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
864; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
865; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
866; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
867; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
868; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
869; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
870; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
871; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
872; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
873; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
874; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
875; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
876; AVX1-NEXT:    vzeroupper
877; AVX1-NEXT:    retq
878;
879; AVX2-LABEL: trunc_sub_v8i64_8i16:
880; AVX2:       # BB#0:
881; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
882; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
883; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
884; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
885; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
886; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
887; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
888; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
889; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
890; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
891; AVX2-NEXT:    vzeroupper
892; AVX2-NEXT:    retq
893;
894; AVX512-LABEL: trunc_sub_v8i64_8i16:
895; AVX512:       # BB#0:
896; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
897; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
898; AVX512-NEXT:    retq
899  %1 = sub <8 x i64> %a0, %a1
900  %2 = trunc <8 x i64> %1 to <8 x i16>
901  ret <8 x i16> %2
902}
903
904define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
905; SSE-LABEL: trunc_sub_v8i32_8i16:
906; SSE:       # BB#0:
907; SSE-NEXT:    psubd %xmm2, %xmm0
908; SSE-NEXT:    psubd %xmm3, %xmm1
909; SSE-NEXT:    pslld $16, %xmm1
910; SSE-NEXT:    psrad $16, %xmm1
911; SSE-NEXT:    pslld $16, %xmm0
912; SSE-NEXT:    psrad $16, %xmm0
913; SSE-NEXT:    packssdw %xmm1, %xmm0
914; SSE-NEXT:    retq
915;
916; AVX1-LABEL: trunc_sub_v8i32_8i16:
917; AVX1:       # BB#0:
918; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
919; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
920; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
921; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
922; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
923; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
924; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
925; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
926; AVX1-NEXT:    vzeroupper
927; AVX1-NEXT:    retq
928;
929; AVX2-LABEL: trunc_sub_v8i32_8i16:
930; AVX2:       # BB#0:
931; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
932; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
933; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
934; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
935; AVX2-NEXT:    vzeroupper
936; AVX2-NEXT:    retq
937;
938; AVX512-LABEL: trunc_sub_v8i32_8i16:
939; AVX512:       # BB#0:
940; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
941; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
942; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
943; AVX512-NEXT:    retq
944  %1 = sub <8 x i32> %a0, %a1
945  %2 = trunc <8 x i32> %1 to <8 x i16>
946  ret <8 x i16> %2
947}
948
949define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
950; SSE-LABEL: trunc_sub_v16i64_v16i8:
951; SSE:       # BB#0:
952; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
953; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
954; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
955; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
956; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
957; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
958; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
959; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
960; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
961; SSE-NEXT:    pand %xmm8, %xmm7
962; SSE-NEXT:    pand %xmm8, %xmm6
963; SSE-NEXT:    packuswb %xmm7, %xmm6
964; SSE-NEXT:    pand %xmm8, %xmm5
965; SSE-NEXT:    pand %xmm8, %xmm4
966; SSE-NEXT:    packuswb %xmm5, %xmm4
967; SSE-NEXT:    packuswb %xmm6, %xmm4
968; SSE-NEXT:    pand %xmm8, %xmm3
969; SSE-NEXT:    pand %xmm8, %xmm2
970; SSE-NEXT:    packuswb %xmm3, %xmm2
971; SSE-NEXT:    pand %xmm8, %xmm1
972; SSE-NEXT:    pand %xmm8, %xmm0
973; SSE-NEXT:    packuswb %xmm1, %xmm0
974; SSE-NEXT:    packuswb %xmm2, %xmm0
975; SSE-NEXT:    packuswb %xmm4, %xmm0
976; SSE-NEXT:    retq
977;
978; AVX1-LABEL: trunc_sub_v16i64_v16i8:
979; AVX1:       # BB#0:
980; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
981; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
982; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
983; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
984; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
985; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
986; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
987; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
988; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
989; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
990; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
991; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
992; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
993; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
994; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
995; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
996; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
997; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
998; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
999; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
1000; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1001; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1002; AVX1-NEXT:    vpackuswb %xmm2, %xmm5, %xmm2
1003; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
1004; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1005; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1006; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
1007; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1008; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1009; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1010; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1011; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1012; AVX1-NEXT:    vzeroupper
1013; AVX1-NEXT:    retq
1014;
1015; AVX2-LABEL: trunc_sub_v16i64_v16i8:
1016; AVX2:       # BB#0:
1017; AVX2-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1018; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1019; AVX2-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1020; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1021; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
1022; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
1023; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
1024; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
1025; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1026; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1027; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1028; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1029; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1030; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1031; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1032; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1033; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1034; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1035; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1036; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1037; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1038; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1039; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1040; AVX2-NEXT:    vzeroupper
1041; AVX2-NEXT:    retq
1042;
1043; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1044; AVX512:       # BB#0:
1045; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1046; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1047; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1048; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
1049; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1050; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1051; AVX512-NEXT:    retq
1052  %1 = sub <16 x i64> %a0, %a1
1053  %2 = trunc <16 x i64> %1 to <16 x i8>
1054  ret <16 x i8> %2
1055}
1056
1057define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1058; SSE-LABEL: trunc_sub_v16i32_v16i8:
1059; SSE:       # BB#0:
1060; SSE-NEXT:    psubd %xmm4, %xmm0
1061; SSE-NEXT:    psubd %xmm5, %xmm1
1062; SSE-NEXT:    psubd %xmm6, %xmm2
1063; SSE-NEXT:    psubd %xmm7, %xmm3
1064; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1065; SSE-NEXT:    pand %xmm4, %xmm3
1066; SSE-NEXT:    pand %xmm4, %xmm2
1067; SSE-NEXT:    packuswb %xmm3, %xmm2
1068; SSE-NEXT:    pand %xmm4, %xmm1
1069; SSE-NEXT:    pand %xmm4, %xmm0
1070; SSE-NEXT:    packuswb %xmm1, %xmm0
1071; SSE-NEXT:    packuswb %xmm2, %xmm0
1072; SSE-NEXT:    retq
1073;
1074; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1075; AVX1:       # BB#0:
1076; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1077; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1078; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1079; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1080; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1081; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1082; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1083; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1084; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1085; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1086; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1087; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1088; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1089; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1090; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1091; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1092; AVX1-NEXT:    vzeroupper
1093; AVX1-NEXT:    retq
1094;
1095; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1096; AVX2:       # BB#0:
1097; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1098; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1099; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1100; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1101; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1102; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1103; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1104; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1105; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1106; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1107; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1108; AVX2-NEXT:    vzeroupper
1109; AVX2-NEXT:    retq
1110;
1111; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1112; AVX512:       # BB#0:
1113; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1114; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1115; AVX512-NEXT:    retq
1116  %1 = sub <16 x i32> %a0, %a1
1117  %2 = trunc <16 x i32> %1 to <16 x i8>
1118  ret <16 x i8> %2
1119}
1120
1121define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1122; SSE-LABEL: trunc_sub_v16i16_v16i8:
1123; SSE:       # BB#0:
1124; SSE-NEXT:    psubw %xmm2, %xmm0
1125; SSE-NEXT:    psubw %xmm3, %xmm1
1126; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1127; SSE-NEXT:    pand %xmm2, %xmm1
1128; SSE-NEXT:    pand %xmm2, %xmm0
1129; SSE-NEXT:    packuswb %xmm1, %xmm0
1130; SSE-NEXT:    retq
1131;
1132; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1133; AVX1:       # BB#0:
1134; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1135; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1136; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1137; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1138; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1139; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1140; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1141; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1142; AVX1-NEXT:    vzeroupper
1143; AVX1-NEXT:    retq
1144;
1145; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1146; AVX2:       # BB#0:
1147; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1148; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1149; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1150; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1151; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1152; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1153; AVX2-NEXT:    vzeroupper
1154; AVX2-NEXT:    retq
1155;
1156; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1157; AVX512F:       # BB#0:
1158; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1159; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1160; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1161; AVX512F-NEXT:    retq
1162;
1163; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1164; AVX512BW:       # BB#0:
1165; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1166; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1167; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1168; AVX512BW-NEXT:    retq
1169  %1 = sub <16 x i16> %a0, %a1
1170  %2 = trunc <16 x i16> %1 to <16 x i8>
1171  ret <16 x i8> %2
1172}
1173
1174;
1175; sub to constant
1176;
1177
1178define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind {
1179; SSE-LABEL: trunc_sub_const_v4i64_4i32:
1180; SSE:       # BB#0:
1181; SSE-NEXT:    movl $1, %eax
1182; SSE-NEXT:    movd %rax, %xmm2
1183; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1184; SSE-NEXT:    psubq %xmm2, %xmm0
1185; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1186; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1187; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1188; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1189; SSE-NEXT:    retq
1190;
1191; AVX1-LABEL: trunc_sub_const_v4i64_4i32:
1192; AVX1:       # BB#0:
1193; AVX1-NEXT:    movl $1, %eax
1194; AVX1-NEXT:    vmovq %rax, %xmm1
1195; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
1196; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
1197; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1198; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1199; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1200; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1201; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1202; AVX1-NEXT:    vzeroupper
1203; AVX1-NEXT:    retq
1204;
1205; AVX2-LABEL: trunc_sub_const_v4i64_4i32:
1206; AVX2:       # BB#0:
1207; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1208; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1209; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1210; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1211; AVX2-NEXT:    vzeroupper
1212; AVX2-NEXT:    retq
1213;
1214; AVX512-LABEL: trunc_sub_const_v4i64_4i32:
1215; AVX512:       # BB#0:
1216; AVX512-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1217; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1218; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1219; AVX512-NEXT:    retq
1220  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1221  %2 = trunc <4 x i64> %1 to <4 x i32>
1222  ret <4 x i32> %2
1223}
1224
1225define <8 x i16> @trunc_sub_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
1226; SSE-LABEL: trunc_sub_const_v16i64_v16i16:
1227; SSE:       # BB#0:
1228; SSE-NEXT:    movl $1, %eax
1229; SSE-NEXT:    movd %rax, %xmm4
1230; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1231; SSE-NEXT:    psubq %xmm4, %xmm0
1232; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
1233; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
1234; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1235; SSE-NEXT:    pextrw $4, %xmm1, %eax
1236; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1237; SSE-NEXT:    pextrw $4, %xmm0, %ecx
1238; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1239; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1240; SSE-NEXT:    pextrw $4, %xmm3, %edx
1241; SSE-NEXT:    movd %edx, %xmm1
1242; SSE-NEXT:    movd %eax, %xmm3
1243; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1244; SSE-NEXT:    movd %ecx, %xmm1
1245; SSE-NEXT:    pextrw $4, %xmm2, %eax
1246; SSE-NEXT:    movd %eax, %xmm2
1247; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1248; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1249; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1250; SSE-NEXT:    retq
1251;
1252; AVX1-LABEL: trunc_sub_const_v16i64_v16i16:
1253; AVX1:       # BB#0:
1254; AVX1-NEXT:    movl $1, %eax
1255; AVX1-NEXT:    vmovq %rax, %xmm2
1256; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
1257; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
1258; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1259; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1260; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm3
1261; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1262; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1263; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1264; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
1265; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
1266; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1267; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
1268; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
1269; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1270; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1271; AVX1-NEXT:    vzeroupper
1272; AVX1-NEXT:    retq
1273;
1274; AVX2-LABEL: trunc_sub_const_v16i64_v16i16:
1275; AVX2:       # BB#0:
1276; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
1277; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1278; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1279; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1280; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1281; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1282; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1283; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1284; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1285; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1286; AVX2-NEXT:    vzeroupper
1287; AVX2-NEXT:    retq
1288;
1289; AVX512-LABEL: trunc_sub_const_v16i64_v16i16:
1290; AVX512:       # BB#0:
1291; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1292; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1293; AVX512-NEXT:    retq
1294  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1295  %2 = trunc <8 x i64> %1 to <8 x i16>
1296  ret <8 x i16> %2
1297}
1298
1299define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
1300; SSE-LABEL: trunc_sub_const_v16i32_v16i16:
1301; SSE:       # BB#0:
1302; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
1303; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
1304; SSE-NEXT:    pslld $16, %xmm1
1305; SSE-NEXT:    psrad $16, %xmm1
1306; SSE-NEXT:    pslld $16, %xmm0
1307; SSE-NEXT:    psrad $16, %xmm0
1308; SSE-NEXT:    packssdw %xmm1, %xmm0
1309; SSE-NEXT:    retq
1310;
1311; AVX1-LABEL: trunc_sub_const_v16i32_v16i16:
1312; AVX1:       # BB#0:
1313; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm1
1314; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1315; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1316; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1317; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1318; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1319; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1320; AVX1-NEXT:    vzeroupper
1321; AVX1-NEXT:    retq
1322;
1323; AVX2-LABEL: trunc_sub_const_v16i32_v16i16:
1324; AVX2:       # BB#0:
1325; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1326; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1327; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1328; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1329; AVX2-NEXT:    vzeroupper
1330; AVX2-NEXT:    retq
1331;
1332; AVX512-LABEL: trunc_sub_const_v16i32_v16i16:
1333; AVX512:       # BB#0:
1334; AVX512-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1335; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1336; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1337; AVX512-NEXT:    retq
1338  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1339  %2 = trunc <8 x i32> %1 to <8 x i16>
1340  ret <8 x i16> %2
1341}
1342
1343define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1344; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1345; SSE:       # BB#0:
1346; SSE-NEXT:    movl $1, %eax
1347; SSE-NEXT:    movd %rax, %xmm8
1348; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
1349; SSE-NEXT:    psubq %xmm8, %xmm0
1350; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
1351; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
1352; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
1353; SSE-NEXT:    psubq {{.*}}(%rip), %xmm4
1354; SSE-NEXT:    psubq {{.*}}(%rip), %xmm5
1355; SSE-NEXT:    psubq {{.*}}(%rip), %xmm6
1356; SSE-NEXT:    psubq {{.*}}(%rip), %xmm7
1357; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1358; SSE-NEXT:    pand %xmm8, %xmm7
1359; SSE-NEXT:    pand %xmm8, %xmm6
1360; SSE-NEXT:    packuswb %xmm7, %xmm6
1361; SSE-NEXT:    pand %xmm8, %xmm5
1362; SSE-NEXT:    pand %xmm8, %xmm4
1363; SSE-NEXT:    packuswb %xmm5, %xmm4
1364; SSE-NEXT:    packuswb %xmm6, %xmm4
1365; SSE-NEXT:    pand %xmm8, %xmm3
1366; SSE-NEXT:    pand %xmm8, %xmm2
1367; SSE-NEXT:    packuswb %xmm3, %xmm2
1368; SSE-NEXT:    pand %xmm8, %xmm1
1369; SSE-NEXT:    pand %xmm8, %xmm0
1370; SSE-NEXT:    packuswb %xmm1, %xmm0
1371; SSE-NEXT:    packuswb %xmm2, %xmm0
1372; SSE-NEXT:    packuswb %xmm4, %xmm0
1373; SSE-NEXT:    retq
1374;
1375; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1376; AVX1:       # BB#0:
1377; AVX1-NEXT:    movl $1, %eax
1378; AVX1-NEXT:    vmovq %rax, %xmm4
1379; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
1380; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
1381; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1382; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
1383; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm5
1384; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1385; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
1386; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm6
1387; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1388; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
1389; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
1390; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1391; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
1392; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1393; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
1394; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
1395; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
1396; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1397; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
1398; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2
1399; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
1400; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1401; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
1402; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
1403; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1404; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
1405; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1406; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1407; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1408; AVX1-NEXT:    vzeroupper
1409; AVX1-NEXT:    retq
1410;
1411; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
1412; AVX2:       # BB#0:
1413; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
1414; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
1415; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
1416; AVX2-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
1417; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
1418; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
1419; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
1420; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
1421; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1422; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1423; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1424; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1425; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1426; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
1427; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1428; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1429; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1430; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1431; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1432; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1433; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1434; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
1435; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1436; AVX2-NEXT:    vzeroupper
1437; AVX2-NEXT:    retq
1438;
1439; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1440; AVX512:       # BB#0:
1441; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
1442; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
1443; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1444; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
1445; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1446; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1447; AVX512-NEXT:    retq
1448  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1449  %2 = trunc <16 x i64> %1 to <16 x i8>
1450  ret <16 x i8> %2
1451}
1452
1453define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1454; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1455; SSE:       # BB#0:
1456; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
1457; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
1458; SSE-NEXT:    psubd {{.*}}(%rip), %xmm2
1459; SSE-NEXT:    psubd {{.*}}(%rip), %xmm3
1460; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1461; SSE-NEXT:    pand %xmm4, %xmm3
1462; SSE-NEXT:    pand %xmm4, %xmm2
1463; SSE-NEXT:    packuswb %xmm3, %xmm2
1464; SSE-NEXT:    pand %xmm4, %xmm1
1465; SSE-NEXT:    pand %xmm4, %xmm0
1466; SSE-NEXT:    packuswb %xmm1, %xmm0
1467; SSE-NEXT:    packuswb %xmm2, %xmm0
1468; SSE-NEXT:    retq
1469;
1470; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1471; AVX1:       # BB#0:
1472; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm2
1473; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1474; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1475; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
1476; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1477; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
1478; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1479; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1480; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
1481; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
1482; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1483; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1484; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1485; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1486; AVX1-NEXT:    vzeroupper
1487; AVX1-NEXT:    retq
1488;
1489; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1490; AVX2:       # BB#0:
1491; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
1492; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm1, %ymm1
1493; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1494; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1495; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1496; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1497; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1498; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1499; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1500; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1501; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1502; AVX2-NEXT:    vzeroupper
1503; AVX2-NEXT:    retq
1504;
1505; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1506; AVX512:       # BB#0:
1507; AVX512-NEXT:    vpsubd {{.*}}(%rip), %zmm0, %zmm0
1508; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1509; AVX512-NEXT:    retq
1510  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1511  %2 = trunc <16 x i32> %1 to <16 x i8>
1512  ret <16 x i8> %2
1513}
1514
1515define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1516; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1517; SSE:       # BB#0:
1518; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
1519; SSE-NEXT:    psubw {{.*}}(%rip), %xmm1
1520; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1521; SSE-NEXT:    pand %xmm2, %xmm1
1522; SSE-NEXT:    pand %xmm2, %xmm0
1523; SSE-NEXT:    packuswb %xmm1, %xmm0
1524; SSE-NEXT:    retq
1525;
1526; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1527; AVX1:       # BB#0:
1528; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm1
1529; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1530; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1531; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1532; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1533; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1534; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1535; AVX1-NEXT:    vzeroupper
1536; AVX1-NEXT:    retq
1537;
1538; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1539; AVX2:       # BB#0:
1540; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1541; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1542; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1543; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1544; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1545; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1546; AVX2-NEXT:    vzeroupper
1547; AVX2-NEXT:    retq
1548;
1549; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1550; AVX512F:       # BB#0:
1551; AVX512F-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1552; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
1553; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1554; AVX512F-NEXT:    retq
1555;
1556; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1557; AVX512BW:       # BB#0:
1558; AVX512BW-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
1559; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1560; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1561; AVX512BW-NEXT:    retq
1562  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1563  %2 = trunc <16 x i16> %1 to <16 x i8>
1564  ret <16 x i8> %2
1565}
1566
1567;
1568; mul
1569;
1570
1571define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1572; SSE-LABEL: trunc_mul_v4i64_4i32:
1573; SSE:       # BB#0:
1574; SSE-NEXT:    movdqa %xmm0, %xmm4
1575; SSE-NEXT:    pmuludq %xmm2, %xmm4
1576; SSE-NEXT:    movdqa %xmm2, %xmm5
1577; SSE-NEXT:    psrlq $32, %xmm5
1578; SSE-NEXT:    pmuludq %xmm0, %xmm5
1579; SSE-NEXT:    psllq $32, %xmm5
1580; SSE-NEXT:    paddq %xmm4, %xmm5
1581; SSE-NEXT:    psrlq $32, %xmm0
1582; SSE-NEXT:    pmuludq %xmm2, %xmm0
1583; SSE-NEXT:    psllq $32, %xmm0
1584; SSE-NEXT:    paddq %xmm5, %xmm0
1585; SSE-NEXT:    movdqa %xmm1, %xmm2
1586; SSE-NEXT:    pmuludq %xmm3, %xmm2
1587; SSE-NEXT:    movdqa %xmm3, %xmm4
1588; SSE-NEXT:    psrlq $32, %xmm4
1589; SSE-NEXT:    pmuludq %xmm1, %xmm4
1590; SSE-NEXT:    psllq $32, %xmm4
1591; SSE-NEXT:    paddq %xmm2, %xmm4
1592; SSE-NEXT:    psrlq $32, %xmm1
1593; SSE-NEXT:    pmuludq %xmm3, %xmm1
1594; SSE-NEXT:    psllq $32, %xmm1
1595; SSE-NEXT:    paddq %xmm4, %xmm1
1596; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1597; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1598; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1599; SSE-NEXT:    retq
1600;
1601; AVX1-LABEL: trunc_mul_v4i64_4i32:
1602; AVX1:       # BB#0:
1603; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
1604; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
1605; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
1606; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
1607; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
1608; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
1609; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm3
1610; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
1611; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
1612; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1613; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1614; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
1615; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
1616; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
1617; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
1618; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
1619; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
1620; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1621; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
1622; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
1623; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1624; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1625; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1626; AVX1-NEXT:    vzeroupper
1627; AVX1-NEXT:    retq
1628;
1629; AVX2-LABEL: trunc_mul_v4i64_4i32:
1630; AVX2:       # BB#0:
1631; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
1632; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
1633; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
1634; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
1635; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
1636; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
1637; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1638; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
1639; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1640; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1641; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1642; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1643; AVX2-NEXT:    vzeroupper
1644; AVX2-NEXT:    retq
1645;
1646; AVX512-LABEL: trunc_mul_v4i64_4i32:
1647; AVX512:       # BB#0:
1648; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
1649; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm3
1650; AVX512-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
1651; AVX512-NEXT:    vpsllq $32, %ymm3, %ymm3
1652; AVX512-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
1653; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
1654; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1655; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
1656; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1657; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1658; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1659; AVX512-NEXT:    retq
1660  %1 = mul <4 x i64> %a0, %a1
1661  %2 = trunc <4 x i64> %1 to <4 x i32>
1662  ret <4 x i32> %2
1663}
1664
1665define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1666; SSE-LABEL: trunc_mul_v8i64_8i16:
1667; SSE:       # BB#0:
1668; SSE-NEXT:    movdqa %xmm2, %xmm8
1669; SSE-NEXT:    pmuludq %xmm6, %xmm8
1670; SSE-NEXT:    movdqa %xmm6, %xmm9
1671; SSE-NEXT:    psrlq $32, %xmm9
1672; SSE-NEXT:    pmuludq %xmm2, %xmm9
1673; SSE-NEXT:    psllq $32, %xmm9
1674; SSE-NEXT:    paddq %xmm8, %xmm9
1675; SSE-NEXT:    psrlq $32, %xmm2
1676; SSE-NEXT:    pmuludq %xmm6, %xmm2
1677; SSE-NEXT:    psllq $32, %xmm2
1678; SSE-NEXT:    paddq %xmm9, %xmm2
1679; SSE-NEXT:    movdqa %xmm0, %xmm8
1680; SSE-NEXT:    pmuludq %xmm4, %xmm8
1681; SSE-NEXT:    movdqa %xmm4, %xmm6
1682; SSE-NEXT:    psrlq $32, %xmm6
1683; SSE-NEXT:    pmuludq %xmm0, %xmm6
1684; SSE-NEXT:    psllq $32, %xmm6
1685; SSE-NEXT:    paddq %xmm8, %xmm6
1686; SSE-NEXT:    psrlq $32, %xmm0
1687; SSE-NEXT:    pmuludq %xmm4, %xmm0
1688; SSE-NEXT:    psllq $32, %xmm0
1689; SSE-NEXT:    paddq %xmm6, %xmm0
1690; SSE-NEXT:    movdqa %xmm3, %xmm4
1691; SSE-NEXT:    pmuludq %xmm7, %xmm4
1692; SSE-NEXT:    movdqa %xmm7, %xmm6
1693; SSE-NEXT:    psrlq $32, %xmm6
1694; SSE-NEXT:    pmuludq %xmm3, %xmm6
1695; SSE-NEXT:    psllq $32, %xmm6
1696; SSE-NEXT:    paddq %xmm4, %xmm6
1697; SSE-NEXT:    psrlq $32, %xmm3
1698; SSE-NEXT:    pmuludq %xmm7, %xmm3
1699; SSE-NEXT:    psllq $32, %xmm3
1700; SSE-NEXT:    paddq %xmm6, %xmm3
1701; SSE-NEXT:    movdqa %xmm1, %xmm4
1702; SSE-NEXT:    pmuludq %xmm5, %xmm4
1703; SSE-NEXT:    movdqa %xmm5, %xmm6
1704; SSE-NEXT:    psrlq $32, %xmm6
1705; SSE-NEXT:    pmuludq %xmm1, %xmm6
1706; SSE-NEXT:    psllq $32, %xmm6
1707; SSE-NEXT:    paddq %xmm4, %xmm6
1708; SSE-NEXT:    psrlq $32, %xmm1
1709; SSE-NEXT:    pmuludq %xmm5, %xmm1
1710; SSE-NEXT:    psllq $32, %xmm1
1711; SSE-NEXT:    paddq %xmm6, %xmm1
1712; SSE-NEXT:    pextrw $4, %xmm1, %eax
1713; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
1714; SSE-NEXT:    pextrw $4, %xmm0, %ecx
1715; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1716; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1717; SSE-NEXT:    pextrw $4, %xmm3, %edx
1718; SSE-NEXT:    movd %edx, %xmm1
1719; SSE-NEXT:    movd %eax, %xmm3
1720; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1721; SSE-NEXT:    pextrw $4, %xmm2, %eax
1722; SSE-NEXT:    movd %eax, %xmm1
1723; SSE-NEXT:    movd %ecx, %xmm2
1724; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1725; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1726; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1727; SSE-NEXT:    retq
1728;
1729; AVX1-LABEL: trunc_mul_v8i64_8i16:
1730; AVX1:       # BB#0:
1731; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm4
1732; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm5
1733; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm5
1734; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
1735; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
1736; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm5
1737; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm5
1738; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
1739; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
1740; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1741; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1742; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm5
1743; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
1744; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
1745; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
1746; AVX1-NEXT:    vpaddq %xmm6, %xmm5, %xmm5
1747; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
1748; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
1749; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
1750; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
1751; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm2
1752; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
1753; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm5
1754; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
1755; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
1756; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
1757; AVX1-NEXT:    vpmuludq %xmm3, %xmm5, %xmm5
1758; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
1759; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
1760; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1761; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1762; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm5
1763; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
1764; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm6
1765; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
1766; AVX1-NEXT:    vpaddq %xmm6, %xmm5, %xmm5
1767; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
1768; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
1769; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
1770; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
1771; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1772; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1773; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1774; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1775; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1776; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
1777; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1778; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1779; AVX1-NEXT:    vzeroupper
1780; AVX1-NEXT:    retq
1781;
1782; AVX2-LABEL: trunc_mul_v8i64_8i16:
1783; AVX2:       # BB#0:
1784; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm4
1785; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm5
1786; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm5
1787; AVX2-NEXT:    vpsllq $32, %ymm5, %ymm5
1788; AVX2-NEXT:    vpaddq %ymm5, %ymm4, %ymm4
1789; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
1790; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
1791; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
1792; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
1793; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm3
1794; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm4
1795; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
1796; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
1797; AVX2-NEXT:    vpaddq %ymm4, %ymm3, %ymm3
1798; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
1799; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
1800; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
1801; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
1802; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
1803; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
1804; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
1805; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
1806; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1807; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1808; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1809; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1810; AVX2-NEXT:    vzeroupper
1811; AVX2-NEXT:    retq
1812;
1813; AVX512-LABEL: trunc_mul_v8i64_8i16:
1814; AVX512:       # BB#0:
1815; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2
1816; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm3
1817; AVX512-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
1818; AVX512-NEXT:    vpsllq $32, %zmm3, %zmm3
1819; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
1820; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
1821; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
1822; AVX512-NEXT:    vpsllq $32, %zmm0, %zmm0
1823; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
1824; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1825; AVX512-NEXT:    retq
1826  %1 = mul <8 x i64> %a0, %a1
1827  %2 = trunc <8 x i64> %1 to <8 x i16>
1828  ret <8 x i16> %2
1829}
1830
1831define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1832; SSE-LABEL: trunc_mul_v8i32_8i16:
1833; SSE:       # BB#0:
1834; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1835; SSE-NEXT:    pmuludq %xmm2, %xmm0
1836; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1837; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1838; SSE-NEXT:    pmuludq %xmm4, %xmm2
1839; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1840; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1841; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1842; SSE-NEXT:    pmuludq %xmm3, %xmm1
1843; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1844; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1845; SSE-NEXT:    pmuludq %xmm2, %xmm3
1846; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1847; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1848; SSE-NEXT:    pslld $16, %xmm1
1849; SSE-NEXT:    psrad $16, %xmm1
1850; SSE-NEXT:    pslld $16, %xmm0
1851; SSE-NEXT:    psrad $16, %xmm0
1852; SSE-NEXT:    packssdw %xmm1, %xmm0
1853; SSE-NEXT:    retq
1854;
1855; AVX1-LABEL: trunc_mul_v8i32_8i16:
1856; AVX1:       # BB#0:
1857; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1858; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1859; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1860; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1861; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1862; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1863; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1864; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1865; AVX1-NEXT:    vzeroupper
1866; AVX1-NEXT:    retq
1867;
1868; AVX2-LABEL: trunc_mul_v8i32_8i16:
1869; AVX2:       # BB#0:
1870; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1871; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1872; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1873; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1874; AVX2-NEXT:    vzeroupper
1875; AVX2-NEXT:    retq
1876;
1877; AVX512-LABEL: trunc_mul_v8i32_8i16:
1878; AVX512:       # BB#0:
1879; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1880; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1881; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1882; AVX512-NEXT:    retq
1883  %1 = mul <8 x i32> %a0, %a1
1884  %2 = trunc <8 x i32> %1 to <8 x i16>
1885  ret <8 x i16> %2
1886}
1887
1888define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1889; SSE-LABEL: trunc_mul_v16i64_v16i8:
1890; SSE:       # BB#0:
1891; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1892; SSE-NEXT:    movdqa %xmm0, %xmm9
1893; SSE-NEXT:    pmuludq %xmm8, %xmm9
1894; SSE-NEXT:    movdqa %xmm8, %xmm10
1895; SSE-NEXT:    psrlq $32, %xmm10
1896; SSE-NEXT:    pmuludq %xmm0, %xmm10
1897; SSE-NEXT:    psllq $32, %xmm10
1898; SSE-NEXT:    paddq %xmm10, %xmm9
1899; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
1900; SSE-NEXT:    psrlq $32, %xmm0
1901; SSE-NEXT:    pmuludq %xmm8, %xmm0
1902; SSE-NEXT:    psllq $32, %xmm0
1903; SSE-NEXT:    paddq %xmm9, %xmm0
1904; SSE-NEXT:    movdqa %xmm1, %xmm8
1905; SSE-NEXT:    pmuludq %xmm10, %xmm8
1906; SSE-NEXT:    movdqa %xmm10, %xmm9
1907; SSE-NEXT:    psrlq $32, %xmm9
1908; SSE-NEXT:    pmuludq %xmm1, %xmm9
1909; SSE-NEXT:    psllq $32, %xmm9
1910; SSE-NEXT:    paddq %xmm8, %xmm9
1911; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1912; SSE-NEXT:    psrlq $32, %xmm1
1913; SSE-NEXT:    pmuludq %xmm10, %xmm1
1914; SSE-NEXT:    psllq $32, %xmm1
1915; SSE-NEXT:    paddq %xmm9, %xmm1
1916; SSE-NEXT:    movdqa %xmm2, %xmm9
1917; SSE-NEXT:    pmuludq %xmm8, %xmm9
1918; SSE-NEXT:    movdqa %xmm8, %xmm10
1919; SSE-NEXT:    psrlq $32, %xmm10
1920; SSE-NEXT:    pmuludq %xmm2, %xmm10
1921; SSE-NEXT:    psllq $32, %xmm10
1922; SSE-NEXT:    paddq %xmm9, %xmm10
1923; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1924; SSE-NEXT:    psrlq $32, %xmm2
1925; SSE-NEXT:    pmuludq %xmm8, %xmm2
1926; SSE-NEXT:    psllq $32, %xmm2
1927; SSE-NEXT:    paddq %xmm10, %xmm2
1928; SSE-NEXT:    movdqa %xmm3, %xmm8
1929; SSE-NEXT:    pmuludq %xmm9, %xmm8
1930; SSE-NEXT:    movdqa %xmm9, %xmm10
1931; SSE-NEXT:    psrlq $32, %xmm10
1932; SSE-NEXT:    pmuludq %xmm3, %xmm10
1933; SSE-NEXT:    psllq $32, %xmm10
1934; SSE-NEXT:    paddq %xmm8, %xmm10
1935; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1936; SSE-NEXT:    psrlq $32, %xmm3
1937; SSE-NEXT:    pmuludq %xmm9, %xmm3
1938; SSE-NEXT:    psllq $32, %xmm3
1939; SSE-NEXT:    paddq %xmm10, %xmm3
1940; SSE-NEXT:    movdqa %xmm4, %xmm9
1941; SSE-NEXT:    pmuludq %xmm8, %xmm9
1942; SSE-NEXT:    movdqa %xmm8, %xmm10
1943; SSE-NEXT:    psrlq $32, %xmm10
1944; SSE-NEXT:    pmuludq %xmm4, %xmm10
1945; SSE-NEXT:    psllq $32, %xmm10
1946; SSE-NEXT:    paddq %xmm9, %xmm10
1947; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1948; SSE-NEXT:    psrlq $32, %xmm4
1949; SSE-NEXT:    pmuludq %xmm8, %xmm4
1950; SSE-NEXT:    psllq $32, %xmm4
1951; SSE-NEXT:    paddq %xmm10, %xmm4
1952; SSE-NEXT:    movdqa %xmm5, %xmm8
1953; SSE-NEXT:    pmuludq %xmm9, %xmm8
1954; SSE-NEXT:    movdqa %xmm9, %xmm10
1955; SSE-NEXT:    psrlq $32, %xmm10
1956; SSE-NEXT:    pmuludq %xmm5, %xmm10
1957; SSE-NEXT:    psllq $32, %xmm10
1958; SSE-NEXT:    paddq %xmm8, %xmm10
1959; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
1960; SSE-NEXT:    psrlq $32, %xmm5
1961; SSE-NEXT:    pmuludq %xmm9, %xmm5
1962; SSE-NEXT:    psllq $32, %xmm5
1963; SSE-NEXT:    paddq %xmm10, %xmm5
1964; SSE-NEXT:    movdqa %xmm6, %xmm9
1965; SSE-NEXT:    pmuludq %xmm8, %xmm9
1966; SSE-NEXT:    movdqa %xmm8, %xmm10
1967; SSE-NEXT:    psrlq $32, %xmm10
1968; SSE-NEXT:    pmuludq %xmm6, %xmm10
1969; SSE-NEXT:    psllq $32, %xmm10
1970; SSE-NEXT:    paddq %xmm9, %xmm10
1971; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
1972; SSE-NEXT:    psrlq $32, %xmm6
1973; SSE-NEXT:    pmuludq %xmm8, %xmm6
1974; SSE-NEXT:    psllq $32, %xmm6
1975; SSE-NEXT:    paddq %xmm10, %xmm6
1976; SSE-NEXT:    movdqa %xmm7, %xmm8
1977; SSE-NEXT:    pmuludq %xmm9, %xmm8
1978; SSE-NEXT:    movdqa %xmm9, %xmm10
1979; SSE-NEXT:    psrlq $32, %xmm10
1980; SSE-NEXT:    pmuludq %xmm7, %xmm10
1981; SSE-NEXT:    psllq $32, %xmm10
1982; SSE-NEXT:    paddq %xmm8, %xmm10
1983; SSE-NEXT:    psrlq $32, %xmm7
1984; SSE-NEXT:    pmuludq %xmm9, %xmm7
1985; SSE-NEXT:    psllq $32, %xmm7
1986; SSE-NEXT:    paddq %xmm10, %xmm7
1987; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1988; SSE-NEXT:    pand %xmm8, %xmm7
1989; SSE-NEXT:    pand %xmm8, %xmm6
1990; SSE-NEXT:    packuswb %xmm7, %xmm6
1991; SSE-NEXT:    pand %xmm8, %xmm5
1992; SSE-NEXT:    pand %xmm8, %xmm4
1993; SSE-NEXT:    packuswb %xmm5, %xmm4
1994; SSE-NEXT:    packuswb %xmm6, %xmm4
1995; SSE-NEXT:    pand %xmm8, %xmm3
1996; SSE-NEXT:    pand %xmm8, %xmm2
1997; SSE-NEXT:    packuswb %xmm3, %xmm2
1998; SSE-NEXT:    pand %xmm8, %xmm1
1999; SSE-NEXT:    pand %xmm8, %xmm0
2000; SSE-NEXT:    packuswb %xmm1, %xmm0
2001; SSE-NEXT:    packuswb %xmm2, %xmm0
2002; SSE-NEXT:    packuswb %xmm4, %xmm0
2003; SSE-NEXT:    retq
2004;
2005; AVX1-LABEL: trunc_mul_v16i64_v16i8:
2006; AVX1:       # BB#0:
2007; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm8
2008; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
2009; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm9
2010; AVX1-NEXT:    vpsllq $32, %xmm9, %xmm9
2011; AVX1-NEXT:    vpaddq %xmm9, %xmm8, %xmm8
2012; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm9
2013; AVX1-NEXT:    vpmuludq %xmm4, %xmm9, %xmm9
2014; AVX1-NEXT:    vpsllq $32, %xmm9, %xmm9
2015; AVX1-NEXT:    vpaddq %xmm9, %xmm8, %xmm8
2016; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm10
2017; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2018; AVX1-NEXT:    vpmuludq %xmm10, %xmm0, %xmm9
2019; AVX1-NEXT:    vpsrlq $32, %xmm10, %xmm4
2020; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
2021; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2022; AVX1-NEXT:    vpaddq %xmm4, %xmm9, %xmm4
2023; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
2024; AVX1-NEXT:    vpmuludq %xmm10, %xmm0, %xmm0
2025; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2026; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm9
2027; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
2028; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm0
2029; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
2030; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2031; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
2032; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
2033; AVX1-NEXT:    vpmuludq %xmm5, %xmm4, %xmm4
2034; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2035; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm10
2036; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
2037; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2038; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm5
2039; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
2040; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
2041; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2042; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
2043; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
2044; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
2045; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2046; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm1
2047; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm0
2048; AVX1-NEXT:    vpsrlq $32, %xmm6, %xmm4
2049; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
2050; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2051; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
2052; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
2053; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
2054; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2055; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm5
2056; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
2057; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2058; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm4
2059; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
2060; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm6
2061; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
2062; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
2063; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
2064; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm0
2065; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2066; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
2067; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm2
2068; AVX1-NEXT:    vpsrlq $32, %xmm7, %xmm4
2069; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
2070; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2071; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
2072; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
2073; AVX1-NEXT:    vpmuludq %xmm7, %xmm4, %xmm4
2074; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2075; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
2076; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
2077; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2078; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm6
2079; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm7
2080; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm7
2081; AVX1-NEXT:    vpsllq $32, %xmm7, %xmm7
2082; AVX1-NEXT:    vpaddq %xmm7, %xmm6, %xmm6
2083; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm3
2084; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
2085; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
2086; AVX1-NEXT:    vpaddq %xmm3, %xmm6, %xmm3
2087; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2088; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2089; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2090; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
2091; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2092; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2093; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
2094; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2095; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2096; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm2
2097; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2098; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm2
2099; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2100; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2101; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2102; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2103; AVX1-NEXT:    vzeroupper
2104; AVX1-NEXT:    retq
2105;
2106; AVX2-LABEL: trunc_mul_v16i64_v16i8:
2107; AVX2:       # BB#0:
2108; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm8
2109; AVX2-NEXT:    vpsrlq $32, %ymm5, %ymm9
2110; AVX2-NEXT:    vpmuludq %ymm9, %ymm1, %ymm9
2111; AVX2-NEXT:    vpsllq $32, %ymm9, %ymm9
2112; AVX2-NEXT:    vpaddq %ymm9, %ymm8, %ymm8
2113; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
2114; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm1
2115; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
2116; AVX2-NEXT:    vpaddq %ymm1, %ymm8, %ymm1
2117; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm5
2118; AVX2-NEXT:    vpsrlq $32, %ymm4, %ymm8
2119; AVX2-NEXT:    vpmuludq %ymm8, %ymm0, %ymm8
2120; AVX2-NEXT:    vpsllq $32, %ymm8, %ymm8
2121; AVX2-NEXT:    vpaddq %ymm8, %ymm5, %ymm5
2122; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
2123; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm0
2124; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
2125; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
2126; AVX2-NEXT:    vpmuludq %ymm7, %ymm3, %ymm4
2127; AVX2-NEXT:    vpsrlq $32, %ymm7, %ymm5
2128; AVX2-NEXT:    vpmuludq %ymm5, %ymm3, %ymm5
2129; AVX2-NEXT:    vpsllq $32, %ymm5, %ymm5
2130; AVX2-NEXT:    vpaddq %ymm5, %ymm4, %ymm4
2131; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
2132; AVX2-NEXT:    vpmuludq %ymm7, %ymm3, %ymm3
2133; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
2134; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
2135; AVX2-NEXT:    vpmuludq %ymm6, %ymm2, %ymm4
2136; AVX2-NEXT:    vpsrlq $32, %ymm6, %ymm5
2137; AVX2-NEXT:    vpmuludq %ymm5, %ymm2, %ymm5
2138; AVX2-NEXT:    vpsllq $32, %ymm5, %ymm5
2139; AVX2-NEXT:    vpaddq %ymm5, %ymm4, %ymm4
2140; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
2141; AVX2-NEXT:    vpmuludq %ymm6, %ymm2, %ymm2
2142; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
2143; AVX2-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
2144; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
2145; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
2146; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
2147; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
2148; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2149; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2150; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2151; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2152; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2153; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2154; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2155; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2156; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2157; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2158; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2159; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2160; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2161; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2162; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2163; AVX2-NEXT:    vzeroupper
2164; AVX2-NEXT:    retq
2165;
2166; AVX512-LABEL: trunc_mul_v16i64_v16i8:
2167; AVX512:       # BB#0:
2168; AVX512-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
2169; AVX512-NEXT:    vpsrlq $32, %zmm3, %zmm5
2170; AVX512-NEXT:    vpmuludq %zmm5, %zmm1, %zmm5
2171; AVX512-NEXT:    vpsllq $32, %zmm5, %zmm5
2172; AVX512-NEXT:    vpaddq %zmm5, %zmm4, %zmm4
2173; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm1
2174; AVX512-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
2175; AVX512-NEXT:    vpsllq $32, %zmm1, %zmm1
2176; AVX512-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
2177; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm3
2178; AVX512-NEXT:    vpsrlq $32, %zmm2, %zmm4
2179; AVX512-NEXT:    vpmuludq %zmm4, %zmm0, %zmm4
2180; AVX512-NEXT:    vpsllq $32, %zmm4, %zmm4
2181; AVX512-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
2182; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
2183; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
2184; AVX512-NEXT:    vpsllq $32, %zmm0, %zmm0
2185; AVX512-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
2186; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2187; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
2188; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2189; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2190; AVX512-NEXT:    retq
2191  %1 = mul <16 x i64> %a0, %a1
2192  %2 = trunc <16 x i64> %1 to <16 x i8>
2193  ret <16 x i8> %2
2194}
2195
2196define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2197; SSE-LABEL: trunc_mul_v16i32_v16i8:
2198; SSE:       # BB#0:
2199; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2200; SSE-NEXT:    pmuludq %xmm4, %xmm0
2201; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2202; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2203; SSE-NEXT:    pmuludq %xmm8, %xmm4
2204; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2205; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2206; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2207; SSE-NEXT:    pmuludq %xmm5, %xmm1
2208; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2209; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2210; SSE-NEXT:    pmuludq %xmm4, %xmm5
2211; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2212; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2213; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2214; SSE-NEXT:    pmuludq %xmm6, %xmm2
2215; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2216; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2217; SSE-NEXT:    pmuludq %xmm4, %xmm5
2218; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2219; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2220; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2221; SSE-NEXT:    pmuludq %xmm7, %xmm3
2222; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2223; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2224; SSE-NEXT:    pmuludq %xmm4, %xmm5
2225; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2226; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2227; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2228; SSE-NEXT:    pand %xmm4, %xmm3
2229; SSE-NEXT:    pand %xmm4, %xmm2
2230; SSE-NEXT:    packuswb %xmm3, %xmm2
2231; SSE-NEXT:    pand %xmm4, %xmm1
2232; SSE-NEXT:    pand %xmm4, %xmm0
2233; SSE-NEXT:    packuswb %xmm1, %xmm0
2234; SSE-NEXT:    packuswb %xmm2, %xmm0
2235; SSE-NEXT:    retq
2236;
2237; AVX1-LABEL: trunc_mul_v16i32_v16i8:
2238; AVX1:       # BB#0:
2239; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
2240; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2241; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2242; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
2243; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
2244; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2245; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2246; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
2247; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2248; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2249; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2250; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2251; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2252; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
2253; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2254; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2255; AVX1-NEXT:    vzeroupper
2256; AVX1-NEXT:    retq
2257;
2258; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2259; AVX2:       # BB#0:
2260; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2261; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2262; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2263; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2264; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2265; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2266; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2267; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2268; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2269; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2270; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2271; AVX2-NEXT:    vzeroupper
2272; AVX2-NEXT:    retq
2273;
2274; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2275; AVX512:       # BB#0:
2276; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2277; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2278; AVX512-NEXT:    retq
2279  %1 = mul <16 x i32> %a0, %a1
2280  %2 = trunc <16 x i32> %1 to <16 x i8>
2281  ret <16 x i8> %2
2282}
2283
2284define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2285; SSE-LABEL: trunc_mul_v16i16_v16i8:
2286; SSE:       # BB#0:
2287; SSE-NEXT:    pmullw %xmm2, %xmm0
2288; SSE-NEXT:    pmullw %xmm3, %xmm1
2289; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2290; SSE-NEXT:    pand %xmm2, %xmm1
2291; SSE-NEXT:    pand %xmm2, %xmm0
2292; SSE-NEXT:    packuswb %xmm1, %xmm0
2293; SSE-NEXT:    retq
2294;
2295; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2296; AVX1:       # BB#0:
2297; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2298; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2299; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2300; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2301; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2302; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2303; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
2304; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2305; AVX1-NEXT:    vzeroupper
2306; AVX1-NEXT:    retq
2307;
2308; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2309; AVX2:       # BB#0:
2310; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2311; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2312; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2313; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2314; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2315; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2316; AVX2-NEXT:    vzeroupper
2317; AVX2-NEXT:    retq
2318;
2319; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2320; AVX512F:       # BB#0:
2321; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2322; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
2323; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2324; AVX512F-NEXT:    retq
2325;
2326; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2327; AVX512BW:       # BB#0:
2328; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2329; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2330; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2331; AVX512BW-NEXT:    retq
2332  %1 = mul <16 x i16> %a0, %a1
2333  %2 = trunc <16 x i16> %1 to <16 x i8>
2334  ret <16 x i8> %2
2335}
2336
2337;
2338; mul to constant
2339;
2340
2341define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind {
2342; SSE-LABEL: trunc_mul_const_v4i64_4i32:
2343; SSE:       # BB#0:
2344; SSE-NEXT:    movl $1, %eax
2345; SSE-NEXT:    movd %rax, %xmm2
2346; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2347; SSE-NEXT:    movdqa %xmm0, %xmm3
2348; SSE-NEXT:    pmuludq %xmm2, %xmm3
2349; SSE-NEXT:    psrlq $32, %xmm0
2350; SSE-NEXT:    pmuludq %xmm2, %xmm0
2351; SSE-NEXT:    psllq $32, %xmm0
2352; SSE-NEXT:    paddq %xmm3, %xmm0
2353; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,3]
2354; SSE-NEXT:    movdqa %xmm1, %xmm3
2355; SSE-NEXT:    pmuludq %xmm2, %xmm3
2356; SSE-NEXT:    psrlq $32, %xmm1
2357; SSE-NEXT:    pmuludq %xmm2, %xmm1
2358; SSE-NEXT:    psllq $32, %xmm1
2359; SSE-NEXT:    paddq %xmm3, %xmm1
2360; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2361; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2362; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2363; SSE-NEXT:    retq
2364;
2365; AVX1-LABEL: trunc_mul_const_v4i64_4i32:
2366; AVX1:       # BB#0:
2367; AVX1-NEXT:    movl $1, %eax
2368; AVX1-NEXT:    vmovq %rax, %xmm1
2369; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
2370; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
2371; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
2372; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
2373; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
2374; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
2375; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2376; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3]
2377; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm3
2378; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
2379; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
2380; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2381; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
2382; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2383; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2384; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2385; AVX1-NEXT:    vzeroupper
2386; AVX1-NEXT:    retq
2387;
2388; AVX2-LABEL: trunc_mul_const_v4i64_4i32:
2389; AVX2:       # BB#0:
2390; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2391; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
2392; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
2393; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2394; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
2395; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
2396; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2397; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2398; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2399; AVX2-NEXT:    vzeroupper
2400; AVX2-NEXT:    retq
2401;
2402; AVX512-LABEL: trunc_mul_const_v4i64_4i32:
2403; AVX512:       # BB#0:
2404; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2405; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
2406; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
2407; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
2408; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
2409; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
2410; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2411; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2412; AVX512-NEXT:    retq
2413  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2414  %2 = trunc <4 x i64> %1 to <4 x i32>
2415  ret <4 x i32> %2
2416}
2417
2418define <8 x i16> @trunc_mul_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
2419; SSE-LABEL: trunc_mul_const_v16i64_v16i16:
2420; SSE:       # BB#0:
2421; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5]
2422; SSE-NEXT:    movdqa %xmm2, %xmm5
2423; SSE-NEXT:    pmuludq %xmm4, %xmm5
2424; SSE-NEXT:    psrlq $32, %xmm2
2425; SSE-NEXT:    pmuludq %xmm4, %xmm2
2426; SSE-NEXT:    psllq $32, %xmm2
2427; SSE-NEXT:    paddq %xmm5, %xmm2
2428; SSE-NEXT:    movl $1, %eax
2429; SSE-NEXT:    movd %rax, %xmm4
2430; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2431; SSE-NEXT:    movdqa %xmm0, %xmm5
2432; SSE-NEXT:    pmuludq %xmm4, %xmm5
2433; SSE-NEXT:    psrlq $32, %xmm0
2434; SSE-NEXT:    pmuludq %xmm4, %xmm0
2435; SSE-NEXT:    psllq $32, %xmm0
2436; SSE-NEXT:    paddq %xmm5, %xmm0
2437; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [6,7]
2438; SSE-NEXT:    movdqa %xmm3, %xmm5
2439; SSE-NEXT:    pmuludq %xmm4, %xmm5
2440; SSE-NEXT:    psrlq $32, %xmm3
2441; SSE-NEXT:    pmuludq %xmm4, %xmm3
2442; SSE-NEXT:    psllq $32, %xmm3
2443; SSE-NEXT:    paddq %xmm5, %xmm3
2444; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,3]
2445; SSE-NEXT:    movdqa %xmm1, %xmm5
2446; SSE-NEXT:    pmuludq %xmm4, %xmm5
2447; SSE-NEXT:    psrlq $32, %xmm1
2448; SSE-NEXT:    pmuludq %xmm4, %xmm1
2449; SSE-NEXT:    psllq $32, %xmm1
2450; SSE-NEXT:    paddq %xmm5, %xmm1
2451; SSE-NEXT:    pextrw $4, %xmm1, %eax
2452; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
2453; SSE-NEXT:    pextrw $4, %xmm0, %ecx
2454; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2455; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2456; SSE-NEXT:    pextrw $4, %xmm3, %edx
2457; SSE-NEXT:    movd %edx, %xmm1
2458; SSE-NEXT:    movd %eax, %xmm3
2459; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2460; SSE-NEXT:    pextrw $4, %xmm2, %eax
2461; SSE-NEXT:    movd %eax, %xmm1
2462; SSE-NEXT:    movd %ecx, %xmm2
2463; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2464; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2465; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2466; SSE-NEXT:    retq
2467;
2468; AVX1-LABEL: trunc_mul_const_v16i64_v16i16:
2469; AVX1:       # BB#0:
2470; AVX1-NEXT:    movl $1, %eax
2471; AVX1-NEXT:    vmovq %rax, %xmm2
2472; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2473; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm3
2474; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
2475; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
2476; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
2477; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
2478; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2479; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3]
2480; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm4
2481; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
2482; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
2483; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2484; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
2485; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5]
2486; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm4
2487; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
2488; AVX1-NEXT:    vpmuludq %xmm3, %xmm5, %xmm3
2489; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
2490; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
2491; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2492; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,7]
2493; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm5
2494; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
2495; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
2496; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
2497; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
2498; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2499; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
2500; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
2501; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2502; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
2503; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
2504; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2505; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2506; AVX1-NEXT:    vzeroupper
2507; AVX1-NEXT:    retq
2508;
2509; AVX2-LABEL: trunc_mul_const_v16i64_v16i16:
2510; AVX2:       # BB#0:
2511; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
2512; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm3
2513; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
2514; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
2515; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
2516; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
2517; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
2518; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm3
2519; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
2520; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
2521; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
2522; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
2523; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2524; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2525; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2526; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2527; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2528; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2529; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2530; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2531; AVX2-NEXT:    vzeroupper
2532; AVX2-NEXT:    retq
2533;
2534; AVX512-LABEL: trunc_mul_const_v16i64_v16i16:
2535; AVX512:       # BB#0:
2536; AVX512-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
2537; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2
2538; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
2539; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
2540; AVX512-NEXT:    vpsllq $32, %zmm0, %zmm0
2541; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
2542; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2543; AVX512-NEXT:    retq
2544  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2545  %2 = trunc <8 x i64> %1 to <8 x i16>
2546  ret <8 x i16> %2
2547}
2548
2549define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
2550; SSE-LABEL: trunc_mul_const_v16i32_v16i16:
2551; SSE:       # BB#0:
2552; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3]
2553; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2554; SSE-NEXT:    pmuludq %xmm2, %xmm0
2555; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2556; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2557; SSE-NEXT:    pmuludq %xmm3, %xmm2
2558; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2559; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2560; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,5,6,7]
2561; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2562; SSE-NEXT:    pmuludq %xmm2, %xmm1
2563; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2564; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2565; SSE-NEXT:    pmuludq %xmm3, %xmm2
2566; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2567; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2568; SSE-NEXT:    pslld $16, %xmm1
2569; SSE-NEXT:    psrad $16, %xmm1
2570; SSE-NEXT:    pslld $16, %xmm0
2571; SSE-NEXT:    psrad $16, %xmm0
2572; SSE-NEXT:    packssdw %xmm1, %xmm0
2573; SSE-NEXT:    retq
2574;
2575; AVX1-LABEL: trunc_mul_const_v16i32_v16i16:
2576; AVX1:       # BB#0:
2577; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
2578; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2579; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2580; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2581; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2582; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2583; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2584; AVX1-NEXT:    vzeroupper
2585; AVX1-NEXT:    retq
2586;
2587; AVX2-LABEL: trunc_mul_const_v16i32_v16i16:
2588; AVX2:       # BB#0:
2589; AVX2-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2590; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
2591; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2592; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2593; AVX2-NEXT:    vzeroupper
2594; AVX2-NEXT:    retq
2595;
2596; AVX512-LABEL: trunc_mul_const_v16i32_v16i16:
2597; AVX512:       # BB#0:
2598; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2599; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2600; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2601; AVX512-NEXT:    retq
2602  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2603  %2 = trunc <8 x i32> %1 to <8 x i16>
2604  ret <8 x i16> %2
2605}
2606
2607define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2608; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2609; SSE:       # BB#0:
2610; SSE-NEXT:    movl $1, %eax
2611; SSE-NEXT:    movd %rax, %xmm8
2612; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
2613; SSE-NEXT:    movdqa %xmm0, %xmm9
2614; SSE-NEXT:    pmuludq %xmm8, %xmm9
2615; SSE-NEXT:    psrlq $32, %xmm0
2616; SSE-NEXT:    pmuludq %xmm8, %xmm0
2617; SSE-NEXT:    psllq $32, %xmm0
2618; SSE-NEXT:    paddq %xmm9, %xmm0
2619; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [2,3]
2620; SSE-NEXT:    movdqa %xmm1, %xmm9
2621; SSE-NEXT:    pmuludq %xmm8, %xmm9
2622; SSE-NEXT:    psrlq $32, %xmm1
2623; SSE-NEXT:    pmuludq %xmm8, %xmm1
2624; SSE-NEXT:    psllq $32, %xmm1
2625; SSE-NEXT:    paddq %xmm9, %xmm1
2626; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [4,5]
2627; SSE-NEXT:    movdqa %xmm2, %xmm9
2628; SSE-NEXT:    pmuludq %xmm8, %xmm9
2629; SSE-NEXT:    psrlq $32, %xmm2
2630; SSE-NEXT:    pmuludq %xmm8, %xmm2
2631; SSE-NEXT:    psllq $32, %xmm2
2632; SSE-NEXT:    paddq %xmm9, %xmm2
2633; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [6,7]
2634; SSE-NEXT:    movdqa %xmm3, %xmm9
2635; SSE-NEXT:    pmuludq %xmm8, %xmm9
2636; SSE-NEXT:    psrlq $32, %xmm3
2637; SSE-NEXT:    pmuludq %xmm8, %xmm3
2638; SSE-NEXT:    psllq $32, %xmm3
2639; SSE-NEXT:    paddq %xmm9, %xmm3
2640; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [8,9]
2641; SSE-NEXT:    movdqa %xmm4, %xmm9
2642; SSE-NEXT:    pmuludq %xmm8, %xmm9
2643; SSE-NEXT:    psrlq $32, %xmm4
2644; SSE-NEXT:    pmuludq %xmm8, %xmm4
2645; SSE-NEXT:    psllq $32, %xmm4
2646; SSE-NEXT:    paddq %xmm9, %xmm4
2647; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [10,11]
2648; SSE-NEXT:    movdqa %xmm5, %xmm9
2649; SSE-NEXT:    pmuludq %xmm8, %xmm9
2650; SSE-NEXT:    psrlq $32, %xmm5
2651; SSE-NEXT:    pmuludq %xmm8, %xmm5
2652; SSE-NEXT:    psllq $32, %xmm5
2653; SSE-NEXT:    paddq %xmm9, %xmm5
2654; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [12,13]
2655; SSE-NEXT:    movdqa %xmm6, %xmm9
2656; SSE-NEXT:    pmuludq %xmm8, %xmm9
2657; SSE-NEXT:    psrlq $32, %xmm6
2658; SSE-NEXT:    pmuludq %xmm8, %xmm6
2659; SSE-NEXT:    psllq $32, %xmm6
2660; SSE-NEXT:    paddq %xmm9, %xmm6
2661; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [14,15]
2662; SSE-NEXT:    movdqa %xmm7, %xmm9
2663; SSE-NEXT:    pmuludq %xmm8, %xmm9
2664; SSE-NEXT:    psrlq $32, %xmm7
2665; SSE-NEXT:    pmuludq %xmm8, %xmm7
2666; SSE-NEXT:    psllq $32, %xmm7
2667; SSE-NEXT:    paddq %xmm9, %xmm7
2668; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2669; SSE-NEXT:    pand %xmm8, %xmm7
2670; SSE-NEXT:    pand %xmm8, %xmm6
2671; SSE-NEXT:    packuswb %xmm7, %xmm6
2672; SSE-NEXT:    pand %xmm8, %xmm5
2673; SSE-NEXT:    pand %xmm8, %xmm4
2674; SSE-NEXT:    packuswb %xmm5, %xmm4
2675; SSE-NEXT:    packuswb %xmm6, %xmm4
2676; SSE-NEXT:    pand %xmm8, %xmm3
2677; SSE-NEXT:    pand %xmm8, %xmm2
2678; SSE-NEXT:    packuswb %xmm3, %xmm2
2679; SSE-NEXT:    pand %xmm8, %xmm1
2680; SSE-NEXT:    pand %xmm8, %xmm0
2681; SSE-NEXT:    packuswb %xmm1, %xmm0
2682; SSE-NEXT:    packuswb %xmm2, %xmm0
2683; SSE-NEXT:    packuswb %xmm4, %xmm0
2684; SSE-NEXT:    retq
2685;
2686; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2687; AVX1:       # BB#0:
2688; AVX1-NEXT:    movl $1, %eax
2689; AVX1-NEXT:    vmovq %rax, %xmm4
2690; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2691; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm5
2692; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
2693; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
2694; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2695; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm8
2696; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2697; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,3]
2698; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm6
2699; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
2700; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
2701; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2702; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm9
2703; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5]
2704; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm6
2705; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm7
2706; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
2707; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
2708; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
2709; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2710; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7]
2711; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm7
2712; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
2713; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm1
2714; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
2715; AVX1-NEXT:    vpaddq %xmm1, %xmm7, %xmm1
2716; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9]
2717; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
2718; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
2719; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
2720; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
2721; AVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
2722; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2723; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11]
2724; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
2725; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
2726; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
2727; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
2728; AVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
2729; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13]
2730; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
2731; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm0
2732; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
2733; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
2734; AVX1-NEXT:    vpaddq %xmm0, %xmm7, %xmm0
2735; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2736; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [14,15]
2737; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
2738; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm3
2739; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
2740; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
2741; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
2742; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2743; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
2744; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
2745; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2746; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
2747; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm3
2748; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2749; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2750; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
2751; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm2
2752; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2753; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm2
2754; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm3
2755; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2756; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2757; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2758; AVX1-NEXT:    vzeroupper
2759; AVX1-NEXT:    retq
2760;
2761; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
2762; AVX2:       # BB#0:
2763; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
2764; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm5
2765; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
2766; AVX2-NEXT:    vpmuludq %ymm4, %ymm1, %ymm1
2767; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
2768; AVX2-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
2769; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
2770; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm5
2771; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
2772; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm0
2773; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
2774; AVX2-NEXT:    vpaddq %ymm0, %ymm5, %ymm0
2775; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
2776; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm5
2777; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
2778; AVX2-NEXT:    vpmuludq %ymm4, %ymm3, %ymm3
2779; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
2780; AVX2-NEXT:    vpaddq %ymm3, %ymm5, %ymm3
2781; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
2782; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm5
2783; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
2784; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm2
2785; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
2786; AVX2-NEXT:    vpaddq %ymm2, %ymm5, %ymm2
2787; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
2788; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
2789; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
2790; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
2791; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2792; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2793; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2794; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2795; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2796; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
2797; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
2798; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
2799; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
2800; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
2801; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2802; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2803; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2804; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
2805; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2806; AVX2-NEXT:    vzeroupper
2807; AVX2-NEXT:    retq
2808;
2809; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
2810; AVX512:       # BB#0:
2811; AVX512-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
2812; AVX512-NEXT:    vpmuludq %zmm2, %zmm1, %zmm3
2813; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm1
2814; AVX512-NEXT:    vpmuludq %zmm2, %zmm1, %zmm1
2815; AVX512-NEXT:    vpsllq $32, %zmm1, %zmm1
2816; AVX512-NEXT:    vpaddq %zmm1, %zmm3, %zmm1
2817; AVX512-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
2818; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm3
2819; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
2820; AVX512-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
2821; AVX512-NEXT:    vpsllq $32, %zmm0, %zmm0
2822; AVX512-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
2823; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2824; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
2825; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2826; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2827; AVX512-NEXT:    retq
2828  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2829  %2 = trunc <16 x i64> %1 to <16 x i8>
2830  ret <16 x i8> %2
2831}
2832
2833define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2834; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2835; SSE:       # BB#0:
2836; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
2837; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2838; SSE-NEXT:    pmuludq %xmm4, %xmm0
2839; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2840; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2841; SSE-NEXT:    pmuludq %xmm5, %xmm4
2842; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2843; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2844; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
2845; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2846; SSE-NEXT:    pmuludq %xmm4, %xmm1
2847; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2848; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2849; SSE-NEXT:    pmuludq %xmm5, %xmm4
2850; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2851; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2852; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
2853; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2854; SSE-NEXT:    pmuludq %xmm4, %xmm2
2855; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2856; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2857; SSE-NEXT:    pmuludq %xmm5, %xmm4
2858; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2859; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2860; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
2861; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2862; SSE-NEXT:    pmuludq %xmm4, %xmm3
2863; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2864; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2865; SSE-NEXT:    pmuludq %xmm5, %xmm4
2866; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2867; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2868; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2869; SSE-NEXT:    pand %xmm4, %xmm3
2870; SSE-NEXT:    pand %xmm4, %xmm2
2871; SSE-NEXT:    packuswb %xmm3, %xmm2
2872; SSE-NEXT:    pand %xmm4, %xmm1
2873; SSE-NEXT:    pand %xmm4, %xmm0
2874; SSE-NEXT:    packuswb %xmm1, %xmm0
2875; SSE-NEXT:    packuswb %xmm2, %xmm0
2876; SSE-NEXT:    retq
2877;
2878; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2879; AVX1:       # BB#0:
2880; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
2881; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2882; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2883; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
2884; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2885; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2886; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2887; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2888; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2889; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
2890; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2891; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2892; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
2893; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2894; AVX1-NEXT:    vzeroupper
2895; AVX1-NEXT:    retq
2896;
2897; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2898; AVX2:       # BB#0:
2899; AVX2-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
2900; AVX2-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
2901; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
2902; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2903; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2904; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2905; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2906; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2907; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2908; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2909; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2910; AVX2-NEXT:    vzeroupper
2911; AVX2-NEXT:    retq
2912;
2913; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2914; AVX512:       # BB#0:
2915; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
2916; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2917; AVX512-NEXT:    retq
2918  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2919  %2 = trunc <16 x i32> %1 to <16 x i8>
2920  ret <16 x i8> %2
2921}
2922
2923define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2924; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2925; SSE:       # BB#0:
2926; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2927; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
2928; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2929; SSE-NEXT:    pand %xmm2, %xmm1
2930; SSE-NEXT:    pand %xmm2, %xmm0
2931; SSE-NEXT:    packuswb %xmm1, %xmm0
2932; SSE-NEXT:    retq
2933;
2934; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2935; AVX1:       # BB#0:
2936; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2937; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2938; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2939; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2940; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2941; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2942; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2943; AVX1-NEXT:    vzeroupper
2944; AVX1-NEXT:    retq
2945;
2946; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2947; AVX2:       # BB#0:
2948; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2949; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2950; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2951; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2952; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2953; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2954; AVX2-NEXT:    vzeroupper
2955; AVX2-NEXT:    retq
2956;
2957; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2958; AVX512F:       # BB#0:
2959; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2960; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
2961; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2962; AVX512F-NEXT:    retq
2963;
2964; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2965; AVX512BW:       # BB#0:
2966; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2967; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2968; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
2969; AVX512BW-NEXT:    retq
2970  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2971  %2 = trunc <16 x i16> %1 to <16 x i8>
2972  ret <16 x i8> %2
2973}
2974
2975;
2976; and
2977;
2978
2979define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2980; SSE-LABEL: trunc_and_v4i64_4i32:
2981; SSE:       # BB#0:
2982; SSE-NEXT:    pand %xmm2, %xmm0
2983; SSE-NEXT:    pand %xmm3, %xmm1
2984; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2985; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2986; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2987; SSE-NEXT:    retq
2988;
2989; AVX1-LABEL: trunc_and_v4i64_4i32:
2990; AVX1:       # BB#0:
2991; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2992; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2993; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
2994; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2995; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2996; AVX1-NEXT:    vzeroupper
2997; AVX1-NEXT:    retq
2998;
2999; AVX2-LABEL: trunc_and_v4i64_4i32:
3000; AVX2:       # BB#0:
3001; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3002; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3003; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3004; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3005; AVX2-NEXT:    vzeroupper
3006; AVX2-NEXT:    retq
3007;
3008; AVX512-LABEL: trunc_and_v4i64_4i32:
3009; AVX512:       # BB#0:
3010; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
3011; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3012; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3013; AVX512-NEXT:    retq
3014  %1 = and <4 x i64> %a0, %a1
3015  %2 = trunc <4 x i64> %1 to <4 x i32>
3016  ret <4 x i32> %2
3017}
3018
3019define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3020; SSE-LABEL: trunc_and_v8i64_8i16:
3021; SSE:       # BB#0:
3022; SSE-NEXT:    pand %xmm6, %xmm2
3023; SSE-NEXT:    pand %xmm4, %xmm0
3024; SSE-NEXT:    pand %xmm7, %xmm3
3025; SSE-NEXT:    pand %xmm5, %xmm1
3026; SSE-NEXT:    pextrw $4, %xmm1, %eax
3027; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3028; SSE-NEXT:    pextrw $4, %xmm0, %ecx
3029; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3030; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3031; SSE-NEXT:    pextrw $4, %xmm3, %edx
3032; SSE-NEXT:    movd %edx, %xmm1
3033; SSE-NEXT:    movd %eax, %xmm3
3034; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3035; SSE-NEXT:    pextrw $4, %xmm2, %eax
3036; SSE-NEXT:    movd %eax, %xmm1
3037; SSE-NEXT:    movd %ecx, %xmm2
3038; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3039; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3040; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3041; SSE-NEXT:    retq
3042;
3043; AVX1-LABEL: trunc_and_v8i64_8i16:
3044; AVX1:       # BB#0:
3045; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3046; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
3047; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3048; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3049; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3050; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3051; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3052; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3053; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3054; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3055; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3056; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3057; AVX1-NEXT:    vzeroupper
3058; AVX1-NEXT:    retq
3059;
3060; AVX2-LABEL: trunc_and_v8i64_8i16:
3061; AVX2:       # BB#0:
3062; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3063; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3064; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3065; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3066; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3067; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3068; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3069; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3070; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3071; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3072; AVX2-NEXT:    vzeroupper
3073; AVX2-NEXT:    retq
3074;
3075; AVX512-LABEL: trunc_and_v8i64_8i16:
3076; AVX512:       # BB#0:
3077; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
3078; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3079; AVX512-NEXT:    retq
3080  %1 = and <8 x i64> %a0, %a1
3081  %2 = trunc <8 x i64> %1 to <8 x i16>
3082  ret <8 x i16> %2
3083}
3084
3085define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3086; SSE-LABEL: trunc_and_v8i32_8i16:
3087; SSE:       # BB#0:
3088; SSE-NEXT:    pand %xmm2, %xmm0
3089; SSE-NEXT:    pand %xmm3, %xmm1
3090; SSE-NEXT:    pslld $16, %xmm1
3091; SSE-NEXT:    psrad $16, %xmm1
3092; SSE-NEXT:    pslld $16, %xmm0
3093; SSE-NEXT:    psrad $16, %xmm0
3094; SSE-NEXT:    packssdw %xmm1, %xmm0
3095; SSE-NEXT:    retq
3096;
3097; AVX1-LABEL: trunc_and_v8i32_8i16:
3098; AVX1:       # BB#0:
3099; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
3100; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3101; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3102; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3103; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3104; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3105; AVX1-NEXT:    vzeroupper
3106; AVX1-NEXT:    retq
3107;
3108; AVX2-LABEL: trunc_and_v8i32_8i16:
3109; AVX2:       # BB#0:
3110; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3111; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3112; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3113; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3114; AVX2-NEXT:    vzeroupper
3115; AVX2-NEXT:    retq
3116;
3117; AVX512-LABEL: trunc_and_v8i32_8i16:
3118; AVX512:       # BB#0:
3119; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
3120; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3121; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3122; AVX512-NEXT:    retq
3123  %1 = and <8 x i32> %a0, %a1
3124  %2 = trunc <8 x i32> %1 to <8 x i16>
3125  ret <8 x i16> %2
3126}
3127
3128define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3129; SSE-LABEL: trunc_and_v16i64_v16i8:
3130; SSE:       # BB#0:
3131; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
3132; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
3133; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
3134; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
3135; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
3136; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
3137; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
3138; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
3139; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3140; SSE-NEXT:    pand %xmm8, %xmm7
3141; SSE-NEXT:    pand %xmm8, %xmm6
3142; SSE-NEXT:    packuswb %xmm7, %xmm6
3143; SSE-NEXT:    pand %xmm8, %xmm5
3144; SSE-NEXT:    pand %xmm8, %xmm4
3145; SSE-NEXT:    packuswb %xmm5, %xmm4
3146; SSE-NEXT:    packuswb %xmm6, %xmm4
3147; SSE-NEXT:    pand %xmm8, %xmm3
3148; SSE-NEXT:    pand %xmm8, %xmm2
3149; SSE-NEXT:    packuswb %xmm3, %xmm2
3150; SSE-NEXT:    pand %xmm8, %xmm1
3151; SSE-NEXT:    pand %xmm8, %xmm0
3152; SSE-NEXT:    packuswb %xmm1, %xmm0
3153; SSE-NEXT:    packuswb %xmm2, %xmm0
3154; SSE-NEXT:    packuswb %xmm4, %xmm0
3155; SSE-NEXT:    retq
3156;
3157; AVX1-LABEL: trunc_and_v16i64_v16i8:
3158; AVX1:       # BB#0:
3159; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3160; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
3161; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
3162; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
3163; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3164; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3165; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3166; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3167; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
3168; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
3169; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3170; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
3171; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
3172; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
3173; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3174; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3175; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
3176; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
3177; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3178; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3179; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
3180; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
3181; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3182; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3183; AVX1-NEXT:    vzeroupper
3184; AVX1-NEXT:    retq
3185;
3186; AVX2-LABEL: trunc_and_v16i64_v16i8:
3187; AVX2:       # BB#0:
3188; AVX2-NEXT:    vpand %ymm5, %ymm1, %ymm1
3189; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
3190; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
3191; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
3192; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3193; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3194; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
3195; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3196; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3197; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3198; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3199; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3200; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3201; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
3202; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3203; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3204; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3205; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3206; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3207; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3208; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3209; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
3210; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3211; AVX2-NEXT:    vzeroupper
3212; AVX2-NEXT:    retq
3213;
3214; AVX512-LABEL: trunc_and_v16i64_v16i8:
3215; AVX512:       # BB#0:
3216; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
3217; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
3218; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3219; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
3220; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3221; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3222; AVX512-NEXT:    retq
3223  %1 = and <16 x i64> %a0, %a1
3224  %2 = trunc <16 x i64> %1 to <16 x i8>
3225  ret <16 x i8> %2
3226}
3227
3228define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3229; SSE-LABEL: trunc_and_v16i32_v16i8:
3230; SSE:       # BB#0:
3231; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3232; SSE-NEXT:    pand %xmm8, %xmm7
3233; SSE-NEXT:    pand %xmm3, %xmm7
3234; SSE-NEXT:    pand %xmm8, %xmm6
3235; SSE-NEXT:    pand %xmm2, %xmm6
3236; SSE-NEXT:    packuswb %xmm7, %xmm6
3237; SSE-NEXT:    pand %xmm8, %xmm5
3238; SSE-NEXT:    pand %xmm1, %xmm5
3239; SSE-NEXT:    pand %xmm8, %xmm4
3240; SSE-NEXT:    pand %xmm4, %xmm0
3241; SSE-NEXT:    packuswb %xmm5, %xmm0
3242; SSE-NEXT:    packuswb %xmm6, %xmm0
3243; SSE-NEXT:    retq
3244;
3245; AVX1-LABEL: trunc_and_v16i32_v16i8:
3246; AVX1:       # BB#0:
3247; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3248; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
3249; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3250; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3251; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3252; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3253; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3254; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3255; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3256; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3257; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3258; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3259; AVX1-NEXT:    vzeroupper
3260; AVX1-NEXT:    retq
3261;
3262; AVX2-LABEL: trunc_and_v16i32_v16i8:
3263; AVX2:       # BB#0:
3264; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3265; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3266; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3267; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3268; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3269; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3270; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3271; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3272; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3273; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3274; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3275; AVX2-NEXT:    vzeroupper
3276; AVX2-NEXT:    retq
3277;
3278; AVX512-LABEL: trunc_and_v16i32_v16i8:
3279; AVX512:       # BB#0:
3280; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
3281; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3282; AVX512-NEXT:    retq
3283  %1 = and <16 x i32> %a0, %a1
3284  %2 = trunc <16 x i32> %1 to <16 x i8>
3285  ret <16 x i8> %2
3286}
3287
3288define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3289; SSE-LABEL: trunc_and_v16i16_v16i8:
3290; SSE:       # BB#0:
3291; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3292; SSE-NEXT:    pand %xmm4, %xmm3
3293; SSE-NEXT:    pand %xmm1, %xmm3
3294; SSE-NEXT:    pand %xmm4, %xmm2
3295; SSE-NEXT:    pand %xmm2, %xmm0
3296; SSE-NEXT:    packuswb %xmm3, %xmm0
3297; SSE-NEXT:    retq
3298;
3299; AVX1-LABEL: trunc_and_v16i16_v16i8:
3300; AVX1:       # BB#0:
3301; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
3302; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3303; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3304; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3305; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3306; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3307; AVX1-NEXT:    vzeroupper
3308; AVX1-NEXT:    retq
3309;
3310; AVX2-LABEL: trunc_and_v16i16_v16i8:
3311; AVX2:       # BB#0:
3312; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3313; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3314; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3315; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3316; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3317; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3318; AVX2-NEXT:    vzeroupper
3319; AVX2-NEXT:    retq
3320;
3321; AVX512F-LABEL: trunc_and_v16i16_v16i8:
3322; AVX512F:       # BB#0:
3323; AVX512F-NEXT:    vandps %ymm1, %ymm0, %ymm0
3324; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3325; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3326; AVX512F-NEXT:    retq
3327;
3328; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
3329; AVX512BW:       # BB#0:
3330; AVX512BW-NEXT:    vandps %ymm1, %ymm0, %ymm0
3331; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3332; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3333; AVX512BW-NEXT:    retq
3334  %1 = and <16 x i16> %a0, %a1
3335  %2 = trunc <16 x i16> %1 to <16 x i8>
3336  ret <16 x i8> %2
3337}
3338
3339;
3340; and to constant
3341;
3342
3343define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind {
3344; SSE-LABEL: trunc_and_const_v4i64_4i32:
3345; SSE:       # BB#0:
3346; SSE-NEXT:    movl $1, %eax
3347; SSE-NEXT:    movd %rax, %xmm2
3348; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
3349; SSE-NEXT:    pand %xmm0, %xmm2
3350; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3351; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3352; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3353; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3354; SSE-NEXT:    retq
3355;
3356; AVX1-LABEL: trunc_and_const_v4i64_4i32:
3357; AVX1:       # BB#0:
3358; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3359; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3360; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
3361; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3362; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3363; AVX1-NEXT:    vzeroupper
3364; AVX1-NEXT:    retq
3365;
3366; AVX2-LABEL: trunc_and_const_v4i64_4i32:
3367; AVX2:       # BB#0:
3368; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3369; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3370; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3371; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3372; AVX2-NEXT:    vzeroupper
3373; AVX2-NEXT:    retq
3374;
3375; AVX512-LABEL: trunc_and_const_v4i64_4i32:
3376; AVX512:       # BB#0:
3377; AVX512-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3378; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3379; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3380; AVX512-NEXT:    retq
3381  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3382  %2 = trunc <4 x i64> %1 to <4 x i32>
3383  ret <4 x i32> %2
3384}
3385
3386define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
3387; SSE-LABEL: trunc_and_const_v16i64_v16i16:
3388; SSE:       # BB#0:
3389; SSE-NEXT:    movdqa %xmm0, %xmm4
3390; SSE-NEXT:    movl $1, %eax
3391; SSE-NEXT:    movd %rax, %xmm0
3392; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
3393; SSE-NEXT:    pand %xmm4, %xmm0
3394; SSE-NEXT:    pand {{.*}}(%rip), %xmm2
3395; SSE-NEXT:    pand {{.*}}(%rip), %xmm3
3396; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3397; SSE-NEXT:    pextrw $4, %xmm1, %eax
3398; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3399; SSE-NEXT:    pextrw $4, %xmm0, %ecx
3400; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3401; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3402; SSE-NEXT:    pextrw $4, %xmm3, %edx
3403; SSE-NEXT:    movd %edx, %xmm1
3404; SSE-NEXT:    movd %eax, %xmm3
3405; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3406; SSE-NEXT:    movd %ecx, %xmm1
3407; SSE-NEXT:    pextrw $4, %xmm2, %eax
3408; SSE-NEXT:    movd %eax, %xmm2
3409; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
3410; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3411; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3412; SSE-NEXT:    retq
3413;
3414; AVX1-LABEL: trunc_and_const_v16i64_v16i16:
3415; AVX1:       # BB#0:
3416; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3417; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
3418; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3419; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3420; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3421; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3422; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3423; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3424; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3425; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3426; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3427; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3428; AVX1-NEXT:    vzeroupper
3429; AVX1-NEXT:    retq
3430;
3431; AVX2-LABEL: trunc_and_const_v16i64_v16i16:
3432; AVX2:       # BB#0:
3433; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
3434; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3435; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3436; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3437; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3438; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3439; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3440; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3441; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3442; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3443; AVX2-NEXT:    vzeroupper
3444; AVX2-NEXT:    retq
3445;
3446; AVX512-LABEL: trunc_and_const_v16i64_v16i16:
3447; AVX512:       # BB#0:
3448; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
3449; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3450; AVX512-NEXT:    retq
3451  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3452  %2 = trunc <8 x i64> %1 to <8 x i16>
3453  ret <8 x i16> %2
3454}
3455
3456define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
3457; SSE-LABEL: trunc_and_const_v16i32_v16i16:
3458; SSE:       # BB#0:
3459; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3460; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3461; SSE-NEXT:    pslld $16, %xmm1
3462; SSE-NEXT:    psrad $16, %xmm1
3463; SSE-NEXT:    pslld $16, %xmm0
3464; SSE-NEXT:    psrad $16, %xmm0
3465; SSE-NEXT:    packssdw %xmm1, %xmm0
3466; SSE-NEXT:    retq
3467;
3468; AVX1-LABEL: trunc_and_const_v16i32_v16i16:
3469; AVX1:       # BB#0:
3470; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3471; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3472; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3473; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3474; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3475; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3476; AVX1-NEXT:    vzeroupper
3477; AVX1-NEXT:    retq
3478;
3479; AVX2-LABEL: trunc_and_const_v16i32_v16i16:
3480; AVX2:       # BB#0:
3481; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3482; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3483; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3484; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3485; AVX2-NEXT:    vzeroupper
3486; AVX2-NEXT:    retq
3487;
3488; AVX512-LABEL: trunc_and_const_v16i32_v16i16:
3489; AVX512:       # BB#0:
3490; AVX512-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3491; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3492; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3493; AVX512-NEXT:    retq
3494  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3495  %2 = trunc <8 x i32> %1 to <8 x i16>
3496  ret <8 x i16> %2
3497}
3498
3499define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3500; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3501; SSE:       # BB#0:
3502; SSE-NEXT:    movl $1, %eax
3503; SSE-NEXT:    movd %rax, %xmm8
3504; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
3505; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3506; SSE-NEXT:    pand {{.*}}(%rip), %xmm2
3507; SSE-NEXT:    pand {{.*}}(%rip), %xmm3
3508; SSE-NEXT:    pand {{.*}}(%rip), %xmm4
3509; SSE-NEXT:    pand {{.*}}(%rip), %xmm5
3510; SSE-NEXT:    pand {{.*}}(%rip), %xmm6
3511; SSE-NEXT:    pand {{.*}}(%rip), %xmm7
3512; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3513; SSE-NEXT:    pand %xmm9, %xmm7
3514; SSE-NEXT:    pand %xmm9, %xmm6
3515; SSE-NEXT:    packuswb %xmm7, %xmm6
3516; SSE-NEXT:    pand %xmm9, %xmm5
3517; SSE-NEXT:    pand %xmm9, %xmm4
3518; SSE-NEXT:    packuswb %xmm5, %xmm4
3519; SSE-NEXT:    packuswb %xmm6, %xmm4
3520; SSE-NEXT:    pand %xmm9, %xmm3
3521; SSE-NEXT:    pand %xmm9, %xmm2
3522; SSE-NEXT:    packuswb %xmm3, %xmm2
3523; SSE-NEXT:    pand %xmm9, %xmm1
3524; SSE-NEXT:    pand %xmm9, %xmm8
3525; SSE-NEXT:    pand %xmm8, %xmm0
3526; SSE-NEXT:    packuswb %xmm1, %xmm0
3527; SSE-NEXT:    packuswb %xmm2, %xmm0
3528; SSE-NEXT:    packuswb %xmm4, %xmm0
3529; SSE-NEXT:    retq
3530;
3531; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3532; AVX1:       # BB#0:
3533; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3534; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
3535; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm2, %ymm2
3536; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm3, %ymm3
3537; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3538; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3539; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3540; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3541; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
3542; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
3543; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3544; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
3545; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
3546; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
3547; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3548; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3549; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
3550; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
3551; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3552; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3553; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
3554; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
3555; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3556; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3557; AVX1-NEXT:    vzeroupper
3558; AVX1-NEXT:    retq
3559;
3560; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
3561; AVX2:       # BB#0:
3562; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
3563; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3564; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
3565; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
3566; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3567; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3568; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
3569; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3570; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3571; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3572; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3573; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3574; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3575; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
3576; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3577; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3578; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3579; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3580; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3581; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3582; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3583; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
3584; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3585; AVX2-NEXT:    vzeroupper
3586; AVX2-NEXT:    retq
3587;
3588; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3589; AVX512:       # BB#0:
3590; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
3591; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
3592; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3593; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
3594; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3595; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3596; AVX512-NEXT:    retq
3597  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3598  %2 = trunc <16 x i64> %1 to <16 x i8>
3599  ret <16 x i8> %2
3600}
3601
3602define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3603; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3604; SSE:       # BB#0:
3605; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3606; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3607; SSE-NEXT:    pand {{.*}}(%rip), %xmm2
3608; SSE-NEXT:    pand {{.*}}(%rip), %xmm3
3609; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3610; SSE-NEXT:    pand %xmm4, %xmm3
3611; SSE-NEXT:    pand %xmm4, %xmm2
3612; SSE-NEXT:    packuswb %xmm3, %xmm2
3613; SSE-NEXT:    pand %xmm4, %xmm1
3614; SSE-NEXT:    pand %xmm4, %xmm0
3615; SSE-NEXT:    packuswb %xmm1, %xmm0
3616; SSE-NEXT:    packuswb %xmm2, %xmm0
3617; SSE-NEXT:    retq
3618;
3619; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3620; AVX1:       # BB#0:
3621; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3622; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
3623; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3624; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3625; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3626; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3627; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3628; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3629; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3630; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3631; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3632; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3633; AVX1-NEXT:    vzeroupper
3634; AVX1-NEXT:    retq
3635;
3636; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3637; AVX2:       # BB#0:
3638; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3639; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
3640; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3641; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3642; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3643; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3644; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
3645; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3646; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3647; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
3648; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3649; AVX2-NEXT:    vzeroupper
3650; AVX2-NEXT:    retq
3651;
3652; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3653; AVX512:       # BB#0:
3654; AVX512-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
3655; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3656; AVX512-NEXT:    retq
3657  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3658  %2 = trunc <16 x i32> %1 to <16 x i8>
3659  ret <16 x i8> %2
3660}
3661
3662define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3663; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3664; SSE:       # BB#0:
3665; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3666; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
3667; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
3668; SSE-NEXT:    pand %xmm2, %xmm1
3669; SSE-NEXT:    pand %xmm2, %xmm0
3670; SSE-NEXT:    packuswb %xmm1, %xmm0
3671; SSE-NEXT:    retq
3672;
3673; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3674; AVX1:       # BB#0:
3675; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3676; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3677; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3678; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3679; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3680; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3681; AVX1-NEXT:    vzeroupper
3682; AVX1-NEXT:    retq
3683;
3684; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3685; AVX2:       # BB#0:
3686; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3687; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3688; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3689; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3690; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3691; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3692; AVX2-NEXT:    vzeroupper
3693; AVX2-NEXT:    retq
3694;
3695; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3696; AVX512F:       # BB#0:
3697; AVX512F-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3698; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
3699; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3700; AVX512F-NEXT:    retq
3701;
3702; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3703; AVX512BW:       # BB#0:
3704; AVX512BW-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3705; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3706; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3707; AVX512BW-NEXT:    retq
3708  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3709  %2 = trunc <16 x i16> %1 to <16 x i8>
3710  ret <16 x i8> %2
3711}
3712
3713;
3714; xor
3715;
3716
3717define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3718; SSE-LABEL: trunc_xor_v4i64_4i32:
3719; SSE:       # BB#0:
3720; SSE-NEXT:    pxor %xmm2, %xmm0
3721; SSE-NEXT:    pxor %xmm3, %xmm1
3722; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3723; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3724; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3725; SSE-NEXT:    retq
3726;
3727; AVX1-LABEL: trunc_xor_v4i64_4i32:
3728; AVX1:       # BB#0:
3729; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3730; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3731; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
3732; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3733; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3734; AVX1-NEXT:    vzeroupper
3735; AVX1-NEXT:    retq
3736;
3737; AVX2-LABEL: trunc_xor_v4i64_4i32:
3738; AVX2:       # BB#0:
3739; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3740; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3741; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3742; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3743; AVX2-NEXT:    vzeroupper
3744; AVX2-NEXT:    retq
3745;
3746; AVX512-LABEL: trunc_xor_v4i64_4i32:
3747; AVX512:       # BB#0:
3748; AVX512-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3749; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3750; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3751; AVX512-NEXT:    retq
3752  %1 = xor <4 x i64> %a0, %a1
3753  %2 = trunc <4 x i64> %1 to <4 x i32>
3754  ret <4 x i32> %2
3755}
3756
3757define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3758; SSE-LABEL: trunc_xor_v8i64_8i16:
3759; SSE:       # BB#0:
3760; SSE-NEXT:    pxor %xmm6, %xmm2
3761; SSE-NEXT:    pxor %xmm4, %xmm0
3762; SSE-NEXT:    pxor %xmm7, %xmm3
3763; SSE-NEXT:    pxor %xmm5, %xmm1
3764; SSE-NEXT:    pextrw $4, %xmm1, %eax
3765; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
3766; SSE-NEXT:    pextrw $4, %xmm0, %ecx
3767; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3768; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3769; SSE-NEXT:    pextrw $4, %xmm3, %edx
3770; SSE-NEXT:    movd %edx, %xmm1
3771; SSE-NEXT:    movd %eax, %xmm3
3772; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3773; SSE-NEXT:    pextrw $4, %xmm2, %eax
3774; SSE-NEXT:    movd %eax, %xmm1
3775; SSE-NEXT:    movd %ecx, %xmm2
3776; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3777; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3778; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3779; SSE-NEXT:    retq
3780;
3781; AVX1-LABEL: trunc_xor_v8i64_8i16:
3782; AVX1:       # BB#0:
3783; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3784; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3785; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3786; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3787; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3788; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
3789; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
3790; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3791; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
3792; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
3793; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3794; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3795; AVX1-NEXT:    vzeroupper
3796; AVX1-NEXT:    retq
3797;
3798; AVX2-LABEL: trunc_xor_v8i64_8i16:
3799; AVX2:       # BB#0:
3800; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3801; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3802; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3803; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3804; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3805; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3806; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3807; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3808; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3809; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3810; AVX2-NEXT:    vzeroupper
3811; AVX2-NEXT:    retq
3812;
3813; AVX512-LABEL: trunc_xor_v8i64_8i16:
3814; AVX512:       # BB#0:
3815; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3816; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3817; AVX512-NEXT:    retq
3818  %1 = xor <8 x i64> %a0, %a1
3819  %2 = trunc <8 x i64> %1 to <8 x i16>
3820  ret <8 x i16> %2
3821}
3822
3823define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3824; SSE-LABEL: trunc_xor_v8i32_8i16:
3825; SSE:       # BB#0:
3826; SSE-NEXT:    pxor %xmm2, %xmm0
3827; SSE-NEXT:    pxor %xmm3, %xmm1
3828; SSE-NEXT:    pslld $16, %xmm1
3829; SSE-NEXT:    psrad $16, %xmm1
3830; SSE-NEXT:    pslld $16, %xmm0
3831; SSE-NEXT:    psrad $16, %xmm0
3832; SSE-NEXT:    packssdw %xmm1, %xmm0
3833; SSE-NEXT:    retq
3834;
3835; AVX1-LABEL: trunc_xor_v8i32_8i16:
3836; AVX1:       # BB#0:
3837; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3838; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3839; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3840; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3841; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3842; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3843; AVX1-NEXT:    vzeroupper
3844; AVX1-NEXT:    retq
3845;
3846; AVX2-LABEL: trunc_xor_v8i32_8i16:
3847; AVX2:       # BB#0:
3848; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3849; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
3850; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3851; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3852; AVX2-NEXT:    vzeroupper
3853; AVX2-NEXT:    retq
3854;
3855; AVX512-LABEL: trunc_xor_v8i32_8i16:
3856; AVX512:       # BB#0:
3857; AVX512-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3858; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3859; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
3860; AVX512-NEXT:    retq
3861  %1 = xor <8 x i32> %a0, %a1
3862  %2 = trunc <8 x i32> %1 to <8 x i16>
3863  ret <8 x i16> %2
3864}
3865
3866define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3867; SSE-LABEL: trunc_xor_v16i64_v16i8:
3868; SSE:       # BB#0:
3869; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3870; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3871; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3872; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3873; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3874; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3875; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3876; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3877; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3878; SSE-NEXT:    pand %xmm8, %xmm7
3879; SSE-NEXT:    pand %xmm8, %xmm6
3880; SSE-NEXT:    packuswb %xmm7, %xmm6
3881; SSE-NEXT:    pand %xmm8, %xmm5
3882; SSE-NEXT:    pand %xmm8, %xmm4
3883; SSE-NEXT:    packuswb %xmm5, %xmm4
3884; SSE-NEXT:    packuswb %xmm6, %xmm4
3885; SSE-NEXT:    pand %xmm8, %xmm3
3886; SSE-NEXT:    pand %xmm8, %xmm2
3887; SSE-NEXT:    packuswb %xmm3, %xmm2
3888; SSE-NEXT:    pand %xmm8, %xmm1
3889; SSE-NEXT:    pand %xmm8, %xmm0
3890; SSE-NEXT:    packuswb %xmm1, %xmm0
3891; SSE-NEXT:    packuswb %xmm2, %xmm0
3892; SSE-NEXT:    packuswb %xmm4, %xmm0
3893; SSE-NEXT:    retq
3894;
3895; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3896; AVX1:       # BB#0:
3897; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3898; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3899; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3900; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3901; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
3902; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3903; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3904; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3905; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
3906; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
3907; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
3908; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
3909; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
3910; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
3911; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3912; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3913; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
3914; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
3915; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3916; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
3917; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
3918; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
3919; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3920; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3921; AVX1-NEXT:    vzeroupper
3922; AVX1-NEXT:    retq
3923;
3924; AVX2-LABEL: trunc_xor_v16i64_v16i8:
3925; AVX2:       # BB#0:
3926; AVX2-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3927; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3928; AVX2-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3929; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3930; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
3931; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
3932; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
3933; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
3934; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3935; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
3936; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3937; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3938; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
3939; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
3940; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
3941; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
3942; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
3943; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
3944; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3945; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3946; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3947; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
3948; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3949; AVX2-NEXT:    vzeroupper
3950; AVX2-NEXT:    retq
3951;
3952; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3953; AVX512:       # BB#0:
3954; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3955; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3956; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3957; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
3958; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
3959; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3960; AVX512-NEXT:    retq
3961  %1 = xor <16 x i64> %a0, %a1
3962  %2 = trunc <16 x i64> %1 to <16 x i8>
3963  ret <16 x i8> %2
3964}
3965
3966define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3967; SSE-LABEL: trunc_xor_v16i32_v16i8:
3968; SSE:       # BB#0:
3969; SSE-NEXT:    pxor %xmm4, %xmm0
3970; SSE-NEXT:    pxor %xmm5, %xmm1
3971; SSE-NEXT:    pxor %xmm6, %xmm2
3972; SSE-NEXT:    pxor %xmm7, %xmm3
3973; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3974; SSE-NEXT:    pand %xmm4, %xmm3
3975; SSE-NEXT:    pand %xmm4, %xmm2
3976; SSE-NEXT:    packuswb %xmm3, %xmm2
3977; SSE-NEXT:    pand %xmm4, %xmm1
3978; SSE-NEXT:    pand %xmm4, %xmm0
3979; SSE-NEXT:    packuswb %xmm1, %xmm0
3980; SSE-NEXT:    packuswb %xmm2, %xmm0
3981; SSE-NEXT:    retq
3982;
3983; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3984; AVX1:       # BB#0:
3985; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3986; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3987; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
3988; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3989; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3990; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
3991; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3992; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3993; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
3994; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
3995; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3996; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3997; AVX1-NEXT:    vzeroupper
3998; AVX1-NEXT:    retq
3999;
4000; AVX2-LABEL: trunc_xor_v16i32_v16i8:
4001; AVX2:       # BB#0:
4002; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
4003; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
4004; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4005; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4006; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4007; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4008; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4009; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4010; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4011; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4012; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4013; AVX2-NEXT:    vzeroupper
4014; AVX2-NEXT:    retq
4015;
4016; AVX512-LABEL: trunc_xor_v16i32_v16i8:
4017; AVX512:       # BB#0:
4018; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
4019; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4020; AVX512-NEXT:    retq
4021  %1 = xor <16 x i32> %a0, %a1
4022  %2 = trunc <16 x i32> %1 to <16 x i8>
4023  ret <16 x i8> %2
4024}
4025
4026define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4027; SSE-LABEL: trunc_xor_v16i16_v16i8:
4028; SSE:       # BB#0:
4029; SSE-NEXT:    pxor %xmm2, %xmm0
4030; SSE-NEXT:    pxor %xmm3, %xmm1
4031; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4032; SSE-NEXT:    pand %xmm2, %xmm1
4033; SSE-NEXT:    pand %xmm2, %xmm0
4034; SSE-NEXT:    packuswb %xmm1, %xmm0
4035; SSE-NEXT:    retq
4036;
4037; AVX1-LABEL: trunc_xor_v16i16_v16i8:
4038; AVX1:       # BB#0:
4039; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
4040; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4041; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4042; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4043; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4044; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4045; AVX1-NEXT:    vzeroupper
4046; AVX1-NEXT:    retq
4047;
4048; AVX2-LABEL: trunc_xor_v16i16_v16i8:
4049; AVX2:       # BB#0:
4050; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
4051; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4052; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4053; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4054; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4055; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4056; AVX2-NEXT:    vzeroupper
4057; AVX2-NEXT:    retq
4058;
4059; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
4060; AVX512F:       # BB#0:
4061; AVX512F-NEXT:    vxorps %ymm1, %ymm0, %ymm0
4062; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4063; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4064; AVX512F-NEXT:    retq
4065;
4066; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
4067; AVX512BW:       # BB#0:
4068; AVX512BW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
4069; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4070; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4071; AVX512BW-NEXT:    retq
4072  %1 = xor <16 x i16> %a0, %a1
4073  %2 = trunc <16 x i16> %1 to <16 x i8>
4074  ret <16 x i8> %2
4075}
4076
4077;
4078; xor to constant
4079;
4080
4081define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind {
4082; SSE-LABEL: trunc_xor_const_v4i64_4i32:
4083; SSE:       # BB#0:
4084; SSE-NEXT:    movl $1, %eax
4085; SSE-NEXT:    movd %rax, %xmm2
4086; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
4087; SSE-NEXT:    pxor %xmm0, %xmm2
4088; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4089; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4090; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4091; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4092; SSE-NEXT:    retq
4093;
4094; AVX1-LABEL: trunc_xor_const_v4i64_4i32:
4095; AVX1:       # BB#0:
4096; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4097; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4098; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4099; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4100; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4101; AVX1-NEXT:    vzeroupper
4102; AVX1-NEXT:    retq
4103;
4104; AVX2-LABEL: trunc_xor_const_v4i64_4i32:
4105; AVX2:       # BB#0:
4106; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4107; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4108; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4109; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4110; AVX2-NEXT:    vzeroupper
4111; AVX2-NEXT:    retq
4112;
4113; AVX512-LABEL: trunc_xor_const_v4i64_4i32:
4114; AVX512:       # BB#0:
4115; AVX512-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4116; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4117; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4118; AVX512-NEXT:    retq
4119  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4120  %2 = trunc <4 x i64> %1 to <4 x i32>
4121  ret <4 x i32> %2
4122}
4123
4124define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
4125; SSE-LABEL: trunc_xor_const_v16i64_v16i16:
4126; SSE:       # BB#0:
4127; SSE-NEXT:    movdqa %xmm0, %xmm4
4128; SSE-NEXT:    movl $1, %eax
4129; SSE-NEXT:    movd %rax, %xmm0
4130; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4131; SSE-NEXT:    pxor %xmm4, %xmm0
4132; SSE-NEXT:    pxor {{.*}}(%rip), %xmm2
4133; SSE-NEXT:    pxor {{.*}}(%rip), %xmm3
4134; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4135; SSE-NEXT:    pextrw $4, %xmm1, %eax
4136; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4137; SSE-NEXT:    pextrw $4, %xmm0, %ecx
4138; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4139; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4140; SSE-NEXT:    pextrw $4, %xmm3, %edx
4141; SSE-NEXT:    movd %edx, %xmm1
4142; SSE-NEXT:    movd %eax, %xmm3
4143; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4144; SSE-NEXT:    movd %ecx, %xmm1
4145; SSE-NEXT:    pextrw $4, %xmm2, %eax
4146; SSE-NEXT:    movd %eax, %xmm2
4147; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4148; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4149; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4150; SSE-NEXT:    retq
4151;
4152; AVX1-LABEL: trunc_xor_const_v16i64_v16i16:
4153; AVX1:       # BB#0:
4154; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4155; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
4156; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4157; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4158; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4159; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4160; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
4161; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4162; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4163; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4164; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4165; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4166; AVX1-NEXT:    vzeroupper
4167; AVX1-NEXT:    retq
4168;
4169; AVX2-LABEL: trunc_xor_const_v16i64_v16i16:
4170; AVX2:       # BB#0:
4171; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm1, %ymm1
4172; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4173; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4174; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4175; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4176; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4177; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4178; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4179; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4180; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4181; AVX2-NEXT:    vzeroupper
4182; AVX2-NEXT:    retq
4183;
4184; AVX512-LABEL: trunc_xor_const_v16i64_v16i16:
4185; AVX512:       # BB#0:
4186; AVX512-NEXT:    vpxorq {{.*}}(%rip), %zmm0, %zmm0
4187; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4188; AVX512-NEXT:    retq
4189  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4190  %2 = trunc <8 x i64> %1 to <8 x i16>
4191  ret <8 x i16> %2
4192}
4193
4194define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
4195; SSE-LABEL: trunc_xor_const_v16i32_v16i16:
4196; SSE:       # BB#0:
4197; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4198; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4199; SSE-NEXT:    pslld $16, %xmm1
4200; SSE-NEXT:    psrad $16, %xmm1
4201; SSE-NEXT:    pslld $16, %xmm0
4202; SSE-NEXT:    psrad $16, %xmm0
4203; SSE-NEXT:    packssdw %xmm1, %xmm0
4204; SSE-NEXT:    retq
4205;
4206; AVX1-LABEL: trunc_xor_const_v16i32_v16i16:
4207; AVX1:       # BB#0:
4208; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4209; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4210; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4211; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4212; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4213; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4214; AVX1-NEXT:    vzeroupper
4215; AVX1-NEXT:    retq
4216;
4217; AVX2-LABEL: trunc_xor_const_v16i32_v16i16:
4218; AVX2:       # BB#0:
4219; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4220; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4221; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4222; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4223; AVX2-NEXT:    vzeroupper
4224; AVX2-NEXT:    retq
4225;
4226; AVX512-LABEL: trunc_xor_const_v16i32_v16i16:
4227; AVX512:       # BB#0:
4228; AVX512-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4229; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4230; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4231; AVX512-NEXT:    retq
4232  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4233  %2 = trunc <8 x i32> %1 to <8 x i16>
4234  ret <8 x i16> %2
4235}
4236
4237define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4238; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
4239; SSE:       # BB#0:
4240; SSE-NEXT:    movl $1, %eax
4241; SSE-NEXT:    movd %rax, %xmm8
4242; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
4243; SSE-NEXT:    pxor %xmm8, %xmm0
4244; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4245; SSE-NEXT:    pxor {{.*}}(%rip), %xmm2
4246; SSE-NEXT:    pxor {{.*}}(%rip), %xmm3
4247; SSE-NEXT:    pxor {{.*}}(%rip), %xmm4
4248; SSE-NEXT:    pxor {{.*}}(%rip), %xmm5
4249; SSE-NEXT:    pxor {{.*}}(%rip), %xmm6
4250; SSE-NEXT:    pxor {{.*}}(%rip), %xmm7
4251; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4252; SSE-NEXT:    pand %xmm8, %xmm7
4253; SSE-NEXT:    pand %xmm8, %xmm6
4254; SSE-NEXT:    packuswb %xmm7, %xmm6
4255; SSE-NEXT:    pand %xmm8, %xmm5
4256; SSE-NEXT:    pand %xmm8, %xmm4
4257; SSE-NEXT:    packuswb %xmm5, %xmm4
4258; SSE-NEXT:    packuswb %xmm6, %xmm4
4259; SSE-NEXT:    pand %xmm8, %xmm3
4260; SSE-NEXT:    pand %xmm8, %xmm2
4261; SSE-NEXT:    packuswb %xmm3, %xmm2
4262; SSE-NEXT:    pand %xmm8, %xmm1
4263; SSE-NEXT:    pand %xmm8, %xmm0
4264; SSE-NEXT:    packuswb %xmm1, %xmm0
4265; SSE-NEXT:    packuswb %xmm2, %xmm0
4266; SSE-NEXT:    packuswb %xmm4, %xmm0
4267; SSE-NEXT:    retq
4268;
4269; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
4270; AVX1:       # BB#0:
4271; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4272; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
4273; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm2, %ymm2
4274; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm3, %ymm3
4275; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4276; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4277; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4278; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4279; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
4280; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
4281; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4282; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
4283; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
4284; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
4285; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4286; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4287; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
4288; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
4289; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4290; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4291; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
4292; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
4293; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4294; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4295; AVX1-NEXT:    vzeroupper
4296; AVX1-NEXT:    retq
4297;
4298; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
4299; AVX2:       # BB#0:
4300; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm1, %ymm1
4301; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4302; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm3, %ymm3
4303; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm2, %ymm2
4304; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
4305; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
4306; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
4307; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
4308; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4309; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4310; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4311; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4312; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4313; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4314; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4315; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4316; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4317; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4318; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4319; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4320; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4321; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
4322; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4323; AVX2-NEXT:    vzeroupper
4324; AVX2-NEXT:    retq
4325;
4326; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
4327; AVX512:       # BB#0:
4328; AVX512-NEXT:    vpxorq {{.*}}(%rip), %zmm1, %zmm1
4329; AVX512-NEXT:    vpxorq {{.*}}(%rip), %zmm0, %zmm0
4330; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4331; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
4332; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4333; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4334; AVX512-NEXT:    retq
4335  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4336  %2 = trunc <16 x i64> %1 to <16 x i8>
4337  ret <16 x i8> %2
4338}
4339
4340define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4341; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
4342; SSE:       # BB#0:
4343; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4344; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4345; SSE-NEXT:    pxor {{.*}}(%rip), %xmm2
4346; SSE-NEXT:    pxor {{.*}}(%rip), %xmm3
4347; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4348; SSE-NEXT:    pand %xmm4, %xmm3
4349; SSE-NEXT:    pand %xmm4, %xmm2
4350; SSE-NEXT:    packuswb %xmm3, %xmm2
4351; SSE-NEXT:    pand %xmm4, %xmm1
4352; SSE-NEXT:    pand %xmm4, %xmm0
4353; SSE-NEXT:    packuswb %xmm1, %xmm0
4354; SSE-NEXT:    packuswb %xmm2, %xmm0
4355; SSE-NEXT:    retq
4356;
4357; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
4358; AVX1:       # BB#0:
4359; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4360; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
4361; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4362; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4363; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4364; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
4365; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
4366; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4367; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4368; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
4369; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4370; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4371; AVX1-NEXT:    vzeroupper
4372; AVX1-NEXT:    retq
4373;
4374; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
4375; AVX2:       # BB#0:
4376; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4377; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm1, %ymm1
4378; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4379; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4380; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4381; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4382; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4383; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4384; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4385; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4386; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4387; AVX2-NEXT:    vzeroupper
4388; AVX2-NEXT:    retq
4389;
4390; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
4391; AVX512:       # BB#0:
4392; AVX512-NEXT:    vpxord {{.*}}(%rip), %zmm0, %zmm0
4393; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4394; AVX512-NEXT:    retq
4395  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4396  %2 = trunc <16 x i32> %1 to <16 x i8>
4397  ret <16 x i8> %2
4398}
4399
4400define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4401; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
4402; SSE:       # BB#0:
4403; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4404; SSE-NEXT:    pxor {{.*}}(%rip), %xmm1
4405; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4406; SSE-NEXT:    pand %xmm2, %xmm1
4407; SSE-NEXT:    pand %xmm2, %xmm0
4408; SSE-NEXT:    packuswb %xmm1, %xmm0
4409; SSE-NEXT:    retq
4410;
4411; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
4412; AVX1:       # BB#0:
4413; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4414; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4415; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4416; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4417; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4418; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4419; AVX1-NEXT:    vzeroupper
4420; AVX1-NEXT:    retq
4421;
4422; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
4423; AVX2:       # BB#0:
4424; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
4425; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4426; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4427; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4428; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4429; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4430; AVX2-NEXT:    vzeroupper
4431; AVX2-NEXT:    retq
4432;
4433; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
4434; AVX512F:       # BB#0:
4435; AVX512F-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4436; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4437; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4438; AVX512F-NEXT:    retq
4439;
4440; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
4441; AVX512BW:       # BB#0:
4442; AVX512BW-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
4443; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4444; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4445; AVX512BW-NEXT:    retq
4446  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4447  %2 = trunc <16 x i16> %1 to <16 x i8>
4448  ret <16 x i8> %2
4449}
4450
4451;
4452; or
4453;
4454
4455define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
4456; SSE-LABEL: trunc_or_v4i64_4i32:
4457; SSE:       # BB#0:
4458; SSE-NEXT:    por %xmm2, %xmm0
4459; SSE-NEXT:    por %xmm3, %xmm1
4460; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4461; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4462; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4463; SSE-NEXT:    retq
4464;
4465; AVX1-LABEL: trunc_or_v4i64_4i32:
4466; AVX1:       # BB#0:
4467; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4468; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4469; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4470; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4471; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4472; AVX1-NEXT:    vzeroupper
4473; AVX1-NEXT:    retq
4474;
4475; AVX2-LABEL: trunc_or_v4i64_4i32:
4476; AVX2:       # BB#0:
4477; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4478; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4479; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4480; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4481; AVX2-NEXT:    vzeroupper
4482; AVX2-NEXT:    retq
4483;
4484; AVX512-LABEL: trunc_or_v4i64_4i32:
4485; AVX512:       # BB#0:
4486; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
4487; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4488; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4489; AVX512-NEXT:    retq
4490  %1 = or <4 x i64> %a0, %a1
4491  %2 = trunc <4 x i64> %1 to <4 x i32>
4492  ret <4 x i32> %2
4493}
4494
4495define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4496; SSE-LABEL: trunc_or_v8i64_8i16:
4497; SSE:       # BB#0:
4498; SSE-NEXT:    por %xmm6, %xmm2
4499; SSE-NEXT:    por %xmm4, %xmm0
4500; SSE-NEXT:    por %xmm7, %xmm3
4501; SSE-NEXT:    por %xmm5, %xmm1
4502; SSE-NEXT:    pextrw $4, %xmm1, %eax
4503; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4504; SSE-NEXT:    pextrw $4, %xmm0, %ecx
4505; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4506; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4507; SSE-NEXT:    pextrw $4, %xmm3, %edx
4508; SSE-NEXT:    movd %edx, %xmm1
4509; SSE-NEXT:    movd %eax, %xmm3
4510; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4511; SSE-NEXT:    pextrw $4, %xmm2, %eax
4512; SSE-NEXT:    movd %eax, %xmm1
4513; SSE-NEXT:    movd %ecx, %xmm2
4514; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4515; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4516; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4517; SSE-NEXT:    retq
4518;
4519; AVX1-LABEL: trunc_or_v8i64_8i16:
4520; AVX1:       # BB#0:
4521; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4522; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4523; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4524; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
4525; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4526; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4527; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
4528; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4529; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4530; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4531; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4532; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4533; AVX1-NEXT:    vzeroupper
4534; AVX1-NEXT:    retq
4535;
4536; AVX2-LABEL: trunc_or_v8i64_8i16:
4537; AVX2:       # BB#0:
4538; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4539; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4540; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4541; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4542; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4543; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4544; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4545; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4546; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4547; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4548; AVX2-NEXT:    vzeroupper
4549; AVX2-NEXT:    retq
4550;
4551; AVX512-LABEL: trunc_or_v8i64_8i16:
4552; AVX512:       # BB#0:
4553; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4554; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4555; AVX512-NEXT:    retq
4556  %1 = or <8 x i64> %a0, %a1
4557  %2 = trunc <8 x i64> %1 to <8 x i16>
4558  ret <8 x i16> %2
4559}
4560
4561define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4562; SSE-LABEL: trunc_or_v8i32_8i16:
4563; SSE:       # BB#0:
4564; SSE-NEXT:    por %xmm2, %xmm0
4565; SSE-NEXT:    por %xmm3, %xmm1
4566; SSE-NEXT:    pslld $16, %xmm1
4567; SSE-NEXT:    psrad $16, %xmm1
4568; SSE-NEXT:    pslld $16, %xmm0
4569; SSE-NEXT:    psrad $16, %xmm0
4570; SSE-NEXT:    packssdw %xmm1, %xmm0
4571; SSE-NEXT:    retq
4572;
4573; AVX1-LABEL: trunc_or_v8i32_8i16:
4574; AVX1:       # BB#0:
4575; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4576; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4577; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4578; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4579; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4580; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4581; AVX1-NEXT:    vzeroupper
4582; AVX1-NEXT:    retq
4583;
4584; AVX2-LABEL: trunc_or_v8i32_8i16:
4585; AVX2:       # BB#0:
4586; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4587; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4588; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4589; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4590; AVX2-NEXT:    vzeroupper
4591; AVX2-NEXT:    retq
4592;
4593; AVX512-LABEL: trunc_or_v8i32_8i16:
4594; AVX512:       # BB#0:
4595; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
4596; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4597; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4598; AVX512-NEXT:    retq
4599  %1 = or <8 x i32> %a0, %a1
4600  %2 = trunc <8 x i32> %1 to <8 x i16>
4601  ret <8 x i16> %2
4602}
4603
4604define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4605; SSE-LABEL: trunc_or_v16i64_v16i8:
4606; SSE:       # BB#0:
4607; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4608; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4609; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4610; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4611; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4612; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4613; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4614; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4615; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4616; SSE-NEXT:    pand %xmm8, %xmm7
4617; SSE-NEXT:    pand %xmm8, %xmm6
4618; SSE-NEXT:    packuswb %xmm7, %xmm6
4619; SSE-NEXT:    pand %xmm8, %xmm5
4620; SSE-NEXT:    pand %xmm8, %xmm4
4621; SSE-NEXT:    packuswb %xmm5, %xmm4
4622; SSE-NEXT:    packuswb %xmm6, %xmm4
4623; SSE-NEXT:    pand %xmm8, %xmm3
4624; SSE-NEXT:    pand %xmm8, %xmm2
4625; SSE-NEXT:    packuswb %xmm3, %xmm2
4626; SSE-NEXT:    pand %xmm8, %xmm1
4627; SSE-NEXT:    pand %xmm8, %xmm0
4628; SSE-NEXT:    packuswb %xmm1, %xmm0
4629; SSE-NEXT:    packuswb %xmm2, %xmm0
4630; SSE-NEXT:    packuswb %xmm4, %xmm0
4631; SSE-NEXT:    retq
4632;
4633; AVX1-LABEL: trunc_or_v16i64_v16i8:
4634; AVX1:       # BB#0:
4635; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4636; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4637; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4638; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4639; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
4640; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4641; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4642; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4643; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
4644; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
4645; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
4646; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
4647; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
4648; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
4649; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4650; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4651; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
4652; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
4653; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4654; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
4655; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
4656; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
4657; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4658; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4659; AVX1-NEXT:    vzeroupper
4660; AVX1-NEXT:    retq
4661;
4662; AVX2-LABEL: trunc_or_v16i64_v16i8:
4663; AVX2:       # BB#0:
4664; AVX2-NEXT:    vpor %ymm5, %ymm1, %ymm1
4665; AVX2-NEXT:    vpor %ymm4, %ymm0, %ymm0
4666; AVX2-NEXT:    vpor %ymm7, %ymm3, %ymm3
4667; AVX2-NEXT:    vpor %ymm6, %ymm2, %ymm2
4668; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
4669; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
4670; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
4671; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
4672; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4673; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4674; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4675; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4676; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4677; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
4678; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4679; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4680; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4681; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4682; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4683; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4684; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4685; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
4686; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4687; AVX2-NEXT:    vzeroupper
4688; AVX2-NEXT:    retq
4689;
4690; AVX512-LABEL: trunc_or_v16i64_v16i8:
4691; AVX512:       # BB#0:
4692; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
4693; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
4694; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4695; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
4696; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
4697; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4698; AVX512-NEXT:    retq
4699  %1 = or <16 x i64> %a0, %a1
4700  %2 = trunc <16 x i64> %1 to <16 x i8>
4701  ret <16 x i8> %2
4702}
4703
4704define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4705; SSE-LABEL: trunc_or_v16i32_v16i8:
4706; SSE:       # BB#0:
4707; SSE-NEXT:    por %xmm4, %xmm0
4708; SSE-NEXT:    por %xmm5, %xmm1
4709; SSE-NEXT:    por %xmm6, %xmm2
4710; SSE-NEXT:    por %xmm7, %xmm3
4711; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4712; SSE-NEXT:    pand %xmm4, %xmm3
4713; SSE-NEXT:    pand %xmm4, %xmm2
4714; SSE-NEXT:    packuswb %xmm3, %xmm2
4715; SSE-NEXT:    pand %xmm4, %xmm1
4716; SSE-NEXT:    pand %xmm4, %xmm0
4717; SSE-NEXT:    packuswb %xmm1, %xmm0
4718; SSE-NEXT:    packuswb %xmm2, %xmm0
4719; SSE-NEXT:    retq
4720;
4721; AVX1-LABEL: trunc_or_v16i32_v16i8:
4722; AVX1:       # BB#0:
4723; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4724; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4725; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4726; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4727; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4728; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
4729; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
4730; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4731; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
4732; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
4733; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4734; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4735; AVX1-NEXT:    vzeroupper
4736; AVX1-NEXT:    retq
4737;
4738; AVX2-LABEL: trunc_or_v16i32_v16i8:
4739; AVX2:       # BB#0:
4740; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4741; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4742; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
4743; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4744; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4745; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4746; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
4747; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4748; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4749; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
4750; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4751; AVX2-NEXT:    vzeroupper
4752; AVX2-NEXT:    retq
4753;
4754; AVX512-LABEL: trunc_or_v16i32_v16i8:
4755; AVX512:       # BB#0:
4756; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4757; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4758; AVX512-NEXT:    retq
4759  %1 = or <16 x i32> %a0, %a1
4760  %2 = trunc <16 x i32> %1 to <16 x i8>
4761  ret <16 x i8> %2
4762}
4763
4764define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4765; SSE-LABEL: trunc_or_v16i16_v16i8:
4766; SSE:       # BB#0:
4767; SSE-NEXT:    por %xmm2, %xmm0
4768; SSE-NEXT:    por %xmm3, %xmm1
4769; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
4770; SSE-NEXT:    pand %xmm2, %xmm1
4771; SSE-NEXT:    pand %xmm2, %xmm0
4772; SSE-NEXT:    packuswb %xmm1, %xmm0
4773; SSE-NEXT:    retq
4774;
4775; AVX1-LABEL: trunc_or_v16i16_v16i8:
4776; AVX1:       # BB#0:
4777; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4778; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4779; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4780; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4781; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4782; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4783; AVX1-NEXT:    vzeroupper
4784; AVX1-NEXT:    retq
4785;
4786; AVX2-LABEL: trunc_or_v16i16_v16i8:
4787; AVX2:       # BB#0:
4788; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4789; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4790; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4791; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4792; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4793; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4794; AVX2-NEXT:    vzeroupper
4795; AVX2-NEXT:    retq
4796;
4797; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4798; AVX512F:       # BB#0:
4799; AVX512F-NEXT:    vorps %ymm1, %ymm0, %ymm0
4800; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
4801; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4802; AVX512F-NEXT:    retq
4803;
4804; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4805; AVX512BW:       # BB#0:
4806; AVX512BW-NEXT:    vorps %ymm1, %ymm0, %ymm0
4807; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4808; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4809; AVX512BW-NEXT:    retq
4810  %1 = or <16 x i16> %a0, %a1
4811  %2 = trunc <16 x i16> %1 to <16 x i8>
4812  ret <16 x i8> %2
4813}
4814
4815;
4816; or to constant
4817;
4818
4819define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind {
4820; SSE-LABEL: trunc_or_const_v4i64_4i32:
4821; SSE:       # BB#0:
4822; SSE-NEXT:    movl $1, %eax
4823; SSE-NEXT:    movd %rax, %xmm2
4824; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
4825; SSE-NEXT:    por %xmm0, %xmm2
4826; SSE-NEXT:    por {{.*}}(%rip), %xmm1
4827; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4828; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4829; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4830; SSE-NEXT:    retq
4831;
4832; AVX1-LABEL: trunc_or_const_v4i64_4i32:
4833; AVX1:       # BB#0:
4834; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
4835; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4836; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
4837; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4838; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4839; AVX1-NEXT:    vzeroupper
4840; AVX1-NEXT:    retq
4841;
4842; AVX2-LABEL: trunc_or_const_v4i64_4i32:
4843; AVX2:       # BB#0:
4844; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
4845; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4846; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4847; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4848; AVX2-NEXT:    vzeroupper
4849; AVX2-NEXT:    retq
4850;
4851; AVX512-LABEL: trunc_or_const_v4i64_4i32:
4852; AVX512:       # BB#0:
4853; AVX512-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
4854; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4855; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4856; AVX512-NEXT:    retq
4857  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4858  %2 = trunc <4 x i64> %1 to <4 x i32>
4859  ret <4 x i32> %2
4860}
4861
4862define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
4863; SSE-LABEL: trunc_or_const_v16i64_v16i16:
4864; SSE:       # BB#0:
4865; SSE-NEXT:    movdqa %xmm0, %xmm4
4866; SSE-NEXT:    movl $1, %eax
4867; SSE-NEXT:    movd %rax, %xmm0
4868; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
4869; SSE-NEXT:    por %xmm4, %xmm0
4870; SSE-NEXT:    por {{.*}}(%rip), %xmm2
4871; SSE-NEXT:    por {{.*}}(%rip), %xmm3
4872; SSE-NEXT:    por {{.*}}(%rip), %xmm1
4873; SSE-NEXT:    pextrw $4, %xmm1, %eax
4874; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4875; SSE-NEXT:    pextrw $4, %xmm0, %ecx
4876; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
4877; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4878; SSE-NEXT:    pextrw $4, %xmm3, %edx
4879; SSE-NEXT:    movd %edx, %xmm1
4880; SSE-NEXT:    movd %eax, %xmm3
4881; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4882; SSE-NEXT:    movd %ecx, %xmm1
4883; SSE-NEXT:    pextrw $4, %xmm2, %eax
4884; SSE-NEXT:    movd %eax, %xmm2
4885; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4886; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
4887; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4888; SSE-NEXT:    retq
4889;
4890; AVX1-LABEL: trunc_or_const_v16i64_v16i16:
4891; AVX1:       # BB#0:
4892; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
4893; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
4894; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
4895; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
4896; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4897; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
4898; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
4899; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4900; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
4901; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
4902; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4903; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4904; AVX1-NEXT:    vzeroupper
4905; AVX1-NEXT:    retq
4906;
4907; AVX2-LABEL: trunc_or_const_v16i64_v16i16:
4908; AVX2:       # BB#0:
4909; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm1, %ymm1
4910; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
4911; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
4912; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
4913; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
4914; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
4915; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4916; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4917; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4918; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4919; AVX2-NEXT:    vzeroupper
4920; AVX2-NEXT:    retq
4921;
4922; AVX512-LABEL: trunc_or_const_v16i64_v16i16:
4923; AVX512:       # BB#0:
4924; AVX512-NEXT:    vporq {{.*}}(%rip), %zmm0, %zmm0
4925; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4926; AVX512-NEXT:    retq
4927  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4928  %2 = trunc <8 x i64> %1 to <8 x i16>
4929  ret <8 x i16> %2
4930}
4931
4932define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
4933; SSE-LABEL: trunc_or_const_v16i32_v16i16:
4934; SSE:       # BB#0:
4935; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4936; SSE-NEXT:    por {{.*}}(%rip), %xmm1
4937; SSE-NEXT:    pslld $16, %xmm1
4938; SSE-NEXT:    psrad $16, %xmm1
4939; SSE-NEXT:    pslld $16, %xmm0
4940; SSE-NEXT:    psrad $16, %xmm0
4941; SSE-NEXT:    packssdw %xmm1, %xmm0
4942; SSE-NEXT:    retq
4943;
4944; AVX1-LABEL: trunc_or_const_v16i32_v16i16:
4945; AVX1:       # BB#0:
4946; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
4947; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4948; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4949; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4950; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4951; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4952; AVX1-NEXT:    vzeroupper
4953; AVX1-NEXT:    retq
4954;
4955; AVX2-LABEL: trunc_or_const_v16i32_v16i16:
4956; AVX2:       # BB#0:
4957; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
4958; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
4959; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4960; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4961; AVX2-NEXT:    vzeroupper
4962; AVX2-NEXT:    retq
4963;
4964; AVX512-LABEL: trunc_or_const_v16i32_v16i16:
4965; AVX512:       # BB#0:
4966; AVX512-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
4967; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4968; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
4969; AVX512-NEXT:    retq
4970  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4971  %2 = trunc <8 x i32> %1 to <8 x i16>
4972  ret <8 x i16> %2
4973}
4974
4975define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4976; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4977; SSE:       # BB#0:
4978; SSE-NEXT:    movl $1, %eax
4979; SSE-NEXT:    movd %rax, %xmm8
4980; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
4981; SSE-NEXT:    por %xmm8, %xmm0
4982; SSE-NEXT:    por {{.*}}(%rip), %xmm1
4983; SSE-NEXT:    por {{.*}}(%rip), %xmm2
4984; SSE-NEXT:    por {{.*}}(%rip), %xmm3
4985; SSE-NEXT:    por {{.*}}(%rip), %xmm4
4986; SSE-NEXT:    por {{.*}}(%rip), %xmm5
4987; SSE-NEXT:    por {{.*}}(%rip), %xmm6
4988; SSE-NEXT:    por {{.*}}(%rip), %xmm7
4989; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4990; SSE-NEXT:    pand %xmm8, %xmm7
4991; SSE-NEXT:    pand %xmm8, %xmm6
4992; SSE-NEXT:    packuswb %xmm7, %xmm6
4993; SSE-NEXT:    pand %xmm8, %xmm5
4994; SSE-NEXT:    pand %xmm8, %xmm4
4995; SSE-NEXT:    packuswb %xmm5, %xmm4
4996; SSE-NEXT:    packuswb %xmm6, %xmm4
4997; SSE-NEXT:    pand %xmm8, %xmm3
4998; SSE-NEXT:    pand %xmm8, %xmm2
4999; SSE-NEXT:    packuswb %xmm3, %xmm2
5000; SSE-NEXT:    pand %xmm8, %xmm1
5001; SSE-NEXT:    pand %xmm8, %xmm0
5002; SSE-NEXT:    packuswb %xmm1, %xmm0
5003; SSE-NEXT:    packuswb %xmm2, %xmm0
5004; SSE-NEXT:    packuswb %xmm4, %xmm0
5005; SSE-NEXT:    retq
5006;
5007; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
5008; AVX1:       # BB#0:
5009; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
5010; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
5011; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm2, %ymm2
5012; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm3, %ymm3
5013; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
5014; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
5015; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
5016; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
5017; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
5018; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
5019; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
5020; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
5021; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
5022; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
5023; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
5024; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
5025; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
5026; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
5027; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
5028; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
5029; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
5030; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
5031; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5032; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
5033; AVX1-NEXT:    vzeroupper
5034; AVX1-NEXT:    retq
5035;
5036; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
5037; AVX2:       # BB#0:
5038; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm1, %ymm1
5039; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
5040; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm3, %ymm3
5041; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm2, %ymm2
5042; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
5043; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
5044; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
5045; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
5046; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
5047; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
5048; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
5049; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5050; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5051; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
5052; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
5053; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
5054; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
5055; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
5056; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
5057; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
5058; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5059; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
5060; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
5061; AVX2-NEXT:    vzeroupper
5062; AVX2-NEXT:    retq
5063;
5064; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
5065; AVX512:       # BB#0:
5066; AVX512-NEXT:    vporq {{.*}}(%rip), %zmm1, %zmm1
5067; AVX512-NEXT:    vporq {{.*}}(%rip), %zmm0, %zmm0
5068; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
5069; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
5070; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5071; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
5072; AVX512-NEXT:    retq
5073  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
5074  %2 = trunc <16 x i64> %1 to <16 x i8>
5075  ret <16 x i8> %2
5076}
5077
5078define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
5079; SSE-LABEL: trunc_or_const_v16i32_v16i8:
5080; SSE:       # BB#0:
5081; SSE-NEXT:    por {{.*}}(%rip), %xmm0
5082; SSE-NEXT:    por {{.*}}(%rip), %xmm1
5083; SSE-NEXT:    por {{.*}}(%rip), %xmm2
5084; SSE-NEXT:    por {{.*}}(%rip), %xmm3
5085; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5086; SSE-NEXT:    pand %xmm4, %xmm3
5087; SSE-NEXT:    pand %xmm4, %xmm2
5088; SSE-NEXT:    packuswb %xmm3, %xmm2
5089; SSE-NEXT:    pand %xmm4, %xmm1
5090; SSE-NEXT:    pand %xmm4, %xmm0
5091; SSE-NEXT:    packuswb %xmm1, %xmm0
5092; SSE-NEXT:    packuswb %xmm2, %xmm0
5093; SSE-NEXT:    retq
5094;
5095; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
5096; AVX1:       # BB#0:
5097; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
5098; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
5099; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
5100; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
5101; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
5102; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
5103; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
5104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
5105; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
5106; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
5107; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
5108; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5109; AVX1-NEXT:    vzeroupper
5110; AVX1-NEXT:    retq
5111;
5112; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
5113; AVX2:       # BB#0:
5114; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
5115; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm1, %ymm1
5116; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
5117; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
5118; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5119; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5120; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
5121; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
5122; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5123; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
5124; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5125; AVX2-NEXT:    vzeroupper
5126; AVX2-NEXT:    retq
5127;
5128; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
5129; AVX512:       # BB#0:
5130; AVX512-NEXT:    vpord {{.*}}(%rip), %zmm0, %zmm0
5131; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
5132; AVX512-NEXT:    retq
5133  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
5134  %2 = trunc <16 x i32> %1 to <16 x i8>
5135  ret <16 x i8> %2
5136}
5137
5138define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
5139; SSE-LABEL: trunc_or_const_v16i16_v16i8:
5140; SSE:       # BB#0:
5141; SSE-NEXT:    por {{.*}}(%rip), %xmm0
5142; SSE-NEXT:    por {{.*}}(%rip), %xmm1
5143; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
5144; SSE-NEXT:    pand %xmm2, %xmm1
5145; SSE-NEXT:    pand %xmm2, %xmm0
5146; SSE-NEXT:    packuswb %xmm1, %xmm0
5147; SSE-NEXT:    retq
5148;
5149; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
5150; AVX1:       # BB#0:
5151; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
5152; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
5153; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5154; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
5155; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
5156; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5157; AVX1-NEXT:    vzeroupper
5158; AVX1-NEXT:    retq
5159;
5160; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
5161; AVX2:       # BB#0:
5162; AVX2-NEXT:    vpor {{.*}}(%rip), %ymm0, %ymm0
5163; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5164; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5165; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
5166; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
5167; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5168; AVX2-NEXT:    vzeroupper
5169; AVX2-NEXT:    retq
5170;
5171; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
5172; AVX512F:       # BB#0:
5173; AVX512F-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
5174; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
5175; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
5176; AVX512F-NEXT:    retq
5177;
5178; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
5179; AVX512BW:       # BB#0:
5180; AVX512BW-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
5181; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
5182; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5183; AVX512BW-NEXT:    retq
5184  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
5185  %2 = trunc <16 x i16> %1 to <16 x i8>
5186  ret <16 x i8> %2
5187}
5188
5189;
5190; complex patterns - often created by vectorizer
5191;
5192
5193define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5194; SSE-LABEL: mul_add_v4i64_v4i32:
5195; SSE:       # BB#0:
5196; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5197; SSE-NEXT:    movdqa %xmm2, %xmm3
5198; SSE-NEXT:    psrad $31, %xmm3
5199; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5200; SSE-NEXT:    movdqa %xmm0, %xmm3
5201; SSE-NEXT:    psrad $31, %xmm3
5202; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5203; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5204; SSE-NEXT:    movdqa %xmm3, %xmm4
5205; SSE-NEXT:    psrad $31, %xmm4
5206; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5207; SSE-NEXT:    movdqa %xmm1, %xmm4
5208; SSE-NEXT:    psrad $31, %xmm4
5209; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5210; SSE-NEXT:    movdqa %xmm0, %xmm4
5211; SSE-NEXT:    pmuludq %xmm1, %xmm4
5212; SSE-NEXT:    movdqa %xmm1, %xmm5
5213; SSE-NEXT:    psrlq $32, %xmm5
5214; SSE-NEXT:    pmuludq %xmm0, %xmm5
5215; SSE-NEXT:    psllq $32, %xmm5
5216; SSE-NEXT:    paddq %xmm4, %xmm5
5217; SSE-NEXT:    psrlq $32, %xmm0
5218; SSE-NEXT:    pmuludq %xmm1, %xmm0
5219; SSE-NEXT:    psllq $32, %xmm0
5220; SSE-NEXT:    paddq %xmm5, %xmm0
5221; SSE-NEXT:    movdqa %xmm2, %xmm1
5222; SSE-NEXT:    pmuludq %xmm3, %xmm1
5223; SSE-NEXT:    movdqa %xmm3, %xmm4
5224; SSE-NEXT:    psrlq $32, %xmm4
5225; SSE-NEXT:    pmuludq %xmm2, %xmm4
5226; SSE-NEXT:    psllq $32, %xmm4
5227; SSE-NEXT:    paddq %xmm1, %xmm4
5228; SSE-NEXT:    psrlq $32, %xmm2
5229; SSE-NEXT:    pmuludq %xmm3, %xmm2
5230; SSE-NEXT:    psllq $32, %xmm2
5231; SSE-NEXT:    paddq %xmm4, %xmm2
5232; SSE-NEXT:    paddq {{.*}}(%rip), %xmm2
5233; SSE-NEXT:    paddq {{.*}}(%rip), %xmm0
5234; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5235; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
5236; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5237; SSE-NEXT:    retq
5238;
5239; AVX1-LABEL: mul_add_v4i64_v4i32:
5240; AVX1:       # BB#0:
5241; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5242; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
5243; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
5244; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5245; AVX1-NEXT:    vpmovsxdq %xmm3, %xmm3
5246; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
5247; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm4
5248; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
5249; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm5
5250; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
5251; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
5252; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
5253; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
5254; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
5255; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
5256; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm1
5257; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
5258; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
5259; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
5260; AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
5261; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
5262; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
5263; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
5264; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
5265; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm1, %xmm1
5266; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
5267; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5268; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
5269; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
5270; AVX1-NEXT:    retq
5271;
5272; AVX2-LABEL: mul_add_v4i64_v4i32:
5273; AVX2:       # BB#0:
5274; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
5275; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
5276; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
5277; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
5278; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
5279; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
5280; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
5281; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
5282; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
5283; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
5284; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
5285; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
5286; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
5287; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
5288; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5289; AVX2-NEXT:    vzeroupper
5290; AVX2-NEXT:    retq
5291;
5292; AVX512-LABEL: mul_add_v4i64_v4i32:
5293; AVX512:       # BB#0:
5294; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
5295; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
5296; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
5297; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm3
5298; AVX512-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
5299; AVX512-NEXT:    vpsllq $32, %ymm3, %ymm3
5300; AVX512-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
5301; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
5302; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
5303; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
5304; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
5305; AVX512-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
5306; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
5307; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
5308; AVX512-NEXT:    retq
5309  %1 = sext <4 x i32> %a0 to <4 x i64>
5310  %2 = sext <4 x i32> %a1 to <4 x i64>
5311  %3 = mul <4 x i64> %1, %2
5312  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
5313  %5 = trunc <4 x i64> %4 to <4 x i32>
5314  ret <4 x i32> %5
5315}
5316