1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
9
10;
11; add
12;
13
14define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
15; SSE-LABEL: trunc_add_v4i64_v4i32:
16; SSE:       # %bb.0:
17; SSE-NEXT:    paddq %xmm3, %xmm1
18; SSE-NEXT:    paddq %xmm2, %xmm0
19; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
20; SSE-NEXT:    retq
21;
22; AVX1-LABEL: trunc_add_v4i64_v4i32:
23; AVX1:       # %bb.0:
24; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
25; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
26; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
27; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
28; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
29; AVX1-NEXT:    vzeroupper
30; AVX1-NEXT:    retq
31;
32; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
33; AVX2-SLOW:       # %bb.0:
34; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
35; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
36; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
37; AVX2-SLOW-NEXT:    vzeroupper
38; AVX2-SLOW-NEXT:    retq
39;
40; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
41; AVX2-FAST:       # %bb.0:
42; AVX2-FAST-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
43; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
44; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
45; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
46; AVX2-FAST-NEXT:    vzeroupper
47; AVX2-FAST-NEXT:    retq
48;
49; AVX512-LABEL: trunc_add_v4i64_v4i32:
50; AVX512:       # %bb.0:
51; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
52; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
53; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
54; AVX512-NEXT:    vzeroupper
55; AVX512-NEXT:    retq
56  %1 = add <4 x i64> %a0, %a1
57  %2 = trunc <4 x i64> %1 to <4 x i32>
58  ret <4 x i32> %2
59}
60
61define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
62; SSE-LABEL: trunc_add_v8i64_v8i16:
63; SSE:       # %bb.0:
64; SSE-NEXT:    paddq %xmm6, %xmm2
65; SSE-NEXT:    paddq %xmm7, %xmm3
66; SSE-NEXT:    paddq %xmm4, %xmm0
67; SSE-NEXT:    paddq %xmm5, %xmm1
68; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
69; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
70; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
71; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
72; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
73; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
74; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
75; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
76; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
77; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
78; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
79; SSE-NEXT:    retq
80;
81; AVX1-LABEL: trunc_add_v8i64_v8i16:
82; AVX1:       # %bb.0:
83; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
84; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
85; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
86; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
87; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
88; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
89; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
90; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
91; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
92; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
93; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
94; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
95; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
96; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
97; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
98; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
99; AVX1-NEXT:    vzeroupper
100; AVX1-NEXT:    retq
101;
102; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
103; AVX2-SLOW:       # %bb.0:
104; AVX2-SLOW-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
105; AVX2-SLOW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
106; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
107; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
108; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
109; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
110; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
111; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
112; AVX2-SLOW-NEXT:    vzeroupper
113; AVX2-SLOW-NEXT:    retq
114;
115; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
116; AVX2-FAST:       # %bb.0:
117; AVX2-FAST-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
118; AVX2-FAST-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
119; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
120; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
121; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
122; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
123; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
124; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
125; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
126; AVX2-FAST-NEXT:    vzeroupper
127; AVX2-FAST-NEXT:    retq
128;
129; AVX512-LABEL: trunc_add_v8i64_v8i16:
130; AVX512:       # %bb.0:
131; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
132; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
133; AVX512-NEXT:    vzeroupper
134; AVX512-NEXT:    retq
135  %1 = add <8 x i64> %a0, %a1
136  %2 = trunc <8 x i64> %1 to <8 x i16>
137  ret <8 x i16> %2
138}
139
140define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
141; SSE-LABEL: trunc_add_v8i32_v8i16:
142; SSE:       # %bb.0:
143; SSE-NEXT:    paddd %xmm2, %xmm0
144; SSE-NEXT:    paddd %xmm3, %xmm1
145; SSE-NEXT:    pslld $16, %xmm1
146; SSE-NEXT:    psrad $16, %xmm1
147; SSE-NEXT:    pslld $16, %xmm0
148; SSE-NEXT:    psrad $16, %xmm0
149; SSE-NEXT:    packssdw %xmm1, %xmm0
150; SSE-NEXT:    retq
151;
152; AVX1-LABEL: trunc_add_v8i32_v8i16:
153; AVX1:       # %bb.0:
154; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
155; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
156; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
157; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
158; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
159; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
160; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
161; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
162; AVX1-NEXT:    vzeroupper
163; AVX1-NEXT:    retq
164;
165; AVX2-LABEL: trunc_add_v8i32_v8i16:
166; AVX2:       # %bb.0:
167; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
168; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
169; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
170; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
171; AVX2-NEXT:    vzeroupper
172; AVX2-NEXT:    retq
173;
174; AVX512-LABEL: trunc_add_v8i32_v8i16:
175; AVX512:       # %bb.0:
176; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
177; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
178; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
179; AVX512-NEXT:    vzeroupper
180; AVX512-NEXT:    retq
181  %1 = add <8 x i32> %a0, %a1
182  %2 = trunc <8 x i32> %1 to <8 x i16>
183  ret <8 x i16> %2
184}
185
186define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
187; SSE-LABEL: trunc_add_v16i64_v16i8:
188; SSE:       # %bb.0:
189; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
190; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
191; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
192; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
193; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
194; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
195; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
196; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
197; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
198; SSE-NEXT:    pand %xmm8, %xmm7
199; SSE-NEXT:    pand %xmm8, %xmm6
200; SSE-NEXT:    packuswb %xmm7, %xmm6
201; SSE-NEXT:    pand %xmm8, %xmm5
202; SSE-NEXT:    pand %xmm8, %xmm4
203; SSE-NEXT:    packuswb %xmm5, %xmm4
204; SSE-NEXT:    packuswb %xmm6, %xmm4
205; SSE-NEXT:    pand %xmm8, %xmm3
206; SSE-NEXT:    pand %xmm8, %xmm2
207; SSE-NEXT:    packuswb %xmm3, %xmm2
208; SSE-NEXT:    pand %xmm8, %xmm1
209; SSE-NEXT:    pand %xmm8, %xmm0
210; SSE-NEXT:    packuswb %xmm1, %xmm0
211; SSE-NEXT:    packuswb %xmm2, %xmm0
212; SSE-NEXT:    packuswb %xmm4, %xmm0
213; SSE-NEXT:    retq
214;
215; AVX1-LABEL: trunc_add_v16i64_v16i8:
216; AVX1:       # %bb.0:
217; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
218; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
219; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
220; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
221; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
222; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
223; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
224; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
225; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
226; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
227; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
228; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
229; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
230; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
231; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
232; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
233; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
234; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
235; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
236; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
237; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
238; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
239; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
240; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
241; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
242; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
243; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
244; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
245; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
246; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
247; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
248; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
249; AVX1-NEXT:    vzeroupper
250; AVX1-NEXT:    retq
251;
252; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
253; AVX2-SLOW:       # %bb.0:
254; AVX2-SLOW-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
255; AVX2-SLOW-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
256; AVX2-SLOW-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
257; AVX2-SLOW-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
258; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
259; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
260; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
261; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
262; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
263; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
264; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
265; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
266; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
267; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
268; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
269; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
270; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
271; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
272; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
273; AVX2-SLOW-NEXT:    vzeroupper
274; AVX2-SLOW-NEXT:    retq
275;
276; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
277; AVX2-FAST:       # %bb.0:
278; AVX2-FAST-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
279; AVX2-FAST-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
280; AVX2-FAST-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
281; AVX2-FAST-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
282; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
283; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
284; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
285; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
286; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
287; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
288; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
289; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
290; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
291; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
292; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
293; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
294; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
295; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
296; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
297; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
298; AVX2-FAST-NEXT:    vzeroupper
299; AVX2-FAST-NEXT:    retq
300;
301; AVX512-LABEL: trunc_add_v16i64_v16i8:
302; AVX512:       # %bb.0:
303; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
304; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
305; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
306; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
307; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
308; AVX512-NEXT:    vzeroupper
309; AVX512-NEXT:    retq
310  %1 = add <16 x i64> %a0, %a1
311  %2 = trunc <16 x i64> %1 to <16 x i8>
312  ret <16 x i8> %2
313}
314
315define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
316; SSE-LABEL: trunc_add_v16i32_v16i8:
317; SSE:       # %bb.0:
318; SSE-NEXT:    paddd %xmm4, %xmm0
319; SSE-NEXT:    paddd %xmm5, %xmm1
320; SSE-NEXT:    paddd %xmm6, %xmm2
321; SSE-NEXT:    paddd %xmm7, %xmm3
322; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
323; SSE-NEXT:    pand %xmm4, %xmm3
324; SSE-NEXT:    pand %xmm4, %xmm2
325; SSE-NEXT:    packuswb %xmm3, %xmm2
326; SSE-NEXT:    pand %xmm4, %xmm1
327; SSE-NEXT:    pand %xmm4, %xmm0
328; SSE-NEXT:    packuswb %xmm1, %xmm0
329; SSE-NEXT:    packuswb %xmm2, %xmm0
330; SSE-NEXT:    retq
331;
332; AVX1-LABEL: trunc_add_v16i32_v16i8:
333; AVX1:       # %bb.0:
334; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
335; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
336; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
337; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
338; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
339; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
340; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
341; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
342; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
343; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
344; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
345; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
346; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
347; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
348; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
349; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
350; AVX1-NEXT:    vzeroupper
351; AVX1-NEXT:    retq
352;
353; AVX2-LABEL: trunc_add_v16i32_v16i8:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
356; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
357; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
358; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
359; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
360; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
361; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
362; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
363; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
364; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
365; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
366; AVX2-NEXT:    vzeroupper
367; AVX2-NEXT:    retq
368;
369; AVX512-LABEL: trunc_add_v16i32_v16i8:
370; AVX512:       # %bb.0:
371; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
372; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
373; AVX512-NEXT:    vzeroupper
374; AVX512-NEXT:    retq
375  %1 = add <16 x i32> %a0, %a1
376  %2 = trunc <16 x i32> %1 to <16 x i8>
377  ret <16 x i8> %2
378}
379
380define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
381; SSE-LABEL: trunc_add_v16i16_v16i8:
382; SSE:       # %bb.0:
383; SSE-NEXT:    paddw %xmm2, %xmm0
384; SSE-NEXT:    paddw %xmm3, %xmm1
385; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
386; SSE-NEXT:    pand %xmm2, %xmm1
387; SSE-NEXT:    pand %xmm2, %xmm0
388; SSE-NEXT:    packuswb %xmm1, %xmm0
389; SSE-NEXT:    retq
390;
391; AVX1-LABEL: trunc_add_v16i16_v16i8:
392; AVX1:       # %bb.0:
393; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
394; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
395; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
396; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
397; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
398; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
399; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
400; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
401; AVX1-NEXT:    vzeroupper
402; AVX1-NEXT:    retq
403;
404; AVX2-LABEL: trunc_add_v16i16_v16i8:
405; AVX2:       # %bb.0:
406; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
407; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
408; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
409; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
410; AVX2-NEXT:    vzeroupper
411; AVX2-NEXT:    retq
412;
413; AVX512F-LABEL: trunc_add_v16i16_v16i8:
414; AVX512F:       # %bb.0:
415; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
416; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
417; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
418; AVX512F-NEXT:    vzeroupper
419; AVX512F-NEXT:    retq
420;
421; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
422; AVX512BW:       # %bb.0:
423; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
424; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
425; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
426; AVX512BW-NEXT:    vzeroupper
427; AVX512BW-NEXT:    retq
428;
429; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
430; AVX512DQ:       # %bb.0:
431; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
432; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
433; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
434; AVX512DQ-NEXT:    vzeroupper
435; AVX512DQ-NEXT:    retq
436  %1 = add <16 x i16> %a0, %a1
437  %2 = trunc <16 x i16> %1 to <16 x i8>
438  ret <16 x i8> %2
439}
440
441define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
442; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
443; SSE:       # %bb.0:
444; SSE-NEXT:    pslld $16, %xmm2
445; SSE-NEXT:    psrad $16, %xmm2
446; SSE-NEXT:    pslld $16, %xmm1
447; SSE-NEXT:    psrad $16, %xmm1
448; SSE-NEXT:    packssdw %xmm2, %xmm1
449; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
450; SSE-NEXT:    psraw $8, %xmm0
451; SSE-NEXT:    paddw %xmm1, %xmm0
452; SSE-NEXT:    retq
453;
454; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
455; AVX1:       # %bb.0:
456; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
457; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
458; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
459; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
460; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
461; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
462; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
463; AVX1-NEXT:    vzeroupper
464; AVX1-NEXT:    retq
465;
466; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
467; AVX2:       # %bb.0:
468; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
469; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
470; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
471; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
472; AVX2-NEXT:    vzeroupper
473; AVX2-NEXT:    retq
474;
475; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
476; AVX512:       # %bb.0:
477; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
478; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
479; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
480; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
481; AVX512-NEXT:    vzeroupper
482; AVX512-NEXT:    retq
483  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
484  %2 = sext <8 x i8> %1 to <8 x i32>
485  %3 = add <8 x i32> %2, %a1
486  %4 = trunc <8 x i32> %3 to <8 x i16>
487  ret <8 x i16> %4
488}
489
490;
491; add to constant
492;
493
494define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
495; SSE-LABEL: trunc_add_const_v4i64_v4i32:
496; SSE:       # %bb.0:
497; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
498; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
499; SSE-NEXT:    retq
500;
501; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
502; AVX1:       # %bb.0:
503; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
504; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
505; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
506; AVX1-NEXT:    vzeroupper
507; AVX1-NEXT:    retq
508;
509; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
510; AVX2-SLOW:       # %bb.0:
511; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
512; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
513; AVX2-SLOW-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
514; AVX2-SLOW-NEXT:    vzeroupper
515; AVX2-SLOW-NEXT:    retq
516;
517; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
518; AVX2-FAST:       # %bb.0:
519; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
520; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
521; AVX2-FAST-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
522; AVX2-FAST-NEXT:    vzeroupper
523; AVX2-FAST-NEXT:    retq
524;
525; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
526; AVX512:       # %bb.0:
527; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
528; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
529; AVX512-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
530; AVX512-NEXT:    vzeroupper
531; AVX512-NEXT:    retq
532  %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
533  %2 = trunc <4 x i64> %1 to <4 x i32>
534  ret <4 x i32> %2
535}
536
537define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
538; SSE-LABEL: trunc_add_const_v8i64_v8i16:
539; SSE:       # %bb.0:
540; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
541; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
542; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
543; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
544; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
545; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
546; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
547; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
548; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
549; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
550; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
551; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
552; SSE-NEXT:    retq
553;
554; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
555; AVX1:       # %bb.0:
556; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
557; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
558; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
559; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
560; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
561; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
562; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
563; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
564; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
565; AVX1-NEXT:    vzeroupper
566; AVX1-NEXT:    retq
567;
568; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
569; AVX2-SLOW:       # %bb.0:
570; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
571; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
572; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
573; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
574; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
575; AVX2-SLOW-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
576; AVX2-SLOW-NEXT:    vzeroupper
577; AVX2-SLOW-NEXT:    retq
578;
579; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
580; AVX2-FAST:       # %bb.0:
581; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
582; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
583; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
584; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
585; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
586; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
587; AVX2-FAST-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
588; AVX2-FAST-NEXT:    vzeroupper
589; AVX2-FAST-NEXT:    retq
590;
591; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
592; AVX512:       # %bb.0:
593; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
594; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
595; AVX512-NEXT:    vzeroupper
596; AVX512-NEXT:    retq
597  %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
598  %2 = trunc <8 x i64> %1 to <8 x i16>
599  ret <8 x i16> %2
600}
601
602define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
603; SSE-LABEL: trunc_add_const_v8i32_v8i16:
604; SSE:       # %bb.0:
605; SSE-NEXT:    pslld $16, %xmm1
606; SSE-NEXT:    psrad $16, %xmm1
607; SSE-NEXT:    pslld $16, %xmm0
608; SSE-NEXT:    psrad $16, %xmm0
609; SSE-NEXT:    packssdw %xmm1, %xmm0
610; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
611; SSE-NEXT:    retq
612;
613; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
614; AVX1:       # %bb.0:
615; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
616; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
617; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
618; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
619; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
620; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
621; AVX1-NEXT:    vzeroupper
622; AVX1-NEXT:    retq
623;
624; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
625; AVX2:       # %bb.0:
626; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
627; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
628; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
629; AVX2-NEXT:    vzeroupper
630; AVX2-NEXT:    retq
631;
632; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
633; AVX512:       # %bb.0:
634; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
635; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
636; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
637; AVX512-NEXT:    vzeroupper
638; AVX512-NEXT:    retq
639  %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
640  %2 = trunc <8 x i32> %1 to <8 x i16>
641  ret <8 x i16> %2
642}
643
644define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
645; SSE-LABEL: trunc_add_const_v16i64_v16i8:
646; SSE:       # %bb.0:
647; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
648; SSE-NEXT:    pand %xmm8, %xmm7
649; SSE-NEXT:    pand %xmm8, %xmm6
650; SSE-NEXT:    packuswb %xmm7, %xmm6
651; SSE-NEXT:    pand %xmm8, %xmm5
652; SSE-NEXT:    pand %xmm8, %xmm4
653; SSE-NEXT:    packuswb %xmm5, %xmm4
654; SSE-NEXT:    packuswb %xmm6, %xmm4
655; SSE-NEXT:    pand %xmm8, %xmm3
656; SSE-NEXT:    pand %xmm8, %xmm2
657; SSE-NEXT:    packuswb %xmm3, %xmm2
658; SSE-NEXT:    pand %xmm8, %xmm1
659; SSE-NEXT:    pand %xmm8, %xmm0
660; SSE-NEXT:    packuswb %xmm1, %xmm0
661; SSE-NEXT:    packuswb %xmm2, %xmm0
662; SSE-NEXT:    packuswb %xmm4, %xmm0
663; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
664; SSE-NEXT:    retq
665;
666; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
667; AVX1:       # %bb.0:
668; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
669; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
670; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
671; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
672; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
673; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
674; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
675; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
676; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
677; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
678; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
679; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
680; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
681; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
682; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
683; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
684; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
685; AVX1-NEXT:    vzeroupper
686; AVX1-NEXT:    retq
687;
688; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
689; AVX2-SLOW:       # %bb.0:
690; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
691; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
692; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
693; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
694; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
695; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
696; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
697; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
698; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
699; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
700; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
701; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
702; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
703; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
704; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
705; AVX2-SLOW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
706; AVX2-SLOW-NEXT:    vzeroupper
707; AVX2-SLOW-NEXT:    retq
708;
709; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
710; AVX2-FAST:       # %bb.0:
711; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
712; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
713; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
714; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
715; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
716; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
717; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
718; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
719; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
720; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
721; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
722; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
723; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
724; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
725; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
726; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
727; AVX2-FAST-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
728; AVX2-FAST-NEXT:    vzeroupper
729; AVX2-FAST-NEXT:    retq
730;
731; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
732; AVX512:       # %bb.0:
733; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
734; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
735; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
736; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
737; AVX512-NEXT:    vzeroupper
738; AVX512-NEXT:    retq
739  %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
740  %2 = trunc <16 x i64> %1 to <16 x i8>
741  ret <16 x i8> %2
742}
743
744define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
745; SSE-LABEL: trunc_add_const_v16i32_v16i8:
746; SSE:       # %bb.0:
747; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
748; SSE-NEXT:    pand %xmm4, %xmm3
749; SSE-NEXT:    pand %xmm4, %xmm2
750; SSE-NEXT:    packuswb %xmm3, %xmm2
751; SSE-NEXT:    pand %xmm4, %xmm1
752; SSE-NEXT:    pand %xmm4, %xmm0
753; SSE-NEXT:    packuswb %xmm1, %xmm0
754; SSE-NEXT:    packuswb %xmm2, %xmm0
755; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
756; SSE-NEXT:    retq
757;
758; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
759; AVX1:       # %bb.0:
760; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
761; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
762; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
763; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
764; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
765; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
766; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
767; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
768; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
769; AVX1-NEXT:    vzeroupper
770; AVX1-NEXT:    retq
771;
772; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
773; AVX2:       # %bb.0:
774; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
775; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
776; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
777; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
778; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
779; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
780; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
781; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
782; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
783; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
784; AVX2-NEXT:    vzeroupper
785; AVX2-NEXT:    retq
786;
787; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
788; AVX512:       # %bb.0:
789; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
790; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
791; AVX512-NEXT:    vzeroupper
792; AVX512-NEXT:    retq
793  %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
794  %2 = trunc <16 x i32> %1 to <16 x i8>
795  ret <16 x i8> %2
796}
797
798define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
799; SSE-LABEL: trunc_add_const_v16i16_v16i8:
800; SSE:       # %bb.0:
801; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
802; SSE-NEXT:    pand %xmm2, %xmm1
803; SSE-NEXT:    pand %xmm2, %xmm0
804; SSE-NEXT:    packuswb %xmm1, %xmm0
805; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
806; SSE-NEXT:    retq
807;
808; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
809; AVX1:       # %bb.0:
810; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
811; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
812; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
813; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
814; AVX1-NEXT:    vzeroupper
815; AVX1-NEXT:    retq
816;
817; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
818; AVX2:       # %bb.0:
819; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
820; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
821; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
822; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
823; AVX2-NEXT:    vzeroupper
824; AVX2-NEXT:    retq
825;
826; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
827; AVX512F:       # %bb.0:
828; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
829; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
830; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
831; AVX512F-NEXT:    vzeroupper
832; AVX512F-NEXT:    retq
833;
834; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
835; AVX512BW:       # %bb.0:
836; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
837; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
838; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
839; AVX512BW-NEXT:    vzeroupper
840; AVX512BW-NEXT:    retq
841;
842; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
843; AVX512DQ:       # %bb.0:
844; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
845; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
846; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
847; AVX512DQ-NEXT:    vzeroupper
848; AVX512DQ-NEXT:    retq
849  %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
850  %2 = trunc <16 x i16> %1 to <16 x i8>
851  ret <16 x i8> %2
852}
853
854;
855; sub
856;
857
858define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
859; SSE-LABEL: trunc_sub_v4i64_v4i32:
860; SSE:       # %bb.0:
861; SSE-NEXT:    psubq %xmm3, %xmm1
862; SSE-NEXT:    psubq %xmm2, %xmm0
863; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
864; SSE-NEXT:    retq
865;
866; AVX1-LABEL: trunc_sub_v4i64_v4i32:
867; AVX1:       # %bb.0:
868; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
869; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
870; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
871; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
872; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
873; AVX1-NEXT:    vzeroupper
874; AVX1-NEXT:    retq
875;
876; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
877; AVX2-SLOW:       # %bb.0:
878; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
879; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
880; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
881; AVX2-SLOW-NEXT:    vzeroupper
882; AVX2-SLOW-NEXT:    retq
883;
884; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
885; AVX2-FAST:       # %bb.0:
886; AVX2-FAST-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
887; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
888; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
889; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
890; AVX2-FAST-NEXT:    vzeroupper
891; AVX2-FAST-NEXT:    retq
892;
893; AVX512-LABEL: trunc_sub_v4i64_v4i32:
894; AVX512:       # %bb.0:
895; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
896; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
897; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
898; AVX512-NEXT:    vzeroupper
899; AVX512-NEXT:    retq
900  %1 = sub <4 x i64> %a0, %a1
901  %2 = trunc <4 x i64> %1 to <4 x i32>
902  ret <4 x i32> %2
903}
904
905define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
906; SSE-LABEL: trunc_sub_v8i64_v8i16:
907; SSE:       # %bb.0:
908; SSE-NEXT:    psubq %xmm6, %xmm2
909; SSE-NEXT:    psubq %xmm7, %xmm3
910; SSE-NEXT:    psubq %xmm4, %xmm0
911; SSE-NEXT:    psubq %xmm5, %xmm1
912; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
913; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
914; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
915; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
916; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
917; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
918; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
919; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
920; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
921; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
922; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
923; SSE-NEXT:    retq
924;
925; AVX1-LABEL: trunc_sub_v8i64_v8i16:
926; AVX1:       # %bb.0:
927; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
928; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
929; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
930; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
931; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
932; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
933; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
934; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
935; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
936; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
937; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
938; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
939; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
940; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
941; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
942; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
943; AVX1-NEXT:    vzeroupper
944; AVX1-NEXT:    retq
945;
946; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
947; AVX2-SLOW:       # %bb.0:
948; AVX2-SLOW-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
949; AVX2-SLOW-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
950; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
951; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
952; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
953; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
954; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
955; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
956; AVX2-SLOW-NEXT:    vzeroupper
957; AVX2-SLOW-NEXT:    retq
958;
959; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
960; AVX2-FAST:       # %bb.0:
961; AVX2-FAST-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
962; AVX2-FAST-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
963; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
964; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
965; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
966; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
967; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
968; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
969; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
970; AVX2-FAST-NEXT:    vzeroupper
971; AVX2-FAST-NEXT:    retq
972;
973; AVX512-LABEL: trunc_sub_v8i64_v8i16:
974; AVX512:       # %bb.0:
975; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
976; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
977; AVX512-NEXT:    vzeroupper
978; AVX512-NEXT:    retq
979  %1 = sub <8 x i64> %a0, %a1
980  %2 = trunc <8 x i64> %1 to <8 x i16>
981  ret <8 x i16> %2
982}
983
984define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
985; SSE-LABEL: trunc_sub_v8i32_v8i16:
986; SSE:       # %bb.0:
987; SSE-NEXT:    psubd %xmm2, %xmm0
988; SSE-NEXT:    psubd %xmm3, %xmm1
989; SSE-NEXT:    pslld $16, %xmm1
990; SSE-NEXT:    psrad $16, %xmm1
991; SSE-NEXT:    pslld $16, %xmm0
992; SSE-NEXT:    psrad $16, %xmm0
993; SSE-NEXT:    packssdw %xmm1, %xmm0
994; SSE-NEXT:    retq
995;
996; AVX1-LABEL: trunc_sub_v8i32_v8i16:
997; AVX1:       # %bb.0:
998; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
999; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1000; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1001; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1002; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1003; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1004; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1005; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1006; AVX1-NEXT:    vzeroupper
1007; AVX1-NEXT:    retq
1008;
1009; AVX2-LABEL: trunc_sub_v8i32_v8i16:
1010; AVX2:       # %bb.0:
1011; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1012; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1013; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1014; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1015; AVX2-NEXT:    vzeroupper
1016; AVX2-NEXT:    retq
1017;
1018; AVX512-LABEL: trunc_sub_v8i32_v8i16:
1019; AVX512:       # %bb.0:
1020; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
1021; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1022; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1023; AVX512-NEXT:    vzeroupper
1024; AVX512-NEXT:    retq
1025  %1 = sub <8 x i32> %a0, %a1
1026  %2 = trunc <8 x i32> %1 to <8 x i16>
1027  ret <8 x i16> %2
1028}
1029
1030define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1031; SSE-LABEL: trunc_sub_v16i64_v16i8:
1032; SSE:       # %bb.0:
1033; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
1034; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
1035; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
1036; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
1037; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
1038; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
1039; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
1040; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
1041; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1042; SSE-NEXT:    pand %xmm8, %xmm7
1043; SSE-NEXT:    pand %xmm8, %xmm6
1044; SSE-NEXT:    packuswb %xmm7, %xmm6
1045; SSE-NEXT:    pand %xmm8, %xmm5
1046; SSE-NEXT:    pand %xmm8, %xmm4
1047; SSE-NEXT:    packuswb %xmm5, %xmm4
1048; SSE-NEXT:    packuswb %xmm6, %xmm4
1049; SSE-NEXT:    pand %xmm8, %xmm3
1050; SSE-NEXT:    pand %xmm8, %xmm2
1051; SSE-NEXT:    packuswb %xmm3, %xmm2
1052; SSE-NEXT:    pand %xmm8, %xmm1
1053; SSE-NEXT:    pand %xmm8, %xmm0
1054; SSE-NEXT:    packuswb %xmm1, %xmm0
1055; SSE-NEXT:    packuswb %xmm2, %xmm0
1056; SSE-NEXT:    packuswb %xmm4, %xmm0
1057; SSE-NEXT:    retq
1058;
1059; AVX1-LABEL: trunc_sub_v16i64_v16i8:
1060; AVX1:       # %bb.0:
1061; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
1062; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
1063; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1064; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
1065; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
1066; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1067; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1068; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
1069; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
1070; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1071; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1072; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
1073; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
1074; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1075; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1076; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
1077; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1078; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1079; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1080; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1081; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1082; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1083; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1084; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1085; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
1086; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
1087; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
1088; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
1089; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
1090; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
1091; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1092; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1093; AVX1-NEXT:    vzeroupper
1094; AVX1-NEXT:    retq
1095;
1096; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
1097; AVX2-SLOW:       # %bb.0:
1098; AVX2-SLOW-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1099; AVX2-SLOW-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1100; AVX2-SLOW-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1101; AVX2-SLOW-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1102; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
1103; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1104; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
1105; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1106; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1107; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1108; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1109; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
1110; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
1111; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1112; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
1113; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1114; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1115; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
1116; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1117; AVX2-SLOW-NEXT:    vzeroupper
1118; AVX2-SLOW-NEXT:    retq
1119;
1120; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
1121; AVX2-FAST:       # %bb.0:
1122; AVX2-FAST-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
1123; AVX2-FAST-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
1124; AVX2-FAST-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
1125; AVX2-FAST-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1126; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1127; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
1128; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
1129; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1130; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1131; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1132; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1133; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1134; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
1135; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
1136; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
1137; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1138; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1139; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1140; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
1141; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1142; AVX2-FAST-NEXT:    vzeroupper
1143; AVX2-FAST-NEXT:    retq
1144;
1145; AVX512-LABEL: trunc_sub_v16i64_v16i8:
1146; AVX512:       # %bb.0:
1147; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
1148; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
1149; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1150; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1151; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1152; AVX512-NEXT:    vzeroupper
1153; AVX512-NEXT:    retq
1154  %1 = sub <16 x i64> %a0, %a1
1155  %2 = trunc <16 x i64> %1 to <16 x i8>
1156  ret <16 x i8> %2
1157}
1158
1159define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
1160; SSE-LABEL: trunc_sub_v16i32_v16i8:
1161; SSE:       # %bb.0:
1162; SSE-NEXT:    psubd %xmm4, %xmm0
1163; SSE-NEXT:    psubd %xmm5, %xmm1
1164; SSE-NEXT:    psubd %xmm6, %xmm2
1165; SSE-NEXT:    psubd %xmm7, %xmm3
1166; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1167; SSE-NEXT:    pand %xmm4, %xmm3
1168; SSE-NEXT:    pand %xmm4, %xmm2
1169; SSE-NEXT:    packuswb %xmm3, %xmm2
1170; SSE-NEXT:    pand %xmm4, %xmm1
1171; SSE-NEXT:    pand %xmm4, %xmm0
1172; SSE-NEXT:    packuswb %xmm1, %xmm0
1173; SSE-NEXT:    packuswb %xmm2, %xmm0
1174; SSE-NEXT:    retq
1175;
1176; AVX1-LABEL: trunc_sub_v16i32_v16i8:
1177; AVX1:       # %bb.0:
1178; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
1179; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1180; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1181; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
1182; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
1183; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1184; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1185; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
1186; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
1187; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1188; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1189; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
1190; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1191; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
1192; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
1193; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1194; AVX1-NEXT:    vzeroupper
1195; AVX1-NEXT:    retq
1196;
1197; AVX2-LABEL: trunc_sub_v16i32_v16i8:
1198; AVX2:       # %bb.0:
1199; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
1200; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
1201; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1202; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1203; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1204; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1205; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
1206; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1207; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1208; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
1209; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1210; AVX2-NEXT:    vzeroupper
1211; AVX2-NEXT:    retq
1212;
1213; AVX512-LABEL: trunc_sub_v16i32_v16i8:
1214; AVX512:       # %bb.0:
1215; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
1216; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1217; AVX512-NEXT:    vzeroupper
1218; AVX512-NEXT:    retq
1219  %1 = sub <16 x i32> %a0, %a1
1220  %2 = trunc <16 x i32> %1 to <16 x i8>
1221  ret <16 x i8> %2
1222}
1223
1224define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
1225; SSE-LABEL: trunc_sub_v16i16_v16i8:
1226; SSE:       # %bb.0:
1227; SSE-NEXT:    psubw %xmm2, %xmm0
1228; SSE-NEXT:    psubw %xmm3, %xmm1
1229; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1230; SSE-NEXT:    pand %xmm2, %xmm1
1231; SSE-NEXT:    pand %xmm2, %xmm0
1232; SSE-NEXT:    packuswb %xmm1, %xmm0
1233; SSE-NEXT:    retq
1234;
1235; AVX1-LABEL: trunc_sub_v16i16_v16i8:
1236; AVX1:       # %bb.0:
1237; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
1238; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1239; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1240; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1241; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1242; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1243; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
1244; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
1245; AVX1-NEXT:    vzeroupper
1246; AVX1-NEXT:    retq
1247;
1248; AVX2-LABEL: trunc_sub_v16i16_v16i8:
1249; AVX2:       # %bb.0:
1250; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1251; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1252; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1253; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1254; AVX2-NEXT:    vzeroupper
1255; AVX2-NEXT:    retq
1256;
1257; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
1258; AVX512F:       # %bb.0:
1259; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1260; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1261; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1262; AVX512F-NEXT:    vzeroupper
1263; AVX512F-NEXT:    retq
1264;
1265; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
1266; AVX512BW:       # %bb.0:
1267; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1268; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1269; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1270; AVX512BW-NEXT:    vzeroupper
1271; AVX512BW-NEXT:    retq
1272;
1273; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
1274; AVX512DQ:       # %bb.0:
1275; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
1276; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1277; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1278; AVX512DQ-NEXT:    vzeroupper
1279; AVX512DQ-NEXT:    retq
1280  %1 = sub <16 x i16> %a0, %a1
1281  %2 = trunc <16 x i16> %1 to <16 x i8>
1282  ret <16 x i8> %2
1283}
1284
1285define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
1286; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
1287; SSE:       # %bb.0:
1288; SSE-NEXT:    psubb %xmm1, %xmm0
1289; SSE-NEXT:    retq
1290;
1291; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
1292; AVX:       # %bb.0:
1293; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1294; AVX-NEXT:    retq
1295  %a = zext <16 x i8> %x to <16 x i16>
1296  %b = zext <16 x i8> %y to <16 x i16>
1297  %c = sub <16 x i16> %a, %b
1298  %d = trunc <16 x i16> %c to <16 x i8>
1299  ret <16 x i8> %d
1300}
1301
1302;
1303; sub to constant
1304;
1305
1306define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
1307; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
1308; SSE:       # %bb.0:
1309; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1310; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
1311; SSE-NEXT:    retq
1312;
1313; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
1314; AVX1:       # %bb.0:
1315; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1316; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1317; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1318; AVX1-NEXT:    vzeroupper
1319; AVX1-NEXT:    retq
1320;
1321; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
1322; AVX2-SLOW:       # %bb.0:
1323; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1324; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1325; AVX2-SLOW-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1326; AVX2-SLOW-NEXT:    vzeroupper
1327; AVX2-SLOW-NEXT:    retq
1328;
1329; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
1330; AVX2-FAST:       # %bb.0:
1331; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
1332; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1333; AVX2-FAST-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1334; AVX2-FAST-NEXT:    vzeroupper
1335; AVX2-FAST-NEXT:    retq
1336;
1337; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
1338; AVX512:       # %bb.0:
1339; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1340; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1341; AVX512-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
1342; AVX512-NEXT:    vzeroupper
1343; AVX512-NEXT:    retq
1344  %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
1345  %2 = trunc <4 x i64> %1 to <4 x i32>
1346  ret <4 x i32> %2
1347}
1348
1349define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
1350; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
1351; SSE:       # %bb.0:
1352; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1353; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1354; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1355; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1356; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1357; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1358; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1359; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1360; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1361; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1362; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1363; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
1364; SSE-NEXT:    retq
1365;
1366; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
1367; AVX1:       # %bb.0:
1368; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
1369; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1370; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1371; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1372; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1373; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1374; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1375; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1376; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1377; AVX1-NEXT:    vzeroupper
1378; AVX1-NEXT:    retq
1379;
1380; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
1381; AVX2-SLOW:       # %bb.0:
1382; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1383; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1384; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
1385; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1386; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1387; AVX2-SLOW-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1388; AVX2-SLOW-NEXT:    vzeroupper
1389; AVX2-SLOW-NEXT:    retq
1390;
1391; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
1392; AVX2-FAST:       # %bb.0:
1393; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1394; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1395; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1396; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1397; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1398; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1399; AVX2-FAST-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1400; AVX2-FAST-NEXT:    vzeroupper
1401; AVX2-FAST-NEXT:    retq
1402;
1403; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
1404; AVX512:       # %bb.0:
1405; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
1406; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1407; AVX512-NEXT:    vzeroupper
1408; AVX512-NEXT:    retq
1409  %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
1410  %2 = trunc <8 x i64> %1 to <8 x i16>
1411  ret <8 x i16> %2
1412}
1413
1414define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
1415; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
1416; SSE:       # %bb.0:
1417; SSE-NEXT:    pslld $16, %xmm1
1418; SSE-NEXT:    psrad $16, %xmm1
1419; SSE-NEXT:    pslld $16, %xmm0
1420; SSE-NEXT:    psrad $16, %xmm0
1421; SSE-NEXT:    packssdw %xmm1, %xmm0
1422; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
1423; SSE-NEXT:    retq
1424;
1425; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
1426; AVX1:       # %bb.0:
1427; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1428; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1429; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1430; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1431; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1432; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1433; AVX1-NEXT:    vzeroupper
1434; AVX1-NEXT:    retq
1435;
1436; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
1437; AVX2:       # %bb.0:
1438; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1439; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1440; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1441; AVX2-NEXT:    vzeroupper
1442; AVX2-NEXT:    retq
1443;
1444; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
1445; AVX512:       # %bb.0:
1446; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1447; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1448; AVX512-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
1449; AVX512-NEXT:    vzeroupper
1450; AVX512-NEXT:    retq
1451  %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1452  %2 = trunc <8 x i32> %1 to <8 x i16>
1453  ret <8 x i16> %2
1454}
1455
1456define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
1457; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
1458; SSE:       # %bb.0:
1459; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1460; SSE-NEXT:    pand %xmm8, %xmm7
1461; SSE-NEXT:    pand %xmm8, %xmm6
1462; SSE-NEXT:    packuswb %xmm7, %xmm6
1463; SSE-NEXT:    pand %xmm8, %xmm5
1464; SSE-NEXT:    pand %xmm8, %xmm4
1465; SSE-NEXT:    packuswb %xmm5, %xmm4
1466; SSE-NEXT:    packuswb %xmm6, %xmm4
1467; SSE-NEXT:    pand %xmm8, %xmm3
1468; SSE-NEXT:    pand %xmm8, %xmm2
1469; SSE-NEXT:    packuswb %xmm3, %xmm2
1470; SSE-NEXT:    pand %xmm8, %xmm1
1471; SSE-NEXT:    pand %xmm8, %xmm0
1472; SSE-NEXT:    packuswb %xmm1, %xmm0
1473; SSE-NEXT:    packuswb %xmm2, %xmm0
1474; SSE-NEXT:    packuswb %xmm4, %xmm0
1475; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
1476; SSE-NEXT:    retq
1477;
1478; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
1479; AVX1:       # %bb.0:
1480; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
1481; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1482; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1483; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1484; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1485; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1486; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1487; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1488; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1489; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1490; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1491; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1492; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1493; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1494; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1495; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1496; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1497; AVX1-NEXT:    vzeroupper
1498; AVX1-NEXT:    retq
1499;
1500; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
1501; AVX2-SLOW:       # %bb.0:
1502; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
1503; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1504; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
1505; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1506; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1507; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1508; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1509; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
1510; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
1511; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1512; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
1513; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1514; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1515; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
1516; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1517; AVX2-SLOW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1518; AVX2-SLOW-NEXT:    vzeroupper
1519; AVX2-SLOW-NEXT:    retq
1520;
1521; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
1522; AVX2-FAST:       # %bb.0:
1523; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1524; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
1525; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
1526; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1527; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1528; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1529; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1530; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1531; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
1532; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
1533; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
1534; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1535; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1536; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1537; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
1538; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1539; AVX2-FAST-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1540; AVX2-FAST-NEXT:    vzeroupper
1541; AVX2-FAST-NEXT:    retq
1542;
1543; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
1544; AVX512:       # %bb.0:
1545; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
1546; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
1547; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1548; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1549; AVX512-NEXT:    vzeroupper
1550; AVX512-NEXT:    retq
1551  %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
1552  %2 = trunc <16 x i64> %1 to <16 x i8>
1553  ret <16 x i8> %2
1554}
1555
1556define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
1557; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
1558; SSE:       # %bb.0:
1559; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1560; SSE-NEXT:    pand %xmm4, %xmm3
1561; SSE-NEXT:    pand %xmm4, %xmm2
1562; SSE-NEXT:    packuswb %xmm3, %xmm2
1563; SSE-NEXT:    pand %xmm4, %xmm1
1564; SSE-NEXT:    pand %xmm4, %xmm0
1565; SSE-NEXT:    packuswb %xmm1, %xmm0
1566; SSE-NEXT:    packuswb %xmm2, %xmm0
1567; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
1568; SSE-NEXT:    retq
1569;
1570; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
1571; AVX1:       # %bb.0:
1572; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
1573; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
1574; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1575; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1576; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
1577; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1578; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
1579; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1580; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1581; AVX1-NEXT:    vzeroupper
1582; AVX1-NEXT:    retq
1583;
1584; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
1585; AVX2:       # %bb.0:
1586; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1587; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1588; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
1589; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1590; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
1591; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1592; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1593; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
1594; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1595; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1596; AVX2-NEXT:    vzeroupper
1597; AVX2-NEXT:    retq
1598;
1599; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
1600; AVX512:       # %bb.0:
1601; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
1602; AVX512-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1603; AVX512-NEXT:    vzeroupper
1604; AVX512-NEXT:    retq
1605  %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1606  %2 = trunc <16 x i32> %1 to <16 x i8>
1607  ret <16 x i8> %2
1608}
1609
1610define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
1611; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
1612; SSE:       # %bb.0:
1613; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
1614; SSE-NEXT:    pand %xmm2, %xmm1
1615; SSE-NEXT:    pand %xmm2, %xmm0
1616; SSE-NEXT:    packuswb %xmm1, %xmm0
1617; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
1618; SSE-NEXT:    retq
1619;
1620; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
1621; AVX1:       # %bb.0:
1622; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1623; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1624; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1625; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1626; AVX1-NEXT:    vzeroupper
1627; AVX1-NEXT:    retq
1628;
1629; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
1630; AVX2:       # %bb.0:
1631; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1632; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1633; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1634; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1635; AVX2-NEXT:    vzeroupper
1636; AVX2-NEXT:    retq
1637;
1638; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
1639; AVX512F:       # %bb.0:
1640; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1641; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1642; AVX512F-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1643; AVX512F-NEXT:    vzeroupper
1644; AVX512F-NEXT:    retq
1645;
1646; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
1647; AVX512BW:       # %bb.0:
1648; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1649; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1650; AVX512BW-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1651; AVX512BW-NEXT:    vzeroupper
1652; AVX512BW-NEXT:    retq
1653;
1654; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
1655; AVX512DQ:       # %bb.0:
1656; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1657; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1658; AVX512DQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1659; AVX512DQ-NEXT:    vzeroupper
1660; AVX512DQ-NEXT:    retq
1661  %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1662  %2 = trunc <16 x i16> %1 to <16 x i8>
1663  ret <16 x i8> %2
1664}
1665
1666define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
1667; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1668; SSE:       # %bb.0:
1669; SSE-NEXT:    psubb {{.*}}(%rip), %xmm0
1670; SSE-NEXT:    retq
1671;
1672; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
1673; AVX:       # %bb.0:
1674; AVX-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
1675; AVX-NEXT:    retq
1676  %a = zext <16 x i8> %x to <16 x i16>
1677  %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1678  %c = trunc <16 x i16> %b to <16 x i8>
1679  ret <16 x i8> %c
1680}
1681
1682define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
1683; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1684; SSE:       # %bb.0:
1685; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1686; SSE-NEXT:    psubb %xmm0, %xmm1
1687; SSE-NEXT:    movdqa %xmm1, %xmm0
1688; SSE-NEXT:    retq
1689;
1690; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
1691; AVX:       # %bb.0:
1692; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1693; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm0
1694; AVX-NEXT:    retq
1695  %a = zext <16 x i8> %x to <16 x i16>
1696  %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
1697  %c = trunc <16 x i16> %b to <16 x i8>
1698  ret <16 x i8> %c
1699}
1700
1701;
1702; mul
1703;
1704
1705define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1706; SSE-LABEL: trunc_mul_v4i64_v4i32:
1707; SSE:       # %bb.0:
1708; SSE-NEXT:    pmuludq %xmm3, %xmm1
1709; SSE-NEXT:    pmuludq %xmm2, %xmm0
1710; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1711; SSE-NEXT:    retq
1712;
1713; AVX1-LABEL: trunc_mul_v4i64_v4i32:
1714; AVX1:       # %bb.0:
1715; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1716; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1717; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1718; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1719; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1720; AVX1-NEXT:    vzeroupper
1721; AVX1-NEXT:    retq
1722;
1723; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
1724; AVX2-SLOW:       # %bb.0:
1725; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
1726; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1727; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
1728; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1729; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1730; AVX2-SLOW-NEXT:    vzeroupper
1731; AVX2-SLOW-NEXT:    retq
1732;
1733; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
1734; AVX2-FAST:       # %bb.0:
1735; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
1736; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1737; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
1738; AVX2-FAST-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1739; AVX2-FAST-NEXT:    vzeroupper
1740; AVX2-FAST-NEXT:    retq
1741;
1742; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
1743; AVX512F:       # %bb.0:
1744; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1745; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1746; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
1747; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
1748; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1749; AVX512F-NEXT:    vzeroupper
1750; AVX512F-NEXT:    retq
1751;
1752; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
1753; AVX512BW:       # %bb.0:
1754; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1755; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1756; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
1757; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
1758; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1759; AVX512BW-NEXT:    vzeroupper
1760; AVX512BW-NEXT:    retq
1761;
1762; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
1763; AVX512DQ:       # %bb.0:
1764; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1765; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1766; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1767; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
1768; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1769; AVX512DQ-NEXT:    vzeroupper
1770; AVX512DQ-NEXT:    retq
1771  %1 = mul <4 x i64> %a0, %a1
1772  %2 = trunc <4 x i64> %1 to <4 x i32>
1773  ret <4 x i32> %2
1774}
1775
1776define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
1777; SSE-LABEL: trunc_mul_v8i64_v8i16:
1778; SSE:       # %bb.0:
1779; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1780; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1781; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1782; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1783; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1784; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1785; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
1786; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1787; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1788; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1789; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
1790; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1791; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1792; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1793; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
1794; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1795; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
1796; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
1797; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
1798; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1799; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1800; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1801; SSE-NEXT:    pmullw %xmm6, %xmm0
1802; SSE-NEXT:    retq
1803;
1804; AVX1-LABEL: trunc_mul_v8i64_v8i16:
1805; AVX1:       # %bb.0:
1806; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
1807; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1808; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1809; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
1810; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1811; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1812; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
1813; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
1814; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1815; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1816; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
1817; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
1818; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1819; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
1820; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
1821; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1822; AVX1-NEXT:    vzeroupper
1823; AVX1-NEXT:    retq
1824;
1825; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
1826; AVX2-SLOW:       # %bb.0:
1827; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
1828; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1829; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
1830; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1831; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1832; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1833; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
1834; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1835; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6]
1836; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1837; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1838; AVX2-SLOW-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1839; AVX2-SLOW-NEXT:    vzeroupper
1840; AVX2-SLOW-NEXT:    retq
1841;
1842; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
1843; AVX2-FAST:       # %bb.0:
1844; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
1845; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
1846; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
1847; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1848; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
1849; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1850; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1851; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
1852; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
1853; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1854; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1855; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1856; AVX2-FAST-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1857; AVX2-FAST-NEXT:    vzeroupper
1858; AVX2-FAST-NEXT:    retq
1859;
1860; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
1861; AVX512F:       # %bb.0:
1862; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
1863; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
1864; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1865; AVX512F-NEXT:    vzeroupper
1866; AVX512F-NEXT:    retq
1867;
1868; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
1869; AVX512BW:       # %bb.0:
1870; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
1871; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
1872; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1873; AVX512BW-NEXT:    vzeroupper
1874; AVX512BW-NEXT:    retq
1875;
1876; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
1877; AVX512DQ:       # %bb.0:
1878; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
1879; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
1880; AVX512DQ-NEXT:    vzeroupper
1881; AVX512DQ-NEXT:    retq
1882  %1 = mul <8 x i64> %a0, %a1
1883  %2 = trunc <8 x i64> %1 to <8 x i16>
1884  ret <8 x i16> %2
1885}
1886
1887define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
1888; SSE-LABEL: trunc_mul_v8i32_v8i16:
1889; SSE:       # %bb.0:
1890; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1891; SSE-NEXT:    pmuludq %xmm2, %xmm0
1892; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1893; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1894; SSE-NEXT:    pmuludq %xmm4, %xmm2
1895; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1896; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1897; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1898; SSE-NEXT:    pmuludq %xmm3, %xmm1
1899; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1900; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
1901; SSE-NEXT:    pmuludq %xmm2, %xmm3
1902; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1903; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1904; SSE-NEXT:    pslld $16, %xmm1
1905; SSE-NEXT:    psrad $16, %xmm1
1906; SSE-NEXT:    pslld $16, %xmm0
1907; SSE-NEXT:    psrad $16, %xmm0
1908; SSE-NEXT:    packssdw %xmm1, %xmm0
1909; SSE-NEXT:    retq
1910;
1911; AVX1-LABEL: trunc_mul_v8i32_v8i16:
1912; AVX1:       # %bb.0:
1913; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
1914; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1915; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1916; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
1917; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
1918; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1919; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
1920; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1921; AVX1-NEXT:    vzeroupper
1922; AVX1-NEXT:    retq
1923;
1924; AVX2-LABEL: trunc_mul_v8i32_v8i16:
1925; AVX2:       # %bb.0:
1926; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1927; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
1928; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1929; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1930; AVX2-NEXT:    vzeroupper
1931; AVX2-NEXT:    retq
1932;
1933; AVX512-LABEL: trunc_mul_v8i32_v8i16:
1934; AVX512:       # %bb.0:
1935; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1936; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
1937; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1938; AVX512-NEXT:    vzeroupper
1939; AVX512-NEXT:    retq
1940  %1 = mul <8 x i32> %a0, %a1
1941  %2 = trunc <8 x i32> %1 to <8 x i16>
1942  ret <8 x i16> %2
1943}
1944
1945define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
1946; SSE-LABEL: trunc_mul_v16i64_v16i8:
1947; SSE:       # %bb.0:
1948; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm0
1949; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm1
1950; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm2
1951; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm3
1952; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm4
1953; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm5
1954; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm6
1955; SSE-NEXT:    pmuludq {{[0-9]+}}(%rsp), %xmm7
1956; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
1957; SSE-NEXT:    pand %xmm8, %xmm7
1958; SSE-NEXT:    pand %xmm8, %xmm6
1959; SSE-NEXT:    packuswb %xmm7, %xmm6
1960; SSE-NEXT:    pand %xmm8, %xmm5
1961; SSE-NEXT:    pand %xmm8, %xmm4
1962; SSE-NEXT:    packuswb %xmm5, %xmm4
1963; SSE-NEXT:    packuswb %xmm6, %xmm4
1964; SSE-NEXT:    pand %xmm8, %xmm3
1965; SSE-NEXT:    pand %xmm8, %xmm2
1966; SSE-NEXT:    packuswb %xmm3, %xmm2
1967; SSE-NEXT:    pand %xmm8, %xmm1
1968; SSE-NEXT:    pand %xmm8, %xmm0
1969; SSE-NEXT:    packuswb %xmm1, %xmm0
1970; SSE-NEXT:    packuswb %xmm2, %xmm0
1971; SSE-NEXT:    packuswb %xmm4, %xmm0
1972; SSE-NEXT:    retq
1973;
1974; AVX1-LABEL: trunc_mul_v16i64_v16i8:
1975; AVX1:       # %bb.0:
1976; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm8
1977; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
1978; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1979; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm0
1980; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
1981; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
1982; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1983; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
1984; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm5
1985; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
1986; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
1987; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
1988; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm6
1989; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
1990; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
1991; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
1992; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255]
1993; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
1994; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
1995; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
1996; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
1997; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
1998; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
1999; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2000; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
2001; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
2002; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2003; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
2004; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
2005; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
2006; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2007; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2008; AVX1-NEXT:    vzeroupper
2009; AVX1-NEXT:    retq
2010;
2011; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
2012; AVX2-SLOW:       # %bb.0:
2013; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm7, %xmm8
2014; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
2015; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm7
2016; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
2017; AVX2-SLOW-NEXT:    vpmulld %xmm8, %xmm3, %xmm3
2018; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm6, %xmm7
2019; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
2020; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm7
2021; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
2022; AVX2-SLOW-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
2023; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2024; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2025; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2026; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2027; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2028; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm2, %xmm2
2029; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm5, %xmm7
2030; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
2031; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm7
2032; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
2033; AVX2-SLOW-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
2034; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm4, %xmm5
2035; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
2036; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm5
2037; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
2038; AVX2-SLOW-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
2039; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2040; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2041; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2042; AVX2-SLOW-NEXT:    vpand %xmm6, %xmm0, %xmm0
2043; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2044; AVX2-SLOW-NEXT:    vzeroupper
2045; AVX2-SLOW-NEXT:    retq
2046;
2047; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
2048; AVX2-FAST:       # %bb.0:
2049; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
2050; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm8, %ymm7
2051; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm8, %ymm3
2052; AVX2-FAST-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
2053; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm8, %ymm6
2054; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm8, %ymm2
2055; AVX2-FAST-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
2056; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2057; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2058; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2059; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2060; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2061; AVX2-FAST-NEXT:    vpand %xmm6, %xmm2, %xmm2
2062; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm5
2063; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm1
2064; AVX2-FAST-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
2065; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm4
2066; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm0
2067; AVX2-FAST-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
2068; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2069; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2070; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2071; AVX2-FAST-NEXT:    vpand %xmm6, %xmm0, %xmm0
2072; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2073; AVX2-FAST-NEXT:    vzeroupper
2074; AVX2-FAST-NEXT:    retq
2075;
2076; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
2077; AVX512F:       # %bb.0:
2078; AVX512F-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
2079; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
2080; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
2081; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
2082; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2083; AVX512F-NEXT:    vzeroupper
2084; AVX512F-NEXT:    retq
2085;
2086; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
2087; AVX512BW:       # %bb.0:
2088; AVX512BW-NEXT:    vpmuludq %zmm2, %zmm0, %zmm0
2089; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
2090; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
2091; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
2092; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2093; AVX512BW-NEXT:    vzeroupper
2094; AVX512BW-NEXT:    retq
2095;
2096; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
2097; AVX512DQ:       # %bb.0:
2098; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
2099; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
2100; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
2101; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
2102; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2103; AVX512DQ-NEXT:    vzeroupper
2104; AVX512DQ-NEXT:    retq
2105  %1 = mul <16 x i64> %a0, %a1
2106  %2 = trunc <16 x i64> %1 to <16 x i8>
2107  ret <16 x i8> %2
2108}
2109
2110define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
2111; SSE-LABEL: trunc_mul_v16i32_v16i8:
2112; SSE:       # %bb.0:
2113; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
2114; SSE-NEXT:    pmuludq %xmm4, %xmm0
2115; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2116; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2117; SSE-NEXT:    pmuludq %xmm8, %xmm4
2118; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2119; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2120; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2121; SSE-NEXT:    pmuludq %xmm5, %xmm1
2122; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2123; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
2124; SSE-NEXT:    pmuludq %xmm4, %xmm5
2125; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2126; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2127; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
2128; SSE-NEXT:    pmuludq %xmm6, %xmm2
2129; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2130; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
2131; SSE-NEXT:    pmuludq %xmm4, %xmm5
2132; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2133; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2134; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
2135; SSE-NEXT:    pmuludq %xmm7, %xmm3
2136; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2137; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
2138; SSE-NEXT:    pmuludq %xmm4, %xmm5
2139; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2140; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2141; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2142; SSE-NEXT:    pand %xmm4, %xmm3
2143; SSE-NEXT:    pand %xmm4, %xmm2
2144; SSE-NEXT:    packuswb %xmm3, %xmm2
2145; SSE-NEXT:    pand %xmm4, %xmm1
2146; SSE-NEXT:    pand %xmm4, %xmm0
2147; SSE-NEXT:    packuswb %xmm1, %xmm0
2148; SSE-NEXT:    packuswb %xmm2, %xmm0
2149; SSE-NEXT:    retq
2150;
2151; AVX1-LABEL: trunc_mul_v16i32_v16i8:
2152; AVX1:       # %bb.0:
2153; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
2154; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2155; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2156; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
2157; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
2158; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2159; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2160; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
2161; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
2162; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
2163; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
2164; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
2165; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
2166; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
2167; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2168; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2169; AVX1-NEXT:    vzeroupper
2170; AVX1-NEXT:    retq
2171;
2172; AVX2-LABEL: trunc_mul_v16i32_v16i8:
2173; AVX2:       # %bb.0:
2174; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
2175; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
2176; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2177; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2178; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2179; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2180; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
2181; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2182; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2183; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
2184; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2185; AVX2-NEXT:    vzeroupper
2186; AVX2-NEXT:    retq
2187;
2188; AVX512-LABEL: trunc_mul_v16i32_v16i8:
2189; AVX512:       # %bb.0:
2190; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
2191; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2192; AVX512-NEXT:    vzeroupper
2193; AVX512-NEXT:    retq
2194  %1 = mul <16 x i32> %a0, %a1
2195  %2 = trunc <16 x i32> %1 to <16 x i8>
2196  ret <16 x i8> %2
2197}
2198
2199define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
2200; SSE-LABEL: trunc_mul_v16i16_v16i8:
2201; SSE:       # %bb.0:
2202; SSE-NEXT:    pmullw %xmm2, %xmm0
2203; SSE-NEXT:    pmullw %xmm3, %xmm1
2204; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2205; SSE-NEXT:    pand %xmm2, %xmm1
2206; SSE-NEXT:    pand %xmm2, %xmm0
2207; SSE-NEXT:    packuswb %xmm1, %xmm0
2208; SSE-NEXT:    retq
2209;
2210; AVX1-LABEL: trunc_mul_v16i16_v16i8:
2211; AVX1:       # %bb.0:
2212; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2213; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2214; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2215; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2216; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2217; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
2218; AVX1-NEXT:    vpand %xmm1, %xmm2, %xmm1
2219; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2220; AVX1-NEXT:    vzeroupper
2221; AVX1-NEXT:    retq
2222;
2223; AVX2-LABEL: trunc_mul_v16i16_v16i8:
2224; AVX2:       # %bb.0:
2225; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2226; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
2227; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2228; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2229; AVX2-NEXT:    vzeroupper
2230; AVX2-NEXT:    retq
2231;
2232; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
2233; AVX512F:       # %bb.0:
2234; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2235; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2236; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2237; AVX512F-NEXT:    vzeroupper
2238; AVX512F-NEXT:    retq
2239;
2240; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
2241; AVX512BW:       # %bb.0:
2242; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2243; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2244; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2245; AVX512BW-NEXT:    vzeroupper
2246; AVX512BW-NEXT:    retq
2247;
2248; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
2249; AVX512DQ:       # %bb.0:
2250; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
2251; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2252; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2253; AVX512DQ-NEXT:    vzeroupper
2254; AVX512DQ-NEXT:    retq
2255  %1 = mul <16 x i16> %a0, %a1
2256  %2 = trunc <16 x i16> %1 to <16 x i8>
2257  ret <16 x i8> %2
2258}
2259
2260define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
2261; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2262; SSE:       # %bb.0:
2263; SSE-NEXT:    pxor %xmm3, %xmm3
2264; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2265; SSE-NEXT:    pslld $16, %xmm2
2266; SSE-NEXT:    psrad $16, %xmm2
2267; SSE-NEXT:    pslld $16, %xmm1
2268; SSE-NEXT:    psrad $16, %xmm1
2269; SSE-NEXT:    packssdw %xmm2, %xmm1
2270; SSE-NEXT:    pmullw %xmm1, %xmm0
2271; SSE-NEXT:    retq
2272;
2273; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2274; AVX1:       # %bb.0:
2275; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2276; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2277; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2278; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2279; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2280; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2281; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2282; AVX1-NEXT:    vzeroupper
2283; AVX1-NEXT:    retq
2284;
2285; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2286; AVX2:       # %bb.0:
2287; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2288; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2289; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2290; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2291; AVX2-NEXT:    vzeroupper
2292; AVX2-NEXT:    retq
2293;
2294; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
2295; AVX512:       # %bb.0:
2296; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
2297; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
2298; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2299; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
2300; AVX512-NEXT:    vzeroupper
2301; AVX512-NEXT:    retq
2302  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2303  %2 = zext <8 x i8> %1 to <8 x i32>
2304  %3 = mul <8 x i32> %2, %a1
2305  %4 = trunc <8 x i32> %3 to <8 x i16>
2306  ret <8 x i16> %4
2307}
2308
2309;
2310; mul to constant
2311;
2312
2313define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
2314; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
2315; SSE:       # %bb.0:
2316; SSE-NEXT:    xorps %xmm2, %xmm2
2317; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
2318; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
2319; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
2320; SSE-NEXT:    movaps %xmm2, %xmm0
2321; SSE-NEXT:    retq
2322;
2323; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
2324; AVX1:       # %bb.0:
2325; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2326; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2327; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2328; AVX1-NEXT:    vzeroupper
2329; AVX1-NEXT:    retq
2330;
2331; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
2332; AVX2-SLOW:       # %bb.0:
2333; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2334; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2335; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2336; AVX2-SLOW-NEXT:    vzeroupper
2337; AVX2-SLOW-NEXT:    retq
2338;
2339; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
2340; AVX2-FAST:       # %bb.0:
2341; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2342; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2343; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2344; AVX2-FAST-NEXT:    vzeroupper
2345; AVX2-FAST-NEXT:    retq
2346;
2347; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2348; AVX512:       # %bb.0:
2349; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2350; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2351; AVX512-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2352; AVX512-NEXT:    vzeroupper
2353; AVX512-NEXT:    retq
2354  %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
2355  %2 = trunc <4 x i64> %1 to <4 x i32>
2356  ret <4 x i32> %2
2357}
2358
2359define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
2360; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
2361; SSE:       # %bb.0:
2362; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2363; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2364; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2365; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2366; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2367; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2368; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2369; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2370; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2371; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2372; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2373; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2374; SSE-NEXT:    retq
2375;
2376; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
2377; AVX1:       # %bb.0:
2378; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
2379; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
2380; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2381; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2382; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2383; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2384; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2385; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2386; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2387; AVX1-NEXT:    vzeroupper
2388; AVX1-NEXT:    retq
2389;
2390; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
2391; AVX2-SLOW:       # %bb.0:
2392; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
2393; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2394; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
2395; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2396; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2397; AVX2-SLOW-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2398; AVX2-SLOW-NEXT:    vzeroupper
2399; AVX2-SLOW-NEXT:    retq
2400;
2401; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
2402; AVX2-FAST:       # %bb.0:
2403; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2404; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
2405; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
2406; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2407; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2408; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2409; AVX2-FAST-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2410; AVX2-FAST-NEXT:    vzeroupper
2411; AVX2-FAST-NEXT:    retq
2412;
2413; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2414; AVX512:       # %bb.0:
2415; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2416; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2417; AVX512-NEXT:    vzeroupper
2418; AVX512-NEXT:    retq
2419  %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
2420  %2 = trunc <8 x i64> %1 to <8 x i16>
2421  ret <8 x i16> %2
2422}
2423
2424define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
2425; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
2426; SSE:       # %bb.0:
2427; SSE-NEXT:    pslld $16, %xmm1
2428; SSE-NEXT:    psrad $16, %xmm1
2429; SSE-NEXT:    pslld $16, %xmm0
2430; SSE-NEXT:    psrad $16, %xmm0
2431; SSE-NEXT:    packssdw %xmm1, %xmm0
2432; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2433; SSE-NEXT:    retq
2434;
2435; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
2436; AVX1:       # %bb.0:
2437; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2438; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2439; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2440; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2441; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2442; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2443; AVX1-NEXT:    vzeroupper
2444; AVX1-NEXT:    retq
2445;
2446; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
2447; AVX2:       # %bb.0:
2448; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2449; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2450; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2451; AVX2-NEXT:    vzeroupper
2452; AVX2-NEXT:    retq
2453;
2454; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
2455; AVX512:       # %bb.0:
2456; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2457; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2458; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2459; AVX512-NEXT:    vzeroupper
2460; AVX512-NEXT:    retq
2461  %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2462  %2 = trunc <8 x i32> %1 to <8 x i16>
2463  ret <8 x i16> %2
2464}
2465
2466define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
2467; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
2468; SSE:       # %bb.0:
2469; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm1
2470; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm2
2471; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm3
2472; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm4
2473; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm5
2474; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm6
2475; SSE-NEXT:    pmuludq {{.*}}(%rip), %xmm7
2476; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2477; SSE-NEXT:    pand %xmm8, %xmm7
2478; SSE-NEXT:    pand %xmm8, %xmm6
2479; SSE-NEXT:    packuswb %xmm7, %xmm6
2480; SSE-NEXT:    pand %xmm8, %xmm5
2481; SSE-NEXT:    pand %xmm8, %xmm4
2482; SSE-NEXT:    packuswb %xmm5, %xmm4
2483; SSE-NEXT:    packuswb %xmm6, %xmm4
2484; SSE-NEXT:    pand %xmm8, %xmm3
2485; SSE-NEXT:    pand %xmm8, %xmm2
2486; SSE-NEXT:    packuswb %xmm3, %xmm2
2487; SSE-NEXT:    pand %xmm8, %xmm1
2488; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2489; SSE-NEXT:    packuswb %xmm1, %xmm0
2490; SSE-NEXT:    packuswb %xmm2, %xmm0
2491; SSE-NEXT:    packuswb %xmm4, %xmm0
2492; SSE-NEXT:    retq
2493;
2494; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
2495; AVX1:       # %bb.0:
2496; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm8
2497; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2498; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm0, %xmm0
2499; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm5
2500; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2501; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm1, %xmm1
2502; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm6
2503; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2504; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm2, %xmm2
2505; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm3, %xmm7
2506; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2507; AVX1-NEXT:    vpmuludq {{.*}}(%rip), %xmm3, %xmm3
2508; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255]
2509; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2510; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
2511; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
2512; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2513; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
2514; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
2515; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2516; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2517; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
2518; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2519; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2520; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
2521; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
2522; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2523; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2524; AVX1-NEXT:    vzeroupper
2525; AVX1-NEXT:    retq
2526;
2527; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
2528; AVX2-SLOW:       # %bb.0:
2529; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm4
2530; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
2531; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
2532; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm4
2533; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
2534; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
2535; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2536; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2537; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2538; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2539; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2540; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
2541; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm5
2542; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
2543; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2544; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm5
2545; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
2546; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2547; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2548; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2549; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2550; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
2551; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2552; AVX2-SLOW-NEXT:    vzeroupper
2553; AVX2-SLOW-NEXT:    retq
2554;
2555; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
2556; AVX2-FAST:       # %bb.0:
2557; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
2558; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
2559; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
2560; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
2561; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
2562; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
2563; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2564; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2565; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2566; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
2567; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
2568; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
2569; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2570; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
2571; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2572; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2573; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
2574; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2575; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
2576; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2577; AVX2-FAST-NEXT:    vzeroupper
2578; AVX2-FAST-NEXT:    retq
2579;
2580; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
2581; AVX512F:       # %bb.0:
2582; AVX512F-NEXT:    vpmuludq {{.*}}(%rip), %zmm0, %zmm0
2583; AVX512F-NEXT:    vpmuludq {{.*}}(%rip), %zmm1, %zmm1
2584; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
2585; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
2586; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2587; AVX512F-NEXT:    vzeroupper
2588; AVX512F-NEXT:    retq
2589;
2590; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
2591; AVX512BW:       # %bb.0:
2592; AVX512BW-NEXT:    vpmuludq {{.*}}(%rip), %zmm0, %zmm0
2593; AVX512BW-NEXT:    vpmuludq {{.*}}(%rip), %zmm1, %zmm1
2594; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
2595; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
2596; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2597; AVX512BW-NEXT:    vzeroupper
2598; AVX512BW-NEXT:    retq
2599;
2600; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
2601; AVX512DQ:       # %bb.0:
2602; AVX512DQ-NEXT:    vpmullq {{.*}}(%rip), %zmm0, %zmm0
2603; AVX512DQ-NEXT:    vpmullq {{.*}}(%rip), %zmm1, %zmm1
2604; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
2605; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
2606; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2607; AVX512DQ-NEXT:    vzeroupper
2608; AVX512DQ-NEXT:    retq
2609  %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
2610  %2 = trunc <16 x i64> %1 to <16 x i8>
2611  ret <16 x i8> %2
2612}
2613
2614define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
2615; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
2616; SSE:       # %bb.0:
2617; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
2618; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
2619; SSE-NEXT:    pmuludq %xmm4, %xmm0
2620; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2621; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2622; SSE-NEXT:    pmuludq %xmm5, %xmm4
2623; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2624; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2625; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
2626; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
2627; SSE-NEXT:    pmuludq %xmm4, %xmm1
2628; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2629; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2630; SSE-NEXT:    pmuludq %xmm5, %xmm4
2631; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2632; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2633; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
2634; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
2635; SSE-NEXT:    pmuludq %xmm4, %xmm2
2636; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2637; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2638; SSE-NEXT:    pmuludq %xmm5, %xmm4
2639; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2640; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2641; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
2642; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
2643; SSE-NEXT:    pmuludq %xmm4, %xmm3
2644; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2645; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
2646; SSE-NEXT:    pmuludq %xmm5, %xmm4
2647; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2648; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2649; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
2650; SSE-NEXT:    pand %xmm4, %xmm3
2651; SSE-NEXT:    pand %xmm4, %xmm2
2652; SSE-NEXT:    packuswb %xmm3, %xmm2
2653; SSE-NEXT:    pand %xmm4, %xmm1
2654; SSE-NEXT:    pand %xmm4, %xmm0
2655; SSE-NEXT:    packuswb %xmm1, %xmm0
2656; SSE-NEXT:    packuswb %xmm2, %xmm0
2657; SSE-NEXT:    retq
2658;
2659; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
2660; AVX1:       # %bb.0:
2661; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
2662; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2663; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
2664; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
2665; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2666; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
2667; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
2668; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2669; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
2670; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
2671; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2672; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2673; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
2674; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2675; AVX1-NEXT:    vzeroupper
2676; AVX1-NEXT:    retq
2677;
2678; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
2679; AVX2:       # %bb.0:
2680; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2681; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2682; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2683; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
2684; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2685; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
2686; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2687; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2688; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2689; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
2690; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2691; AVX2-NEXT:    vzeroupper
2692; AVX2-NEXT:    retq
2693;
2694; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
2695; AVX512:       # %bb.0:
2696; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
2697; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
2698; AVX512-NEXT:    vzeroupper
2699; AVX512-NEXT:    retq
2700  %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2701  %2 = trunc <16 x i32> %1 to <16 x i8>
2702  ret <16 x i8> %2
2703}
2704
2705define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
2706; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
2707; SSE:       # %bb.0:
2708; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
2709; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
2710; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
2711; SSE-NEXT:    pand %xmm2, %xmm1
2712; SSE-NEXT:    pand %xmm2, %xmm0
2713; SSE-NEXT:    packuswb %xmm1, %xmm0
2714; SSE-NEXT:    retq
2715;
2716; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
2717; AVX1:       # %bb.0:
2718; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2719; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2720; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
2721; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2722; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
2723; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
2724; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
2725; AVX1-NEXT:    vzeroupper
2726; AVX1-NEXT:    retq
2727;
2728; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
2729; AVX2:       # %bb.0:
2730; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2731; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
2732; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2733; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2734; AVX2-NEXT:    vzeroupper
2735; AVX2-NEXT:    retq
2736;
2737; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
2738; AVX512F:       # %bb.0:
2739; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2740; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2741; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2742; AVX512F-NEXT:    vzeroupper
2743; AVX512F-NEXT:    retq
2744;
2745; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
2746; AVX512BW:       # %bb.0:
2747; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2748; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2749; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2750; AVX512BW-NEXT:    vzeroupper
2751; AVX512BW-NEXT:    retq
2752;
2753; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
2754; AVX512DQ:       # %bb.0:
2755; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
2756; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2757; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2758; AVX512DQ-NEXT:    vzeroupper
2759; AVX512DQ-NEXT:    retq
2760  %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
2761  %2 = trunc <16 x i16> %1 to <16 x i8>
2762  ret <16 x i8> %2
2763}
2764
2765;
2766; and
2767;
2768
2769define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2770; SSE-LABEL: trunc_and_v4i64_v4i32:
2771; SSE:       # %bb.0:
2772; SSE-NEXT:    andps %xmm3, %xmm1
2773; SSE-NEXT:    andps %xmm2, %xmm0
2774; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2775; SSE-NEXT:    retq
2776;
2777; AVX1-LABEL: trunc_and_v4i64_v4i32:
2778; AVX1:       # %bb.0:
2779; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2780; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2781; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2782; AVX1-NEXT:    vzeroupper
2783; AVX1-NEXT:    retq
2784;
2785; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
2786; AVX2-SLOW:       # %bb.0:
2787; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
2788; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
2789; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2790; AVX2-SLOW-NEXT:    vzeroupper
2791; AVX2-SLOW-NEXT:    retq
2792;
2793; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
2794; AVX2-FAST:       # %bb.0:
2795; AVX2-FAST-NEXT:    vandps %ymm1, %ymm0, %ymm0
2796; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
2797; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2798; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2799; AVX2-FAST-NEXT:    vzeroupper
2800; AVX2-FAST-NEXT:    retq
2801;
2802; AVX512-LABEL: trunc_and_v4i64_v4i32:
2803; AVX512:       # %bb.0:
2804; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2805; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2806; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2807; AVX512-NEXT:    vzeroupper
2808; AVX512-NEXT:    retq
2809  %1 = and <4 x i64> %a0, %a1
2810  %2 = trunc <4 x i64> %1 to <4 x i32>
2811  ret <4 x i32> %2
2812}
2813
2814define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
2815; SSE-LABEL: trunc_and_v8i64_v8i16:
2816; SSE:       # %bb.0:
2817; SSE-NEXT:    pand %xmm6, %xmm2
2818; SSE-NEXT:    pand %xmm7, %xmm3
2819; SSE-NEXT:    pand %xmm4, %xmm0
2820; SSE-NEXT:    pand %xmm5, %xmm1
2821; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2822; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
2823; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2824; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
2825; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
2826; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
2827; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
2828; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2829; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
2830; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2831; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
2832; SSE-NEXT:    retq
2833;
2834; AVX1-LABEL: trunc_and_v8i64_v8i16:
2835; AVX1:       # %bb.0:
2836; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
2837; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
2838; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2839; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2840; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2841; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
2842; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
2843; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2844; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
2845; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2846; AVX1-NEXT:    vzeroupper
2847; AVX1-NEXT:    retq
2848;
2849; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
2850; AVX2-SLOW:       # %bb.0:
2851; AVX2-SLOW-NEXT:    vandps %ymm3, %ymm1, %ymm1
2852; AVX2-SLOW-NEXT:    vandps %ymm2, %ymm0, %ymm0
2853; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
2854; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2855; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
2856; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2857; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2858; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2859; AVX2-SLOW-NEXT:    vzeroupper
2860; AVX2-SLOW-NEXT:    retq
2861;
2862; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
2863; AVX2-FAST:       # %bb.0:
2864; AVX2-FAST-NEXT:    vpand %ymm3, %ymm1, %ymm1
2865; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
2866; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
2867; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
2868; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
2869; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2870; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2871; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2872; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2873; AVX2-FAST-NEXT:    vzeroupper
2874; AVX2-FAST-NEXT:    retq
2875;
2876; AVX512-LABEL: trunc_and_v8i64_v8i16:
2877; AVX512:       # %bb.0:
2878; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2879; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
2880; AVX512-NEXT:    vzeroupper
2881; AVX512-NEXT:    retq
2882  %1 = and <8 x i64> %a0, %a1
2883  %2 = trunc <8 x i64> %1 to <8 x i16>
2884  ret <8 x i16> %2
2885}
2886
2887define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
2888; SSE-LABEL: trunc_and_v8i32_v8i16:
2889; SSE:       # %bb.0:
2890; SSE-NEXT:    pand %xmm2, %xmm0
2891; SSE-NEXT:    pand %xmm3, %xmm1
2892; SSE-NEXT:    pslld $16, %xmm1
2893; SSE-NEXT:    psrad $16, %xmm1
2894; SSE-NEXT:    pslld $16, %xmm0
2895; SSE-NEXT:    psrad $16, %xmm0
2896; SSE-NEXT:    packssdw %xmm1, %xmm0
2897; SSE-NEXT:    retq
2898;
2899; AVX1-LABEL: trunc_and_v8i32_v8i16:
2900; AVX1:       # %bb.0:
2901; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
2902; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2903; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
2904; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
2905; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
2906; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2907; AVX1-NEXT:    vzeroupper
2908; AVX1-NEXT:    retq
2909;
2910; AVX2-LABEL: trunc_and_v8i32_v8i16:
2911; AVX2:       # %bb.0:
2912; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
2913; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
2914; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2915; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2916; AVX2-NEXT:    vzeroupper
2917; AVX2-NEXT:    retq
2918;
2919; AVX512-LABEL: trunc_and_v8i32_v8i16:
2920; AVX512:       # %bb.0:
2921; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
2922; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
2923; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2924; AVX512-NEXT:    vzeroupper
2925; AVX512-NEXT:    retq
2926  %1 = and <8 x i32> %a0, %a1
2927  %2 = trunc <8 x i32> %1 to <8 x i16>
2928  ret <8 x i16> %2
2929}
2930
2931define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
2932; SSE-LABEL: trunc_and_v16i64_v16i8:
2933; SSE:       # %bb.0:
2934; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
2935; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
2936; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
2937; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
2938; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
2939; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
2940; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
2941; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
2942; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
2943; SSE-NEXT:    pand %xmm8, %xmm7
2944; SSE-NEXT:    pand %xmm8, %xmm6
2945; SSE-NEXT:    packuswb %xmm7, %xmm6
2946; SSE-NEXT:    pand %xmm8, %xmm5
2947; SSE-NEXT:    pand %xmm8, %xmm4
2948; SSE-NEXT:    packuswb %xmm5, %xmm4
2949; SSE-NEXT:    packuswb %xmm6, %xmm4
2950; SSE-NEXT:    pand %xmm8, %xmm3
2951; SSE-NEXT:    pand %xmm8, %xmm2
2952; SSE-NEXT:    packuswb %xmm3, %xmm2
2953; SSE-NEXT:    pand %xmm8, %xmm1
2954; SSE-NEXT:    pand %xmm8, %xmm0
2955; SSE-NEXT:    packuswb %xmm1, %xmm0
2956; SSE-NEXT:    packuswb %xmm2, %xmm0
2957; SSE-NEXT:    packuswb %xmm4, %xmm0
2958; SSE-NEXT:    retq
2959;
2960; AVX1-LABEL: trunc_and_v16i64_v16i8:
2961; AVX1:       # %bb.0:
2962; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255]
2963; AVX1-NEXT:    vandps %ymm7, %ymm8, %ymm7
2964; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
2965; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
2966; AVX1-NEXT:    vpackusdw %xmm7, %xmm3, %xmm3
2967; AVX1-NEXT:    vandps %ymm6, %ymm8, %ymm6
2968; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
2969; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
2970; AVX1-NEXT:    vpackusdw %xmm6, %xmm2, %xmm2
2971; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
2972; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm3
2973; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
2974; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
2975; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
2976; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm3
2977; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
2978; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
2979; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
2980; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
2981; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2982; AVX1-NEXT:    vzeroupper
2983; AVX1-NEXT:    retq
2984;
2985; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
2986; AVX2-SLOW:       # %bb.0:
2987; AVX2-SLOW-NEXT:    vandps %ymm5, %ymm1, %ymm1
2988; AVX2-SLOW-NEXT:    vandps %ymm4, %ymm0, %ymm0
2989; AVX2-SLOW-NEXT:    vandps %ymm7, %ymm3, %ymm3
2990; AVX2-SLOW-NEXT:    vandps %ymm6, %ymm2, %ymm2
2991; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
2992; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
2993; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
2994; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
2995; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
2996; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2997; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
2998; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
2999; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
3000; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3001; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
3002; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3003; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3004; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
3005; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3006; AVX2-SLOW-NEXT:    vzeroupper
3007; AVX2-SLOW-NEXT:    retq
3008;
3009; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
3010; AVX2-FAST:       # %bb.0:
3011; AVX2-FAST-NEXT:    vpand %ymm5, %ymm1, %ymm1
3012; AVX2-FAST-NEXT:    vpand %ymm4, %ymm0, %ymm0
3013; AVX2-FAST-NEXT:    vpand %ymm7, %ymm3, %ymm3
3014; AVX2-FAST-NEXT:    vpand %ymm6, %ymm2, %ymm2
3015; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3016; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
3017; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
3018; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3019; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3020; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3021; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3022; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3023; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
3024; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
3025; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
3026; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3027; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3028; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3029; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
3030; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3031; AVX2-FAST-NEXT:    vzeroupper
3032; AVX2-FAST-NEXT:    retq
3033;
3034; AVX512-LABEL: trunc_and_v16i64_v16i8:
3035; AVX512:       # %bb.0:
3036; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
3037; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
3038; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3039; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3040; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3041; AVX512-NEXT:    vzeroupper
3042; AVX512-NEXT:    retq
3043  %1 = and <16 x i64> %a0, %a1
3044  %2 = trunc <16 x i64> %1 to <16 x i8>
3045  ret <16 x i8> %2
3046}
3047
3048define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3049; SSE-LABEL: trunc_and_v16i32_v16i8:
3050; SSE:       # %bb.0:
3051; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3052; SSE-NEXT:    pand %xmm8, %xmm7
3053; SSE-NEXT:    pand %xmm3, %xmm7
3054; SSE-NEXT:    pand %xmm8, %xmm6
3055; SSE-NEXT:    pand %xmm2, %xmm6
3056; SSE-NEXT:    packuswb %xmm7, %xmm6
3057; SSE-NEXT:    pand %xmm8, %xmm5
3058; SSE-NEXT:    pand %xmm1, %xmm5
3059; SSE-NEXT:    pand %xmm8, %xmm4
3060; SSE-NEXT:    pand %xmm4, %xmm0
3061; SSE-NEXT:    packuswb %xmm5, %xmm0
3062; SSE-NEXT:    packuswb %xmm6, %xmm0
3063; SSE-NEXT:    retq
3064;
3065; AVX1-LABEL: trunc_and_v16i32_v16i8:
3066; AVX1:       # %bb.0:
3067; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
3068; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3069; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
3070; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3071; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3072; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3073; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3074; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3075; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3076; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3077; AVX1-NEXT:    vzeroupper
3078; AVX1-NEXT:    retq
3079;
3080; AVX2-LABEL: trunc_and_v16i32_v16i8:
3081; AVX2:       # %bb.0:
3082; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
3083; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
3084; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3085; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3086; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3087; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3088; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
3089; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3090; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3091; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
3092; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3093; AVX2-NEXT:    vzeroupper
3094; AVX2-NEXT:    retq
3095;
3096; AVX512-LABEL: trunc_and_v16i32_v16i8:
3097; AVX512:       # %bb.0:
3098; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
3099; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3100; AVX512-NEXT:    vzeroupper
3101; AVX512-NEXT:    retq
3102  %1 = and <16 x i32> %a0, %a1
3103  %2 = trunc <16 x i32> %1 to <16 x i8>
3104  ret <16 x i8> %2
3105}
3106
3107define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3108; SSE-LABEL: trunc_and_v16i16_v16i8:
3109; SSE:       # %bb.0:
3110; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3111; SSE-NEXT:    pand %xmm4, %xmm3
3112; SSE-NEXT:    pand %xmm1, %xmm3
3113; SSE-NEXT:    pand %xmm4, %xmm2
3114; SSE-NEXT:    pand %xmm2, %xmm0
3115; SSE-NEXT:    packuswb %xmm3, %xmm0
3116; SSE-NEXT:    retq
3117;
3118; AVX1-LABEL: trunc_and_v16i16_v16i8:
3119; AVX1:       # %bb.0:
3120; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
3121; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3122; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3123; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3124; AVX1-NEXT:    vzeroupper
3125; AVX1-NEXT:    retq
3126;
3127; AVX2-LABEL: trunc_and_v16i16_v16i8:
3128; AVX2:       # %bb.0:
3129; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
3130; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3131; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3132; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3133; AVX2-NEXT:    vzeroupper
3134; AVX2-NEXT:    retq
3135;
3136; AVX512F-LABEL: trunc_and_v16i16_v16i8:
3137; AVX512F:       # %bb.0:
3138; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
3139; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3140; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3141; AVX512F-NEXT:    vzeroupper
3142; AVX512F-NEXT:    retq
3143;
3144; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
3145; AVX512BW:       # %bb.0:
3146; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
3147; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3148; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3149; AVX512BW-NEXT:    vzeroupper
3150; AVX512BW-NEXT:    retq
3151;
3152; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
3153; AVX512DQ:       # %bb.0:
3154; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
3155; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3156; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3157; AVX512DQ-NEXT:    vzeroupper
3158; AVX512DQ-NEXT:    retq
3159  %1 = and <16 x i16> %a0, %a1
3160  %2 = trunc <16 x i16> %1 to <16 x i8>
3161  ret <16 x i8> %2
3162}
3163
3164;
3165; and to constant
3166;
3167
3168define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3169; SSE-LABEL: trunc_and_const_v4i64_v4i32:
3170; SSE:       # %bb.0:
3171; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3172; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
3173; SSE-NEXT:    retq
3174;
3175; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
3176; AVX1:       # %bb.0:
3177; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3178; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3179; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
3180; AVX1-NEXT:    vzeroupper
3181; AVX1-NEXT:    retq
3182;
3183; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
3184; AVX2-SLOW:       # %bb.0:
3185; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3186; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3187; AVX2-SLOW-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
3188; AVX2-SLOW-NEXT:    vzeroupper
3189; AVX2-SLOW-NEXT:    retq
3190;
3191; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
3192; AVX2-FAST:       # %bb.0:
3193; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3194; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3195; AVX2-FAST-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
3196; AVX2-FAST-NEXT:    vzeroupper
3197; AVX2-FAST-NEXT:    retq
3198;
3199; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
3200; AVX512:       # %bb.0:
3201; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3202; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3203; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3204; AVX512-NEXT:    vzeroupper
3205; AVX512-NEXT:    retq
3206  %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3207  %2 = trunc <4 x i64> %1 to <4 x i32>
3208  ret <4 x i32> %2
3209}
3210
3211define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3212; SSE-LABEL: trunc_and_const_v8i64_v8i16:
3213; SSE:       # %bb.0:
3214; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3215; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3216; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3217; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3218; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3219; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3220; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3221; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3222; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3223; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3224; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3225; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
3226; SSE-NEXT:    retq
3227;
3228; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
3229; AVX1:       # %bb.0:
3230; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3231; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3232; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3233; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3234; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3235; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3236; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3237; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3238; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3239; AVX1-NEXT:    vzeroupper
3240; AVX1-NEXT:    retq
3241;
3242; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
3243; AVX2-SLOW:       # %bb.0:
3244; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
3245; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3246; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
3247; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3248; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3249; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3250; AVX2-SLOW-NEXT:    vzeroupper
3251; AVX2-SLOW-NEXT:    retq
3252;
3253; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
3254; AVX2-FAST:       # %bb.0:
3255; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3256; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3257; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3258; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3259; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3260; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3261; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3262; AVX2-FAST-NEXT:    vzeroupper
3263; AVX2-FAST-NEXT:    retq
3264;
3265; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
3266; AVX512:       # %bb.0:
3267; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3268; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3269; AVX512-NEXT:    vzeroupper
3270; AVX512-NEXT:    retq
3271  %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
3272  %2 = trunc <8 x i64> %1 to <8 x i16>
3273  ret <8 x i16> %2
3274}
3275
3276define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
3277; SSE-LABEL: trunc_and_const_v8i32_v8i16:
3278; SSE:       # %bb.0:
3279; SSE-NEXT:    pslld $16, %xmm1
3280; SSE-NEXT:    psrad $16, %xmm1
3281; SSE-NEXT:    pslld $16, %xmm0
3282; SSE-NEXT:    psrad $16, %xmm0
3283; SSE-NEXT:    packssdw %xmm1, %xmm0
3284; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3285; SSE-NEXT:    retq
3286;
3287; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
3288; AVX1:       # %bb.0:
3289; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3290; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3291; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3292; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3293; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3294; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3295; AVX1-NEXT:    vzeroupper
3296; AVX1-NEXT:    retq
3297;
3298; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
3299; AVX2:       # %bb.0:
3300; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3301; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3302; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3303; AVX2-NEXT:    vzeroupper
3304; AVX2-NEXT:    retq
3305;
3306; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
3307; AVX512:       # %bb.0:
3308; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3309; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3310; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3311; AVX512-NEXT:    vzeroupper
3312; AVX512-NEXT:    retq
3313  %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3314  %2 = trunc <8 x i32> %1 to <8 x i16>
3315  ret <8 x i16> %2
3316}
3317
3318define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
3319; SSE-LABEL: trunc_and_const_v16i64_v16i8:
3320; SSE:       # %bb.0:
3321; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3322; SSE-NEXT:    pand %xmm8, %xmm7
3323; SSE-NEXT:    pand %xmm8, %xmm6
3324; SSE-NEXT:    packuswb %xmm7, %xmm6
3325; SSE-NEXT:    pand %xmm8, %xmm5
3326; SSE-NEXT:    pand %xmm8, %xmm4
3327; SSE-NEXT:    packuswb %xmm5, %xmm4
3328; SSE-NEXT:    packuswb %xmm6, %xmm4
3329; SSE-NEXT:    pand %xmm8, %xmm3
3330; SSE-NEXT:    pand %xmm8, %xmm2
3331; SSE-NEXT:    packuswb %xmm3, %xmm2
3332; SSE-NEXT:    pand %xmm8, %xmm1
3333; SSE-NEXT:    pand %xmm8, %xmm0
3334; SSE-NEXT:    packuswb %xmm1, %xmm0
3335; SSE-NEXT:    packuswb %xmm2, %xmm0
3336; SSE-NEXT:    packuswb %xmm4, %xmm0
3337; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3338; SSE-NEXT:    retq
3339;
3340; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
3341; AVX1:       # %bb.0:
3342; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3343; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3344; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3345; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3346; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3347; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3348; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3349; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3350; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3351; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3352; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3353; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3354; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3355; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3356; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3357; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3358; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3359; AVX1-NEXT:    vzeroupper
3360; AVX1-NEXT:    retq
3361;
3362; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
3363; AVX2-SLOW:       # %bb.0:
3364; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
3365; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
3366; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
3367; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3368; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3369; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3370; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3371; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
3372; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
3373; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3374; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
3375; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3376; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3377; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
3378; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3379; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3380; AVX2-SLOW-NEXT:    vzeroupper
3381; AVX2-SLOW-NEXT:    retq
3382;
3383; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
3384; AVX2-FAST:       # %bb.0:
3385; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3386; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
3387; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
3388; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3389; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3390; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3391; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3392; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3393; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
3394; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
3395; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
3396; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3397; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3398; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3399; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
3400; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3401; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3402; AVX2-FAST-NEXT:    vzeroupper
3403; AVX2-FAST-NEXT:    retq
3404;
3405; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
3406; AVX512:       # %bb.0:
3407; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3408; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3409; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3410; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3411; AVX512-NEXT:    vzeroupper
3412; AVX512-NEXT:    retq
3413  %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
3414  %2 = trunc <16 x i64> %1 to <16 x i8>
3415  ret <16 x i8> %2
3416}
3417
3418define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
3419; SSE-LABEL: trunc_and_const_v16i32_v16i8:
3420; SSE:       # %bb.0:
3421; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3422; SSE-NEXT:    pand %xmm4, %xmm3
3423; SSE-NEXT:    pand %xmm4, %xmm2
3424; SSE-NEXT:    packuswb %xmm3, %xmm2
3425; SSE-NEXT:    pand %xmm4, %xmm1
3426; SSE-NEXT:    pand %xmm4, %xmm0
3427; SSE-NEXT:    packuswb %xmm1, %xmm0
3428; SSE-NEXT:    packuswb %xmm2, %xmm0
3429; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3430; SSE-NEXT:    retq
3431;
3432; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
3433; AVX1:       # %bb.0:
3434; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3435; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3436; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3437; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3438; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3439; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3440; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3441; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3442; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3443; AVX1-NEXT:    vzeroupper
3444; AVX1-NEXT:    retq
3445;
3446; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
3447; AVX2:       # %bb.0:
3448; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3449; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3450; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3451; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3452; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
3453; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3454; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3455; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
3456; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3457; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3458; AVX2-NEXT:    vzeroupper
3459; AVX2-NEXT:    retq
3460;
3461; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
3462; AVX512:       # %bb.0:
3463; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3464; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3465; AVX512-NEXT:    vzeroupper
3466; AVX512-NEXT:    retq
3467  %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3468  %2 = trunc <16 x i32> %1 to <16 x i8>
3469  ret <16 x i8> %2
3470}
3471
3472define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
3473; SSE-LABEL: trunc_and_const_v16i16_v16i8:
3474; SSE:       # %bb.0:
3475; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3476; SSE-NEXT:    pand %xmm2, %xmm1
3477; SSE-NEXT:    pand %xmm2, %xmm0
3478; SSE-NEXT:    packuswb %xmm1, %xmm0
3479; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
3480; SSE-NEXT:    retq
3481;
3482; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
3483; AVX1:       # %bb.0:
3484; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3485; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3486; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3487; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3488; AVX1-NEXT:    vzeroupper
3489; AVX1-NEXT:    retq
3490;
3491; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
3492; AVX2:       # %bb.0:
3493; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3494; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3495; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3496; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3497; AVX2-NEXT:    vzeroupper
3498; AVX2-NEXT:    retq
3499;
3500; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
3501; AVX512F:       # %bb.0:
3502; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3503; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3504; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3505; AVX512F-NEXT:    vzeroupper
3506; AVX512F-NEXT:    retq
3507;
3508; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
3509; AVX512BW:       # %bb.0:
3510; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3511; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3512; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3513; AVX512BW-NEXT:    vzeroupper
3514; AVX512BW-NEXT:    retq
3515;
3516; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
3517; AVX512DQ:       # %bb.0:
3518; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3519; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3520; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3521; AVX512DQ-NEXT:    vzeroupper
3522; AVX512DQ-NEXT:    retq
3523  %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
3524  %2 = trunc <16 x i16> %1 to <16 x i8>
3525  ret <16 x i8> %2
3526}
3527
3528;
3529; xor
3530;
3531
3532define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3533; SSE-LABEL: trunc_xor_v4i64_v4i32:
3534; SSE:       # %bb.0:
3535; SSE-NEXT:    xorps %xmm3, %xmm1
3536; SSE-NEXT:    xorps %xmm2, %xmm0
3537; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3538; SSE-NEXT:    retq
3539;
3540; AVX1-LABEL: trunc_xor_v4i64_v4i32:
3541; AVX1:       # %bb.0:
3542; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3543; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3544; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3545; AVX1-NEXT:    vzeroupper
3546; AVX1-NEXT:    retq
3547;
3548; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
3549; AVX2-SLOW:       # %bb.0:
3550; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3551; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3552; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3553; AVX2-SLOW-NEXT:    vzeroupper
3554; AVX2-SLOW-NEXT:    retq
3555;
3556; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
3557; AVX2-FAST:       # %bb.0:
3558; AVX2-FAST-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3559; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3560; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3561; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3562; AVX2-FAST-NEXT:    vzeroupper
3563; AVX2-FAST-NEXT:    retq
3564;
3565; AVX512-LABEL: trunc_xor_v4i64_v4i32:
3566; AVX512:       # %bb.0:
3567; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3568; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3569; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3570; AVX512-NEXT:    vzeroupper
3571; AVX512-NEXT:    retq
3572  %1 = xor <4 x i64> %a0, %a1
3573  %2 = trunc <4 x i64> %1 to <4 x i32>
3574  ret <4 x i32> %2
3575}
3576
3577define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
3578; SSE-LABEL: trunc_xor_v8i64_v8i16:
3579; SSE:       # %bb.0:
3580; SSE-NEXT:    pxor %xmm6, %xmm2
3581; SSE-NEXT:    pxor %xmm7, %xmm3
3582; SSE-NEXT:    pxor %xmm4, %xmm0
3583; SSE-NEXT:    pxor %xmm5, %xmm1
3584; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3585; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3586; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3587; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3588; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3589; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3590; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3591; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3592; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3593; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3594; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3595; SSE-NEXT:    retq
3596;
3597; AVX1-LABEL: trunc_xor_v8i64_v8i16:
3598; AVX1:       # %bb.0:
3599; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3600; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3601; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3602; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3603; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3604; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3605; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3606; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3607; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3608; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3609; AVX1-NEXT:    vzeroupper
3610; AVX1-NEXT:    retq
3611;
3612; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
3613; AVX2-SLOW:       # %bb.0:
3614; AVX2-SLOW-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3615; AVX2-SLOW-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3616; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
3617; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3618; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
3619; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3620; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3621; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3622; AVX2-SLOW-NEXT:    vzeroupper
3623; AVX2-SLOW-NEXT:    retq
3624;
3625; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
3626; AVX2-FAST:       # %bb.0:
3627; AVX2-FAST-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3628; AVX2-FAST-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3629; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3630; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
3631; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
3632; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3633; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3634; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3635; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3636; AVX2-FAST-NEXT:    vzeroupper
3637; AVX2-FAST-NEXT:    retq
3638;
3639; AVX512-LABEL: trunc_xor_v8i64_v8i16:
3640; AVX512:       # %bb.0:
3641; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
3642; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
3643; AVX512-NEXT:    vzeroupper
3644; AVX512-NEXT:    retq
3645  %1 = xor <8 x i64> %a0, %a1
3646  %2 = trunc <8 x i64> %1 to <8 x i16>
3647  ret <8 x i16> %2
3648}
3649
3650define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
3651; SSE-LABEL: trunc_xor_v8i32_v8i16:
3652; SSE:       # %bb.0:
3653; SSE-NEXT:    pxor %xmm2, %xmm0
3654; SSE-NEXT:    pxor %xmm3, %xmm1
3655; SSE-NEXT:    pslld $16, %xmm1
3656; SSE-NEXT:    psrad $16, %xmm1
3657; SSE-NEXT:    pslld $16, %xmm0
3658; SSE-NEXT:    psrad $16, %xmm0
3659; SSE-NEXT:    packssdw %xmm1, %xmm0
3660; SSE-NEXT:    retq
3661;
3662; AVX1-LABEL: trunc_xor_v8i32_v8i16:
3663; AVX1:       # %bb.0:
3664; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3665; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3666; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
3667; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
3668; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
3669; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3670; AVX1-NEXT:    vzeroupper
3671; AVX1-NEXT:    retq
3672;
3673; AVX2-LABEL: trunc_xor_v8i32_v8i16:
3674; AVX2:       # %bb.0:
3675; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3676; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
3677; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3678; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3679; AVX2-NEXT:    vzeroupper
3680; AVX2-NEXT:    retq
3681;
3682; AVX512-LABEL: trunc_xor_v8i32_v8i16:
3683; AVX512:       # %bb.0:
3684; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3685; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
3686; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3687; AVX512-NEXT:    vzeroupper
3688; AVX512-NEXT:    retq
3689  %1 = xor <8 x i32> %a0, %a1
3690  %2 = trunc <8 x i32> %1 to <8 x i16>
3691  ret <8 x i16> %2
3692}
3693
3694define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
3695; SSE-LABEL: trunc_xor_v16i64_v16i8:
3696; SSE:       # %bb.0:
3697; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
3698; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
3699; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
3700; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
3701; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
3702; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
3703; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
3704; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
3705; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3706; SSE-NEXT:    pand %xmm8, %xmm7
3707; SSE-NEXT:    pand %xmm8, %xmm6
3708; SSE-NEXT:    packuswb %xmm7, %xmm6
3709; SSE-NEXT:    pand %xmm8, %xmm5
3710; SSE-NEXT:    pand %xmm8, %xmm4
3711; SSE-NEXT:    packuswb %xmm5, %xmm4
3712; SSE-NEXT:    packuswb %xmm6, %xmm4
3713; SSE-NEXT:    pand %xmm8, %xmm3
3714; SSE-NEXT:    pand %xmm8, %xmm2
3715; SSE-NEXT:    packuswb %xmm3, %xmm2
3716; SSE-NEXT:    pand %xmm8, %xmm1
3717; SSE-NEXT:    pand %xmm8, %xmm0
3718; SSE-NEXT:    packuswb %xmm1, %xmm0
3719; SSE-NEXT:    packuswb %xmm2, %xmm0
3720; SSE-NEXT:    packuswb %xmm4, %xmm0
3721; SSE-NEXT:    retq
3722;
3723; AVX1-LABEL: trunc_xor_v16i64_v16i8:
3724; AVX1:       # %bb.0:
3725; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3726; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3727; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3728; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3729; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
3730; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
3731; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
3732; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
3733; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
3734; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
3735; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
3736; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
3737; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
3738; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3739; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3740; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
3741; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
3742; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
3743; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
3744; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3745; AVX1-NEXT:    vzeroupper
3746; AVX1-NEXT:    retq
3747;
3748; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
3749; AVX2-SLOW:       # %bb.0:
3750; AVX2-SLOW-NEXT:    vxorps %ymm5, %ymm1, %ymm1
3751; AVX2-SLOW-NEXT:    vxorps %ymm4, %ymm0, %ymm0
3752; AVX2-SLOW-NEXT:    vxorps %ymm7, %ymm3, %ymm3
3753; AVX2-SLOW-NEXT:    vxorps %ymm6, %ymm2, %ymm2
3754; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
3755; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
3756; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
3757; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3758; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3759; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3760; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
3761; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
3762; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
3763; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
3764; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
3765; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3766; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3767; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
3768; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3769; AVX2-SLOW-NEXT:    vzeroupper
3770; AVX2-SLOW-NEXT:    retq
3771;
3772; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
3773; AVX2-FAST:       # %bb.0:
3774; AVX2-FAST-NEXT:    vpxor %ymm5, %ymm1, %ymm1
3775; AVX2-FAST-NEXT:    vpxor %ymm4, %ymm0, %ymm0
3776; AVX2-FAST-NEXT:    vpxor %ymm7, %ymm3, %ymm3
3777; AVX2-FAST-NEXT:    vpxor %ymm6, %ymm2, %ymm2
3778; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
3779; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
3780; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
3781; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
3782; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3783; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
3784; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
3785; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
3786; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
3787; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
3788; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
3789; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
3790; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
3791; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3792; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
3793; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
3794; AVX2-FAST-NEXT:    vzeroupper
3795; AVX2-FAST-NEXT:    retq
3796;
3797; AVX512-LABEL: trunc_xor_v16i64_v16i8:
3798; AVX512:       # %bb.0:
3799; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
3800; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
3801; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
3802; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
3803; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3804; AVX512-NEXT:    vzeroupper
3805; AVX512-NEXT:    retq
3806  %1 = xor <16 x i64> %a0, %a1
3807  %2 = trunc <16 x i64> %1 to <16 x i8>
3808  ret <16 x i8> %2
3809}
3810
3811define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
3812; SSE-LABEL: trunc_xor_v16i32_v16i8:
3813; SSE:       # %bb.0:
3814; SSE-NEXT:    pxor %xmm4, %xmm0
3815; SSE-NEXT:    pxor %xmm5, %xmm1
3816; SSE-NEXT:    pxor %xmm6, %xmm2
3817; SSE-NEXT:    pxor %xmm7, %xmm3
3818; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
3819; SSE-NEXT:    pand %xmm4, %xmm3
3820; SSE-NEXT:    pand %xmm4, %xmm2
3821; SSE-NEXT:    packuswb %xmm3, %xmm2
3822; SSE-NEXT:    pand %xmm4, %xmm1
3823; SSE-NEXT:    pand %xmm4, %xmm0
3824; SSE-NEXT:    packuswb %xmm1, %xmm0
3825; SSE-NEXT:    packuswb %xmm2, %xmm0
3826; SSE-NEXT:    retq
3827;
3828; AVX1-LABEL: trunc_xor_v16i32_v16i8:
3829; AVX1:       # %bb.0:
3830; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
3831; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
3832; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
3833; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3834; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3835; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3836; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3837; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3838; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
3839; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3840; AVX1-NEXT:    vzeroupper
3841; AVX1-NEXT:    retq
3842;
3843; AVX2-LABEL: trunc_xor_v16i32_v16i8:
3844; AVX2:       # %bb.0:
3845; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
3846; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
3847; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
3848; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3849; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
3850; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
3851; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
3852; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3853; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3854; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
3855; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3856; AVX2-NEXT:    vzeroupper
3857; AVX2-NEXT:    retq
3858;
3859; AVX512-LABEL: trunc_xor_v16i32_v16i8:
3860; AVX512:       # %bb.0:
3861; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
3862; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
3863; AVX512-NEXT:    vzeroupper
3864; AVX512-NEXT:    retq
3865  %1 = xor <16 x i32> %a0, %a1
3866  %2 = trunc <16 x i32> %1 to <16 x i8>
3867  ret <16 x i8> %2
3868}
3869
3870define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
3871; SSE-LABEL: trunc_xor_v16i16_v16i8:
3872; SSE:       # %bb.0:
3873; SSE-NEXT:    pxor %xmm2, %xmm0
3874; SSE-NEXT:    pxor %xmm3, %xmm1
3875; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
3876; SSE-NEXT:    pand %xmm2, %xmm1
3877; SSE-NEXT:    pand %xmm2, %xmm0
3878; SSE-NEXT:    packuswb %xmm1, %xmm0
3879; SSE-NEXT:    retq
3880;
3881; AVX1-LABEL: trunc_xor_v16i16_v16i8:
3882; AVX1:       # %bb.0:
3883; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3884; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
3885; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3886; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3887; AVX1-NEXT:    vzeroupper
3888; AVX1-NEXT:    retq
3889;
3890; AVX2-LABEL: trunc_xor_v16i16_v16i8:
3891; AVX2:       # %bb.0:
3892; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3893; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
3894; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
3895; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
3896; AVX2-NEXT:    vzeroupper
3897; AVX2-NEXT:    retq
3898;
3899; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
3900; AVX512F:       # %bb.0:
3901; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3902; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3903; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3904; AVX512F-NEXT:    vzeroupper
3905; AVX512F-NEXT:    retq
3906;
3907; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
3908; AVX512BW:       # %bb.0:
3909; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3910; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
3911; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3912; AVX512BW-NEXT:    vzeroupper
3913; AVX512BW-NEXT:    retq
3914;
3915; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
3916; AVX512DQ:       # %bb.0:
3917; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
3918; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
3919; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
3920; AVX512DQ-NEXT:    vzeroupper
3921; AVX512DQ-NEXT:    retq
3922  %1 = xor <16 x i16> %a0, %a1
3923  %2 = trunc <16 x i16> %1 to <16 x i8>
3924  ret <16 x i8> %2
3925}
3926
3927;
3928; xor to constant
3929;
3930
3931define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
3932; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
3933; SSE:       # %bb.0:
3934; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3935; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
3936; SSE-NEXT:    retq
3937;
3938; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
3939; AVX1:       # %bb.0:
3940; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
3941; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3942; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
3943; AVX1-NEXT:    vzeroupper
3944; AVX1-NEXT:    retq
3945;
3946; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
3947; AVX2-SLOW:       # %bb.0:
3948; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
3949; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3950; AVX2-SLOW-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
3951; AVX2-SLOW-NEXT:    vzeroupper
3952; AVX2-SLOW-NEXT:    retq
3953;
3954; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
3955; AVX2-FAST:       # %bb.0:
3956; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
3957; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
3958; AVX2-FAST-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
3959; AVX2-FAST-NEXT:    vzeroupper
3960; AVX2-FAST-NEXT:    retq
3961;
3962; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
3963; AVX512:       # %bb.0:
3964; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
3965; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
3966; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
3967; AVX512-NEXT:    vzeroupper
3968; AVX512-NEXT:    retq
3969  %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
3970  %2 = trunc <4 x i64> %1 to <4 x i32>
3971  ret <4 x i32> %2
3972}
3973
3974define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
3975; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
3976; SSE:       # %bb.0:
3977; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3978; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3979; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3980; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
3981; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3982; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3983; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
3984; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3985; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3986; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3987; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
3988; SSE-NEXT:    xorpd {{.*}}(%rip), %xmm0
3989; SSE-NEXT:    retq
3990;
3991; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
3992; AVX1:       # %bb.0:
3993; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
3994; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
3995; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
3996; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
3997; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
3998; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
3999; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4000; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4001; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4002; AVX1-NEXT:    vzeroupper
4003; AVX1-NEXT:    retq
4004;
4005; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
4006; AVX2-SLOW:       # %bb.0:
4007; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4008; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4009; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
4010; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4011; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4012; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4013; AVX2-SLOW-NEXT:    vzeroupper
4014; AVX2-SLOW-NEXT:    retq
4015;
4016; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
4017; AVX2-FAST:       # %bb.0:
4018; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4019; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
4020; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4021; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4022; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4023; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4024; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4025; AVX2-FAST-NEXT:    vzeroupper
4026; AVX2-FAST-NEXT:    retq
4027;
4028; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
4029; AVX512:       # %bb.0:
4030; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4031; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4032; AVX512-NEXT:    vzeroupper
4033; AVX512-NEXT:    retq
4034  %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4035  %2 = trunc <8 x i64> %1 to <8 x i16>
4036  ret <8 x i16> %2
4037}
4038
4039define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4040; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
4041; SSE:       # %bb.0:
4042; SSE-NEXT:    pslld $16, %xmm1
4043; SSE-NEXT:    psrad $16, %xmm1
4044; SSE-NEXT:    pslld $16, %xmm0
4045; SSE-NEXT:    psrad $16, %xmm0
4046; SSE-NEXT:    packssdw %xmm1, %xmm0
4047; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4048; SSE-NEXT:    retq
4049;
4050; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
4051; AVX1:       # %bb.0:
4052; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4053; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4054; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4055; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4056; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4057; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4058; AVX1-NEXT:    vzeroupper
4059; AVX1-NEXT:    retq
4060;
4061; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
4062; AVX2:       # %bb.0:
4063; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4064; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4065; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4066; AVX2-NEXT:    vzeroupper
4067; AVX2-NEXT:    retq
4068;
4069; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
4070; AVX512:       # %bb.0:
4071; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4072; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4073; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4074; AVX512-NEXT:    vzeroupper
4075; AVX512-NEXT:    retq
4076  %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4077  %2 = trunc <8 x i32> %1 to <8 x i16>
4078  ret <8 x i16> %2
4079}
4080
4081define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4082; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
4083; SSE:       # %bb.0:
4084; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4085; SSE-NEXT:    pand %xmm8, %xmm7
4086; SSE-NEXT:    pand %xmm8, %xmm6
4087; SSE-NEXT:    packuswb %xmm7, %xmm6
4088; SSE-NEXT:    pand %xmm8, %xmm5
4089; SSE-NEXT:    pand %xmm8, %xmm4
4090; SSE-NEXT:    packuswb %xmm5, %xmm4
4091; SSE-NEXT:    packuswb %xmm6, %xmm4
4092; SSE-NEXT:    pand %xmm8, %xmm3
4093; SSE-NEXT:    pand %xmm8, %xmm2
4094; SSE-NEXT:    packuswb %xmm3, %xmm2
4095; SSE-NEXT:    pand %xmm8, %xmm1
4096; SSE-NEXT:    pand %xmm8, %xmm0
4097; SSE-NEXT:    packuswb %xmm1, %xmm0
4098; SSE-NEXT:    packuswb %xmm2, %xmm0
4099; SSE-NEXT:    packuswb %xmm4, %xmm0
4100; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4101; SSE-NEXT:    retq
4102;
4103; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
4104; AVX1:       # %bb.0:
4105; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4106; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4107; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4108; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4109; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4110; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4111; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4112; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4113; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4114; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4115; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4116; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4117; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4118; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4119; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4120; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4121; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4122; AVX1-NEXT:    vzeroupper
4123; AVX1-NEXT:    retq
4124;
4125; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
4126; AVX2-SLOW:       # %bb.0:
4127; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
4128; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
4129; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
4130; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4131; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4132; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4133; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4134; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
4135; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
4136; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4137; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
4138; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4139; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4140; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
4141; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4142; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4143; AVX2-SLOW-NEXT:    vzeroupper
4144; AVX2-SLOW-NEXT:    retq
4145;
4146; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
4147; AVX2-FAST:       # %bb.0:
4148; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4149; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
4150; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
4151; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4152; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4153; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4154; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4155; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4156; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
4157; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
4158; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
4159; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4160; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4161; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4162; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
4163; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4164; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4165; AVX2-FAST-NEXT:    vzeroupper
4166; AVX2-FAST-NEXT:    retq
4167;
4168; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
4169; AVX512:       # %bb.0:
4170; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4171; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4172; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4173; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4174; AVX512-NEXT:    vzeroupper
4175; AVX512-NEXT:    retq
4176  %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4177  %2 = trunc <16 x i64> %1 to <16 x i8>
4178  ret <16 x i8> %2
4179}
4180
4181define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4182; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
4183; SSE:       # %bb.0:
4184; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4185; SSE-NEXT:    pand %xmm4, %xmm3
4186; SSE-NEXT:    pand %xmm4, %xmm2
4187; SSE-NEXT:    packuswb %xmm3, %xmm2
4188; SSE-NEXT:    pand %xmm4, %xmm1
4189; SSE-NEXT:    pand %xmm4, %xmm0
4190; SSE-NEXT:    packuswb %xmm1, %xmm0
4191; SSE-NEXT:    packuswb %xmm2, %xmm0
4192; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4193; SSE-NEXT:    retq
4194;
4195; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
4196; AVX1:       # %bb.0:
4197; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4198; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4199; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4200; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4201; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4202; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4203; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4204; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4205; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4206; AVX1-NEXT:    vzeroupper
4207; AVX1-NEXT:    retq
4208;
4209; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
4210; AVX2:       # %bb.0:
4211; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4212; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4213; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4214; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4215; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
4216; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4217; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4218; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
4219; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4220; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4221; AVX2-NEXT:    vzeroupper
4222; AVX2-NEXT:    retq
4223;
4224; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
4225; AVX512:       # %bb.0:
4226; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4227; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4228; AVX512-NEXT:    vzeroupper
4229; AVX512-NEXT:    retq
4230  %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4231  %2 = trunc <16 x i32> %1 to <16 x i8>
4232  ret <16 x i8> %2
4233}
4234
4235define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4236; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
4237; SSE:       # %bb.0:
4238; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4239; SSE-NEXT:    pand %xmm2, %xmm1
4240; SSE-NEXT:    pand %xmm2, %xmm0
4241; SSE-NEXT:    packuswb %xmm1, %xmm0
4242; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
4243; SSE-NEXT:    retq
4244;
4245; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
4246; AVX1:       # %bb.0:
4247; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
4248; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4249; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4250; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4251; AVX1-NEXT:    vzeroupper
4252; AVX1-NEXT:    retq
4253;
4254; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
4255; AVX2:       # %bb.0:
4256; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
4257; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4258; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4259; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4260; AVX2-NEXT:    vzeroupper
4261; AVX2-NEXT:    retq
4262;
4263; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
4264; AVX512F:       # %bb.0:
4265; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4266; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4267; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4268; AVX512F-NEXT:    vzeroupper
4269; AVX512F-NEXT:    retq
4270;
4271; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
4272; AVX512BW:       # %bb.0:
4273; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4274; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4275; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4276; AVX512BW-NEXT:    vzeroupper
4277; AVX512BW-NEXT:    retq
4278;
4279; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
4280; AVX512DQ:       # %bb.0:
4281; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4282; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4283; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
4284; AVX512DQ-NEXT:    vzeroupper
4285; AVX512DQ-NEXT:    retq
4286  %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
4287  %2 = trunc <16 x i16> %1 to <16 x i8>
4288  ret <16 x i8> %2
4289}
4290
4291;
4292; or
4293;
4294
4295define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
4296; SSE-LABEL: trunc_or_v4i64_v4i32:
4297; SSE:       # %bb.0:
4298; SSE-NEXT:    orps %xmm3, %xmm1
4299; SSE-NEXT:    orps %xmm2, %xmm0
4300; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4301; SSE-NEXT:    retq
4302;
4303; AVX1-LABEL: trunc_or_v4i64_v4i32:
4304; AVX1:       # %bb.0:
4305; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4306; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4307; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4308; AVX1-NEXT:    vzeroupper
4309; AVX1-NEXT:    retq
4310;
4311; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
4312; AVX2-SLOW:       # %bb.0:
4313; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
4314; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
4315; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4316; AVX2-SLOW-NEXT:    vzeroupper
4317; AVX2-SLOW-NEXT:    retq
4318;
4319; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
4320; AVX2-FAST:       # %bb.0:
4321; AVX2-FAST-NEXT:    vorps %ymm1, %ymm0, %ymm0
4322; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4323; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
4324; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4325; AVX2-FAST-NEXT:    vzeroupper
4326; AVX2-FAST-NEXT:    retq
4327;
4328; AVX512-LABEL: trunc_or_v4i64_v4i32:
4329; AVX512:       # %bb.0:
4330; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4331; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4332; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4333; AVX512-NEXT:    vzeroupper
4334; AVX512-NEXT:    retq
4335  %1 = or <4 x i64> %a0, %a1
4336  %2 = trunc <4 x i64> %1 to <4 x i32>
4337  ret <4 x i32> %2
4338}
4339
4340define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
4341; SSE-LABEL: trunc_or_v8i64_v8i16:
4342; SSE:       # %bb.0:
4343; SSE-NEXT:    por %xmm6, %xmm2
4344; SSE-NEXT:    por %xmm7, %xmm3
4345; SSE-NEXT:    por %xmm4, %xmm0
4346; SSE-NEXT:    por %xmm5, %xmm1
4347; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4348; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4349; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4350; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4351; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4352; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4353; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4354; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4355; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4356; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4357; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4358; SSE-NEXT:    retq
4359;
4360; AVX1-LABEL: trunc_or_v8i64_v8i16:
4361; AVX1:       # %bb.0:
4362; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4363; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4364; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4365; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4366; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4367; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4368; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4369; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4370; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4371; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4372; AVX1-NEXT:    vzeroupper
4373; AVX1-NEXT:    retq
4374;
4375; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
4376; AVX2-SLOW:       # %bb.0:
4377; AVX2-SLOW-NEXT:    vorps %ymm3, %ymm1, %ymm1
4378; AVX2-SLOW-NEXT:    vorps %ymm2, %ymm0, %ymm0
4379; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4380; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4381; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
4382; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4383; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4384; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4385; AVX2-SLOW-NEXT:    vzeroupper
4386; AVX2-SLOW-NEXT:    retq
4387;
4388; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
4389; AVX2-FAST:       # %bb.0:
4390; AVX2-FAST-NEXT:    vpor %ymm3, %ymm1, %ymm1
4391; AVX2-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
4392; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4393; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
4394; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4395; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4396; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4397; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4398; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4399; AVX2-FAST-NEXT:    vzeroupper
4400; AVX2-FAST-NEXT:    retq
4401;
4402; AVX512-LABEL: trunc_or_v8i64_v8i16:
4403; AVX512:       # %bb.0:
4404; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
4405; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4406; AVX512-NEXT:    vzeroupper
4407; AVX512-NEXT:    retq
4408  %1 = or <8 x i64> %a0, %a1
4409  %2 = trunc <8 x i64> %1 to <8 x i16>
4410  ret <8 x i16> %2
4411}
4412
4413define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
4414; SSE-LABEL: trunc_or_v8i32_v8i16:
4415; SSE:       # %bb.0:
4416; SSE-NEXT:    por %xmm2, %xmm0
4417; SSE-NEXT:    por %xmm3, %xmm1
4418; SSE-NEXT:    pslld $16, %xmm1
4419; SSE-NEXT:    psrad $16, %xmm1
4420; SSE-NEXT:    pslld $16, %xmm0
4421; SSE-NEXT:    psrad $16, %xmm0
4422; SSE-NEXT:    packssdw %xmm1, %xmm0
4423; SSE-NEXT:    retq
4424;
4425; AVX1-LABEL: trunc_or_v8i32_v8i16:
4426; AVX1:       # %bb.0:
4427; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4428; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4429; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4430; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4431; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4432; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4433; AVX1-NEXT:    vzeroupper
4434; AVX1-NEXT:    retq
4435;
4436; AVX2-LABEL: trunc_or_v8i32_v8i16:
4437; AVX2:       # %bb.0:
4438; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4439; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4440; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4441; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4442; AVX2-NEXT:    vzeroupper
4443; AVX2-NEXT:    retq
4444;
4445; AVX512-LABEL: trunc_or_v8i32_v8i16:
4446; AVX512:       # %bb.0:
4447; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
4448; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4449; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4450; AVX512-NEXT:    vzeroupper
4451; AVX512-NEXT:    retq
4452  %1 = or <8 x i32> %a0, %a1
4453  %2 = trunc <8 x i32> %1 to <8 x i16>
4454  ret <8 x i16> %2
4455}
4456
4457define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
4458; SSE-LABEL: trunc_or_v16i64_v16i8:
4459; SSE:       # %bb.0:
4460; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
4461; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
4462; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
4463; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
4464; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
4465; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
4466; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
4467; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
4468; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4469; SSE-NEXT:    pand %xmm8, %xmm7
4470; SSE-NEXT:    pand %xmm8, %xmm6
4471; SSE-NEXT:    packuswb %xmm7, %xmm6
4472; SSE-NEXT:    pand %xmm8, %xmm5
4473; SSE-NEXT:    pand %xmm8, %xmm4
4474; SSE-NEXT:    packuswb %xmm5, %xmm4
4475; SSE-NEXT:    packuswb %xmm6, %xmm4
4476; SSE-NEXT:    pand %xmm8, %xmm3
4477; SSE-NEXT:    pand %xmm8, %xmm2
4478; SSE-NEXT:    packuswb %xmm3, %xmm2
4479; SSE-NEXT:    pand %xmm8, %xmm1
4480; SSE-NEXT:    pand %xmm8, %xmm0
4481; SSE-NEXT:    packuswb %xmm1, %xmm0
4482; SSE-NEXT:    packuswb %xmm2, %xmm0
4483; SSE-NEXT:    packuswb %xmm4, %xmm0
4484; SSE-NEXT:    retq
4485;
4486; AVX1-LABEL: trunc_or_v16i64_v16i8:
4487; AVX1:       # %bb.0:
4488; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
4489; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
4490; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
4491; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
4492; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4493; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4494; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4495; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4496; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4497; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4498; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4499; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4500; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4501; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4502; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4503; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4504; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4505; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4506; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4507; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4508; AVX1-NEXT:    vzeroupper
4509; AVX1-NEXT:    retq
4510;
4511; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
4512; AVX2-SLOW:       # %bb.0:
4513; AVX2-SLOW-NEXT:    vorps %ymm5, %ymm1, %ymm1
4514; AVX2-SLOW-NEXT:    vorps %ymm4, %ymm0, %ymm0
4515; AVX2-SLOW-NEXT:    vorps %ymm7, %ymm3, %ymm3
4516; AVX2-SLOW-NEXT:    vorps %ymm6, %ymm2, %ymm2
4517; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
4518; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
4519; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
4520; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4521; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4522; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4523; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4524; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
4525; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
4526; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4527; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
4528; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4529; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4530; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
4531; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4532; AVX2-SLOW-NEXT:    vzeroupper
4533; AVX2-SLOW-NEXT:    retq
4534;
4535; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
4536; AVX2-FAST:       # %bb.0:
4537; AVX2-FAST-NEXT:    vpor %ymm5, %ymm1, %ymm1
4538; AVX2-FAST-NEXT:    vpor %ymm4, %ymm0, %ymm0
4539; AVX2-FAST-NEXT:    vpor %ymm7, %ymm3, %ymm3
4540; AVX2-FAST-NEXT:    vpor %ymm6, %ymm2, %ymm2
4541; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4542; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
4543; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
4544; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4545; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4546; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4547; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4548; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4549; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
4550; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
4551; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
4552; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4553; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4554; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4555; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
4556; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4557; AVX2-FAST-NEXT:    vzeroupper
4558; AVX2-FAST-NEXT:    retq
4559;
4560; AVX512-LABEL: trunc_or_v16i64_v16i8:
4561; AVX512:       # %bb.0:
4562; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
4563; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
4564; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4565; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4566; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4567; AVX512-NEXT:    vzeroupper
4568; AVX512-NEXT:    retq
4569  %1 = or <16 x i64> %a0, %a1
4570  %2 = trunc <16 x i64> %1 to <16 x i8>
4571  ret <16 x i8> %2
4572}
4573
4574define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
4575; SSE-LABEL: trunc_or_v16i32_v16i8:
4576; SSE:       # %bb.0:
4577; SSE-NEXT:    por %xmm4, %xmm0
4578; SSE-NEXT:    por %xmm5, %xmm1
4579; SSE-NEXT:    por %xmm6, %xmm2
4580; SSE-NEXT:    por %xmm7, %xmm3
4581; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4582; SSE-NEXT:    pand %xmm4, %xmm3
4583; SSE-NEXT:    pand %xmm4, %xmm2
4584; SSE-NEXT:    packuswb %xmm3, %xmm2
4585; SSE-NEXT:    pand %xmm4, %xmm1
4586; SSE-NEXT:    pand %xmm4, %xmm0
4587; SSE-NEXT:    packuswb %xmm1, %xmm0
4588; SSE-NEXT:    packuswb %xmm2, %xmm0
4589; SSE-NEXT:    retq
4590;
4591; AVX1-LABEL: trunc_or_v16i32_v16i8:
4592; AVX1:       # %bb.0:
4593; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
4594; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
4595; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4596; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4597; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4598; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4599; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4600; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4601; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4602; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4603; AVX1-NEXT:    vzeroupper
4604; AVX1-NEXT:    retq
4605;
4606; AVX2-LABEL: trunc_or_v16i32_v16i8:
4607; AVX2:       # %bb.0:
4608; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
4609; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
4610; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4611; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4612; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4613; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4614; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
4615; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4616; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4617; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
4618; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4619; AVX2-NEXT:    vzeroupper
4620; AVX2-NEXT:    retq
4621;
4622; AVX512-LABEL: trunc_or_v16i32_v16i8:
4623; AVX512:       # %bb.0:
4624; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
4625; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4626; AVX512-NEXT:    vzeroupper
4627; AVX512-NEXT:    retq
4628  %1 = or <16 x i32> %a0, %a1
4629  %2 = trunc <16 x i32> %1 to <16 x i8>
4630  ret <16 x i8> %2
4631}
4632
4633define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
4634; SSE-LABEL: trunc_or_v16i16_v16i8:
4635; SSE:       # %bb.0:
4636; SSE-NEXT:    por %xmm2, %xmm0
4637; SSE-NEXT:    por %xmm3, %xmm1
4638; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4639; SSE-NEXT:    pand %xmm2, %xmm1
4640; SSE-NEXT:    pand %xmm2, %xmm0
4641; SSE-NEXT:    packuswb %xmm1, %xmm0
4642; SSE-NEXT:    retq
4643;
4644; AVX1-LABEL: trunc_or_v16i16_v16i8:
4645; AVX1:       # %bb.0:
4646; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
4647; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
4648; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4649; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4650; AVX1-NEXT:    vzeroupper
4651; AVX1-NEXT:    retq
4652;
4653; AVX2-LABEL: trunc_or_v16i16_v16i8:
4654; AVX2:       # %bb.0:
4655; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
4656; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
4657; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
4658; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4659; AVX2-NEXT:    vzeroupper
4660; AVX2-NEXT:    retq
4661;
4662; AVX512F-LABEL: trunc_or_v16i16_v16i8:
4663; AVX512F:       # %bb.0:
4664; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
4665; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4666; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
4667; AVX512F-NEXT:    vzeroupper
4668; AVX512F-NEXT:    retq
4669;
4670; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
4671; AVX512BW:       # %bb.0:
4672; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
4673; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
4674; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
4675; AVX512BW-NEXT:    vzeroupper
4676; AVX512BW-NEXT:    retq
4677;
4678; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
4679; AVX512DQ:       # %bb.0:
4680; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
4681; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
4682; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
4683; AVX512DQ-NEXT:    vzeroupper
4684; AVX512DQ-NEXT:    retq
4685  %1 = or <16 x i16> %a0, %a1
4686  %2 = trunc <16 x i16> %1 to <16 x i8>
4687  ret <16 x i8> %2
4688}
4689
4690;
4691; or to constant
4692;
4693
4694define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
4695; SSE-LABEL: trunc_or_const_v4i64_v4i32:
4696; SSE:       # %bb.0:
4697; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4698; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
4699; SSE-NEXT:    retq
4700;
4701; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
4702; AVX1:       # %bb.0:
4703; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4704; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4705; AVX1-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
4706; AVX1-NEXT:    vzeroupper
4707; AVX1-NEXT:    retq
4708;
4709; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
4710; AVX2-SLOW:       # %bb.0:
4711; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
4712; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4713; AVX2-SLOW-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
4714; AVX2-SLOW-NEXT:    vzeroupper
4715; AVX2-SLOW-NEXT:    retq
4716;
4717; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
4718; AVX2-FAST:       # %bb.0:
4719; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
4720; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
4721; AVX2-FAST-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
4722; AVX2-FAST-NEXT:    vzeroupper
4723; AVX2-FAST-NEXT:    retq
4724;
4725; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
4726; AVX512:       # %bb.0:
4727; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4728; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
4729; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4730; AVX512-NEXT:    vzeroupper
4731; AVX512-NEXT:    retq
4732  %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
4733  %2 = trunc <4 x i64> %1 to <4 x i32>
4734  ret <4 x i32> %2
4735}
4736
4737define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
4738; SSE-LABEL: trunc_or_const_v8i64_v8i16:
4739; SSE:       # %bb.0:
4740; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4741; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4742; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4743; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
4744; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4745; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4746; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
4747; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4748; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4749; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4750; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
4751; SSE-NEXT:    orpd {{.*}}(%rip), %xmm0
4752; SSE-NEXT:    retq
4753;
4754; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
4755; AVX1:       # %bb.0:
4756; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
4757; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4758; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4759; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4760; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4761; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4762; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4763; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4764; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4765; AVX1-NEXT:    vzeroupper
4766; AVX1-NEXT:    retq
4767;
4768; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
4769; AVX2-SLOW:       # %bb.0:
4770; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
4771; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4772; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
4773; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4774; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4775; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4776; AVX2-SLOW-NEXT:    vzeroupper
4777; AVX2-SLOW-NEXT:    retq
4778;
4779; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
4780; AVX2-FAST:       # %bb.0:
4781; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
4782; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
4783; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
4784; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4785; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4786; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4787; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4788; AVX2-FAST-NEXT:    vzeroupper
4789; AVX2-FAST-NEXT:    retq
4790;
4791; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
4792; AVX512:       # %bb.0:
4793; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
4794; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4795; AVX512-NEXT:    vzeroupper
4796; AVX512-NEXT:    retq
4797  %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
4798  %2 = trunc <8 x i64> %1 to <8 x i16>
4799  ret <8 x i16> %2
4800}
4801
4802define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
4803; SSE-LABEL: trunc_or_const_v8i32_v8i16:
4804; SSE:       # %bb.0:
4805; SSE-NEXT:    pslld $16, %xmm1
4806; SSE-NEXT:    psrad $16, %xmm1
4807; SSE-NEXT:    pslld $16, %xmm0
4808; SSE-NEXT:    psrad $16, %xmm0
4809; SSE-NEXT:    packssdw %xmm1, %xmm0
4810; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4811; SSE-NEXT:    retq
4812;
4813; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
4814; AVX1:       # %bb.0:
4815; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
4816; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u>
4817; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
4818; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
4819; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4820; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4821; AVX1-NEXT:    vzeroupper
4822; AVX1-NEXT:    retq
4823;
4824; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
4825; AVX2:       # %bb.0:
4826; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
4827; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4828; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4829; AVX2-NEXT:    vzeroupper
4830; AVX2-NEXT:    retq
4831;
4832; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
4833; AVX512:       # %bb.0:
4834; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
4835; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
4836; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4837; AVX512-NEXT:    vzeroupper
4838; AVX512-NEXT:    retq
4839  %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4840  %2 = trunc <8 x i32> %1 to <8 x i16>
4841  ret <8 x i16> %2
4842}
4843
4844define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
4845; SSE-LABEL: trunc_or_const_v16i64_v16i8:
4846; SSE:       # %bb.0:
4847; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
4848; SSE-NEXT:    pand %xmm8, %xmm7
4849; SSE-NEXT:    pand %xmm8, %xmm6
4850; SSE-NEXT:    packuswb %xmm7, %xmm6
4851; SSE-NEXT:    pand %xmm8, %xmm5
4852; SSE-NEXT:    pand %xmm8, %xmm4
4853; SSE-NEXT:    packuswb %xmm5, %xmm4
4854; SSE-NEXT:    packuswb %xmm6, %xmm4
4855; SSE-NEXT:    pand %xmm8, %xmm3
4856; SSE-NEXT:    pand %xmm8, %xmm2
4857; SSE-NEXT:    packuswb %xmm3, %xmm2
4858; SSE-NEXT:    pand %xmm8, %xmm1
4859; SSE-NEXT:    pand %xmm8, %xmm0
4860; SSE-NEXT:    packuswb %xmm1, %xmm0
4861; SSE-NEXT:    packuswb %xmm2, %xmm0
4862; SSE-NEXT:    packuswb %xmm4, %xmm0
4863; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4864; SSE-NEXT:    retq
4865;
4866; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
4867; AVX1:       # %bb.0:
4868; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
4869; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
4870; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
4871; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
4872; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
4873; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
4874; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
4875; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
4876; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
4877; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4878; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4879; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
4880; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
4881; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
4882; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
4883; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4884; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4885; AVX1-NEXT:    vzeroupper
4886; AVX1-NEXT:    retq
4887;
4888; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
4889; AVX2-SLOW:       # %bb.0:
4890; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
4891; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
4892; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6]
4893; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4894; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4895; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4896; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
4897; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm2, %xmm2
4898; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3]
4899; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
4900; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6]
4901; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4902; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4903; AVX2-SLOW-NEXT:    vpand %xmm4, %xmm0, %xmm0
4904; AVX2-SLOW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4905; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4906; AVX2-SLOW-NEXT:    vzeroupper
4907; AVX2-SLOW-NEXT:    retq
4908;
4909; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
4910; AVX2-FAST:       # %bb.0:
4911; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
4912; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
4913; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
4914; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
4915; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4916; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
4917; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
4918; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
4919; AVX2-FAST-NEXT:    vpand %xmm5, %xmm2, %xmm2
4920; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
4921; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
4922; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
4923; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
4924; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4925; AVX2-FAST-NEXT:    vpand %xmm5, %xmm0, %xmm0
4926; AVX2-FAST-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
4927; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4928; AVX2-FAST-NEXT:    vzeroupper
4929; AVX2-FAST-NEXT:    retq
4930;
4931; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
4932; AVX512:       # %bb.0:
4933; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
4934; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
4935; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4936; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4937; AVX512-NEXT:    vzeroupper
4938; AVX512-NEXT:    retq
4939  %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
4940  %2 = trunc <16 x i64> %1 to <16 x i8>
4941  ret <16 x i8> %2
4942}
4943
4944define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
4945; SSE-LABEL: trunc_or_const_v16i32_v16i8:
4946; SSE:       # %bb.0:
4947; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
4948; SSE-NEXT:    pand %xmm4, %xmm3
4949; SSE-NEXT:    pand %xmm4, %xmm2
4950; SSE-NEXT:    packuswb %xmm3, %xmm2
4951; SSE-NEXT:    pand %xmm4, %xmm1
4952; SSE-NEXT:    pand %xmm4, %xmm0
4953; SSE-NEXT:    packuswb %xmm1, %xmm0
4954; SSE-NEXT:    packuswb %xmm2, %xmm0
4955; SSE-NEXT:    por {{.*}}(%rip), %xmm0
4956; SSE-NEXT:    retq
4957;
4958; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
4959; AVX1:       # %bb.0:
4960; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
4961; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
4962; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
4963; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
4964; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
4965; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
4966; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
4967; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4968; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4969; AVX1-NEXT:    vzeroupper
4970; AVX1-NEXT:    retq
4971;
4972; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
4973; AVX2:       # %bb.0:
4974; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
4975; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
4976; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4977; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
4978; AVX2-NEXT:    vpand %xmm3, %xmm1, %xmm1
4979; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
4980; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4981; AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
4982; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
4983; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4984; AVX2-NEXT:    vzeroupper
4985; AVX2-NEXT:    retq
4986;
4987; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
4988; AVX512:       # %bb.0:
4989; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
4990; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
4991; AVX512-NEXT:    vzeroupper
4992; AVX512-NEXT:    retq
4993  %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
4994  %2 = trunc <16 x i32> %1 to <16 x i8>
4995  ret <16 x i8> %2
4996}
4997
4998define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
4999; SSE-LABEL: trunc_or_const_v16i16_v16i8:
5000; SSE:       # %bb.0:
5001; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
5002; SSE-NEXT:    pand %xmm2, %xmm1
5003; SSE-NEXT:    pand %xmm2, %xmm0
5004; SSE-NEXT:    packuswb %xmm1, %xmm0
5005; SSE-NEXT:    por {{.*}}(%rip), %xmm0
5006; SSE-NEXT:    retq
5007;
5008; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
5009; AVX1:       # %bb.0:
5010; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
5011; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
5012; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5013; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
5014; AVX1-NEXT:    vzeroupper
5015; AVX1-NEXT:    retq
5016;
5017; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
5018; AVX2:       # %bb.0:
5019; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
5020; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
5021; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
5022; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
5023; AVX2-NEXT:    vzeroupper
5024; AVX2-NEXT:    retq
5025;
5026; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
5027; AVX512F:       # %bb.0:
5028; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5029; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
5030; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
5031; AVX512F-NEXT:    vzeroupper
5032; AVX512F-NEXT:    retq
5033;
5034; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
5035; AVX512BW:       # %bb.0:
5036; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
5037; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
5038; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
5039; AVX512BW-NEXT:    vzeroupper
5040; AVX512BW-NEXT:    retq
5041;
5042; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
5043; AVX512DQ:       # %bb.0:
5044; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
5045; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
5046; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
5047; AVX512DQ-NEXT:    vzeroupper
5048; AVX512DQ-NEXT:    retq
5049  %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
5050  %2 = trunc <16 x i16> %1 to <16 x i8>
5051  ret <16 x i8> %2
5052}
5053
5054;
5055; complex patterns - often created by vectorizer
5056;
5057
5058define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5059; SSE-LABEL: mul_add_const_v4i64_v4i32:
5060; SSE:       # %bb.0:
5061; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5062; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
5063; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5064; SSE-NEXT:    pmuludq %xmm2, %xmm0
5065; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
5066; SSE-NEXT:    pmuludq %xmm3, %xmm1
5067; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5068; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
5069; SSE-NEXT:    retq
5070;
5071; AVX-LABEL: mul_add_const_v4i64_v4i32:
5072; AVX:       # %bb.0:
5073; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5074; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
5075; AVX-NEXT:    retq
5076  %1 = sext <4 x i32> %a0 to <4 x i64>
5077  %2 = sext <4 x i32> %a1 to <4 x i64>
5078  %3 = mul <4 x i64> %1, %2
5079  %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
5080  %5 = trunc <4 x i64> %4 to <4 x i32>
5081  ret <4 x i32> %5
5082}
5083
5084define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5085; SSE-LABEL: mul_add_self_v4i64_v4i32:
5086; SSE:       # %bb.0:
5087; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5088; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
5089; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
5090; SSE-NEXT:    pmuludq %xmm2, %xmm0
5091; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
5092; SSE-NEXT:    pmuludq %xmm3, %xmm1
5093; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
5094; SSE-NEXT:    paddd %xmm0, %xmm0
5095; SSE-NEXT:    retq
5096;
5097; AVX-LABEL: mul_add_self_v4i64_v4i32:
5098; AVX:       # %bb.0:
5099; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
5100; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
5101; AVX-NEXT:    retq
5102  %1 = sext <4 x i32> %a0 to <4 x i64>
5103  %2 = sext <4 x i32> %a1 to <4 x i64>
5104  %3 = mul <4 x i64> %1, %2
5105  %4 = add <4 x i64> %3, %3
5106  %5 = trunc <4 x i64> %4 to <4 x i32>
5107  ret <4 x i32> %5
5108}
5109
5110define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
5111; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
5112; SSE:       # %bb.0:
5113; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
5114; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3]
5115; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
5116; SSE-NEXT:    pmuludq %xmm2, %xmm4
5117; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3]
5118; SSE-NEXT:    pmuludq %xmm3, %xmm1
5119; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
5120; SSE-NEXT:    paddd %xmm4, %xmm0
5121; SSE-NEXT:    retq
5122;
5123; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
5124; AVX:       # %bb.0:
5125; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
5126; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
5127; AVX-NEXT:    retq
5128  %1 = sext <4 x i32> %a0 to <4 x i64>
5129  %2 = sext <4 x i32> %a1 to <4 x i64>
5130  %3 = mul <4 x i64> %1, %2
5131  %4 = add <4 x i64> %1, %3
5132  %5 = trunc <4 x i64> %4 to <4 x i32>
5133  ret <4 x i32> %5
5134}
5135