1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
7
8define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
9; SSE-LABEL: mulhuw_v4i16:
10; SSE:       # %bb.0:
11; SSE-NEXT:    pmulhuw %xmm1, %xmm0
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: mulhuw_v4i16:
15; AVX:       # %bb.0:
16; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
17; AVX-NEXT:    retq
18  %a1 = zext <4 x i16> %a to <4 x i32>
19  %b1 = zext <4 x i16> %b to <4 x i32>
20  %c = mul <4 x i32> %a1, %b1
21  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
22  %e = trunc <4 x i32> %d to <4 x i16>
23  ret <4 x i16> %e
24}
25
26define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
27; SSE-LABEL: mulhw_v4i16:
28; SSE:       # %bb.0:
29; SSE-NEXT:    pmulhw %xmm1, %xmm0
30; SSE-NEXT:    retq
31;
32; AVX-LABEL: mulhw_v4i16:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
35; AVX-NEXT:    retq
36  %a1 = sext <4 x i16> %a to <4 x i32>
37  %b1 = sext <4 x i16> %b to <4 x i32>
38  %c = mul <4 x i32> %a1, %b1
39  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
40  %e = trunc <4 x i32> %d to <4 x i16>
41  ret <4 x i16> %e
42}
43
44define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
45; SSE-LABEL: mulhuw_v8i16:
46; SSE:       # %bb.0:
47; SSE-NEXT:    pmulhuw %xmm1, %xmm0
48; SSE-NEXT:    retq
49;
50; AVX-LABEL: mulhuw_v8i16:
51; AVX:       # %bb.0:
52; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
53; AVX-NEXT:    retq
54  %a1 = zext <8 x i16> %a to <8 x i32>
55  %b1 = zext <8 x i16> %b to <8 x i32>
56  %c = mul <8 x i32> %a1, %b1
57  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
58  %e = trunc <8 x i32> %d to <8 x i16>
59  ret <8 x i16> %e
60}
61
62define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
63; SSE-LABEL: mulhw_v8i16:
64; SSE:       # %bb.0:
65; SSE-NEXT:    pmulhw %xmm1, %xmm0
66; SSE-NEXT:    retq
67;
68; AVX-LABEL: mulhw_v8i16:
69; AVX:       # %bb.0:
70; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
71; AVX-NEXT:    retq
72  %a1 = sext <8 x i16> %a to <8 x i32>
73  %b1 = sext <8 x i16> %b to <8 x i32>
74  %c = mul <8 x i32> %a1, %b1
75  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
76  %e = trunc <8 x i32> %d to <8 x i16>
77  ret <8 x i16> %e
78}
79
80define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
81; SSE-LABEL: mulhuw_v16i16:
82; SSE:       # %bb.0:
83; SSE-NEXT:    pmulhuw %xmm2, %xmm0
84; SSE-NEXT:    pmulhuw %xmm3, %xmm1
85; SSE-NEXT:    retq
86;
87; AVX-LABEL: mulhuw_v16i16:
88; AVX:       # %bb.0:
89; AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
90; AVX-NEXT:    retq
91  %a1 = zext <16 x i16> %a to <16 x i32>
92  %b1 = zext <16 x i16> %b to <16 x i32>
93  %c = mul <16 x i32> %a1, %b1
94  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
95  %e = trunc <16 x i32> %d to <16 x i16>
96  ret <16 x i16> %e
97}
98
99define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
100; SSE-LABEL: mulhw_v16i16:
101; SSE:       # %bb.0:
102; SSE-NEXT:    pmulhw %xmm2, %xmm0
103; SSE-NEXT:    pmulhw %xmm3, %xmm1
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: mulhw_v16i16:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
109; AVX-NEXT:    retq
110  %a1 = sext <16 x i16> %a to <16 x i32>
111  %b1 = sext <16 x i16> %b to <16 x i32>
112  %c = mul <16 x i32> %a1, %b1
113  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114  %e = trunc <16 x i32> %d to <16 x i16>
115  ret <16 x i16> %e
116}
117
118define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
119; SSE-LABEL: mulhuw_v32i16:
120; SSE:       # %bb.0:
121; SSE-NEXT:    pmulhuw %xmm4, %xmm0
122; SSE-NEXT:    pmulhuw %xmm5, %xmm1
123; SSE-NEXT:    pmulhuw %xmm6, %xmm2
124; SSE-NEXT:    pmulhuw %xmm7, %xmm3
125; SSE-NEXT:    retq
126;
127; AVX2-LABEL: mulhuw_v32i16:
128; AVX2:       # %bb.0:
129; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
130; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
131; AVX2-NEXT:    retq
132;
133; AVX512F-LABEL: mulhuw_v32i16:
134; AVX512F:       # %bb.0:
135; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
136; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
137; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm3, %ymm2
138; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
139; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
140; AVX512F-NEXT:    retq
141;
142; AVX512BW-LABEL: mulhuw_v32i16:
143; AVX512BW:       # %bb.0:
144; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0
145; AVX512BW-NEXT:    retq
146  %a1 = zext <32 x i16> %a to <32 x i32>
147  %b1 = zext <32 x i16> %b to <32 x i32>
148  %c = mul <32 x i32> %a1, %b1
149  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
150  %e = trunc <32 x i32> %d to <32 x i16>
151  ret <32 x i16> %e
152}
153
154define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
155; SSE-LABEL: mulhw_v32i16:
156; SSE:       # %bb.0:
157; SSE-NEXT:    pmulhw %xmm4, %xmm0
158; SSE-NEXT:    pmulhw %xmm5, %xmm1
159; SSE-NEXT:    pmulhw %xmm6, %xmm2
160; SSE-NEXT:    pmulhw %xmm7, %xmm3
161; SSE-NEXT:    retq
162;
163; AVX2-LABEL: mulhw_v32i16:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
166; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
167; AVX2-NEXT:    retq
168;
169; AVX512F-LABEL: mulhw_v32i16:
170; AVX512F:       # %bb.0:
171; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
172; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
173; AVX512F-NEXT:    vpmulhw %ymm2, %ymm3, %ymm2
174; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
175; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
176; AVX512F-NEXT:    retq
177;
178; AVX512BW-LABEL: mulhw_v32i16:
179; AVX512BW:       # %bb.0:
180; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0
181; AVX512BW-NEXT:    retq
182  %a1 = sext <32 x i16> %a to <32 x i32>
183  %b1 = sext <32 x i16> %b to <32 x i32>
184  %c = mul <32 x i32> %a1, %b1
185  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
186  %e = trunc <32 x i32> %d to <32 x i16>
187  ret <32 x i16> %e
188}
189
190define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
191; SSE-LABEL: mulhuw_v64i16:
192; SSE:       # %bb.0:
193; SSE-NEXT:    movq %rdi, %rax
194; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
195; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
196; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
197; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
198; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
199; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
200; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
201; SSE-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
202; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
203; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
204; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
205; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
206; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
207; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
208; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
209; SSE-NEXT:    movdqa %xmm0, (%rdi)
210; SSE-NEXT:    retq
211;
212; AVX2-LABEL: mulhuw_v64i16:
213; AVX2:       # %bb.0:
214; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
215; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
216; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
217; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
218; AVX2-NEXT:    retq
219;
220; AVX512F-LABEL: mulhuw_v64i16:
221; AVX512F:       # %bb.0:
222; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
223; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
224; AVX512F-NEXT:    vpmulhuw %ymm4, %ymm5, %ymm4
225; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
226; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
227; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
228; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
229; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm4, %ymm2
230; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
231; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
232; AVX512F-NEXT:    retq
233;
234; AVX512BW-LABEL: mulhuw_v64i16:
235; AVX512BW:       # %bb.0:
236; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm0
237; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
238; AVX512BW-NEXT:    retq
239  %a1 = zext <64 x i16> %a to <64 x i32>
240  %b1 = zext <64 x i16> %b to <64 x i32>
241  %c = mul <64 x i32> %a1, %b1
242  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
243  %e = trunc <64 x i32> %d to <64 x i16>
244  ret <64 x i16> %e
245}
246
247define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
248; SSE-LABEL: mulhw_v64i16:
249; SSE:       # %bb.0:
250; SSE-NEXT:    movq %rdi, %rax
251; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
252; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
253; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
254; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
255; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
256; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
257; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
258; SSE-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
259; SSE-NEXT:    movdqa %xmm7, 112(%rdi)
260; SSE-NEXT:    movdqa %xmm6, 96(%rdi)
261; SSE-NEXT:    movdqa %xmm5, 80(%rdi)
262; SSE-NEXT:    movdqa %xmm4, 64(%rdi)
263; SSE-NEXT:    movdqa %xmm3, 48(%rdi)
264; SSE-NEXT:    movdqa %xmm2, 32(%rdi)
265; SSE-NEXT:    movdqa %xmm1, 16(%rdi)
266; SSE-NEXT:    movdqa %xmm0, (%rdi)
267; SSE-NEXT:    retq
268;
269; AVX2-LABEL: mulhw_v64i16:
270; AVX2:       # %bb.0:
271; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
272; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
273; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
274; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
275; AVX2-NEXT:    retq
276;
277; AVX512F-LABEL: mulhw_v64i16:
278; AVX512F:       # %bb.0:
279; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
280; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
281; AVX512F-NEXT:    vpmulhw %ymm4, %ymm5, %ymm4
282; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
283; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
284; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
285; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
286; AVX512F-NEXT:    vpmulhw %ymm2, %ymm4, %ymm2
287; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
288; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
289; AVX512F-NEXT:    retq
290;
291; AVX512BW-LABEL: mulhw_v64i16:
292; AVX512BW:       # %bb.0:
293; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm0
294; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
295; AVX512BW-NEXT:    retq
296  %a1 = sext <64 x i16> %a to <64 x i32>
297  %b1 = sext <64 x i16> %b to <64 x i32>
298  %c = mul <64 x i32> %a1, %b1
299  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
300  %e = trunc <64 x i32> %d to <64 x i16>
301  ret <64 x i16> %e
302}
303
304define <8 x i16> @mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
305; SSE-LABEL: mulhuw_v8i16_i64:
306; SSE:       # %bb.0:
307; SSE-NEXT:    pmulhuw %xmm1, %xmm0
308; SSE-NEXT:    retq
309;
310; AVX-LABEL: mulhuw_v8i16_i64:
311; AVX:       # %bb.0:
312; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
313; AVX-NEXT:    retq
314  %a1 = zext <8 x i16> %a to <8 x i64>
315  %b1 = zext <8 x i16> %b to <8 x i64>
316  %c = mul <8 x i64> %a1, %b1
317  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
318  %e = trunc <8 x i64> %d to <8 x i16>
319  ret <8 x i16> %e
320}
321
322define <8 x i16> @mulhw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) {
323; SSE-LABEL: mulhw_v8i16_i64:
324; SSE:       # %bb.0:
325; SSE-NEXT:    pmulhw %xmm1, %xmm0
326; SSE-NEXT:    retq
327;
328; AVX-LABEL: mulhw_v8i16_i64:
329; AVX:       # %bb.0:
330; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
331; AVX-NEXT:    retq
332  %a1 = sext <8 x i16> %a to <8 x i64>
333  %b1 = sext <8 x i16> %b to <8 x i64>
334  %c = mul <8 x i64> %a1, %b1
335  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
336  %e = trunc <8 x i64> %d to <8 x i16>
337  ret <8 x i16> %e
338}
339
340define <4 x i32> @mulhuw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
341; SSE2-LABEL: mulhuw_v4i16_lshr:
342; SSE2:       # %bb.0:
343; SSE2-NEXT:    pmulhuw %xmm1, %xmm0
344; SSE2-NEXT:    pxor %xmm1, %xmm1
345; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
346; SSE2-NEXT:    retq
347;
348; SSE41-LABEL: mulhuw_v4i16_lshr:
349; SSE41:       # %bb.0:
350; SSE41-NEXT:    pmulhuw %xmm1, %xmm0
351; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
352; SSE41-NEXT:    retq
353;
354; AVX-LABEL: mulhuw_v4i16_lshr:
355; AVX:       # %bb.0:
356; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
357; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
358; AVX-NEXT:    retq
359  %a1 = zext <4 x i16> %a to <4 x i32>
360  %b1 = zext <4 x i16> %b to <4 x i32>
361  %c = mul <4 x i32> %a1, %b1
362  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
363  ret <4 x i32> %d
364}
365
366define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) {
367; SSE2-LABEL: mulhsw_v4i16_lshr:
368; SSE2:       # %bb.0:
369; SSE2-NEXT:    pmulhw %xmm1, %xmm0
370; SSE2-NEXT:    pxor %xmm1, %xmm1
371; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
372; SSE2-NEXT:    retq
373;
374; SSE41-LABEL: mulhsw_v4i16_lshr:
375; SSE41:       # %bb.0:
376; SSE41-NEXT:    pmulhw %xmm1, %xmm0
377; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
378; SSE41-NEXT:    retq
379;
380; AVX-LABEL: mulhsw_v4i16_lshr:
381; AVX:       # %bb.0:
382; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
383; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
384; AVX-NEXT:    retq
385  %a1 = sext <4 x i16> %a to <4 x i32>
386  %b1 = sext <4 x i16> %b to <4 x i32>
387  %c = mul <4 x i32> %a1, %b1
388  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
389  ret <4 x i32> %d
390}
391
392define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) {
393; SSE2-LABEL: mulhsw_v4i16_ashr:
394; SSE2:       # %bb.0:
395; SSE2-NEXT:    pmulhw %xmm1, %xmm0
396; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
397; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
398; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
399; SSE2-NEXT:    psrad $16, %xmm0
400; SSE2-NEXT:    retq
401;
402; SSE41-LABEL: mulhsw_v4i16_ashr:
403; SSE41:       # %bb.0:
404; SSE41-NEXT:    pmulhw %xmm1, %xmm0
405; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
406; SSE41-NEXT:    retq
407;
408; AVX-LABEL: mulhsw_v4i16_ashr:
409; AVX:       # %bb.0:
410; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
411; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
412; AVX-NEXT:    retq
413  %a1 = sext <4 x i16> %a to <4 x i32>
414  %b1 = sext <4 x i16> %b to <4 x i32>
415  %c = mul <4 x i32> %a1, %b1
416  %d = ashr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
417  ret <4 x i32> %d
418}
419
420define <8 x i32> @mulhuw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
421; SSE2-LABEL: mulhuw_v8i16_lshr:
422; SSE2:       # %bb.0:
423; SSE2-NEXT:    movdqa %xmm0, %xmm2
424; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
425; SSE2-NEXT:    pxor %xmm1, %xmm1
426; SSE2-NEXT:    movdqa %xmm2, %xmm0
427; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
428; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
429; SSE2-NEXT:    movdqa %xmm2, %xmm1
430; SSE2-NEXT:    retq
431;
432; SSE41-LABEL: mulhuw_v8i16_lshr:
433; SSE41:       # %bb.0:
434; SSE41-NEXT:    movdqa %xmm0, %xmm2
435; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
436; SSE41-NEXT:    pxor %xmm1, %xmm1
437; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
438; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
439; SSE41-NEXT:    movdqa %xmm2, %xmm1
440; SSE41-NEXT:    retq
441;
442; AVX-LABEL: mulhuw_v8i16_lshr:
443; AVX:       # %bb.0:
444; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
445; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
446; AVX-NEXT:    retq
447  %a1 = zext <8 x i16> %a to <8 x i32>
448  %b1 = zext <8 x i16> %b to <8 x i32>
449  %c = mul <8 x i32> %a1, %b1
450  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
451  ret <8 x i32> %d
452}
453
454define <8 x i32> @mulhsw_v8i16_lshr(<8 x i16> %a, <8 x i16> %b) {
455; SSE2-LABEL: mulhsw_v8i16_lshr:
456; SSE2:       # %bb.0:
457; SSE2-NEXT:    movdqa %xmm0, %xmm2
458; SSE2-NEXT:    pmulhw %xmm1, %xmm2
459; SSE2-NEXT:    pxor %xmm1, %xmm1
460; SSE2-NEXT:    movdqa %xmm2, %xmm0
461; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
462; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
463; SSE2-NEXT:    movdqa %xmm2, %xmm1
464; SSE2-NEXT:    retq
465;
466; SSE41-LABEL: mulhsw_v8i16_lshr:
467; SSE41:       # %bb.0:
468; SSE41-NEXT:    movdqa %xmm0, %xmm2
469; SSE41-NEXT:    pmulhw %xmm1, %xmm2
470; SSE41-NEXT:    pxor %xmm1, %xmm1
471; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
472; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
473; SSE41-NEXT:    movdqa %xmm2, %xmm1
474; SSE41-NEXT:    retq
475;
476; AVX-LABEL: mulhsw_v8i16_lshr:
477; AVX:       # %bb.0:
478; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
479; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
480; AVX-NEXT:    retq
481  %a1 = sext <8 x i16> %a to <8 x i32>
482  %b1 = sext <8 x i16> %b to <8 x i32>
483  %c = mul <8 x i32> %a1, %b1
484  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
485  ret <8 x i32> %d
486}
487
488define <8 x i32> @mulhsw_v8i16_ashr(<8 x i16> %a, <8 x i16> %b) {
489; SSE2-LABEL: mulhsw_v8i16_ashr:
490; SSE2:       # %bb.0:
491; SSE2-NEXT:    pmulhw %xmm1, %xmm0
492; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
493; SSE2-NEXT:    psrad $16, %xmm2
494; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
495; SSE2-NEXT:    psrad $16, %xmm1
496; SSE2-NEXT:    movdqa %xmm2, %xmm0
497; SSE2-NEXT:    retq
498;
499; SSE41-LABEL: mulhsw_v8i16_ashr:
500; SSE41:       # %bb.0:
501; SSE41-NEXT:    pmulhw %xmm1, %xmm0
502; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
503; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
504; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
505; SSE41-NEXT:    movdqa %xmm2, %xmm0
506; SSE41-NEXT:    retq
507;
508; AVX-LABEL: mulhsw_v8i16_ashr:
509; AVX:       # %bb.0:
510; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
511; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
512; AVX-NEXT:    retq
513  %a1 = sext <8 x i16> %a to <8 x i32>
514  %b1 = sext <8 x i16> %b to <8 x i32>
515  %c = mul <8 x i32> %a1, %b1
516  %d = ashr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
517  ret <8 x i32> %d
518}
519
520define <16 x i32> @mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
521; SSE2-LABEL: mulhuw_v16i16_lshr:
522; SSE2:       # %bb.0:
523; SSE2-NEXT:    movdqa %xmm1, %xmm4
524; SSE2-NEXT:    movdqa %xmm0, %xmm1
525; SSE2-NEXT:    pmulhuw %xmm3, %xmm4
526; SSE2-NEXT:    pmulhuw %xmm2, %xmm1
527; SSE2-NEXT:    pxor %xmm3, %xmm3
528; SSE2-NEXT:    movdqa %xmm1, %xmm0
529; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
530; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
531; SSE2-NEXT:    movdqa %xmm4, %xmm2
532; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
533; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
534; SSE2-NEXT:    movdqa %xmm4, %xmm3
535; SSE2-NEXT:    retq
536;
537; SSE41-LABEL: mulhuw_v16i16_lshr:
538; SSE41:       # %bb.0:
539; SSE41-NEXT:    movdqa %xmm1, %xmm4
540; SSE41-NEXT:    movdqa %xmm0, %xmm1
541; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
542; SSE41-NEXT:    pxor %xmm5, %xmm5
543; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
544; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
545; SSE41-NEXT:    pmulhuw %xmm3, %xmm4
546; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
547; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
548; SSE41-NEXT:    movdqa %xmm4, %xmm3
549; SSE41-NEXT:    retq
550;
551; AVX2-LABEL: mulhuw_v16i16_lshr:
552; AVX2:       # %bb.0:
553; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm1
554; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
555; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
556; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
557; AVX2-NEXT:    retq
558;
559; AVX512-LABEL: mulhuw_v16i16_lshr:
560; AVX512:       # %bb.0:
561; AVX512-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
562; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
563; AVX512-NEXT:    retq
564  %a1 = zext <16 x i16> %a to <16 x i32>
565  %b1 = zext <16 x i16> %b to <16 x i32>
566  %c = mul <16 x i32> %a1, %b1
567  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
568  ret <16 x i32> %d
569}
570
571define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
572; SSE2-LABEL: mulhsw_v16i16_lshr:
573; SSE2:       # %bb.0:
574; SSE2-NEXT:    movdqa %xmm1, %xmm4
575; SSE2-NEXT:    movdqa %xmm0, %xmm1
576; SSE2-NEXT:    pmulhw %xmm3, %xmm4
577; SSE2-NEXT:    pmulhw %xmm2, %xmm1
578; SSE2-NEXT:    pxor %xmm3, %xmm3
579; SSE2-NEXT:    movdqa %xmm1, %xmm0
580; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
581; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
582; SSE2-NEXT:    movdqa %xmm4, %xmm2
583; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
584; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
585; SSE2-NEXT:    movdqa %xmm4, %xmm3
586; SSE2-NEXT:    retq
587;
588; SSE41-LABEL: mulhsw_v16i16_lshr:
589; SSE41:       # %bb.0:
590; SSE41-NEXT:    movdqa %xmm1, %xmm4
591; SSE41-NEXT:    movdqa %xmm0, %xmm1
592; SSE41-NEXT:    pmulhw %xmm2, %xmm1
593; SSE41-NEXT:    pxor %xmm5, %xmm5
594; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
595; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
596; SSE41-NEXT:    pmulhw %xmm3, %xmm4
597; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
598; SSE41-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
599; SSE41-NEXT:    movdqa %xmm4, %xmm3
600; SSE41-NEXT:    retq
601;
602; AVX2-LABEL: mulhsw_v16i16_lshr:
603; AVX2:       # %bb.0:
604; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm1
605; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
606; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
607; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
608; AVX2-NEXT:    retq
609;
610; AVX512-LABEL: mulhsw_v16i16_lshr:
611; AVX512:       # %bb.0:
612; AVX512-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
613; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
614; AVX512-NEXT:    retq
615  %a1 = sext <16 x i16> %a to <16 x i32>
616  %b1 = sext <16 x i16> %b to <16 x i32>
617  %c = mul <16 x i32> %a1, %b1
618  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
619  ret <16 x i32> %d
620}
621
622define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) {
623; SSE2-LABEL: mulhsw_v16i16_ashr:
624; SSE2:       # %bb.0:
625; SSE2-NEXT:    pmulhw %xmm3, %xmm1
626; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
627; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
628; SSE2-NEXT:    pmulhw %xmm2, %xmm0
629; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
630; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
631; SSE2-NEXT:    psrad $16, %xmm0
632; SSE2-NEXT:    psrad $16, %xmm1
633; SSE2-NEXT:    psrad $16, %xmm4
634; SSE2-NEXT:    psrad $16, %xmm3
635; SSE2-NEXT:    movdqa %xmm4, %xmm2
636; SSE2-NEXT:    retq
637;
638; SSE41-LABEL: mulhsw_v16i16_ashr:
639; SSE41:       # %bb.0:
640; SSE41-NEXT:    pmulhw %xmm2, %xmm0
641; SSE41-NEXT:    pmovsxwd %xmm0, %xmm4
642; SSE41-NEXT:    pmulhw %xmm3, %xmm1
643; SSE41-NEXT:    pmovsxwd %xmm1, %xmm2
644; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
645; SSE41-NEXT:    pmovsxwd %xmm0, %xmm5
646; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
647; SSE41-NEXT:    pmovsxwd %xmm0, %xmm3
648; SSE41-NEXT:    movdqa %xmm4, %xmm0
649; SSE41-NEXT:    movdqa %xmm5, %xmm1
650; SSE41-NEXT:    retq
651;
652; AVX2-LABEL: mulhsw_v16i16_ashr:
653; AVX2:       # %bb.0:
654; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm1
655; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm0
656; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
657; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
658; AVX2-NEXT:    retq
659;
660; AVX512-LABEL: mulhsw_v16i16_ashr:
661; AVX512:       # %bb.0:
662; AVX512-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
663; AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
664; AVX512-NEXT:    retq
665  %a1 = sext <16 x i16> %a to <16 x i32>
666  %b1 = sext <16 x i16> %b to <16 x i32>
667  %c = mul <16 x i32> %a1, %b1
668  %d = ashr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
669  ret <16 x i32> %d
670}
671
672define <32 x i32> @mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
673; SSE2-LABEL: mulhuw_v32i16_lshr:
674; SSE2:       # %bb.0:
675; SSE2-NEXT:    movq %rdi, %rax
676; SSE2-NEXT:    pmulhuw %xmm7, %xmm3
677; SSE2-NEXT:    pmulhuw %xmm6, %xmm2
678; SSE2-NEXT:    pmulhuw %xmm5, %xmm1
679; SSE2-NEXT:    pmulhuw %xmm4, %xmm0
680; SSE2-NEXT:    pxor %xmm4, %xmm4
681; SSE2-NEXT:    movdqa %xmm0, %xmm8
682; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
683; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
684; SSE2-NEXT:    movdqa %xmm1, %xmm6
685; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
686; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
687; SSE2-NEXT:    movdqa %xmm2, %xmm7
688; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
689; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
690; SSE2-NEXT:    movdqa %xmm3, %xmm5
691; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
692; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
693; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
694; SSE2-NEXT:    movdqa %xmm5, 96(%rdi)
695; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
696; SSE2-NEXT:    movdqa %xmm7, 64(%rdi)
697; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
698; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
699; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
700; SSE2-NEXT:    movdqa %xmm8, (%rdi)
701; SSE2-NEXT:    retq
702;
703; SSE41-LABEL: mulhuw_v32i16_lshr:
704; SSE41:       # %bb.0:
705; SSE41-NEXT:    movq %rdi, %rax
706; SSE41-NEXT:    pmulhuw %xmm4, %xmm0
707; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
708; SSE41-NEXT:    pxor %xmm4, %xmm4
709; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
710; SSE41-NEXT:    pmulhuw %xmm5, %xmm1
711; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
712; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
713; SSE41-NEXT:    pmulhuw %xmm6, %xmm2
714; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
715; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
716; SSE41-NEXT:    pmulhuw %xmm7, %xmm3
717; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
718; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
719; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
720; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
721; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
722; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
723; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
724; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
725; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
726; SSE41-NEXT:    movdqa %xmm8, (%rdi)
727; SSE41-NEXT:    retq
728;
729; AVX2-LABEL: mulhuw_v32i16_lshr:
730; AVX2:       # %bb.0:
731; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm2
732; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
733; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
734; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
735; AVX2-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm1
736; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
737; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
738; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
739; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
740; AVX2-NEXT:    retq
741;
742; AVX512F-LABEL: mulhuw_v32i16_lshr:
743; AVX512F:       # %bb.0:
744; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
745; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
746; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
747; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
748; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
749; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
750; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
751; AVX512F-NEXT:    retq
752;
753; AVX512BW-LABEL: mulhuw_v32i16_lshr:
754; AVX512BW:       # %bb.0:
755; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm1
756; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
757; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
758; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
759; AVX512BW-NEXT:    retq
760  %a1 = zext <32 x i16> %a to <32 x i32>
761  %b1 = zext <32 x i16> %b to <32 x i32>
762  %c = mul <32 x i32> %a1, %b1
763  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
764  ret <32 x i32> %d
765}
766
767define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) {
768; SSE2-LABEL: mulhsw_v32i16_lshr:
769; SSE2:       # %bb.0:
770; SSE2-NEXT:    movq %rdi, %rax
771; SSE2-NEXT:    pmulhw %xmm7, %xmm3
772; SSE2-NEXT:    pmulhw %xmm6, %xmm2
773; SSE2-NEXT:    pmulhw %xmm5, %xmm1
774; SSE2-NEXT:    pmulhw %xmm4, %xmm0
775; SSE2-NEXT:    pxor %xmm4, %xmm4
776; SSE2-NEXT:    movdqa %xmm0, %xmm8
777; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
778; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
779; SSE2-NEXT:    movdqa %xmm1, %xmm6
780; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
781; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
782; SSE2-NEXT:    movdqa %xmm2, %xmm7
783; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
784; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
785; SSE2-NEXT:    movdqa %xmm3, %xmm5
786; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
787; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
788; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
789; SSE2-NEXT:    movdqa %xmm5, 96(%rdi)
790; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
791; SSE2-NEXT:    movdqa %xmm7, 64(%rdi)
792; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
793; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
794; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
795; SSE2-NEXT:    movdqa %xmm8, (%rdi)
796; SSE2-NEXT:    retq
797;
798; SSE41-LABEL: mulhsw_v32i16_lshr:
799; SSE41:       # %bb.0:
800; SSE41-NEXT:    movq %rdi, %rax
801; SSE41-NEXT:    pmulhw %xmm4, %xmm0
802; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
803; SSE41-NEXT:    pxor %xmm4, %xmm4
804; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
805; SSE41-NEXT:    pmulhw %xmm5, %xmm1
806; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
807; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
808; SSE41-NEXT:    pmulhw %xmm6, %xmm2
809; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
810; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
811; SSE41-NEXT:    pmulhw %xmm7, %xmm3
812; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
813; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
814; SSE41-NEXT:    movdqa %xmm3, 112(%rdi)
815; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
816; SSE41-NEXT:    movdqa %xmm2, 80(%rdi)
817; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
818; SSE41-NEXT:    movdqa %xmm1, 48(%rdi)
819; SSE41-NEXT:    movdqa %xmm5, 32(%rdi)
820; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
821; SSE41-NEXT:    movdqa %xmm8, (%rdi)
822; SSE41-NEXT:    retq
823;
824; AVX2-LABEL: mulhsw_v32i16_lshr:
825; AVX2:       # %bb.0:
826; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm2
827; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
828; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
829; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
830; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
831; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
832; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
833; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
834; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
835; AVX2-NEXT:    retq
836;
837; AVX512F-LABEL: mulhsw_v32i16_lshr:
838; AVX512F:       # %bb.0:
839; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
840; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
841; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
842; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
843; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
844; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
845; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
846; AVX512F-NEXT:    retq
847;
848; AVX512BW-LABEL: mulhsw_v32i16_lshr:
849; AVX512BW:       # %bb.0:
850; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm1
851; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
852; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
853; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
854; AVX512BW-NEXT:    retq
855  %a1 = sext <32 x i16> %a to <32 x i32>
856  %b1 = sext <32 x i16> %b to <32 x i32>
857  %c = mul <32 x i32> %a1, %b1
858  %d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
859  ret <32 x i32> %d
860}
861
862define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) {
863; SSE2-LABEL: mulhsw_v32i16_ashr:
864; SSE2:       # %bb.0:
865; SSE2-NEXT:    movq %rdi, %rax
866; SSE2-NEXT:    pmulhw %xmm7, %xmm3
867; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
868; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
869; SSE2-NEXT:    pmulhw %xmm6, %xmm2
870; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
871; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
872; SSE2-NEXT:    pmulhw %xmm5, %xmm1
873; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
874; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
875; SSE2-NEXT:    pmulhw %xmm4, %xmm0
876; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
877; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
878; SSE2-NEXT:    psrad $16, %xmm0
879; SSE2-NEXT:    psrad $16, %xmm4
880; SSE2-NEXT:    psrad $16, %xmm1
881; SSE2-NEXT:    psrad $16, %xmm5
882; SSE2-NEXT:    psrad $16, %xmm2
883; SSE2-NEXT:    psrad $16, %xmm6
884; SSE2-NEXT:    psrad $16, %xmm3
885; SSE2-NEXT:    psrad $16, %xmm7
886; SSE2-NEXT:    movdqa %xmm7, 112(%rdi)
887; SSE2-NEXT:    movdqa %xmm3, 96(%rdi)
888; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
889; SSE2-NEXT:    movdqa %xmm2, 64(%rdi)
890; SSE2-NEXT:    movdqa %xmm5, 48(%rdi)
891; SSE2-NEXT:    movdqa %xmm1, 32(%rdi)
892; SSE2-NEXT:    movdqa %xmm4, 16(%rdi)
893; SSE2-NEXT:    movdqa %xmm0, (%rdi)
894; SSE2-NEXT:    retq
895;
896; SSE41-LABEL: mulhsw_v32i16_ashr:
897; SSE41:       # %bb.0:
898; SSE41-NEXT:    movq %rdi, %rax
899; SSE41-NEXT:    pmulhw %xmm4, %xmm0
900; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
901; SSE41-NEXT:    pmovsxwd %xmm4, %xmm4
902; SSE41-NEXT:    pmulhw %xmm5, %xmm1
903; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
904; SSE41-NEXT:    pmovsxwd %xmm5, %xmm5
905; SSE41-NEXT:    pmulhw %xmm6, %xmm2
906; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
907; SSE41-NEXT:    pmovsxwd %xmm6, %xmm6
908; SSE41-NEXT:    pmulhw %xmm7, %xmm3
909; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3]
910; SSE41-NEXT:    pmovsxwd %xmm7, %xmm7
911; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
912; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
913; SSE41-NEXT:    pmovsxwd %xmm2, %xmm2
914; SSE41-NEXT:    pmovsxwd %xmm3, %xmm3
915; SSE41-NEXT:    movdqa %xmm3, 96(%rdi)
916; SSE41-NEXT:    movdqa %xmm2, 64(%rdi)
917; SSE41-NEXT:    movdqa %xmm1, 32(%rdi)
918; SSE41-NEXT:    movdqa %xmm0, (%rdi)
919; SSE41-NEXT:    movdqa %xmm7, 112(%rdi)
920; SSE41-NEXT:    movdqa %xmm6, 80(%rdi)
921; SSE41-NEXT:    movdqa %xmm5, 48(%rdi)
922; SSE41-NEXT:    movdqa %xmm4, 16(%rdi)
923; SSE41-NEXT:    retq
924;
925; AVX2-LABEL: mulhsw_v32i16_ashr:
926; AVX2:       # %bb.0:
927; AVX2-NEXT:    vpmulhw %ymm2, %ymm0, %ymm2
928; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm0
929; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
930; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm4
931; AVX2-NEXT:    vpmulhw %ymm3, %ymm1, %ymm1
932; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm2
933; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
934; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm3
935; AVX2-NEXT:    vmovdqa %ymm4, %ymm1
936; AVX2-NEXT:    retq
937;
938; AVX512F-LABEL: mulhsw_v32i16_ashr:
939; AVX512F:       # %bb.0:
940; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2
941; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
942; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
943; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
944; AVX512F-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
945; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm1
946; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
947; AVX512F-NEXT:    retq
948;
949; AVX512BW-LABEL: mulhsw_v32i16_ashr:
950; AVX512BW:       # %bb.0:
951; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm1
952; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm0
953; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
954; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm1
955; AVX512BW-NEXT:    retq
956  %a1 = sext <32 x i16> %a to <32 x i32>
957  %b1 = sext <32 x i16> %b to <32 x i32>
958  %c = mul <32 x i32> %a1, %b1
959  %d = ashr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
960  ret <32 x i32> %d
961}
962
963define <64 x i32> @mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
964; SSE2-LABEL: mulhuw_v64i16_lshr:
965; SSE2:       # %bb.0:
966; SSE2-NEXT:    movdqa %xmm7, %xmm8
967; SSE2-NEXT:    movq %rdi, %rax
968; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm8
969; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
970; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
971; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
972; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
973; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
974; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
975; SSE2-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm0
976; SSE2-NEXT:    pxor %xmm11, %xmm11
977; SSE2-NEXT:    movdqa %xmm0, %xmm7
978; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
979; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
980; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
981; SSE2-NEXT:    movdqa %xmm1, %xmm9
982; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
983; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
984; SSE2-NEXT:    movdqa %xmm2, %xmm10
985; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
986; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
987; SSE2-NEXT:    movdqa %xmm3, %xmm12
988; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
989; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
990; SSE2-NEXT:    movdqa %xmm4, %xmm13
991; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
992; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
993; SSE2-NEXT:    movdqa %xmm5, %xmm14
994; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
995; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
996; SSE2-NEXT:    movdqa %xmm6, %xmm15
997; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
998; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
999; SSE2-NEXT:    movdqa %xmm8, %xmm7
1000; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1001; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1002; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1003; SSE2-NEXT:    movdqa %xmm7, 224(%rdi)
1004; SSE2-NEXT:    movdqa %xmm6, 208(%rdi)
1005; SSE2-NEXT:    movdqa %xmm15, 192(%rdi)
1006; SSE2-NEXT:    movdqa %xmm5, 176(%rdi)
1007; SSE2-NEXT:    movdqa %xmm14, 160(%rdi)
1008; SSE2-NEXT:    movdqa %xmm4, 144(%rdi)
1009; SSE2-NEXT:    movdqa %xmm13, 128(%rdi)
1010; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1011; SSE2-NEXT:    movdqa %xmm12, 96(%rdi)
1012; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1013; SSE2-NEXT:    movdqa %xmm10, 64(%rdi)
1014; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1015; SSE2-NEXT:    movdqa %xmm9, 32(%rdi)
1016; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1017; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1018; SSE2-NEXT:    movaps %xmm0, (%rdi)
1019; SSE2-NEXT:    retq
1020;
1021; SSE41-LABEL: mulhuw_v64i16_lshr:
1022; SSE41:       # %bb.0:
1023; SSE41-NEXT:    movdqa %xmm0, %xmm8
1024; SSE41-NEXT:    movq %rdi, %rax
1025; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm8
1026; SSE41-NEXT:    pxor %xmm11, %xmm11
1027; SSE41-NEXT:    movdqa %xmm8, %xmm0
1028; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1029; SSE41-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1030; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm1
1031; SSE41-NEXT:    movdqa %xmm1, %xmm9
1032; SSE41-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1033; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm2
1034; SSE41-NEXT:    movdqa %xmm2, %xmm10
1035; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1036; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm3
1037; SSE41-NEXT:    movdqa %xmm3, %xmm12
1038; SSE41-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1039; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm4
1040; SSE41-NEXT:    movdqa %xmm4, %xmm13
1041; SSE41-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1042; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm5
1043; SSE41-NEXT:    movdqa %xmm5, %xmm14
1044; SSE41-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1045; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm6
1046; SSE41-NEXT:    movdqa %xmm6, %xmm15
1047; SSE41-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1048; SSE41-NEXT:    pmulhuw {{[0-9]+}}(%rsp), %xmm7
1049; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1050; SSE41-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1051; SSE41-NEXT:    movdqa %xmm7, 240(%rdi)
1052; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
1053; SSE41-NEXT:    movdqa %xmm15, 208(%rdi)
1054; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1055; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
1056; SSE41-NEXT:    movdqa %xmm14, 176(%rdi)
1057; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1058; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
1059; SSE41-NEXT:    movdqa %xmm13, 144(%rdi)
1060; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1061; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
1062; SSE41-NEXT:    movdqa %xmm12, 112(%rdi)
1063; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1064; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
1065; SSE41-NEXT:    movdqa %xmm10, 80(%rdi)
1066; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1067; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
1068; SSE41-NEXT:    movdqa %xmm9, 48(%rdi)
1069; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1070; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
1071; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1072; SSE41-NEXT:    movaps %xmm0, 16(%rdi)
1073; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1074; SSE41-NEXT:    movdqa %xmm0, (%rdi)
1075; SSE41-NEXT:    retq
1076;
1077; AVX2-LABEL: mulhuw_v64i16_lshr:
1078; AVX2:       # %bb.0:
1079; AVX2-NEXT:    movq %rdi, %rax
1080; AVX2-NEXT:    vpmulhuw %ymm4, %ymm0, %ymm0
1081; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1082; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1083; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1084; AVX2-NEXT:    vpmulhuw %ymm5, %ymm1, %ymm1
1085; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1086; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1087; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1088; AVX2-NEXT:    vpmulhuw %ymm6, %ymm2, %ymm2
1089; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1090; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1091; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1092; AVX2-NEXT:    vpmulhuw %ymm7, %ymm3, %ymm3
1093; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1094; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1095; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1096; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1097; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1098; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1099; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1100; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1101; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1102; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1103; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1104; AVX2-NEXT:    vzeroupper
1105; AVX2-NEXT:    retq
1106;
1107; AVX512F-LABEL: mulhuw_v64i16_lshr:
1108; AVX512F:       # %bb.0:
1109; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
1110; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1111; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1112; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1113; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
1114; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1115; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm1, %ymm0
1116; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1117; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1118; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1119; AVX512F-NEXT:    vpmulhuw %ymm0, %ymm1, %ymm0
1120; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1121; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1122; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1123; AVX512F-NEXT:    retq
1124;
1125; AVX512BW-LABEL: mulhuw_v64i16_lshr:
1126; AVX512BW:       # %bb.0:
1127; AVX512BW-NEXT:    vpmulhuw %zmm2, %zmm0, %zmm2
1128; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1129; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1130; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1131; AVX512BW-NEXT:    vpmulhuw %zmm3, %zmm1, %zmm1
1132; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1133; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1134; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1135; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1136; AVX512BW-NEXT:    retq
1137  %a1 = zext <64 x i16> %a to <64 x i32>
1138  %b1 = zext <64 x i16> %b to <64 x i32>
1139  %c = mul <64 x i32> %a1, %b1
1140  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1141  ret <64 x i32> %d
1142}
1143
1144define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) {
1145; SSE2-LABEL: mulhsw_v64i16_lshr:
1146; SSE2:       # %bb.0:
1147; SSE2-NEXT:    movdqa %xmm7, %xmm8
1148; SSE2-NEXT:    movq %rdi, %rax
1149; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm8
1150; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1151; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1152; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1153; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1154; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1155; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1156; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1157; SSE2-NEXT:    pxor %xmm11, %xmm11
1158; SSE2-NEXT:    movdqa %xmm0, %xmm7
1159; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1160; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1161; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1162; SSE2-NEXT:    movdqa %xmm1, %xmm9
1163; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1164; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
1165; SSE2-NEXT:    movdqa %xmm2, %xmm10
1166; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1167; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
1168; SSE2-NEXT:    movdqa %xmm3, %xmm12
1169; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1170; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
1171; SSE2-NEXT:    movdqa %xmm4, %xmm13
1172; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
1173; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
1174; SSE2-NEXT:    movdqa %xmm5, %xmm14
1175; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1176; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
1177; SSE2-NEXT:    movdqa %xmm6, %xmm15
1178; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
1179; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
1180; SSE2-NEXT:    movdqa %xmm8, %xmm7
1181; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
1182; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
1183; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1184; SSE2-NEXT:    movdqa %xmm7, 224(%rdi)
1185; SSE2-NEXT:    movdqa %xmm6, 208(%rdi)
1186; SSE2-NEXT:    movdqa %xmm15, 192(%rdi)
1187; SSE2-NEXT:    movdqa %xmm5, 176(%rdi)
1188; SSE2-NEXT:    movdqa %xmm14, 160(%rdi)
1189; SSE2-NEXT:    movdqa %xmm4, 144(%rdi)
1190; SSE2-NEXT:    movdqa %xmm13, 128(%rdi)
1191; SSE2-NEXT:    movdqa %xmm3, 112(%rdi)
1192; SSE2-NEXT:    movdqa %xmm12, 96(%rdi)
1193; SSE2-NEXT:    movdqa %xmm2, 80(%rdi)
1194; SSE2-NEXT:    movdqa %xmm10, 64(%rdi)
1195; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1196; SSE2-NEXT:    movdqa %xmm9, 32(%rdi)
1197; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1198; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1199; SSE2-NEXT:    movaps %xmm0, (%rdi)
1200; SSE2-NEXT:    retq
1201;
1202; SSE41-LABEL: mulhsw_v64i16_lshr:
1203; SSE41:       # %bb.0:
1204; SSE41-NEXT:    movdqa %xmm0, %xmm8
1205; SSE41-NEXT:    movq %rdi, %rax
1206; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm8
1207; SSE41-NEXT:    pxor %xmm11, %xmm11
1208; SSE41-NEXT:    movdqa %xmm8, %xmm0
1209; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
1210; SSE41-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1211; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1212; SSE41-NEXT:    movdqa %xmm1, %xmm9
1213; SSE41-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
1214; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1215; SSE41-NEXT:    movdqa %xmm2, %xmm10
1216; SSE41-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1217; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1218; SSE41-NEXT:    movdqa %xmm3, %xmm12
1219; SSE41-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1220; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1221; SSE41-NEXT:    movdqa %xmm4, %xmm13
1222; SSE41-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
1223; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1224; SSE41-NEXT:    movdqa %xmm5, %xmm14
1225; SSE41-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1226; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1227; SSE41-NEXT:    movdqa %xmm6, %xmm15
1228; SSE41-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
1229; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1230; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
1231; SSE41-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
1232; SSE41-NEXT:    movdqa %xmm7, 240(%rdi)
1233; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
1234; SSE41-NEXT:    movdqa %xmm15, 208(%rdi)
1235; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
1236; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
1237; SSE41-NEXT:    movdqa %xmm14, 176(%rdi)
1238; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
1239; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
1240; SSE41-NEXT:    movdqa %xmm13, 144(%rdi)
1241; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
1242; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
1243; SSE41-NEXT:    movdqa %xmm12, 112(%rdi)
1244; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
1245; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
1246; SSE41-NEXT:    movdqa %xmm10, 80(%rdi)
1247; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1248; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
1249; SSE41-NEXT:    movdqa %xmm9, 48(%rdi)
1250; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
1251; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
1252; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1253; SSE41-NEXT:    movaps %xmm0, 16(%rdi)
1254; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
1255; SSE41-NEXT:    movdqa %xmm0, (%rdi)
1256; SSE41-NEXT:    retq
1257;
1258; AVX2-LABEL: mulhsw_v64i16_lshr:
1259; AVX2:       # %bb.0:
1260; AVX2-NEXT:    movq %rdi, %rax
1261; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
1262; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1263; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1264; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1265; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
1266; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1267; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1268; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1269; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
1270; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1271; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1272; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1273; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
1274; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1275; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1276; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1277; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1278; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1279; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1280; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1281; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1282; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1283; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1284; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1285; AVX2-NEXT:    vzeroupper
1286; AVX2-NEXT:    retq
1287;
1288; AVX512F-LABEL: mulhsw_v64i16_lshr:
1289; AVX512F:       # %bb.0:
1290; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
1291; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
1292; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1293; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1294; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
1295; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1296; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm0
1297; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1298; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1299; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1300; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
1301; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1302; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1303; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1304; AVX512F-NEXT:    retq
1305;
1306; AVX512BW-LABEL: mulhsw_v64i16_lshr:
1307; AVX512BW:       # %bb.0:
1308; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm2
1309; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1310; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1311; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1312; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
1313; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1314; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1315; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1316; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1317; AVX512BW-NEXT:    retq
1318  %a1 = sext <64 x i16> %a to <64 x i32>
1319  %b1 = sext <64 x i16> %b to <64 x i32>
1320  %c = mul <64 x i32> %a1, %b1
1321  %d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1322  ret <64 x i32> %d
1323}
1324
1325define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) {
1326; SSE2-LABEL: mulhsw_v64i16_ashr:
1327; SSE2:       # %bb.0:
1328; SSE2-NEXT:    movq %rdi, %rax
1329; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1330; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
1331; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
1332; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1333; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
1334; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
1335; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1336; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7]
1337; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
1338; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1339; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
1340; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
1341; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1342; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1344; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1345; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
1346; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1347; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1348; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
1349; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1350; SSE2-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1351; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1352; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1353; SSE2-NEXT:    psrad $16, %xmm0
1354; SSE2-NEXT:    psrad $16, %xmm7
1355; SSE2-NEXT:    psrad $16, %xmm1
1356; SSE2-NEXT:    psrad $16, %xmm4
1357; SSE2-NEXT:    psrad $16, %xmm2
1358; SSE2-NEXT:    psrad $16, %xmm6
1359; SSE2-NEXT:    psrad $16, %xmm3
1360; SSE2-NEXT:    psrad $16, %xmm5
1361; SSE2-NEXT:    psrad $16, %xmm14
1362; SSE2-NEXT:    psrad $16, %xmm15
1363; SSE2-NEXT:    psrad $16, %xmm12
1364; SSE2-NEXT:    psrad $16, %xmm13
1365; SSE2-NEXT:    psrad $16, %xmm10
1366; SSE2-NEXT:    psrad $16, %xmm11
1367; SSE2-NEXT:    psrad $16, %xmm9
1368; SSE2-NEXT:    psrad $16, %xmm8
1369; SSE2-NEXT:    movdqa %xmm8, 240(%rdi)
1370; SSE2-NEXT:    movdqa %xmm9, 224(%rdi)
1371; SSE2-NEXT:    movdqa %xmm11, 208(%rdi)
1372; SSE2-NEXT:    movdqa %xmm10, 192(%rdi)
1373; SSE2-NEXT:    movdqa %xmm13, 176(%rdi)
1374; SSE2-NEXT:    movdqa %xmm12, 160(%rdi)
1375; SSE2-NEXT:    movdqa %xmm15, 144(%rdi)
1376; SSE2-NEXT:    movdqa %xmm14, 128(%rdi)
1377; SSE2-NEXT:    movdqa %xmm5, 112(%rdi)
1378; SSE2-NEXT:    movdqa %xmm3, 96(%rdi)
1379; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
1380; SSE2-NEXT:    movdqa %xmm2, 64(%rdi)
1381; SSE2-NEXT:    movdqa %xmm4, 48(%rdi)
1382; SSE2-NEXT:    movdqa %xmm1, 32(%rdi)
1383; SSE2-NEXT:    movdqa %xmm7, 16(%rdi)
1384; SSE2-NEXT:    movdqa %xmm0, (%rdi)
1385; SSE2-NEXT:    retq
1386;
1387; SSE41-LABEL: mulhsw_v64i16_ashr:
1388; SSE41:       # %bb.0:
1389; SSE41-NEXT:    movq %rdi, %rax
1390; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm0
1391; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
1392; SSE41-NEXT:    pmovsxwd %xmm8, %xmm8
1393; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm1
1394; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
1395; SSE41-NEXT:    pmovsxwd %xmm9, %xmm9
1396; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm2
1397; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3]
1398; SSE41-NEXT:    pmovsxwd %xmm10, %xmm10
1399; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm3
1400; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3]
1401; SSE41-NEXT:    pmovsxwd %xmm11, %xmm11
1402; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm4
1403; SSE41-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[2,3,2,3]
1404; SSE41-NEXT:    pmovsxwd %xmm12, %xmm12
1405; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm5
1406; SSE41-NEXT:    pshufd {{.*#+}} xmm13 = xmm5[2,3,2,3]
1407; SSE41-NEXT:    pmovsxwd %xmm13, %xmm13
1408; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm6
1409; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
1410; SSE41-NEXT:    pmovsxwd %xmm14, %xmm14
1411; SSE41-NEXT:    pmulhw {{[0-9]+}}(%rsp), %xmm7
1412; SSE41-NEXT:    pshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
1413; SSE41-NEXT:    pmovsxwd %xmm15, %xmm15
1414; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
1415; SSE41-NEXT:    pmovsxwd %xmm1, %xmm1
1416; SSE41-NEXT:    pmovsxwd %xmm2, %xmm2
1417; SSE41-NEXT:    pmovsxwd %xmm3, %xmm3
1418; SSE41-NEXT:    pmovsxwd %xmm4, %xmm4
1419; SSE41-NEXT:    pmovsxwd %xmm5, %xmm5
1420; SSE41-NEXT:    pmovsxwd %xmm6, %xmm6
1421; SSE41-NEXT:    pmovsxwd %xmm7, %xmm7
1422; SSE41-NEXT:    movdqa %xmm7, 224(%rdi)
1423; SSE41-NEXT:    movdqa %xmm6, 192(%rdi)
1424; SSE41-NEXT:    movdqa %xmm5, 160(%rdi)
1425; SSE41-NEXT:    movdqa %xmm4, 128(%rdi)
1426; SSE41-NEXT:    movdqa %xmm3, 96(%rdi)
1427; SSE41-NEXT:    movdqa %xmm2, 64(%rdi)
1428; SSE41-NEXT:    movdqa %xmm1, 32(%rdi)
1429; SSE41-NEXT:    movdqa %xmm0, (%rdi)
1430; SSE41-NEXT:    movdqa %xmm15, 240(%rdi)
1431; SSE41-NEXT:    movdqa %xmm14, 208(%rdi)
1432; SSE41-NEXT:    movdqa %xmm13, 176(%rdi)
1433; SSE41-NEXT:    movdqa %xmm12, 144(%rdi)
1434; SSE41-NEXT:    movdqa %xmm11, 112(%rdi)
1435; SSE41-NEXT:    movdqa %xmm10, 80(%rdi)
1436; SSE41-NEXT:    movdqa %xmm9, 48(%rdi)
1437; SSE41-NEXT:    movdqa %xmm8, 16(%rdi)
1438; SSE41-NEXT:    retq
1439;
1440; AVX2-LABEL: mulhsw_v64i16_ashr:
1441; AVX2:       # %bb.0:
1442; AVX2-NEXT:    movq %rdi, %rax
1443; AVX2-NEXT:    vpmulhw %ymm4, %ymm0, %ymm0
1444; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm4
1445; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1446; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1447; AVX2-NEXT:    vpmulhw %ymm5, %ymm1, %ymm1
1448; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm5
1449; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
1450; AVX2-NEXT:    vpmovsxwd %xmm1, %ymm1
1451; AVX2-NEXT:    vpmulhw %ymm6, %ymm2, %ymm2
1452; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm6
1453; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
1454; AVX2-NEXT:    vpmovsxwd %xmm2, %ymm2
1455; AVX2-NEXT:    vpmulhw %ymm7, %ymm3, %ymm3
1456; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm7
1457; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
1458; AVX2-NEXT:    vpmovsxwd %xmm3, %ymm3
1459; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
1460; AVX2-NEXT:    vmovdqa %ymm7, 192(%rdi)
1461; AVX2-NEXT:    vmovdqa %ymm2, 160(%rdi)
1462; AVX2-NEXT:    vmovdqa %ymm6, 128(%rdi)
1463; AVX2-NEXT:    vmovdqa %ymm1, 96(%rdi)
1464; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
1465; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
1466; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1467; AVX2-NEXT:    vzeroupper
1468; AVX2-NEXT:    retq
1469;
1470; AVX512F-LABEL: mulhsw_v64i16_ashr:
1471; AVX512F:       # %bb.0:
1472; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm4
1473; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
1474; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1475; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1476; AVX512F-NEXT:    vpmulhw %ymm2, %ymm0, %ymm0
1477; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm5
1478; AVX512F-NEXT:    vpmulhw %ymm3, %ymm1, %ymm0
1479; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm2
1480; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1481; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1482; AVX512F-NEXT:    vpmulhw %ymm0, %ymm1, %ymm0
1483; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm3
1484; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0
1485; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
1486; AVX512F-NEXT:    retq
1487;
1488; AVX512BW-LABEL: mulhsw_v64i16_ashr:
1489; AVX512BW:       # %bb.0:
1490; AVX512BW-NEXT:    vpmulhw %zmm2, %zmm0, %zmm2
1491; AVX512BW-NEXT:    vpmovsxwd %ymm2, %zmm0
1492; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1493; AVX512BW-NEXT:    vpmovsxwd %ymm2, %zmm4
1494; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm1, %zmm1
1495; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm2
1496; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1497; AVX512BW-NEXT:    vpmovsxwd %ymm1, %zmm3
1498; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
1499; AVX512BW-NEXT:    retq
1500  %a1 = sext <64 x i16> %a to <64 x i32>
1501  %b1 = sext <64 x i16> %b to <64 x i32>
1502  %c = mul <64 x i32> %a1, %b1
1503  %d = ashr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1504  ret <64 x i32> %d
1505}
1506
1507define <8 x i64> @mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
1508; SSE2-LABEL: mulhuw_v8i16_lshr_i64:
1509; SSE2:       # %bb.0:
1510; SSE2-NEXT:    pxor %xmm2, %xmm2
1511; SSE2-NEXT:    movdqa %xmm0, %xmm3
1512; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1513; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3]
1514; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
1515; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1516; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
1517; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3]
1518; SSE2-NEXT:    movdqa %xmm1, %xmm7
1519; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
1520; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
1521; SSE2-NEXT:    pmuludq %xmm4, %xmm0
1522; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3]
1523; SSE2-NEXT:    pmuludq %xmm3, %xmm4
1524; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1525; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
1526; SSE2-NEXT:    pmuludq %xmm5, %xmm2
1527; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3]
1528; SSE2-NEXT:    pmuludq %xmm6, %xmm3
1529; SSE2-NEXT:    psrlq $16, %xmm0
1530; SSE2-NEXT:    psrlq $16, %xmm4
1531; SSE2-NEXT:    psrlq $16, %xmm2
1532; SSE2-NEXT:    psrlq $16, %xmm3
1533; SSE2-NEXT:    movdqa %xmm4, %xmm1
1534; SSE2-NEXT:    retq
1535;
1536; SSE41-LABEL: mulhuw_v8i16_lshr_i64:
1537; SSE41:       # %bb.0:
1538; SSE41-NEXT:    pmulhuw %xmm1, %xmm0
1539; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1540; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1541; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1542; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1543; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1544; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1545; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1546; SSE41-NEXT:    movdqa %xmm4, %xmm0
1547; SSE41-NEXT:    retq
1548;
1549; AVX2-LABEL: mulhuw_v8i16_lshr_i64:
1550; AVX2:       # %bb.0:
1551; AVX2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm1
1552; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1553; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1554; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1555; AVX2-NEXT:    retq
1556;
1557; AVX512-LABEL: mulhuw_v8i16_lshr_i64:
1558; AVX512:       # %bb.0:
1559; AVX512-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
1560; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1561; AVX512-NEXT:    retq
1562  %a1 = zext <8 x i16> %a to <8 x i64>
1563  %b1 = zext <8 x i16> %b to <8 x i64>
1564  %c = mul <8 x i64> %a1, %b1
1565  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
1566  ret <8 x i64> %d
1567}
1568
1569define <8 x i64> @mulhsw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) {
1570; SSE2-LABEL: mulhsw_v8i16_lshr_i64:
1571; SSE2:       # %bb.0:
1572; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
1573; SSE2-NEXT:    psrad $16, %xmm6
1574; SSE2-NEXT:    pxor %xmm13, %xmm13
1575; SSE2-NEXT:    pxor %xmm10, %xmm10
1576; SSE2-NEXT:    pcmpgtd %xmm6, %xmm10
1577; SSE2-NEXT:    movdqa %xmm6, %xmm8
1578; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
1579; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3]
1580; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1581; SSE2-NEXT:    psrad $16, %xmm4
1582; SSE2-NEXT:    pxor %xmm5, %xmm5
1583; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
1584; SSE2-NEXT:    movdqa %xmm4, %xmm11
1585; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
1586; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1587; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
1588; SSE2-NEXT:    psrad $16, %xmm7
1589; SSE2-NEXT:    pxor %xmm12, %xmm12
1590; SSE2-NEXT:    pcmpgtd %xmm7, %xmm12
1591; SSE2-NEXT:    movdqa %xmm7, %xmm9
1592; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
1593; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
1594; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1595; SSE2-NEXT:    psrad $16, %xmm1
1596; SSE2-NEXT:    pcmpgtd %xmm1, %xmm13
1597; SSE2-NEXT:    movdqa %xmm1, %xmm0
1598; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
1599; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1600; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[2,1,3,3]
1601; SSE2-NEXT:    pmuludq %xmm4, %xmm3
1602; SSE2-NEXT:    pmuludq %xmm1, %xmm4
1603; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[2,1,3,3]
1604; SSE2-NEXT:    pmuludq %xmm1, %xmm2
1605; SSE2-NEXT:    paddq %xmm2, %xmm3
1606; SSE2-NEXT:    psllq $32, %xmm3
1607; SSE2-NEXT:    paddq %xmm4, %xmm3
1608; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,1,1,3]
1609; SSE2-NEXT:    pmuludq %xmm11, %xmm2
1610; SSE2-NEXT:    pmuludq %xmm0, %xmm11
1611; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,1,1,3]
1612; SSE2-NEXT:    pmuludq %xmm0, %xmm1
1613; SSE2-NEXT:    paddq %xmm1, %xmm2
1614; SSE2-NEXT:    psllq $32, %xmm2
1615; SSE2-NEXT:    paddq %xmm11, %xmm2
1616; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
1617; SSE2-NEXT:    pmuludq %xmm6, %xmm1
1618; SSE2-NEXT:    pmuludq %xmm7, %xmm6
1619; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3]
1620; SSE2-NEXT:    pmuludq %xmm7, %xmm0
1621; SSE2-NEXT:    paddq %xmm0, %xmm1
1622; SSE2-NEXT:    psllq $32, %xmm1
1623; SSE2-NEXT:    paddq %xmm6, %xmm1
1624; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
1625; SSE2-NEXT:    pmuludq %xmm8, %xmm0
1626; SSE2-NEXT:    pmuludq %xmm9, %xmm8
1627; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
1628; SSE2-NEXT:    pmuludq %xmm9, %xmm4
1629; SSE2-NEXT:    paddq %xmm4, %xmm0
1630; SSE2-NEXT:    psllq $32, %xmm0
1631; SSE2-NEXT:    paddq %xmm8, %xmm0
1632; SSE2-NEXT:    psrlq $16, %xmm0
1633; SSE2-NEXT:    psrlq $16, %xmm1
1634; SSE2-NEXT:    psrlq $16, %xmm2
1635; SSE2-NEXT:    psrlq $16, %xmm3
1636; SSE2-NEXT:    retq
1637;
1638; SSE41-LABEL: mulhsw_v8i16_lshr_i64:
1639; SSE41:       # %bb.0:
1640; SSE41-NEXT:    pmulhw %xmm1, %xmm0
1641; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1642; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1643; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1644; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1645; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1646; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1647; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1648; SSE41-NEXT:    movdqa %xmm4, %xmm0
1649; SSE41-NEXT:    retq
1650;
1651; AVX2-LABEL: mulhsw_v8i16_lshr_i64:
1652; AVX2:       # %bb.0:
1653; AVX2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm1
1654; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1655; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1656; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1657; AVX2-NEXT:    retq
1658;
1659; AVX512-LABEL: mulhsw_v8i16_lshr_i64:
1660; AVX512:       # %bb.0:
1661; AVX512-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
1662; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1663; AVX512-NEXT:    retq
1664  %a1 = sext <8 x i16> %a to <8 x i64>
1665  %b1 = sext <8 x i16> %b to <8 x i64>
1666  %c = mul <8 x i64> %a1, %b1
1667  %d = lshr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
1668  ret <8 x i64> %d
1669}
1670
1671define <8 x i64> @mulhsw_v8i16_ashr_i64(<8 x i16> %a, <8 x i16> %b) {
1672; SSE2-LABEL: mulhsw_v8i16_ashr_i64:
1673; SSE2:       # %bb.0:
1674; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1675; SSE2-NEXT:    psrad $16, %xmm5
1676; SSE2-NEXT:    pxor %xmm13, %xmm13
1677; SSE2-NEXT:    pxor %xmm10, %xmm10
1678; SSE2-NEXT:    pcmpgtd %xmm5, %xmm10
1679; SSE2-NEXT:    movdqa %xmm5, %xmm8
1680; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1]
1681; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
1682; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
1683; SSE2-NEXT:    psrad $16, %xmm2
1684; SSE2-NEXT:    pxor %xmm3, %xmm3
1685; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
1686; SSE2-NEXT:    movdqa %xmm2, %xmm11
1687; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
1688; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1689; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1690; SSE2-NEXT:    psrad $16, %xmm0
1691; SSE2-NEXT:    pxor %xmm12, %xmm12
1692; SSE2-NEXT:    pcmpgtd %xmm0, %xmm12
1693; SSE2-NEXT:    movdqa %xmm0, %xmm9
1694; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
1695; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1696; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1697; SSE2-NEXT:    psrad $16, %xmm1
1698; SSE2-NEXT:    pcmpgtd %xmm1, %xmm13
1699; SSE2-NEXT:    movdqa %xmm1, %xmm6
1700; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1]
1701; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
1702; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,1,3,3]
1703; SSE2-NEXT:    pmuludq %xmm2, %xmm4
1704; SSE2-NEXT:    pmuludq %xmm1, %xmm2
1705; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,1,3,3]
1706; SSE2-NEXT:    pmuludq %xmm1, %xmm7
1707; SSE2-NEXT:    paddq %xmm7, %xmm4
1708; SSE2-NEXT:    psllq $32, %xmm4
1709; SSE2-NEXT:    paddq %xmm2, %xmm4
1710; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm13[0,1,1,3]
1711; SSE2-NEXT:    pmuludq %xmm11, %xmm7
1712; SSE2-NEXT:    pmuludq %xmm6, %xmm11
1713; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3]
1714; SSE2-NEXT:    pmuludq %xmm6, %xmm1
1715; SSE2-NEXT:    paddq %xmm1, %xmm7
1716; SSE2-NEXT:    psllq $32, %xmm7
1717; SSE2-NEXT:    paddq %xmm11, %xmm7
1718; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,1,3,3]
1719; SSE2-NEXT:    pmuludq %xmm5, %xmm1
1720; SSE2-NEXT:    pmuludq %xmm0, %xmm5
1721; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[2,1,3,3]
1722; SSE2-NEXT:    pmuludq %xmm0, %xmm2
1723; SSE2-NEXT:    paddq %xmm2, %xmm1
1724; SSE2-NEXT:    psllq $32, %xmm1
1725; SSE2-NEXT:    paddq %xmm5, %xmm1
1726; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
1727; SSE2-NEXT:    pmuludq %xmm8, %xmm0
1728; SSE2-NEXT:    pmuludq %xmm9, %xmm8
1729; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
1730; SSE2-NEXT:    pmuludq %xmm9, %xmm2
1731; SSE2-NEXT:    paddq %xmm2, %xmm0
1732; SSE2-NEXT:    psllq $32, %xmm0
1733; SSE2-NEXT:    paddq %xmm8, %xmm0
1734; SSE2-NEXT:    movdqa %xmm0, %xmm2
1735; SSE2-NEXT:    psrad $16, %xmm2
1736; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1737; SSE2-NEXT:    psrlq $16, %xmm0
1738; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1739; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1740; SSE2-NEXT:    movdqa %xmm1, %xmm2
1741; SSE2-NEXT:    psrad $16, %xmm2
1742; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1743; SSE2-NEXT:    psrlq $16, %xmm1
1744; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1745; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1746; SSE2-NEXT:    movdqa %xmm7, %xmm2
1747; SSE2-NEXT:    psrad $16, %xmm2
1748; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
1749; SSE2-NEXT:    psrlq $16, %xmm7
1750; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
1751; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1752; SSE2-NEXT:    movdqa %xmm4, %xmm3
1753; SSE2-NEXT:    psrad $16, %xmm3
1754; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
1755; SSE2-NEXT:    psrlq $16, %xmm4
1756; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1757; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
1758; SSE2-NEXT:    retq
1759;
1760; SSE41-LABEL: mulhsw_v8i16_ashr_i64:
1761; SSE41:       # %bb.0:
1762; SSE41-NEXT:    pmulhw %xmm1, %xmm0
1763; SSE41-NEXT:    pmovsxwq %xmm0, %xmm4
1764; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1765; SSE41-NEXT:    pmovsxwq %xmm1, %xmm1
1766; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1767; SSE41-NEXT:    pmovsxwq %xmm2, %xmm2
1768; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1769; SSE41-NEXT:    pmovsxwq %xmm0, %xmm3
1770; SSE41-NEXT:    movdqa %xmm4, %xmm0
1771; SSE41-NEXT:    retq
1772;
1773; AVX2-LABEL: mulhsw_v8i16_ashr_i64:
1774; AVX2:       # %bb.0:
1775; AVX2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm1
1776; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm0
1777; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1778; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
1779; AVX2-NEXT:    retq
1780;
1781; AVX512-LABEL: mulhsw_v8i16_ashr_i64:
1782; AVX512:       # %bb.0:
1783; AVX512-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0
1784; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
1785; AVX512-NEXT:    retq
1786  %a1 = sext <8 x i16> %a to <8 x i64>
1787  %b1 = sext <8 x i16> %b to <8 x i64>
1788  %c = mul <8 x i64> %a1, %b1
1789  %d = ashr <8 x i64> %c, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
1790  ret <8 x i64> %d
1791}
1792