1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
6
7define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
8; X86-LABEL: test_mm512_kunpackd:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    pushl %ebp
11; X86-NEXT:    .cfi_def_cfa_offset 8
12; X86-NEXT:    .cfi_offset %ebp, -8
13; X86-NEXT:    movl %esp, %ebp
14; X86-NEXT:    .cfi_def_cfa_register %ebp
15; X86-NEXT:    andl $-64, %esp
16; X86-NEXT:    subl $64, %esp
17; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
18; X86-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
19; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
20; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
21; X86-NEXT:    kandd %k0, %k2, %k0
22; X86-NEXT:    kmovd %k0, %eax
23; X86-NEXT:    kshiftrq $32, %k2, %k0
24; X86-NEXT:    kandd %k1, %k0, %k0
25; X86-NEXT:    kmovd %k0, %edx
26; X86-NEXT:    movl %ebp, %esp
27; X86-NEXT:    popl %ebp
28; X86-NEXT:    .cfi_def_cfa %esp, 4
29; X86-NEXT:    vzeroupper
30; X86-NEXT:    retl
31;
32; X64-LABEL: test_mm512_kunpackd:
33; X64:       # %bb.0: # %entry
34; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
35; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
36; X64-NEXT:    kunpckdq %k0, %k1, %k1
37; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
38; X64-NEXT:    kmovq %k0, %rax
39; X64-NEXT:    vzeroupper
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <8 x i64> %__E to <64 x i8>
43  %1 = bitcast <8 x i64> %__F to <64 x i8>
44  %2 = bitcast <8 x i64> %__B to <64 x i8>
45  %3 = bitcast <8 x i64> %__A to <64 x i8>
46  %4 = icmp ne <64 x i8> %2, %3
47  %5 = bitcast <8 x i64> %__C to <64 x i8>
48  %6 = bitcast <8 x i64> %__D to <64 x i8>
49  %7 = icmp ne <64 x i8> %5, %6
50  %8 = shufflevector <64 x i1> %4, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
51  %9 = shufflevector <64 x i1> %7, <64 x i1> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
52  %10 = shufflevector <32 x i1> %8, <32 x i1> %9, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
53  %11 = icmp ne <64 x i8> %0, %1
54  %12 = and <64 x i1> %11, %10
55  %13 = bitcast <64 x i1> %12 to i64
56  ret i64 %13
57}
58
59define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
60; X86-LABEL: test_mm512_kunpackw:
61; X86:       # %bb.0: # %entry
62; X86-NEXT:    pushl %ebp
63; X86-NEXT:    .cfi_def_cfa_offset 8
64; X86-NEXT:    .cfi_offset %ebp, -8
65; X86-NEXT:    movl %esp, %ebp
66; X86-NEXT:    .cfi_def_cfa_register %ebp
67; X86-NEXT:    andl $-64, %esp
68; X86-NEXT:    subl $64, %esp
69; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
70; X86-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
71; X86-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
72; X86-NEXT:    kunpckwd %k0, %k1, %k1
73; X86-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
74; X86-NEXT:    kmovd %k0, %eax
75; X86-NEXT:    movl %ebp, %esp
76; X86-NEXT:    popl %ebp
77; X86-NEXT:    .cfi_def_cfa %esp, 4
78; X86-NEXT:    vzeroupper
79; X86-NEXT:    retl
80;
81; X64-LABEL: test_mm512_kunpackw:
82; X64:       # %bb.0: # %entry
83; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
84; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
85; X64-NEXT:    kunpckwd %k0, %k1, %k1
86; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
87; X64-NEXT:    kmovd %k0, %eax
88; X64-NEXT:    vzeroupper
89; X64-NEXT:    retq
90entry:
91  %0 = bitcast <8 x i64> %__E to <32 x i16>
92  %1 = bitcast <8 x i64> %__F to <32 x i16>
93  %2 = bitcast <8 x i64> %__B to <32 x i16>
94  %3 = bitcast <8 x i64> %__A to <32 x i16>
95  %4 = icmp ne <32 x i16> %2, %3
96  %5 = bitcast <8 x i64> %__C to <32 x i16>
97  %6 = bitcast <8 x i64> %__D to <32 x i16>
98  %7 = icmp ne <32 x i16> %5, %6
99  %8 = shufflevector <32 x i1> %4, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
100  %9 = shufflevector <32 x i1> %7, <32 x i1> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
101  %10 = shufflevector <16 x i1> %8, <16 x i1> %9, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
102  %11 = icmp ne <32 x i16> %0, %1
103  %12 = and <32 x i1> %11, %10
104  %13 = bitcast <32 x i1> %12 to i32
105  ret i32 %13
106}
107
108
109define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
110; X86-LABEL: test_mm512_mask_set1_epi8:
111; X86:       # %bb.0: # %entry
112; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
113; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
114; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
115; X86-NEXT:    kunpckdq %k1, %k0, %k1
116; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
117; X86-NEXT:    retl
118;
119; X64-LABEL: test_mm512_mask_set1_epi8:
120; X64:       # %bb.0: # %entry
121; X64-NEXT:    kmovq %rdi, %k1
122; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1}
123; X64-NEXT:    retq
124  entry:
125  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
126  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
127  %0 = bitcast <8 x i64> %__O to <64 x i8>
128  %1 = bitcast i64 %__M to <64 x i1>
129  %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0
130  %3 = bitcast <64 x i8> %2 to <8 x i64>
131  ret <8 x i64> %3
132}
133
134define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
135; X86-LABEL: test_mm512_maskz_set1_epi8:
136; X86:       # %bb.0: # %entry
137; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
138; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
139; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
140; X86-NEXT:    kunpckdq %k1, %k0, %k1
141; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
142; X86-NEXT:    retl
143;
144; X64-LABEL: test_mm512_maskz_set1_epi8:
145; X64:       # %bb.0: # %entry
146; X64-NEXT:    kmovq %rdi, %k1
147; X64-NEXT:    vpbroadcastb %esi, %zmm0 {%k1} {z}
148; X64-NEXT:    retq
149  entry:
150  %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
151  %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
152  %0 = bitcast i64 %__M to <64 x i1>
153  %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer
154  %2 = bitcast <64 x i8> %1 to <8 x i64>
155  ret <8 x i64> %2
156}
157
158define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A)  {
159; X86-LABEL: test_mm512_mask_set1_epi16:
160; X86:       # %bb.0: # %entry
161; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
162; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
163; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
164; X86-NEXT:    retl
165;
166; X64-LABEL: test_mm512_mask_set1_epi16:
167; X64:       # %bb.0: # %entry
168; X64-NEXT:    kmovd %edi, %k1
169; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1}
170; X64-NEXT:    retq
171  entry:
172  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
173  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
174  %0 = bitcast <8 x i64> %__O to <32 x i16>
175  %1 = bitcast i32 %__M to <32 x i1>
176  %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0
177  %3 = bitcast <32 x i16> %2 to <8 x i64>
178  ret <8 x i64> %3
179}
180
181define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A)  {
182; X86-LABEL: test_mm512_maskz_set1_epi16:
183; X86:       # %bb.0: # %entry
184; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
185; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
186; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z}
187; X86-NEXT:    retl
188;
189; X64-LABEL: test_mm512_maskz_set1_epi16:
190; X64:       # %bb.0: # %entry
191; X64-NEXT:    kmovd %edi, %k1
192; X64-NEXT:    vpbroadcastw %esi, %zmm0 {%k1} {z}
193; X64-NEXT:    retq
194  entry:
195  %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
196  %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
197  %0 = bitcast i32 %__M to <32 x i1>
198  %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer
199  %2 = bitcast <32 x i16> %1 to <8 x i64>
200  ret <8 x i64> %2
201}
202
203define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
204; CHECK-LABEL: test_mm512_broadcastb_epi8:
205; CHECK:       # %bb.0:
206; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0
207; CHECK-NEXT:    ret{{[l|q]}}
208  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
209  %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
210  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
211  ret <8 x i64> %res1
212}
213
214define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
215; X86-LABEL: test_mm512_mask_broadcastb_epi8:
216; X86:       # %bb.0:
217; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
218; X86-NEXT:    kmovq (%eax), %k1
219; X86-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
220; X86-NEXT:    retl
221;
222; X64-LABEL: test_mm512_mask_broadcastb_epi8:
223; X64:       # %bb.0:
224; X64-NEXT:    kmovq (%rdi), %k1
225; X64-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
226; X64-NEXT:    retq
227  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
228  %bc1 = bitcast i64* %a1 to <64 x i1>*
229  %arg1 = load <64 x i1>, <64 x i1>* %bc1
230  %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
231  %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer
232  %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0
233  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
234  ret <8 x i64> %res2
235}
236
237define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
238; X86-LABEL: test_mm512_maskz_broadcastb_epi8:
239; X86:       # %bb.0:
240; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
241; X86-NEXT:    kmovq (%eax), %k1
242; X86-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
243; X86-NEXT:    retl
244;
245; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
246; X64:       # %bb.0:
247; X64-NEXT:    kmovq (%rdi), %k1
248; X64-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
249; X64-NEXT:    retq
250  %bc0 = bitcast i64* %a0 to <64 x i1>*
251  %arg0 = load <64 x i1>, <64 x i1>* %bc0
252  %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
253  %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer
254  %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer
255  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
256  ret <8 x i64> %res2
257}
258
259define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
260; CHECK-LABEL: test_mm512_broadcastw_epi16:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
263; CHECK-NEXT:    ret{{[l|q]}}
264  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
265  %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
266  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
267  ret <8 x i64> %res1
268}
269
270define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
271; X86-LABEL: test_mm512_mask_broadcastw_epi16:
272; X86:       # %bb.0:
273; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
274; X86-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
275; X86-NEXT:    retl
276;
277; X64-LABEL: test_mm512_mask_broadcastw_epi16:
278; X64:       # %bb.0:
279; X64-NEXT:    kmovd %edi, %k1
280; X64-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
281; X64-NEXT:    retq
282  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
283  %arg1 = bitcast i32 %a1 to <32 x i1>
284  %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
285  %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer
286  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
287  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
288  ret <8 x i64> %res2
289}
290
291define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
292; X86-LABEL: test_mm512_maskz_broadcastw_epi16:
293; X86:       # %bb.0:
294; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
295; X86-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
296; X86-NEXT:    retl
297;
298; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
299; X64:       # %bb.0:
300; X64-NEXT:    kmovd %edi, %k1
301; X64-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
302; X64-NEXT:    retq
303  %arg0 = bitcast i32 %a0 to <32 x i1>
304  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
305  %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer
306  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
307  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
308  ret <8 x i64> %res2
309}
310
311define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
312; CHECK-LABEL: test_mm512_bslli_epi128:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
315; CHECK-NEXT:    ret{{[l|q]}}
316  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
317  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
318  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
319  ret <8 x i64> %res1
320}
321
322define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
323; CHECK-LABEL: test_mm512_bsrli_epi128:
324; CHECK:       # %bb.0:
325; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
326; CHECK-NEXT:    ret{{[l|q]}}
327  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
328  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
329  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
330  ret <8 x i64> %res1
331}
332
333define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
334; CHECK-LABEL: test_mm512_unpackhi_epi8:
335; CHECK:       # %bb.0:
336; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
337; CHECK-NEXT:    ret{{[l|q]}}
338  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
339  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
340  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
341  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
342  ret <8 x i64> %res1
343}
344
345; TODO - improve support for i64 -> mmask64 on 32-bit targets
346define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
347; X86-LABEL: test_mm512_mask_unpackhi_epi8:
348; X86:       # %bb.0:
349; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
350; X86-NEXT:    kmovq (%eax), %k1
351; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
352; X86-NEXT:    retl
353;
354; X64-LABEL: test_mm512_mask_unpackhi_epi8:
355; X64:       # %bb.0:
356; X64-NEXT:    kmovq (%rdi), %k1
357; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
358; X64-NEXT:    retq
359  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
360  %arg1 = bitcast i64* %a1 to <64 x i1>*
361  %sel1 = load <64 x i1>, <64 x i1>* %arg1
362  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
363  %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
364  %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
365  %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
366  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
367  ret <8 x i64> %res2
368}
369
370define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
371; X86-LABEL: test_mm512_maskz_unpackhi_epi8:
372; X86:       # %bb.0:
373; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
374; X86-NEXT:    kmovq (%eax), %k1
375; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
376; X86-NEXT:    retl
377;
378; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
379; X64:       # %bb.0:
380; X64-NEXT:    kmovq (%rdi), %k1
381; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
382; X64-NEXT:    retq
383  %arg0 = bitcast i64* %a0 to <64 x i1>*
384  %sel0 = load <64 x i1>, <64 x i1>* %arg0
385  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
386  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
387  %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
388  %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
389  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
390  ret <8 x i64> %res2
391}
392
393define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
394; CHECK-LABEL: test_mm512_unpackhi_epi16:
395; CHECK:       # %bb.0:
396; CHECK-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
397; CHECK-NEXT:    ret{{[l|q]}}
398  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
399  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
400  %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
401  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
402  ret <8 x i64> %res1
403}
404
405define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
406; X86-LABEL: test_mm512_mask_unpackhi_epi16:
407; X86:       # %bb.0:
408; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
409; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
410; X86-NEXT:    retl
411;
412; X64-LABEL: test_mm512_mask_unpackhi_epi16:
413; X64:       # %bb.0:
414; X64-NEXT:    kmovd %edi, %k1
415; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
416; X64-NEXT:    retq
417  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
418  %arg1 = bitcast i32 %a1 to <32 x i1>
419  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
420  %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
421  %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
422  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
423  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
424  ret <8 x i64> %res2
425}
426
427define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
428; X86-LABEL: test_mm512_maskz_unpackhi_epi16:
429; X86:       # %bb.0:
430; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
431; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
432; X86-NEXT:    retl
433;
434; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
435; X64:       # %bb.0:
436; X64-NEXT:    kmovd %edi, %k1
437; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
438; X64-NEXT:    retq
439  %arg0 = bitcast i32 %a0 to <32 x i1>
440  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
441  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
442  %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
443  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
444  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
445  ret <8 x i64> %res2
446}
447
448define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
449; CHECK-LABEL: test_mm512_unpacklo_epi8:
450; CHECK:       # %bb.0:
451; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
452; CHECK-NEXT:    ret{{[l|q]}}
453  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
454  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
455  %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
456  %res1 = bitcast <64 x i8> %res0 to <8 x i64>
457  ret <8 x i64> %res1
458}
459
460define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
461; X86-LABEL: test_mm512_mask_unpacklo_epi8:
462; X86:       # %bb.0:
463; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
464; X86-NEXT:    kmovq (%eax), %k1
465; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
466; X86-NEXT:    retl
467;
468; X64-LABEL: test_mm512_mask_unpacklo_epi8:
469; X64:       # %bb.0:
470; X64-NEXT:    kmovq (%rdi), %k1
471; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
472; X64-NEXT:    retq
473  %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
474  %arg1 = bitcast i64* %a1 to <64 x i1>*
475  %sel1 = load <64 x i1>, <64 x i1>* %arg1
476  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
477  %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
478  %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
479  %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
480  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
481  ret <8 x i64> %res2
482}
483
484define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
485; X86-LABEL: test_mm512_maskz_unpacklo_epi8:
486; X86:       # %bb.0:
487; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
488; X86-NEXT:    kmovq (%eax), %k1
489; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
490; X86-NEXT:    retl
491;
492; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
493; X64:       # %bb.0:
494; X64-NEXT:    kmovq (%rdi), %k1
495; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
496; X64-NEXT:    retq
497  %arg0 = bitcast i64* %a0 to <64 x i1>*
498  %sel0 = load <64 x i1>, <64 x i1>* %arg0
499  %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
500  %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
501  %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
502  %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
503  %res2 = bitcast <64 x i8> %res1 to <8 x i64>
504  ret <8 x i64> %res2
505}
506
507define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
508; CHECK-LABEL: test_mm512_unpacklo_epi16:
509; CHECK:       # %bb.0:
510; CHECK-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
511; CHECK-NEXT:    ret{{[l|q]}}
512  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
513  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
514  %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
515  %res1 = bitcast <32 x i16> %res0 to <8 x i64>
516  ret <8 x i64> %res1
517}
518
519define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
520; X86-LABEL: test_mm512_mask_unpacklo_epi16:
521; X86:       # %bb.0:
522; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
523; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
524; X86-NEXT:    retl
525;
526; X64-LABEL: test_mm512_mask_unpacklo_epi16:
527; X64:       # %bb.0:
528; X64-NEXT:    kmovd %edi, %k1
529; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
530; X64-NEXT:    retq
531  %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
532  %arg1 = bitcast i32 %a1 to <32 x i1>
533  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
534  %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
535  %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
536  %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
537  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
538  ret <8 x i64> %res2
539}
540
541define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
542; X86-LABEL: test_mm512_maskz_unpacklo_epi16:
543; X86:       # %bb.0:
544; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
545; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
546; X86-NEXT:    retl
547;
548; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
549; X64:       # %bb.0:
550; X64-NEXT:    kmovd %edi, %k1
551; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
552; X64-NEXT:    retq
553  %arg0 = bitcast i32 %a0 to <32 x i1>
554  %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
555  %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
556  %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
557  %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
558  %res2 = bitcast <32 x i16> %res1 to <8 x i64>
559  ret <8 x i64> %res2
560}
561
562define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
563; X86-LABEL: test_mm512_test_epi8_mask:
564; X86:       # %bb.0: # %entry
565; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
566; X86-NEXT:    kshiftrq $32, %k0, %k1
567; X86-NEXT:    kmovd %k0, %eax
568; X86-NEXT:    kmovd %k1, %edx
569; X86-NEXT:    vzeroupper
570; X86-NEXT:    retl
571;
572; X64-LABEL: test_mm512_test_epi8_mask:
573; X64:       # %bb.0: # %entry
574; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0
575; X64-NEXT:    kmovq %k0, %rax
576; X64-NEXT:    vzeroupper
577; X64-NEXT:    retq
578entry:
579  %and1.i.i = and <8 x i64> %__B, %__A
580  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
581  %1 = icmp ne <64 x i8> %0, zeroinitializer
582  %2 = bitcast <64 x i1> %1 to i64
583  ret i64 %2
584}
585
586define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
587; X86-LABEL: test_mm512_mask_test_epi8_mask:
588; X86:       # %bb.0: # %entry
589; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
590; X86-NEXT:    kshiftrq $32, %k0, %k1
591; X86-NEXT:    kmovd %k1, %edx
592; X86-NEXT:    kmovd %k0, %eax
593; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
594; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
595; X86-NEXT:    vzeroupper
596; X86-NEXT:    retl
597;
598; X64-LABEL: test_mm512_mask_test_epi8_mask:
599; X64:       # %bb.0: # %entry
600; X64-NEXT:    kmovq %rdi, %k1
601; X64-NEXT:    vptestmb %zmm0, %zmm1, %k0 {%k1}
602; X64-NEXT:    kmovq %k0, %rax
603; X64-NEXT:    vzeroupper
604; X64-NEXT:    retq
605entry:
606  %and1.i.i = and <8 x i64> %__B, %__A
607  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
608  %1 = icmp ne <64 x i8> %0, zeroinitializer
609  %2 = bitcast i64 %__U to <64 x i1>
610  %3 = and <64 x i1> %1, %2
611  %4 = bitcast <64 x i1> %3 to i64
612  ret i64 %4
613}
614
615define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
616; CHECK-LABEL: test_mm512_test_epi16_mask:
617; CHECK:       # %bb.0: # %entry
618; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0
619; CHECK-NEXT:    kmovd %k0, %eax
620; CHECK-NEXT:    vzeroupper
621; CHECK-NEXT:    ret{{[l|q]}}
622entry:
623  %and1.i.i = and <8 x i64> %__B, %__A
624  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
625  %1 = icmp ne <32 x i16> %0, zeroinitializer
626  %2 = bitcast <32 x i1> %1 to i32
627  ret i32 %2
628}
629
630define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
631; X86-LABEL: test_mm512_mask_test_epi16_mask:
632; X86:       # %bb.0: # %entry
633; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
634; X86-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
635; X86-NEXT:    kmovd %k0, %eax
636; X86-NEXT:    vzeroupper
637; X86-NEXT:    retl
638;
639; X64-LABEL: test_mm512_mask_test_epi16_mask:
640; X64:       # %bb.0: # %entry
641; X64-NEXT:    kmovd %edi, %k1
642; X64-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
643; X64-NEXT:    kmovd %k0, %eax
644; X64-NEXT:    vzeroupper
645; X64-NEXT:    retq
646entry:
647  %and1.i.i = and <8 x i64> %__B, %__A
648  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
649  %1 = icmp ne <32 x i16> %0, zeroinitializer
650  %2 = bitcast i32 %__U to <32 x i1>
651  %3 = and <32 x i1> %1, %2
652  %4 = bitcast <32 x i1> %3 to i32
653  ret i32 %4
654}
655
656define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
657; X86-LABEL: test_mm512_testn_epi8_mask:
658; X86:       # %bb.0: # %entry
659; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
660; X86-NEXT:    kshiftrq $32, %k0, %k1
661; X86-NEXT:    kmovd %k0, %eax
662; X86-NEXT:    kmovd %k1, %edx
663; X86-NEXT:    vzeroupper
664; X86-NEXT:    retl
665;
666; X64-LABEL: test_mm512_testn_epi8_mask:
667; X64:       # %bb.0: # %entry
668; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0
669; X64-NEXT:    kmovq %k0, %rax
670; X64-NEXT:    vzeroupper
671; X64-NEXT:    retq
672entry:
673  %and1.i.i = and <8 x i64> %__B, %__A
674  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
675  %1 = icmp eq <64 x i8> %0, zeroinitializer
676  %2 = bitcast <64 x i1> %1 to i64
677  ret i64 %2
678}
679
680define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
681; X86-LABEL: test_mm512_mask_testn_epi8_mask:
682; X86:       # %bb.0: # %entry
683; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
684; X86-NEXT:    kshiftrq $32, %k0, %k1
685; X86-NEXT:    kmovd %k1, %edx
686; X86-NEXT:    kmovd %k0, %eax
687; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
688; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
689; X86-NEXT:    vzeroupper
690; X86-NEXT:    retl
691;
692; X64-LABEL: test_mm512_mask_testn_epi8_mask:
693; X64:       # %bb.0: # %entry
694; X64-NEXT:    kmovq %rdi, %k1
695; X64-NEXT:    vptestnmb %zmm0, %zmm1, %k0 {%k1}
696; X64-NEXT:    kmovq %k0, %rax
697; X64-NEXT:    vzeroupper
698; X64-NEXT:    retq
699entry:
700  %and1.i.i = and <8 x i64> %__B, %__A
701  %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
702  %1 = icmp eq <64 x i8> %0, zeroinitializer
703  %2 = bitcast i64 %__U to <64 x i1>
704  %3 = and <64 x i1> %1, %2
705  %4 = bitcast <64 x i1> %3 to i64
706  ret i64 %4
707}
708
709define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
710; CHECK-LABEL: test_mm512_testn_epi16_mask:
711; CHECK:       # %bb.0: # %entry
712; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0
713; CHECK-NEXT:    kmovd %k0, %eax
714; CHECK-NEXT:    vzeroupper
715; CHECK-NEXT:    ret{{[l|q]}}
716entry:
717  %and1.i.i = and <8 x i64> %__B, %__A
718  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
719  %1 = icmp eq <32 x i16> %0, zeroinitializer
720  %2 = bitcast <32 x i1> %1 to i32
721  ret i32 %2
722}
723
724define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
725; X86-LABEL: test_mm512_mask_testn_epi16_mask:
726; X86:       # %bb.0: # %entry
727; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
728; X86-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
729; X86-NEXT:    kmovd %k0, %eax
730; X86-NEXT:    vzeroupper
731; X86-NEXT:    retl
732;
733; X64-LABEL: test_mm512_mask_testn_epi16_mask:
734; X64:       # %bb.0: # %entry
735; X64-NEXT:    kmovd %edi, %k1
736; X64-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
737; X64-NEXT:    kmovd %k0, %eax
738; X64-NEXT:    vzeroupper
739; X64-NEXT:    retq
740entry:
741  %and1.i.i = and <8 x i64> %__B, %__A
742  %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
743  %1 = icmp eq <32 x i16> %0, zeroinitializer
744  %2 = bitcast i32 %__U to <32 x i1>
745  %3 = and <32 x i1> %1, %2
746  %4 = bitcast <32 x i1> %3 to i32
747  ret i32 %4
748}
749
750define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) {
751; CHECK-LABEL: test_mm512_cvtepi16_epi8:
752; CHECK:       # %bb.0: # %entry
753; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
754; CHECK-NEXT:    ret{{[l|q]}}
755entry:
756  %0 = bitcast <8 x i64> %__A to <32 x i16>
757  %conv.i = trunc <32 x i16> %0 to <32 x i8>
758  %1 = bitcast <32 x i8> %conv.i to <4 x i64>
759  ret <4 x i64> %1
760}
761
762define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) {
763; X86-LABEL: test_mm512_mask_cvtepi16_epi8:
764; X86:       # %bb.0: # %entry
765; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
766; X86-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
767; X86-NEXT:    retl
768;
769; X64-LABEL: test_mm512_mask_cvtepi16_epi8:
770; X64:       # %bb.0: # %entry
771; X64-NEXT:    kmovd %edi, %k1
772; X64-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
773; X64-NEXT:    retq
774entry:
775  %0 = bitcast <8 x i64> %__A to <32 x i16>
776  %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
777  %1 = bitcast <4 x i64> %__O to <32 x i8>
778  %2 = bitcast i32 %__M to <32 x i1>
779  %3 = select <32 x i1> %2, <32 x i8> %conv.i.i, <32 x i8> %1
780  %4 = bitcast <32 x i8> %3 to <4 x i64>
781  ret <4 x i64> %4
782}
783
784define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) {
785; X86-LABEL: test_mm512_maskz_cvtepi16_epi8:
786; X86:       # %bb.0: # %entry
787; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
788; X86-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
789; X86-NEXT:    retl
790;
791; X64-LABEL: test_mm512_maskz_cvtepi16_epi8:
792; X64:       # %bb.0: # %entry
793; X64-NEXT:    kmovd %edi, %k1
794; X64-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
795; X64-NEXT:    retq
796entry:
797  %0 = bitcast <8 x i64> %__A to <32 x i16>
798  %conv.i.i = trunc <32 x i16> %0 to <32 x i8>
799  %1 = bitcast i32 %__M to <32 x i1>
800  %2 = select <32 x i1> %1, <32 x i8> %conv.i.i, <32 x i8> zeroinitializer
801  %3 = bitcast <32 x i8> %2 to <4 x i64>
802  ret <4 x i64> %3
803}
804
805define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) {
806; X86-LABEL: test_mm512_mask2_permutex2var_epi16:
807; X86:       # %bb.0: # %entry
808; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
809; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
810; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
811; X86-NEXT:    retl
812;
813; X64-LABEL: test_mm512_mask2_permutex2var_epi16:
814; X64:       # %bb.0: # %entry
815; X64-NEXT:    kmovd %edi, %k1
816; X64-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
817; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
818; X64-NEXT:    retq
819entry:
820  %0 = bitcast <8 x i64> %__A to <32 x i16>
821  %1 = bitcast <8 x i64> %__I to <32 x i16>
822  %2 = bitcast <8 x i64> %__B to <32 x i16>
823  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
824  %4 = bitcast i32 %__U to <32 x i1>
825  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %1
826  %6 = bitcast <32 x i16> %5 to <8 x i64>
827  ret <8 x i64> %6
828}
829
830define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
831; CHECK-LABEL: test_mm512_permutex2var_epi16:
832; CHECK:       # %bb.0: # %entry
833; CHECK-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
834; CHECK-NEXT:    ret{{[l|q]}}
835entry:
836  %0 = bitcast <8 x i64> %__A to <32 x i16>
837  %1 = bitcast <8 x i64> %__I to <32 x i16>
838  %2 = bitcast <8 x i64> %__B to <32 x i16>
839  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
840  %4 = bitcast <32 x i16> %3 to <8 x i64>
841  ret <8 x i64> %4
842}
843
844define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) {
845; X86-LABEL: test_mm512_mask_permutex2var_epi16:
846; X86:       # %bb.0: # %entry
847; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
848; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
849; X86-NEXT:    retl
850;
851; X64-LABEL: test_mm512_mask_permutex2var_epi16:
852; X64:       # %bb.0: # %entry
853; X64-NEXT:    kmovd %edi, %k1
854; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
855; X64-NEXT:    retq
856entry:
857  %0 = bitcast <8 x i64> %__A to <32 x i16>
858  %1 = bitcast <8 x i64> %__I to <32 x i16>
859  %2 = bitcast <8 x i64> %__B to <32 x i16>
860  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
861  %4 = bitcast i32 %__U to <32 x i1>
862  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %0
863  %6 = bitcast <32 x i16> %5 to <8 x i64>
864  ret <8 x i64> %6
865}
866
867define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
868; X86-LABEL: test_mm512_maskz_permutex2var_epi16:
869; X86:       # %bb.0: # %entry
870; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
871; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
872; X86-NEXT:    retl
873;
874; X64-LABEL: test_mm512_maskz_permutex2var_epi16:
875; X64:       # %bb.0: # %entry
876; X64-NEXT:    kmovd %edi, %k1
877; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
878; X64-NEXT:    retq
879entry:
880  %0 = bitcast <8 x i64> %__A to <32 x i16>
881  %1 = bitcast <8 x i64> %__I to <32 x i16>
882  %2 = bitcast <8 x i64> %__B to <32 x i16>
883  %3 = tail call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %0, <32 x i16> %1, <32 x i16> %2)
884  %4 = bitcast i32 %__U to <32 x i1>
885  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
886  %6 = bitcast <32 x i16> %5 to <8 x i64>
887  ret <8 x i64> %6
888}
889
890declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
891
892!0 = !{i32 1}
893
894