1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefixes=CHECK,SKX,X64,SKX64
3; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefixes=CHECK,KNL,X64,KNL64
4; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefixes=CHECK,SKX,X86,SKX32
5; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefixes=CHECK,KNL,X86,KNL32
6
7;expand 128 -> 256 include <4 x float> <2 x double>
8define <8 x float> @expand(<4 x float> %a) {
9; SKX-LABEL: expand:
10; SKX:       # %bb.0:
11; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
12; SKX-NEXT:    movb $5, %al
13; SKX-NEXT:    kmovd %eax, %k1
14; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
15; SKX-NEXT:    ret{{[l|q]}}
16;
17; KNL-LABEL: expand:
18; KNL:       # %bb.0:
19; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
20; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
21; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
22; KNL-NEXT:    ret{{[l|q]}}
23   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
24   ret <8 x float> %res
25}
26
27define <8 x float> @expand1(<4 x float> %a ) {
28; SKX-LABEL: expand1:
29; SKX:       # %bb.0:
30; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
31; SKX-NEXT:    movb $-86, %al
32; SKX-NEXT:    kmovd %eax, %k1
33; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
34; SKX-NEXT:    ret{{[l|q]}}
35;
36; KNL-LABEL: expand1:
37; KNL:       # %bb.0:
38; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
39; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3]
40; KNL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
41; KNL-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
42; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
43; KNL-NEXT:    ret{{[l|q]}}
44   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
45   ret <8 x float> %res
46}
47
48;Expand 128 -> 256 test <2 x double> -> <4 x double>
49define <4 x double> @expand2(<2 x double> %a) {
50; CHECK-LABEL: expand2:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
53; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
54; CHECK-NEXT:    vmovaps %xmm0, %xmm0
55; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
56; CHECK-NEXT:    ret{{[l|q]}}
57   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
58   ret <4 x double> %res
59}
60
61;expand 128 -> 256 include case <4 x i32> <8 x i32>
62define <8 x i32> @expand3(<4 x i32> %a ) {
63; SKX-LABEL: expand3:
64; SKX:       # %bb.0:
65; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
66; SKX-NEXT:    movb $-127, %al
67; SKX-NEXT:    kmovd %eax, %k1
68; SKX-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
69; SKX-NEXT:    ret{{[l|q]}}
70;
71; KNL-LABEL: expand3:
72; KNL:       # %bb.0:
73; KNL-NEXT:    vbroadcastsd %xmm0, %ymm0
74; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
75; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
76; KNL-NEXT:    ret{{[l|q]}}
77   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
78   ret <8 x i32> %res
79}
80
81;expand 128 -> 256 include case <2 x i64> <4 x i64>
82define <4 x i64> @expand4(<2 x i64> %a ) {
83; SKX-LABEL: expand4:
84; SKX:       # %bb.0:
85; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
86; SKX-NEXT:    movb $9, %al
87; SKX-NEXT:    kmovd %eax, %k1
88; SKX-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
89; SKX-NEXT:    ret{{[l|q]}}
90;
91; KNL-LABEL: expand4:
92; KNL:       # %bb.0:
93; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
94; KNL-NEXT:    vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1]
95; KNL-NEXT:    vmovaps %xmm0, %xmm0
96; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
97; KNL-NEXT:    ret{{[l|q]}}
98   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
99   ret <4 x i64> %res
100}
101
102;Negative test for 128-> 256
103define <8 x float> @expand5(<4 x float> %a ) {
104; SKX-LABEL: expand5:
105; SKX:       # %bb.0:
106; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
107; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
108; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0]
109; SKX-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
110; SKX-NEXT:    ret{{[l|q]}}
111;
112; KNL-LABEL: expand5:
113; KNL:       # %bb.0:
114; KNL-NEXT:    vbroadcastss %xmm0, %ymm0
115; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
116; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
117; KNL-NEXT:    ret{{[l|q]}}
118   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
119   ret <8 x float> %res
120}
121
122;expand 256 -> 512 include <8 x float> <16 x float>
123define <8 x float> @expand6(<4 x float> %a ) {
124; CHECK-LABEL: expand6:
125; CHECK:       # %bb.0:
126; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
127; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
128; CHECK-NEXT:    ret{{[l|q]}}
129   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
130   ret <8 x float> %res
131}
132
133define <16 x float> @expand7(<8 x float> %a) {
134; SKX-LABEL: expand7:
135; SKX:       # %bb.0:
136; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
137; SKX-NEXT:    movw $1285, %ax # imm = 0x505
138; SKX-NEXT:    kmovd %eax, %k1
139; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
140; SKX-NEXT:    ret{{[l|q]}}
141;
142; KNL-LABEL: expand7:
143; KNL:       # %bb.0:
144; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
145; KNL-NEXT:    movw $1285, %ax # imm = 0x505
146; KNL-NEXT:    kmovw %eax, %k1
147; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
148; KNL-NEXT:    ret{{[l|q]}}
149   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
150   ret <16 x float> %res
151}
152
153define <16 x float> @expand8(<8 x float> %a ) {
154; SKX-LABEL: expand8:
155; SKX:       # %bb.0:
156; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
157; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
158; SKX-NEXT:    kmovd %eax, %k1
159; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
160; SKX-NEXT:    ret{{[l|q]}}
161;
162; KNL-LABEL: expand8:
163; KNL:       # %bb.0:
164; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
165; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
166; KNL-NEXT:    kmovw %eax, %k1
167; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
168; KNL-NEXT:    ret{{[l|q]}}
169   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
170   ret <16 x float> %res
171}
172
173;expand 256 -> 512 include <4 x double> <8 x double>
174define <8 x double> @expand9(<4 x double> %a) {
175; SKX-LABEL: expand9:
176; SKX:       # %bb.0:
177; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
178; SKX-NEXT:    movb $-127, %al
179; SKX-NEXT:    kmovd %eax, %k1
180; SKX-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
181; SKX-NEXT:    ret{{[l|q]}}
182;
183; KNL-LABEL: expand9:
184; KNL:       # %bb.0:
185; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
186; KNL-NEXT:    movb $-127, %al
187; KNL-NEXT:    kmovw %eax, %k1
188; KNL-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
189; KNL-NEXT:    ret{{[l|q]}}
190   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
191   ret <8 x double> %res
192}
193
194define <16 x i32> @expand10(<8 x i32> %a ) {
195; SKX-LABEL: expand10:
196; SKX:       # %bb.0:
197; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
198; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
199; SKX-NEXT:    kmovd %eax, %k1
200; SKX-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
201; SKX-NEXT:    ret{{[l|q]}}
202;
203; KNL-LABEL: expand10:
204; KNL:       # %bb.0:
205; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
206; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
207; KNL-NEXT:    kmovw %eax, %k1
208; KNL-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
209; KNL-NEXT:    ret{{[l|q]}}
210   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
211   ret <16 x i32> %res
212}
213
214define <8 x i64> @expand11(<4 x i64> %a) {
215; SKX-LABEL: expand11:
216; SKX:       # %bb.0:
217; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
218; SKX-NEXT:    movb $-127, %al
219; SKX-NEXT:    kmovd %eax, %k1
220; SKX-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
221; SKX-NEXT:    ret{{[l|q]}}
222;
223; KNL-LABEL: expand11:
224; KNL:       # %bb.0:
225; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
226; KNL-NEXT:    movb $-127, %al
227; KNL-NEXT:    kmovw %eax, %k1
228; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
229; KNL-NEXT:    ret{{[l|q]}}
230   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
231   ret <8 x i64> %res
232}
233
234;Negative test for 256-> 512
235define <16 x float> @expand12(<8 x float> %a) {
236; CHECK-LABEL: expand12:
237; CHECK:       # %bb.0:
238; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
239; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
240; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
241; CHECK-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
242; CHECK-NEXT:    vmovaps %zmm1, %zmm0
243; CHECK-NEXT:    ret{{[l|q]}}
244   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
245   ret <16 x float> %res
246}
247
248define <16 x float> @expand13(<8 x float> %a ) {
249; CHECK-LABEL: expand13:
250; CHECK:       # %bb.0:
251; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
252; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
253; CHECK-NEXT:    ret{{[l|q]}}
254   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
255   ret <16 x float> %res
256}
257
258; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
259
260define <8 x float> @expand14(<4 x float> %a) {
261; SKX-LABEL: expand14:
262; SKX:       # %bb.0:
263; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
264; SKX-NEXT:    movb $20, %al
265; SKX-NEXT:    kmovd %eax, %k1
266; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
267; SKX-NEXT:    ret{{[l|q]}}
268;
269; KNL-LABEL: expand14:
270; KNL:       # %bb.0:
271; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
272; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23]
273; KNL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
274; KNL-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
275; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
276; KNL-NEXT:    ret{{[l|q]}}
277   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
278   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
279   ret <8 x float> %res
280}
281
282;Negative test.
283define <8 x float> @expand15(<4 x float> %a) {
284; SKX-LABEL: expand15:
285; SKX:       # %bb.0:
286; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
287; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = <u,u,0,u,1,u,u,u>
288; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
289; SKX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
290; SKX-NEXT:    ret{{[l|q]}}
291;
292; KNL-LABEL: expand15:
293; KNL:       # %bb.0:
294; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
295; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
296; KNL-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7]
297; KNL-NEXT:    ret{{[l|q]}}
298   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
299   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
300   ret <8 x float> %res
301}
302
303
304; Shuffle to blend test
305
306define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
307; SKX64-LABEL: test_mm512_mask_blend_epi8:
308; SKX64:       # %bb.0: # %entry
309; SKX64-NEXT:    movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
310; SKX64-NEXT:    kmovq %rax, %k1
311; SKX64-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
312; SKX64-NEXT:    retq
313;
314; KNL-LABEL: test_mm512_mask_blend_epi8:
315; KNL:       # %bb.0: # %entry
316; KNL-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
317; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
318; KNL-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
319; KNL-NEXT:    ret{{[l|q]}}
320;
321; SKX32-LABEL: test_mm512_mask_blend_epi8:
322; SKX32:       # %bb.0: # %entry
323; SKX32-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
324; SKX32-NEXT:    kmovd %eax, %k0
325; SKX32-NEXT:    kunpckdq %k0, %k0, %k1
326; SKX32-NEXT:    vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
327; SKX32-NEXT:    retl
328entry:
329  %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32>  <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
330  ret <64 x i8> %0
331}
332
333define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
334; SKX-LABEL: test_mm512_mask_blend_epi16:
335; SKX:       # %bb.0: # %entry
336; SKX-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
337; SKX-NEXT:    kmovd %eax, %k1
338; SKX-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
339; SKX-NEXT:    ret{{[l|q]}}
340;
341; KNL64-LABEL: test_mm512_mask_blend_epi16:
342; KNL64:       # %bb.0: # %entry
343; KNL64-NEXT:    vpternlogd $216, {{.*}}(%rip){1to16}, %zmm1, %zmm0
344; KNL64-NEXT:    retq
345;
346; KNL32-LABEL: test_mm512_mask_blend_epi16:
347; KNL32:       # %bb.0: # %entry
348; KNL32-NEXT:    vpternlogd $216, {{\.LCPI.*}}{1to16}, %zmm1, %zmm0
349; KNL32-NEXT:    retl
350entry:
351  %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
352  ret <32 x i16> %0
353}
354
355define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
356; SKX-LABEL: test_mm512_mask_blend_epi32:
357; SKX:       # %bb.0: # %entry
358; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
359; SKX-NEXT:    kmovd %eax, %k1
360; SKX-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
361; SKX-NEXT:    ret{{[l|q]}}
362;
363; KNL-LABEL: test_mm512_mask_blend_epi32:
364; KNL:       # %bb.0: # %entry
365; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
366; KNL-NEXT:    kmovw %eax, %k1
367; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
368; KNL-NEXT:    ret{{[l|q]}}
369entry:
370  %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
371  ret <16 x i32> %0
372}
373
374define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
375; SKX-LABEL: test_mm512_mask_blend_epi64:
376; SKX:       # %bb.0: # %entry
377; SKX-NEXT:    movb $-86, %al
378; SKX-NEXT:    kmovd %eax, %k1
379; SKX-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
380; SKX-NEXT:    ret{{[l|q]}}
381;
382; KNL-LABEL: test_mm512_mask_blend_epi64:
383; KNL:       # %bb.0: # %entry
384; KNL-NEXT:    movb $-86, %al
385; KNL-NEXT:    kmovw %eax, %k1
386; KNL-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
387; KNL-NEXT:    ret{{[l|q]}}
388entry:
389  %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32>  <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
390  ret <8 x i64> %0
391}
392
393define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
394; SKX-LABEL: test_mm512_mask_blend_ps:
395; SKX:       # %bb.0: # %entry
396; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
397; SKX-NEXT:    kmovd %eax, %k1
398; SKX-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
399; SKX-NEXT:    ret{{[l|q]}}
400;
401; KNL-LABEL: test_mm512_mask_blend_ps:
402; KNL:       # %bb.0: # %entry
403; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
404; KNL-NEXT:    kmovw %eax, %k1
405; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
406; KNL-NEXT:    ret{{[l|q]}}
407entry:
408  %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
409  ret <16 x float> %0
410}
411
412define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){
413; SKX-LABEL: test_mm512_mask_blend_pd:
414; SKX:       # %bb.0: # %entry
415; SKX-NEXT:    movb $-88, %al
416; SKX-NEXT:    kmovd %eax, %k1
417; SKX-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
418; SKX-NEXT:    ret{{[l|q]}}
419;
420; KNL-LABEL: test_mm512_mask_blend_pd:
421; KNL:       # %bb.0: # %entry
422; KNL-NEXT:    movb $-88, %al
423; KNL-NEXT:    kmovw %eax, %k1
424; KNL-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
425; KNL-NEXT:    ret{{[l|q]}}
426entry:
427  %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32>  <i32 8, i32 9, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
428  ret <8 x double> %0
429}
430
431
432define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){
433; SKX-LABEL: test_mm256_mask_blend_epi8:
434; SKX:       # %bb.0: # %entry
435; SKX-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
436; SKX-NEXT:    kmovd %eax, %k1
437; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
438; SKX-NEXT:    ret{{[l|q]}}
439;
440; KNL-LABEL: test_mm256_mask_blend_epi8:
441; KNL:       # %bb.0: # %entry
442; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
443; KNL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
444; KNL-NEXT:    ret{{[l|q]}}
445entry:
446  %0 = shufflevector <32 x i8> %A, <32 x i8> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
447  ret <32 x i8> %0
448}
449
450define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
451; SKX-LABEL: test_mm_mask_blend_epi8:
452; SKX:       # %bb.0: # %entry
453; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
454; SKX-NEXT:    kmovd %eax, %k1
455; SKX-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
456; SKX-NEXT:    ret{{[l|q]}}
457;
458; KNL-LABEL: test_mm_mask_blend_epi8:
459; KNL:       # %bb.0: # %entry
460; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
461; KNL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
462; KNL-NEXT:    ret{{[l|q]}}
463entry:
464  %0 = shufflevector <16 x i8> %A, <16 x i8> %W, <16 x i32>  <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
465  ret <16 x i8> %0
466}
467
468; PR34370
469define <8 x float> @test_masked_permps_v8f32(<8 x float>* %vp, <8 x float> %vec2) {
470; SKX64-LABEL: test_masked_permps_v8f32:
471; SKX64:       # %bb.0:
472; SKX64-NEXT:    vmovaps (%rdi), %ymm2
473; SKX64-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
474; SKX64-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1
475; SKX64-NEXT:    vmovaps %ymm1, %ymm0
476; SKX64-NEXT:    retq
477;
478; KNL64-LABEL: test_masked_permps_v8f32:
479; KNL64:       # %bb.0:
480; KNL64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
481; KNL64-NEXT:    vmovaps (%rdi), %ymm1
482; KNL64-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
483; KNL64-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
484; KNL64-NEXT:    vmovaps %ymm1, %ymm0
485; KNL64-NEXT:    retq
486;
487; SKX32-LABEL: test_masked_permps_v8f32:
488; SKX32:       # %bb.0:
489; SKX32-NEXT:    movl {{[0-9]+}}(%esp), %eax
490; SKX32-NEXT:    vmovaps (%eax), %ymm2
491; SKX32-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,3,11,7,6,14,15]
492; SKX32-NEXT:    vpermi2ps %ymm0, %ymm2, %ymm1
493; SKX32-NEXT:    vmovaps %ymm1, %ymm0
494; SKX32-NEXT:    retl
495;
496; KNL32-LABEL: test_masked_permps_v8f32:
497; KNL32:       # %bb.0:
498; KNL32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
499; KNL32-NEXT:    movl {{[0-9]+}}(%esp), %eax
500; KNL32-NEXT:    vmovaps (%eax), %ymm1
501; KNL32-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,3,19,7,6,22,23]
502; KNL32-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
503; KNL32-NEXT:    vmovaps %ymm1, %ymm0
504; KNL32-NEXT:    retl
505  %vec = load <8 x float>, <8 x float>* %vp
506  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
507  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
508  ret <8 x float> %res
509}
510
511define <16 x float> @test_masked_permps_v16f32(<16 x float>* %vp, <16 x float> %vec2) {
512; X64-LABEL: test_masked_permps_v16f32:
513; X64:       # %bb.0:
514; X64-NEXT:    vmovaps (%rdi), %zmm2
515; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
516; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
517; X64-NEXT:    vmovaps %zmm1, %zmm0
518; X64-NEXT:    retq
519;
520; X86-LABEL: test_masked_permps_v16f32:
521; X86:       # %bb.0:
522; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
523; X86-NEXT:    vmovaps (%eax), %zmm2
524; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,13,11,19,14,12,22,23,7,6,3,27,7,29,3,31]
525; X86-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
526; X86-NEXT:    vmovaps %zmm1, %zmm0
527; X86-NEXT:    retl
528  %vec = load <16 x float>, <16 x float>* %vp
529  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 14, i32 12, i32 10, i32 8, i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0>
530  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
531  ret <16 x float> %res
532}
533
534define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %dst) {
535; SKX64-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
536; SKX64:       # %bb.0:
537; SKX64-NEXT:    vmovdqa 32(%rdi), %xmm0
538; SKX64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
539; SKX64-NEXT:    vmovdqa %ymm0, 672(%rsi)
540; SKX64-NEXT:    vmovdqa 208(%rdi), %xmm0
541; SKX64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
542; SKX64-NEXT:    vmovdqa %ymm0, 832(%rsi)
543; SKX64-NEXT:    vzeroupper
544; SKX64-NEXT:    retq
545;
546; KNL64-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
547; KNL64:       # %bb.0:
548; KNL64-NEXT:    vmovdqa 32(%rdi), %xmm0
549; KNL64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
550; KNL64-NEXT:    vmovdqa %ymm0, 672(%rsi)
551; KNL64-NEXT:    vmovdqa 208(%rdi), %xmm0
552; KNL64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
553; KNL64-NEXT:    vmovdqa %ymm0, 832(%rsi)
554; KNL64-NEXT:    retq
555;
556; SKX32-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
557; SKX32:       # %bb.0:
558; SKX32-NEXT:    movl {{[0-9]+}}(%esp), %eax
559; SKX32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
560; SKX32-NEXT:    vmovdqa 32(%ecx), %xmm0
561; SKX32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
562; SKX32-NEXT:    vmovdqa %ymm0, 672(%eax)
563; SKX32-NEXT:    vmovdqa 208(%ecx), %xmm0
564; SKX32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
565; SKX32-NEXT:    vmovdqa %ymm0, 832(%eax)
566; SKX32-NEXT:    vzeroupper
567; SKX32-NEXT:    retl
568;
569; KNL32-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
570; KNL32:       # %bb.0:
571; KNL32-NEXT:    movl {{[0-9]+}}(%esp), %eax
572; KNL32-NEXT:    vmovdqa 32(%eax), %xmm0
573; KNL32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
574; KNL32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
575; KNL32-NEXT:    vmovdqa %ymm0, 672(%ecx)
576; KNL32-NEXT:    vmovdqa 208(%eax), %xmm0
577; KNL32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
578; KNL32-NEXT:    vmovdqa %ymm0, 832(%ecx)
579; KNL32-NEXT:    retl
580  %t64 = bitcast <2 x i32>* %src to <16 x i32>*
581  %t87 = load <16 x i32>, <16 x i32>* %t64, align 64
582  %t88 = extractelement <16 x i32> %t87, i64 11
583  %t89 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t88, i64 0
584  %t90 = insertelement <8 x i32> %t89, i32 %t88, i64 1
585  %ptridx49.i = getelementptr inbounds <8 x i32>, <8 x i32>* %dst, i64 21
586  store <8 x i32> %t90, <8 x i32>* %ptridx49.i, align 32
587  %ptridx56.i = getelementptr inbounds <2 x i32>, <2 x i32>* %src, i64 24
588  %t00 = bitcast <2 x i32>* %ptridx56.i to <16 x i32>*
589  %t09 = load <16 x i32>, <16 x i32>* %t00, align 64
590  %t10 = extractelement <16 x i32> %t09, i64 5
591  %t11 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t10, i64 0
592  %t12 = extractelement <16 x i32> %t09, i64 4
593  %t13 = insertelement <8 x i32> %t11, i32 %t12, i64 1
594  %ptridx64.i = getelementptr inbounds <8 x i32>, <8 x i32>* %dst, i64 26
595  store <8 x i32> %t13, <8 x i32>* %ptridx64.i, align 32
596  ret void
597}
598
599define <32 x float> @PR47534(<8 x float> %tmp) {
600; CHECK-LABEL: PR47534:
601; CHECK:       # %bb.0:
602; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
603; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
604; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31]
605; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
606; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
607; CHECK-NEXT:    ret{{[l|q]}}
608  %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
609  %tmp2 = shufflevector <32 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <32 x float> undef, <32 x i32> <i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31>
610  %tmp18 = shufflevector <32 x float> %tmp2, <32 x float> %tmp1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31>
611  ret <32 x float> %tmp18
612}
613
614%union1= type { <16 x float> }
615@src1 = external dso_local local_unnamed_addr global %union1, align 64
616
617define void @PR43170(<16 x float>* %a0) {
618; SKX64-LABEL: PR43170:
619; SKX64:       # %bb.0: # %entry
620; SKX64-NEXT:    vmovaps {{.*}}(%rip), %ymm0
621; SKX64-NEXT:    vmovaps %zmm0, (%rdi)
622; SKX64-NEXT:    vzeroupper
623; SKX64-NEXT:    retq
624;
625; KNL64-LABEL: PR43170:
626; KNL64:       # %bb.0: # %entry
627; KNL64-NEXT:    vmovaps {{.*}}(%rip), %ymm0
628; KNL64-NEXT:    vmovaps %zmm0, (%rdi)
629; KNL64-NEXT:    retq
630;
631; SKX32-LABEL: PR43170:
632; SKX32:       # %bb.0: # %entry
633; SKX32-NEXT:    movl {{[0-9]+}}(%esp), %eax
634; SKX32-NEXT:    vmovaps src1, %ymm0
635; SKX32-NEXT:    vmovaps %zmm0, (%eax)
636; SKX32-NEXT:    vzeroupper
637; SKX32-NEXT:    retl
638;
639; KNL32-LABEL: PR43170:
640; KNL32:       # %bb.0: # %entry
641; KNL32-NEXT:    movl {{[0-9]+}}(%esp), %eax
642; KNL32-NEXT:    vmovaps src1, %ymm0
643; KNL32-NEXT:    vmovaps %zmm0, (%eax)
644; KNL32-NEXT:    retl
645entry:
646  %0 = load <8 x float>, <8 x float>* bitcast (%union1* @src1 to <8 x float>*), align 64
647  %1 = shufflevector <8 x float> %0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
648  store <16 x float> %1, <16 x float>* %a0, align 64
649  ret void
650}
651