1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
3
4; FIXME: All cases here should be fixed by PR34380
5
6define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
7; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
8; CHECK:       # %bb.0:
9; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
10; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
11; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
12; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
13; CHECK-NEXT:    vzeroupper
14; CHECK-NEXT:    retq
15  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
16  ret <8 x i16> %res
17}
18define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
20; CHECK:       # %bb.0:
21; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
22; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
23; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
24; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
25; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
26; CHECK-NEXT:    vzeroupper
27; CHECK-NEXT:    retq
28  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
29  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
30  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
31  ret <8 x i16> %res
32}
33
34define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
35; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
36; CHECK:       # %bb.0:
37; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
38; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
39; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
40; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
41; CHECK-NEXT:    vzeroupper
42; CHECK-NEXT:    retq
43  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
44  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
45  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
46  ret <8 x i16> %res
47}
48define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
49; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
52; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
53; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
54; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
55; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
56; CHECK-NEXT:    vzeroupper
57; CHECK-NEXT:    retq
58  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
59  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
60  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
61  ret <8 x i16> %res
62}
63
64define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
65; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
66; CHECK:       # %bb.0:
67; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
68; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
69; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
70; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
71; CHECK-NEXT:    vzeroupper
72; CHECK-NEXT:    retq
73  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
74  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
75  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
76  ret <8 x i16> %res
77}
78define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
79; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
80; CHECK:       # %bb.0:
81; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
82; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
83; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
84; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
85; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
86; CHECK-NEXT:    vzeroupper
87; CHECK-NEXT:    retq
88  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
89  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
90  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
91  ret <8 x i16> %res
92}
93
94define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
95; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
98; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
99; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
100; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
101; CHECK-NEXT:    vzeroupper
102; CHECK-NEXT:    retq
103  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
104  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
105  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
106  ret <8 x i16> %res
107}
108define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
109; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
110; CHECK:       # %bb.0:
111; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
112; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
113; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
114; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
115; CHECK-NEXT:    vzeroupper
116; CHECK-NEXT:    retq
117  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
118  ret <8 x i16> %res
119}
120define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
121; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
122; CHECK:       # %bb.0:
123; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
124; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
125; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
126; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
127; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
128; CHECK-NEXT:    vzeroupper
129; CHECK-NEXT:    retq
130  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
131  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
132  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
133  ret <8 x i16> %res
134}
135
136define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
137; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
138; CHECK:       # %bb.0:
139; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
140; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
141; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
142; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
143; CHECK-NEXT:    vzeroupper
144; CHECK-NEXT:    retq
145  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
146  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
147  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
148  ret <8 x i16> %res
149}
150define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
151; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
152; CHECK:       # %bb.0:
153; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
154; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
155; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm1, %xmm0
156; CHECK-NEXT:    retq
157  %vec = load <16 x i16>, <16 x i16>* %vp
158  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
159  ret <8 x i16> %res
160}
161define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
162; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
163; CHECK:       # %bb.0:
164; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
165; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
166; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
167; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
168; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
169; CHECK-NEXT:    retq
170  %vec = load <16 x i16>, <16 x i16>* %vp
171  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
172  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
173  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
174  ret <8 x i16> %res
175}
176
177define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
178; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
179; CHECK:       # %bb.0:
180; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
181; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
182; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
183; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
184; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
185; CHECK-NEXT:    retq
186  %vec = load <16 x i16>, <16 x i16>* %vp
187  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
188  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
189  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
190  ret <8 x i16> %res
191}
192
193define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
194; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
195; CHECK:       # %bb.0:
196; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
197; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
198; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
199; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
200; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
201; CHECK-NEXT:    retq
202  %vec = load <16 x i16>, <16 x i16>* %vp
203  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
204  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
205  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
206  ret <8 x i16> %res
207}
208
209define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
210; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
211; CHECK:       # %bb.0:
212; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
213; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
214; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
215; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
216; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
217; CHECK-NEXT:    retq
218  %vec = load <16 x i16>, <16 x i16>* %vp
219  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
220  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
221  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
222  ret <8 x i16> %res
223}
224
225define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
226; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
227; CHECK:       # %bb.0:
228; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
229; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
230; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm3
231; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
232; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
233; CHECK-NEXT:    retq
234  %vec = load <16 x i16>, <16 x i16>* %vp
235  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
236  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
237  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
238  ret <8 x i16> %res
239}
240
241define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
242; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
243; CHECK:       # %bb.0:
244; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
245; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
246; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
247; CHECK-NEXT:    vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
248; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
249; CHECK-NEXT:    retq
250  %vec = load <16 x i16>, <16 x i16>* %vp
251  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
252  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
253  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
254  ret <8 x i16> %res
255}
256
257define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
258; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
261; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
262; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm1, %xmm0
263; CHECK-NEXT:    retq
264  %vec = load <16 x i16>, <16 x i16>* %vp
265  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
266  ret <8 x i16> %res
267}
268define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
269; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
270; CHECK:       # %bb.0:
271; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
272; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
273; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm3
274; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
275; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
276; CHECK-NEXT:    retq
277  %vec = load <16 x i16>, <16 x i16>* %vp
278  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
279  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
280  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
281  ret <8 x i16> %res
282}
283
284define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
285; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
286; CHECK:       # %bb.0:
287; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
288; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
289; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
290; CHECK-NEXT:    vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
291; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
292; CHECK-NEXT:    retq
293  %vec = load <16 x i16>, <16 x i16>* %vp
294  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
295  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
296  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
297  ret <8 x i16> %res
298}
299
300define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
301; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
302; CHECK:       # %bb.0:
303; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
304; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
305; CHECK-NEXT:    vpermi2w %ymm0, %ymm2, %ymm1
306; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
307; CHECK-NEXT:    retq
308  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
309  ret <16 x i16> %res
310}
311define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
312; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
315; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
316; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
317; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
318; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
319; CHECK-NEXT:    retq
320  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
321  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
322  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
323  ret <16 x i16> %res
324}
325
326define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
327; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
328; CHECK:       # %bb.0:
329; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
330; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
331; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
332; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
333; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
334; CHECK-NEXT:    retq
335  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
336  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
337  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
338  ret <16 x i16> %res
339}
340define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
341; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
342; CHECK:       # %bb.0:
343; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
344; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
345; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
346; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
347; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
348; CHECK-NEXT:    retq
349  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
350  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
351  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
352  ret <16 x i16> %res
353}
354
355define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
356; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
359; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
360; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
361; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
362; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
363; CHECK-NEXT:    retq
364  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
365  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
366  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
367  ret <16 x i16> %res
368}
369define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
370; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
371; CHECK:       # %bb.0:
372; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
373; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
374; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm4
375; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
376; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
377; CHECK-NEXT:    retq
378  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
379  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
380  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
381  ret <16 x i16> %res
382}
383
384define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
385; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
386; CHECK:       # %bb.0:
387; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
388; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
389; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
390; CHECK-NEXT:    vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
391; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
392; CHECK-NEXT:    retq
393  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
394  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
395  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
396  ret <16 x i16> %res
397}
398define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
399; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
400; CHECK:       # %bb.0:
401; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
402; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
403; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1
404; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
405; CHECK-NEXT:    retq
406  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
407  ret <16 x i16> %res
408}
409define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
410; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
411; CHECK:       # %bb.0:
412; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
413; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
414; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm4
415; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
416; CHECK-NEXT:    vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
417; CHECK-NEXT:    retq
418  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
419  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
420  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
421  ret <16 x i16> %res
422}
423
424define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
425; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
426; CHECK:       # %bb.0:
427; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
428; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
429; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
430; CHECK-NEXT:    vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
431; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
432; CHECK-NEXT:    retq
433  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
434  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
435  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
436  ret <16 x i16> %res
437}
438define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
439; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
440; CHECK:       # %bb.0:
441; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
442; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
443; CHECK-NEXT:    vpermt2w %ymm0, %ymm2, %ymm1
444; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
445; CHECK-NEXT:    vzeroupper
446; CHECK-NEXT:    retq
447  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
448  ret <8 x i16> %res
449}
450define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
451; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
452; CHECK:       # %bb.0:
453; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
454; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
455; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm4
456; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
457; CHECK-NEXT:    vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
458; CHECK-NEXT:    vzeroupper
459; CHECK-NEXT:    retq
460  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
461  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
462  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
463  ret <8 x i16> %res
464}
465
466define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
467; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
468; CHECK:       # %bb.0:
469; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
470; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
471; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
472; CHECK-NEXT:    vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
473; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
474; CHECK-NEXT:    vzeroupper
475; CHECK-NEXT:    retq
476  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
477  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
478  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
479  ret <8 x i16> %res
480}
481define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
482; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
483; CHECK:       # %bb.0:
484; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
485; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
486; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
487; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
488; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
489; CHECK-NEXT:    vzeroupper
490; CHECK-NEXT:    retq
491  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
492  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
493  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
494  ret <8 x i16> %res
495}
496
497define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
498; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
499; CHECK:       # %bb.0:
500; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
501; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
502; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
503; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
504; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
505; CHECK-NEXT:    vzeroupper
506; CHECK-NEXT:    retq
507  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
508  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
509  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
510  ret <8 x i16> %res
511}
512define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
513; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
514; CHECK:       # %bb.0:
515; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
516; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
517; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
518; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
519; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
520; CHECK-NEXT:    vzeroupper
521; CHECK-NEXT:    retq
522  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
523  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
524  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
525  ret <8 x i16> %res
526}
527
528define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
529; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
530; CHECK:       # %bb.0:
531; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
532; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
533; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
534; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
535; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
536; CHECK-NEXT:    vzeroupper
537; CHECK-NEXT:    retq
538  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
539  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
540  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
541  ret <8 x i16> %res
542}
543define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
544; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
545; CHECK:       # %bb.0:
546; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
547; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
548; CHECK-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0
549; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
550; CHECK-NEXT:    vzeroupper
551; CHECK-NEXT:    retq
552  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
553  ret <8 x i16> %res
554}
555define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
556; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
557; CHECK:       # %bb.0:
558; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
559; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
560; CHECK-NEXT:    vpermt2w %ymm4, %ymm3, %ymm0
561; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
562; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
563; CHECK-NEXT:    vzeroupper
564; CHECK-NEXT:    retq
565  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
566  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
567  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
568  ret <8 x i16> %res
569}
570
571define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
572; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
573; CHECK:       # %bb.0:
574; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
575; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
576; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
577; CHECK-NEXT:    vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
578; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
579; CHECK-NEXT:    vzeroupper
580; CHECK-NEXT:    retq
581  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
582  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
583  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
584  ret <8 x i16> %res
585}
586define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
587; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
588; CHECK:       # %bb.0:
589; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
590; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
591; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
592; CHECK-NEXT:    retq
593  %vec = load <32 x i16>, <32 x i16>* %vp
594  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
595  ret <16 x i16> %res
596}
597define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
598; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
599; CHECK:       # %bb.0:
600; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
601; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
602; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
603; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
604; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
605; CHECK-NEXT:    retq
606  %vec = load <32 x i16>, <32 x i16>* %vp
607  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
608  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
609  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
610  ret <16 x i16> %res
611}
612
613define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
614; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
615; CHECK:       # %bb.0:
616; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
617; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
618; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
619; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
620; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
621; CHECK-NEXT:    retq
622  %vec = load <32 x i16>, <32 x i16>* %vp
623  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
624  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
625  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
626  ret <16 x i16> %res
627}
628
629define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
630; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
631; CHECK:       # %bb.0:
632; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
633; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
634; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
635; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
636; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
637; CHECK-NEXT:    retq
638  %vec = load <32 x i16>, <32 x i16>* %vp
639  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
640  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
641  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
642  ret <16 x i16> %res
643}
644
645define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
646; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
647; CHECK:       # %bb.0:
648; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
649; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
650; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
651; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
652; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
653; CHECK-NEXT:    retq
654  %vec = load <32 x i16>, <32 x i16>* %vp
655  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
656  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
657  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
658  ret <16 x i16> %res
659}
660
661define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
662; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
663; CHECK:       # %bb.0:
664; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
665; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
666; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm3
667; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
668; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
669; CHECK-NEXT:    retq
670  %vec = load <32 x i16>, <32 x i16>* %vp
671  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
672  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
673  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
674  ret <16 x i16> %res
675}
676
677define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
678; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
679; CHECK:       # %bb.0:
680; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
681; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
682; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
683; CHECK-NEXT:    vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
684; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
685; CHECK-NEXT:    retq
686  %vec = load <32 x i16>, <32 x i16>* %vp
687  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
688  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
689  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
690  ret <16 x i16> %res
691}
692
693define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
694; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
695; CHECK:       # %bb.0:
696; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
697; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
698; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm1, %ymm0
699; CHECK-NEXT:    retq
700  %vec = load <32 x i16>, <32 x i16>* %vp
701  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
702  ret <16 x i16> %res
703}
704define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
705; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
706; CHECK:       # %bb.0:
707; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
708; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
709; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm3
710; CHECK-NEXT:    vptestnmw %ymm1, %ymm1, %k1
711; CHECK-NEXT:    vmovdqu16 %ymm3, %ymm0 {%k1}
712; CHECK-NEXT:    retq
713  %vec = load <32 x i16>, <32 x i16>* %vp
714  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
715  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
716  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
717  ret <16 x i16> %res
718}
719
720define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
721; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
722; CHECK:       # %bb.0:
723; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
724; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
725; CHECK-NEXT:    vptestnmw %ymm0, %ymm0, %k1
726; CHECK-NEXT:    vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
727; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
728; CHECK-NEXT:    retq
729  %vec = load <32 x i16>, <32 x i16>* %vp
730  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
731  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
732  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
733  ret <16 x i16> %res
734}
735
736define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
737; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
738; CHECK:       # %bb.0:
739; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
740; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm0
741; CHECK-NEXT:    vpermt2w (%rdi), %ymm1, %ymm0
742; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
743; CHECK-NEXT:    vzeroupper
744; CHECK-NEXT:    retq
745  %vec = load <32 x i16>, <32 x i16>* %vp
746  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
747  ret <8 x i16> %res
748}
749define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
750; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
751; CHECK:       # %bb.0:
752; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
753; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
754; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
755; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
756; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
757; CHECK-NEXT:    vzeroupper
758; CHECK-NEXT:    retq
759  %vec = load <32 x i16>, <32 x i16>* %vp
760  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
761  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
762  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
763  ret <8 x i16> %res
764}
765
766define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
767; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
768; CHECK:       # %bb.0:
769; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
770; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
771; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
772; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
773; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
774; CHECK-NEXT:    vzeroupper
775; CHECK-NEXT:    retq
776  %vec = load <32 x i16>, <32 x i16>* %vp
777  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
778  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
779  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
780  ret <8 x i16> %res
781}
782
783define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
784; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
785; CHECK:       # %bb.0:
786; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
787; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm3
788; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm3
789; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
790; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
791; CHECK-NEXT:    vzeroupper
792; CHECK-NEXT:    retq
793  %vec = load <32 x i16>, <32 x i16>* %vp
794  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
795  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
796  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
797  ret <8 x i16> %res
798}
799
800define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
801; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
802; CHECK:       # %bb.0:
803; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
804; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
805; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
806; CHECK-NEXT:    vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
807; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
808; CHECK-NEXT:    vzeroupper
809; CHECK-NEXT:    retq
810  %vec = load <32 x i16>, <32 x i16>* %vp
811  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
812  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
813  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
814  ret <8 x i16> %res
815}
816
817define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
818; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
819; CHECK:       # %bb.0:
820; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
821; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
822; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
823; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
824; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
825; CHECK-NEXT:    vzeroupper
826; CHECK-NEXT:    retq
827  %vec = load <32 x i16>, <32 x i16>* %vp
828  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
829  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
830  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
831  ret <8 x i16> %res
832}
833
834define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
835; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
836; CHECK:       # %bb.0:
837; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
838; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
839; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
840; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
841; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
842; CHECK-NEXT:    vzeroupper
843; CHECK-NEXT:    retq
844  %vec = load <32 x i16>, <32 x i16>* %vp
845  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
846  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
847  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
848  ret <8 x i16> %res
849}
850
851define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
852; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
853; CHECK:       # %bb.0:
854; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
855; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
856; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm1, %ymm0
857; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
858; CHECK-NEXT:    vzeroupper
859; CHECK-NEXT:    retq
860  %vec = load <32 x i16>, <32 x i16>* %vp
861  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
862  ret <8 x i16> %res
863}
864define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
865; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
866; CHECK:       # %bb.0:
867; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
868; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
869; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm3
870; CHECK-NEXT:    vptestnmw %xmm1, %xmm1, %k1
871; CHECK-NEXT:    vmovdqu16 %xmm3, %xmm0 {%k1}
872; CHECK-NEXT:    vzeroupper
873; CHECK-NEXT:    retq
874  %vec = load <32 x i16>, <32 x i16>* %vp
875  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
876  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
877  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
878  ret <8 x i16> %res
879}
880
881define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
882; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
883; CHECK:       # %bb.0:
884; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
885; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
886; CHECK-NEXT:    vptestnmw %xmm0, %xmm0, %k1
887; CHECK-NEXT:    vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
888; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
889; CHECK-NEXT:    vzeroupper
890; CHECK-NEXT:    retq
891  %vec = load <32 x i16>, <32 x i16>* %vp
892  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
893  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
894  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
895  ret <8 x i16> %res
896}
897
898define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
899; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
900; CHECK:       # %bb.0:
901; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15]
902; CHECK-NEXT:    # ymm1 = mem[0,1,0,1]
903; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
904; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
905; CHECK-NEXT:    vzeroupper
906; CHECK-NEXT:    retq
907  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
908  ret <8 x i16> %res
909}
910
911define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
912; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
913; CHECK:       # %bb.0:
914; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,3,2]
915; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
916; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
917; CHECK-NEXT:    vzeroupper
918; CHECK-NEXT:    retq
919  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
920  ret <4 x i32> %res
921}
922define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
923; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
924; CHECK:       # %bb.0:
925; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,0,3,2]
926; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
927; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
928; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
929; CHECK-NEXT:    vzeroupper
930; CHECK-NEXT:    retq
931  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
932  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
933  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
934  ret <4 x i32> %res
935}
936
937define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
938; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
939; CHECK:       # %bb.0:
940; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
941; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
942; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
943; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
944; CHECK-NEXT:    vzeroupper
945; CHECK-NEXT:    retq
946  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
947  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
948  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
949  ret <4 x i32> %res
950}
951define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
952; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
953; CHECK:       # %bb.0:
954; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,0,7,3]
955; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
956; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
957; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
958; CHECK-NEXT:    vzeroupper
959; CHECK-NEXT:    retq
960  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
961  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
962  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
963  ret <4 x i32> %res
964}
965
966define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
967; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
968; CHECK:       # %bb.0:
969; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
970; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
971; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
972; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
973; CHECK-NEXT:    vzeroupper
974; CHECK-NEXT:    retq
975  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
976  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
977  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
978  ret <4 x i32> %res
979}
980define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
981; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
982; CHECK:       # %bb.0:
983; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
984; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
985; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
986; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
987; CHECK-NEXT:    vzeroupper
988; CHECK-NEXT:    retq
989  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
990  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
991  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
992  ret <4 x i32> %res
993}
994
995define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
996; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
997; CHECK:       # %bb.0:
998; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
999; CHECK-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
1000; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1001; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1002; CHECK-NEXT:    vzeroupper
1003; CHECK-NEXT:    retq
1004  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1005  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1006  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1007  ret <4 x i32> %res
1008}
1009define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1010; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1011; CHECK:       # %bb.0:
1012; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [5,3,2,5]
1013; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1014; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1015; CHECK-NEXT:    vzeroupper
1016; CHECK-NEXT:    retq
1017  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1018  ret <4 x i32> %res
1019}
1020define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1021; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1022; CHECK:       # %bb.0:
1023; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,3,2,5]
1024; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
1025; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1026; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1027; CHECK-NEXT:    vzeroupper
1028; CHECK-NEXT:    retq
1029  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1030  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1031  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1032  ret <4 x i32> %res
1033}
1034
1035define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1036; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1037; CHECK:       # %bb.0:
1038; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
1039; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1040; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1041; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1042; CHECK-NEXT:    vzeroupper
1043; CHECK-NEXT:    retq
1044  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1045  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1046  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1047  ret <4 x i32> %res
1048}
1049define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
1050; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1051; CHECK:       # %bb.0:
1052; CHECK-NEXT:    vmovaps 16(%rdi), %xmm0
1053; CHECK-NEXT:    vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
1054; CHECK-NEXT:    retq
1055  %vec = load <8 x i32>, <8 x i32>* %vp
1056  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1057  ret <4 x i32> %res
1058}
1059define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1060; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1061; CHECK:       # %bb.0:
1062; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
1063; CHECK-NEXT:    vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
1064; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1065; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1}
1066; CHECK-NEXT:    retq
1067  %vec = load <8 x i32>, <8 x i32>* %vp
1068  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1069  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1070  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1071  ret <4 x i32> %res
1072}
1073
1074define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
1075; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1076; CHECK:       # %bb.0:
1077; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
1078; CHECK-NEXT:    vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
1079; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1080; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1081; CHECK-NEXT:    retq
1082  %vec = load <8 x i32>, <8 x i32>* %vp
1083  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1084  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1085  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1086  ret <4 x i32> %res
1087}
1088
1089define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1090; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1091; CHECK:       # %bb.0:
1092; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1093; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
1094; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm3
1095; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1096; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1097; CHECK-NEXT:    retq
1098  %vec = load <8 x i32>, <8 x i32>* %vp
1099  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1100  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1101  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1102  ret <4 x i32> %res
1103}
1104
1105define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
1106; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1107; CHECK:       # %bb.0:
1108; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1109; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
1110; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1111; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1112; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1113; CHECK-NEXT:    retq
1114  %vec = load <8 x i32>, <8 x i32>* %vp
1115  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1116  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1117  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1118  ret <4 x i32> %res
1119}
1120
1121define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1122; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1123; CHECK:       # %bb.0:
1124; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1125; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
1126; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
1127; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1128; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1129; CHECK-NEXT:    retq
1130  %vec = load <8 x i32>, <8 x i32>* %vp
1131  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1132  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1133  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1134  ret <4 x i32> %res
1135}
1136
1137define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
1138; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1141; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
1142; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1143; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1144; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1145; CHECK-NEXT:    retq
1146  %vec = load <8 x i32>, <8 x i32>* %vp
1147  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1148  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1149  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1150  ret <4 x i32> %res
1151}
1152
1153define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
1154; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1155; CHECK:       # %bb.0:
1156; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm1
1157; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
1158; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm1, %xmm0
1159; CHECK-NEXT:    retq
1160  %vec = load <8 x i32>, <8 x i32>* %vp
1161  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1162  ret <4 x i32> %res
1163}
1164define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1165; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1166; CHECK:       # %bb.0:
1167; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm2
1168; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
1169; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm3
1170; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1171; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1172; CHECK-NEXT:    retq
1173  %vec = load <8 x i32>, <8 x i32>* %vp
1174  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1175  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1176  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1177  ret <4 x i32> %res
1178}
1179
1180define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
1181; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1182; CHECK:       # %bb.0:
1183; CHECK-NEXT:    vpbroadcastq 8(%rdi), %xmm2
1184; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
1185; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1186; CHECK-NEXT:    vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1187; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1188; CHECK-NEXT:    retq
1189  %vec = load <8 x i32>, <8 x i32>* %vp
1190  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1191  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1192  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1193  ret <4 x i32> %res
1194}
1195
1196define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1197; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1198; CHECK:       # %bb.0:
1199; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
1200; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
1201; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1202; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1203; CHECK-NEXT:    retq
1204  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1205  ret <8 x i32> %res
1206}
1207define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1208; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1209; CHECK:       # %bb.0:
1210; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
1211; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
1212; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1213; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1214; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1215; CHECK-NEXT:    retq
1216  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1217  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1218  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1219  ret <8 x i32> %res
1220}
1221
1222define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1223; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1224; CHECK:       # %bb.0:
1225; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1226; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1227; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1228; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1229; CHECK-NEXT:    retq
1230  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1231  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1232  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1233  ret <8 x i32> %res
1234}
1235define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1236; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1237; CHECK:       # %bb.0:
1238; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8]
1239; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
1240; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1241; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1242; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1243; CHECK-NEXT:    retq
1244  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1245  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1246  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1247  ret <8 x i32> %res
1248}
1249
1250define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1251; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1252; CHECK:       # %bb.0:
1253; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1254; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1255; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1256; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1257; CHECK-NEXT:    retq
1258  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1259  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1260  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1261  ret <8 x i32> %res
1262}
1263define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1264; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1265; CHECK:       # %bb.0:
1266; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7]
1267; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
1268; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1269; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1270; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1271; CHECK-NEXT:    retq
1272  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1273  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1274  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1275  ret <8 x i32> %res
1276}
1277
1278define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1279; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1280; CHECK:       # %bb.0:
1281; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1282; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1283; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1284; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1285; CHECK-NEXT:    retq
1286  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1287  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1288  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1289  ret <8 x i32> %res
1290}
1291define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1292; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1293; CHECK:       # %bb.0:
1294; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
1295; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
1296; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1297; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1298; CHECK-NEXT:    retq
1299  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1300  ret <8 x i32> %res
1301}
1302define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1303; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1304; CHECK:       # %bb.0:
1305; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
1306; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
1307; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1308; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
1309; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1310; CHECK-NEXT:    retq
1311  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1312  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1313  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1314  ret <8 x i32> %res
1315}
1316
1317define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1318; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1319; CHECK:       # %bb.0:
1320; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1321; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1322; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1323; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1324; CHECK-NEXT:    retq
1325  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1326  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1327  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1328  ret <8 x i32> %res
1329}
1330define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1331; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1332; CHECK:       # %bb.0:
1333; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1334; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1335; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1336; CHECK-NEXT:    vzeroupper
1337; CHECK-NEXT:    retq
1338  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1339  ret <4 x i32> %res
1340}
1341define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1342; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1343; CHECK:       # %bb.0:
1344; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1345; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1346; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1347; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1348; CHECK-NEXT:    vzeroupper
1349; CHECK-NEXT:    retq
1350  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1351  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1352  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1353  ret <4 x i32> %res
1354}
1355
1356define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1357; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1358; CHECK:       # %bb.0:
1359; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1360; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1361; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1362; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1363; CHECK-NEXT:    vzeroupper
1364; CHECK-NEXT:    retq
1365  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1366  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1367  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1368  ret <4 x i32> %res
1369}
1370define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1371; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1372; CHECK:       # %bb.0:
1373; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,1,3,4]
1374; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1375; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
1376; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1377; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1378; CHECK-NEXT:    vzeroupper
1379; CHECK-NEXT:    retq
1380  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1381  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1382  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1383  ret <4 x i32> %res
1384}
1385
1386define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1387; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1388; CHECK:       # %bb.0:
1389; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
1390; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1391; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1392; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1393; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1394; CHECK-NEXT:    vzeroupper
1395; CHECK-NEXT:    retq
1396  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1397  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1398  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1399  ret <4 x i32> %res
1400}
1401define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1402; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1403; CHECK:       # %bb.0:
1404; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,13,0]
1405; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1406; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1407; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1408; CHECK-NEXT:    vzeroupper
1409; CHECK-NEXT:    retq
1410  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1411  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1412  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1413  ret <4 x i32> %res
1414}
1415
1416define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1417; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1418; CHECK:       # %bb.0:
1419; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
1420; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1421; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1422; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1423; CHECK-NEXT:    vzeroupper
1424; CHECK-NEXT:    retq
1425  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1426  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1427  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1428  ret <4 x i32> %res
1429}
1430define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1431; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1432; CHECK:       # %bb.0:
1433; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,0,0,13]
1434; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1435; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1436; CHECK-NEXT:    vzeroupper
1437; CHECK-NEXT:    retq
1438  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1439  ret <4 x i32> %res
1440}
1441define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1442; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1443; CHECK:       # %bb.0:
1444; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,0,0,13]
1445; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
1446; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
1447; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1448; CHECK-NEXT:    vzeroupper
1449; CHECK-NEXT:    retq
1450  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1451  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1452  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1453  ret <4 x i32> %res
1454}
1455
1456define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1457; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1458; CHECK:       # %bb.0:
1459; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
1460; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1461; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1462; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1463; CHECK-NEXT:    vzeroupper
1464; CHECK-NEXT:    retq
1465  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1466  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1467  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1468  ret <4 x i32> %res
1469}
1470define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
1471; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1472; CHECK:       # %bb.0:
1473; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1474; CHECK-NEXT:    vpermps 32(%rdi), %ymm0, %ymm0
1475; CHECK-NEXT:    retq
1476  %vec = load <16 x i32>, <16 x i32>* %vp
1477  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1478  ret <8 x i32> %res
1479}
1480define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1481; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1482; CHECK:       # %bb.0:
1483; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1484; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1485; CHECK-NEXT:    vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1486; CHECK-NEXT:    retq
1487  %vec = load <16 x i32>, <16 x i32>* %vp
1488  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1489  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1490  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1491  ret <8 x i32> %res
1492}
1493
1494define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
1495; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1496; CHECK:       # %bb.0:
1497; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1498; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1499; CHECK-NEXT:    vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1500; CHECK-NEXT:    retq
1501  %vec = load <16 x i32>, <16 x i32>* %vp
1502  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1503  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1504  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1505  ret <8 x i32> %res
1506}
1507
1508define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1509; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1510; CHECK:       # %bb.0:
1511; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1512; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1513; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1514; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1515; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
1516; CHECK-NEXT:    retq
1517  %vec = load <16 x i32>, <16 x i32>* %vp
1518  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1519  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1520  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1521  ret <8 x i32> %res
1522}
1523
1524define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
1525; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1526; CHECK:       # %bb.0:
1527; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1528; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1529; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1530; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1531; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1532; CHECK-NEXT:    retq
1533  %vec = load <16 x i32>, <16 x i32>* %vp
1534  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1535  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1536  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1537  ret <8 x i32> %res
1538}
1539
1540define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1541; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1542; CHECK:       # %bb.0:
1543; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1544; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1545; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1546; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1547; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
1548; CHECK-NEXT:    retq
1549  %vec = load <16 x i32>, <16 x i32>* %vp
1550  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1551  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1552  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1553  ret <8 x i32> %res
1554}
1555
1556define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
1557; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1558; CHECK:       # %bb.0:
1559; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1560; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1561; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1562; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1563; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1564; CHECK-NEXT:    retq
1565  %vec = load <16 x i32>, <16 x i32>* %vp
1566  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1567  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1568  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1569  ret <8 x i32> %res
1570}
1571
1572define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
1573; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1574; CHECK:       # %bb.0:
1575; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
1576; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1577; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm1, %ymm0
1578; CHECK-NEXT:    retq
1579  %vec = load <16 x i32>, <16 x i32>* %vp
1580  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1581  ret <8 x i32> %res
1582}
1583define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1584; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1585; CHECK:       # %bb.0:
1586; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1587; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1588; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm3
1589; CHECK-NEXT:    vptestnmd %ymm1, %ymm1, %k1
1590; CHECK-NEXT:    vmovdqa32 %ymm3, %ymm0 {%k1}
1591; CHECK-NEXT:    retq
1592  %vec = load <16 x i32>, <16 x i32>* %vp
1593  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1594  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1595  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1596  ret <8 x i32> %res
1597}
1598
1599define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
1600; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1601; CHECK:       # %bb.0:
1602; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
1603; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1604; CHECK-NEXT:    vptestnmd %ymm0, %ymm0, %k1
1605; CHECK-NEXT:    vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1606; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1607; CHECK-NEXT:    retq
1608  %vec = load <16 x i32>, <16 x i32>* %vp
1609  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1610  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1611  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1612  ret <8 x i32> %res
1613}
1614
1615define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
1616; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1617; CHECK:       # %bb.0:
1618; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
1619; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
1620; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm1, %ymm0
1621; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1622; CHECK-NEXT:    vzeroupper
1623; CHECK-NEXT:    retq
1624  %vec = load <16 x i32>, <16 x i32>* %vp
1625  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1626  ret <4 x i32> %res
1627}
1628define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1629; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1630; CHECK:       # %bb.0:
1631; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1632; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
1633; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
1634; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1635; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1636; CHECK-NEXT:    vzeroupper
1637; CHECK-NEXT:    retq
1638  %vec = load <16 x i32>, <16 x i32>* %vp
1639  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1640  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1641  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1642  ret <4 x i32> %res
1643}
1644
1645define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
1646; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1647; CHECK:       # %bb.0:
1648; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1649; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
1650; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1651; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1652; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1653; CHECK-NEXT:    vzeroupper
1654; CHECK-NEXT:    retq
1655  %vec = load <16 x i32>, <16 x i32>* %vp
1656  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1657  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1658  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1659  ret <4 x i32> %res
1660}
1661
1662define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1663; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1664; CHECK:       # %bb.0:
1665; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1666; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u>
1667; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm3
1668; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1669; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1670; CHECK-NEXT:    vzeroupper
1671; CHECK-NEXT:    retq
1672  %vec = load <16 x i32>, <16 x i32>* %vp
1673  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1674  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1675  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1676  ret <4 x i32> %res
1677}
1678
1679define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
1680; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1681; CHECK:       # %bb.0:
1682; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
1683; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u>
1684; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1685; CHECK-NEXT:    vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1686; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1687; CHECK-NEXT:    vzeroupper
1688; CHECK-NEXT:    retq
1689  %vec = load <16 x i32>, <16 x i32>* %vp
1690  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1691  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1692  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1693  ret <4 x i32> %res
1694}
1695
1696define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1697; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1698; CHECK:       # %bb.0:
1699; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1700; CHECK-NEXT:    vmovdqa (%rdi), %ymm3
1701; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm3
1702; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1703; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1704; CHECK-NEXT:    vzeroupper
1705; CHECK-NEXT:    retq
1706  %vec = load <16 x i32>, <16 x i32>* %vp
1707  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1708  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1709  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1710  ret <4 x i32> %res
1711}
1712
1713define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
1714; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1715; CHECK:       # %bb.0:
1716; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1717; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
1718; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1719; CHECK-NEXT:    vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1720; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1721; CHECK-NEXT:    vzeroupper
1722; CHECK-NEXT:    retq
1723  %vec = load <16 x i32>, <16 x i32>* %vp
1724  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1725  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1726  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1727  ret <4 x i32> %res
1728}
1729
1730define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
1731; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1732; CHECK:       # %bb.0:
1733; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
1734; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,4,3,6]
1735; CHECK-NEXT:    vpermi2d (%rdi), %xmm1, %xmm0
1736; CHECK-NEXT:    retq
1737  %vec = load <16 x i32>, <16 x i32>* %vp
1738  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1739  ret <4 x i32> %res
1740}
1741define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1742; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1743; CHECK:       # %bb.0:
1744; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1745; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,4,3,6]
1746; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm3
1747; CHECK-NEXT:    vptestnmd %xmm1, %xmm1, %k1
1748; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1}
1749; CHECK-NEXT:    retq
1750  %vec = load <16 x i32>, <16 x i32>* %vp
1751  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1752  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1753  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1754  ret <4 x i32> %res
1755}
1756
1757define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
1758; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1759; CHECK:       # %bb.0:
1760; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1761; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,4,3,6]
1762; CHECK-NEXT:    vptestnmd %xmm0, %xmm0, %k1
1763; CHECK-NEXT:    vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1764; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
1765; CHECK-NEXT:    retq
1766  %vec = load <16 x i32>, <16 x i32>* %vp
1767  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1768  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1769  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1770  ret <4 x i32> %res
1771}
1772
1773define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1774; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
1775; CHECK:       # %bb.0:
1776; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [12,9,4,10]
1777; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1778; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1779; CHECK-NEXT:    vzeroupper
1780; CHECK-NEXT:    retq
1781  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1782  ret <4 x i32> %res
1783}
1784
1785define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1786; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1787; CHECK:       # %bb.0:
1788; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1789; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1790; CHECK-NEXT:    vzeroupper
1791; CHECK-NEXT:    retq
1792  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1793  ret <2 x i64> %res
1794}
1795define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1796; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1797; CHECK:       # %bb.0:
1798; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1799; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
1800; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1801; CHECK-NEXT:    vzeroupper
1802; CHECK-NEXT:    retq
1803  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1804  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1805  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1806  ret <2 x i64> %res
1807}
1808
1809define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1810; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1811; CHECK:       # %bb.0:
1812; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1813; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1814; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1815; CHECK-NEXT:    vzeroupper
1816; CHECK-NEXT:    retq
1817  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1818  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1819  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1820  ret <2 x i64> %res
1821}
1822define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1823; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1824; CHECK:       # %bb.0:
1825; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1826; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
1827; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1828; CHECK-NEXT:    vzeroupper
1829; CHECK-NEXT:    retq
1830  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1831  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1832  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1833  ret <2 x i64> %res
1834}
1835
1836define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1837; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1838; CHECK:       # %bb.0:
1839; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1840; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1841; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1842; CHECK-NEXT:    vzeroupper
1843; CHECK-NEXT:    retq
1844  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1845  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1846  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1847  ret <2 x i64> %res
1848}
1849define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
1850; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1851; CHECK:       # %bb.0:
1852; CHECK-NEXT:    vmovaps (%rdi), %xmm0
1853; CHECK-NEXT:    vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
1854; CHECK-NEXT:    retq
1855  %vec = load <4 x i64>, <4 x i64>* %vp
1856  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1857  ret <2 x i64> %res
1858}
1859define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1860; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1861; CHECK:       # %bb.0:
1862; CHECK-NEXT:    vmovdqa (%rdi), %xmm2
1863; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1864; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
1865; CHECK-NEXT:    retq
1866  %vec = load <4 x i64>, <4 x i64>* %vp
1867  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1868  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1869  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1870  ret <2 x i64> %res
1871}
1872
1873define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
1874; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1875; CHECK:       # %bb.0:
1876; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
1877; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1878; CHECK-NEXT:    vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
1879; CHECK-NEXT:    retq
1880  %vec = load <4 x i64>, <4 x i64>* %vp
1881  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1882  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1883  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1884  ret <2 x i64> %res
1885}
1886
1887define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1888; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1889; CHECK:       # %bb.0:
1890; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm2
1891; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
1892; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
1893; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
1894; CHECK-NEXT:    retq
1895  %vec = load <4 x i64>, <4 x i64>* %vp
1896  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1897  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1898  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1899  ret <2 x i64> %res
1900}
1901
1902define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
1903; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1904; CHECK:       # %bb.0:
1905; CHECK-NEXT:    vmovdqa 16(%rdi), %xmm1
1906; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
1907; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
1908; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1909; CHECK-NEXT:    retq
1910  %vec = load <4 x i64>, <4 x i64>* %vp
1911  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1912  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1913  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1914  ret <2 x i64> %res
1915}
1916
1917define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1918; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1919; CHECK:       # %bb.0:
1920; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
1921; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1922; CHECK-NEXT:    retq
1923  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1924  ret <4 x i64> %res
1925}
1926define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1927; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1928; CHECK:       # %bb.0:
1929; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1930; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1931; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1932; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
1933; CHECK-NEXT:    retq
1934  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1935  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1936  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1937  ret <4 x i64> %res
1938}
1939
1940define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1941; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1942; CHECK:       # %bb.0:
1943; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1944; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1945; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1946; CHECK-NEXT:    retq
1947  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1948  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1949  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1950  ret <4 x i64> %res
1951}
1952define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1953; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1954; CHECK:       # %bb.0:
1955; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1]
1956; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
1957; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
1958; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1959; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1960; CHECK-NEXT:    retq
1961  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1962  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1963  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1964  ret <4 x i64> %res
1965}
1966
1967define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1968; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1969; CHECK:       # %bb.0:
1970; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1]
1971; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
1972; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1973; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1974; CHECK-NEXT:    retq
1975  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1976  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1977  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1978  ret <4 x i64> %res
1979}
1980define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1981; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
1982; CHECK:       # %bb.0:
1983; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3]
1984; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1985; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
1986; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
1987; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1988; CHECK-NEXT:    retq
1989  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
1990  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1991  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1992  ret <4 x i64> %res
1993}
1994
1995define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
1996; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
1997; CHECK:       # %bb.0:
1998; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
1999; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
2000; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2001; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2002; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2003; CHECK-NEXT:    retq
2004  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2005  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2006  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2007  ret <4 x i64> %res
2008}
2009define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2010; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
2011; CHECK:       # %bb.0:
2012; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7]
2013; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
2014; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
2015; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2016; CHECK-NEXT:    retq
2017  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2018  ret <4 x i64> %res
2019}
2020define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2021; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2022; CHECK:       # %bb.0:
2023; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7]
2024; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2025; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
2026; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2027; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2028; CHECK-NEXT:    retq
2029  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2030  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2031  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2032  ret <4 x i64> %res
2033}
2034
2035define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2036; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2037; CHECK:       # %bb.0:
2038; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7]
2039; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2040; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2041; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2042; CHECK-NEXT:    retq
2043  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2044  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2045  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2046  ret <4 x i64> %res
2047}
2048define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2049; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2050; CHECK:       # %bb.0:
2051; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5]
2052; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2053; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
2054; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2055; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2056; CHECK-NEXT:    retq
2057  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2058  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2059  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2060  ret <4 x i64> %res
2061}
2062
2063define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2064; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2065; CHECK:       # %bb.0:
2066; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,7,5]
2067; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2068; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2069; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2070; CHECK-NEXT:    retq
2071  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2072  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2073  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2074  ret <4 x i64> %res
2075}
2076define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2077; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2078; CHECK:       # %bb.0:
2079; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6]
2080; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2081; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
2082; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2083; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2084; CHECK-NEXT:    retq
2085  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2086  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2087  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2088  ret <4 x i64> %res
2089}
2090
2091define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2092; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2093; CHECK:       # %bb.0:
2094; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
2095; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2096; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2097; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2098; CHECK-NEXT:    retq
2099  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2100  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2101  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2102  ret <4 x i64> %res
2103}
2104define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2105; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
2106; CHECK:       # %bb.0:
2107; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3]
2108; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
2109; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
2110; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2111; CHECK-NEXT:    retq
2112  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2113  ret <4 x i64> %res
2114}
2115define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2116; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2117; CHECK:       # %bb.0:
2118; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3]
2119; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2120; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
2121; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2122; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2123; CHECK-NEXT:    retq
2124  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2125  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2126  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2127  ret <4 x i64> %res
2128}
2129
2130define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2131; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2132; CHECK:       # %bb.0:
2133; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,3]
2134; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2135; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2136; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2137; CHECK-NEXT:    retq
2138  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2139  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2140  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2141  ret <4 x i64> %res
2142}
2143define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2144; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2145; CHECK:       # %bb.0:
2146; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
2147; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2148; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4
2149; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
2150; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2151; CHECK-NEXT:    retq
2152  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2153  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2154  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2155  ret <4 x i64> %res
2156}
2157
2158define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2159; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2160; CHECK:       # %bb.0:
2161; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
2162; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
2163; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2164; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
2165; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
2166; CHECK-NEXT:    retq
2167  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2168  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2169  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2170  ret <4 x i64> %res
2171}
2172define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2173; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2174; CHECK:       # %bb.0:
2175; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
2176; CHECK-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
2177; CHECK-NEXT:    vzeroupper
2178; CHECK-NEXT:    retq
2179  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2180  ret <2 x i64> %res
2181}
2182define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2183; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2184; CHECK:       # %bb.0:
2185; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm3
2186; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
2187; CHECK-NEXT:    valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0]
2188; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
2189; CHECK-NEXT:    vzeroupper
2190; CHECK-NEXT:    retq
2191  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2192  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2193  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2194  ret <2 x i64> %res
2195}
2196
2197define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2198; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2199; CHECK:       # %bb.0:
2200; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
2201; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2202; CHECK-NEXT:    valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0]
2203; CHECK-NEXT:    vzeroupper
2204; CHECK-NEXT:    retq
2205  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2206  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2207  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2208  ret <2 x i64> %res
2209}
2210define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2211; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2212; CHECK:       # %bb.0:
2213; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2214; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2215; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
2216; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2217; CHECK-NEXT:    vzeroupper
2218; CHECK-NEXT:    retq
2219  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2220  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2221  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2222  ret <2 x i64> %res
2223}
2224
2225define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2226; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2227; CHECK:       # %bb.0:
2228; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2229; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2230; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2231; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2232; CHECK-NEXT:    vzeroupper
2233; CHECK-NEXT:    retq
2234  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2235  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2236  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2237  ret <2 x i64> %res
2238}
2239define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
2240; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2241; CHECK:       # %bb.0:
2242; CHECK-NEXT:    vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2]
2243; CHECK-NEXT:    retq
2244  %vec = load <8 x i64>, <8 x i64>* %vp
2245  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2246  ret <4 x i64> %res
2247}
2248define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2249; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2250; CHECK:       # %bb.0:
2251; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2252; CHECK-NEXT:    vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2]
2253; CHECK-NEXT:    retq
2254  %vec = load <8 x i64>, <8 x i64>* %vp
2255  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2256  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2257  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2258  ret <4 x i64> %res
2259}
2260
2261define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
2262; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2263; CHECK:       # %bb.0:
2264; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2265; CHECK-NEXT:    vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2]
2266; CHECK-NEXT:    retq
2267  %vec = load <8 x i64>, <8 x i64>* %vp
2268  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2269  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2270  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2271  ret <4 x i64> %res
2272}
2273
2274define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2275; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2276; CHECK:       # %bb.0:
2277; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2278; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
2279; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2280; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2281; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2282; CHECK-NEXT:    retq
2283  %vec = load <8 x i64>, <8 x i64>* %vp
2284  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2285  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2286  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2287  ret <4 x i64> %res
2288}
2289
2290define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
2291; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2292; CHECK:       # %bb.0:
2293; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2294; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2295; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2296; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2297; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2298; CHECK-NEXT:    retq
2299  %vec = load <8 x i64>, <8 x i64>* %vp
2300  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2301  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2302  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2303  ret <4 x i64> %res
2304}
2305
2306define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2307; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2308; CHECK:       # %bb.0:
2309; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2310; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
2311; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2312; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2313; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2314; CHECK-NEXT:    retq
2315  %vec = load <8 x i64>, <8 x i64>* %vp
2316  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2317  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2318  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2319  ret <4 x i64> %res
2320}
2321
2322define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
2323; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2324; CHECK:       # %bb.0:
2325; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2326; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2327; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2328; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2329; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2330; CHECK-NEXT:    retq
2331  %vec = load <8 x i64>, <8 x i64>* %vp
2332  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2333  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2334  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2335  ret <4 x i64> %res
2336}
2337
2338define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
2339; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2340; CHECK:       # %bb.0:
2341; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
2342; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2343; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
2344; CHECK-NEXT:    retq
2345  %vec = load <8 x i64>, <8 x i64>* %vp
2346  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2347  ret <4 x i64> %res
2348}
2349define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2350; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2351; CHECK:       # %bb.0:
2352; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2353; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
2354; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
2355; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2356; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2357; CHECK-NEXT:    retq
2358  %vec = load <8 x i64>, <8 x i64>* %vp
2359  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2360  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2361  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2362  ret <4 x i64> %res
2363}
2364
2365define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
2366; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2367; CHECK:       # %bb.0:
2368; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2369; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2370; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2371; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2372; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2373; CHECK-NEXT:    retq
2374  %vec = load <8 x i64>, <8 x i64>* %vp
2375  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2376  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2377  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2378  ret <4 x i64> %res
2379}
2380
2381define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2382; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2383; CHECK:       # %bb.0:
2384; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2385; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
2386; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
2387; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2388; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2389; CHECK-NEXT:    retq
2390  %vec = load <8 x i64>, <8 x i64>* %vp
2391  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2392  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2393  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2394  ret <4 x i64> %res
2395}
2396
2397define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
2398; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2399; CHECK:       # %bb.0:
2400; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2401; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
2402; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2403; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2404; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2405; CHECK-NEXT:    retq
2406  %vec = load <8 x i64>, <8 x i64>* %vp
2407  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2408  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2409  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2410  ret <4 x i64> %res
2411}
2412
2413define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2414; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2415; CHECK:       # %bb.0:
2416; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2417; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
2418; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
2419; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2420; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2421; CHECK-NEXT:    retq
2422  %vec = load <8 x i64>, <8 x i64>* %vp
2423  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2424  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2425  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2426  ret <4 x i64> %res
2427}
2428
2429define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
2430; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2431; CHECK:       # %bb.0:
2432; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2433; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2434; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2435; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2436; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2437; CHECK-NEXT:    retq
2438  %vec = load <8 x i64>, <8 x i64>* %vp
2439  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2440  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2441  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2442  ret <4 x i64> %res
2443}
2444
2445define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
2446; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2447; CHECK:       # %bb.0:
2448; CHECK-NEXT:    vmovdqa (%rdi), %ymm1
2449; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
2450; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm1, %ymm0
2451; CHECK-NEXT:    retq
2452  %vec = load <8 x i64>, <8 x i64>* %vp
2453  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2454  ret <4 x i64> %res
2455}
2456define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2457; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2458; CHECK:       # %bb.0:
2459; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2460; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
2461; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm3
2462; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2463; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2464; CHECK-NEXT:    retq
2465  %vec = load <8 x i64>, <8 x i64>* %vp
2466  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2467  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2468  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2469  ret <4 x i64> %res
2470}
2471
2472define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
2473; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2474; CHECK:       # %bb.0:
2475; CHECK-NEXT:    vmovdqa (%rdi), %ymm2
2476; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
2477; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2478; CHECK-NEXT:    vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2479; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2480; CHECK-NEXT:    retq
2481  %vec = load <8 x i64>, <8 x i64>* %vp
2482  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2483  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2484  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2485  ret <4 x i64> %res
2486}
2487
2488define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2489; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2490; CHECK:       # %bb.0:
2491; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2492; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
2493; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm3
2494; CHECK-NEXT:    vptestnmq %ymm1, %ymm1, %k1
2495; CHECK-NEXT:    vmovdqa64 %ymm3, %ymm0 {%k1}
2496; CHECK-NEXT:    retq
2497  %vec = load <8 x i64>, <8 x i64>* %vp
2498  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2499  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2500  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2501  ret <4 x i64> %res
2502}
2503
2504define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
2505; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2506; CHECK:       # %bb.0:
2507; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2508; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2509; CHECK-NEXT:    vptestnmq %ymm0, %ymm0, %k1
2510; CHECK-NEXT:    vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2511; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
2512; CHECK-NEXT:    retq
2513  %vec = load <8 x i64>, <8 x i64>* %vp
2514  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2515  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2516  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2517  ret <4 x i64> %res
2518}
2519
2520define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
2521; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2522; CHECK:       # %bb.0:
2523; CHECK-NEXT:    vmovaps 32(%rdi), %xmm0
2524; CHECK-NEXT:    vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2525; CHECK-NEXT:    retq
2526  %vec = load <8 x i64>, <8 x i64>* %vp
2527  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2528  ret <2 x i64> %res
2529}
2530define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2531; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2532; CHECK:       # %bb.0:
2533; CHECK-NEXT:    vmovdqa 32(%rdi), %xmm2
2534; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2535; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2536; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
2537; CHECK-NEXT:    retq
2538  %vec = load <8 x i64>, <8 x i64>* %vp
2539  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2540  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2541  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2542  ret <2 x i64> %res
2543}
2544
2545define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
2546; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2547; CHECK:       # %bb.0:
2548; CHECK-NEXT:    vmovdqa 32(%rdi), %xmm1
2549; CHECK-NEXT:    vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2550; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
2551; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2552; CHECK-NEXT:    retq
2553  %vec = load <8 x i64>, <8 x i64>* %vp
2554  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2555  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2556  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2557  ret <2 x i64> %res
2558}
2559
2560define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2561; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2562; CHECK:       # %bb.0:
2563; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm2
2564; CHECK-NEXT:    vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2565; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm2
2566; CHECK-NEXT:    vptestnmq %xmm1, %xmm1, %k1
2567; CHECK-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
2568; CHECK-NEXT:    vzeroupper
2569; CHECK-NEXT:    retq
2570  %vec = load <8 x i64>, <8 x i64>* %vp
2571  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2572  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2573  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2574  ret <2 x i64> %res
2575}
2576
2577define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
2578; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2579; CHECK:       # %bb.0:
2580; CHECK-NEXT:    vmovdqa 32(%rdi), %ymm1
2581; CHECK-NEXT:    vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2582; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
2583; CHECK-NEXT:    vptestnmq %xmm0, %xmm0, %k1
2584; CHECK-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2585; CHECK-NEXT:    vzeroupper
2586; CHECK-NEXT:    retq
2587  %vec = load <8 x i64>, <8 x i64>* %vp
2588  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2589  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2590  %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2591  ret <2 x i64> %res
2592}
2593
2594define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2595; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2596; CHECK:       # %bb.0:
2597; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
2598; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2599; CHECK-NEXT:    vzeroupper
2600; CHECK-NEXT:    retq
2601  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2602  ret <4 x float> %res
2603}
2604define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2605; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2606; CHECK:       # %bb.0:
2607; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
2608; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
2609; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
2610; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2611; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2612; CHECK-NEXT:    vzeroupper
2613; CHECK-NEXT:    retq
2614  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2615  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2616  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2617  ret <4 x float> %res
2618}
2619
2620define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2621; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2622; CHECK:       # %bb.0:
2623; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
2624; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2625; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2626; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2627; CHECK-NEXT:    vzeroupper
2628; CHECK-NEXT:    retq
2629  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2630  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2631  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2632  ret <4 x float> %res
2633}
2634define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2635; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2636; CHECK:       # %bb.0:
2637; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [1,3,5,0]
2638; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
2639; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2640; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
2641; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2642; CHECK-NEXT:    vzeroupper
2643; CHECK-NEXT:    retq
2644  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2645  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2646  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2647  ret <4 x float> %res
2648}
2649
2650define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2651; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2652; CHECK:       # %bb.0:
2653; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [1,3,5,0]
2654; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2655; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2656; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2657; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2658; CHECK-NEXT:    vzeroupper
2659; CHECK-NEXT:    retq
2660  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2661  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2662  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2663  ret <4 x float> %res
2664}
2665define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2666; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2667; CHECK:       # %bb.0:
2668; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,2,7,0]
2669; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
2670; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2671; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
2672; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2673; CHECK-NEXT:    vzeroupper
2674; CHECK-NEXT:    retq
2675  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2676  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2677  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2678  ret <4 x float> %res
2679}
2680
2681define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2682; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2683; CHECK:       # %bb.0:
2684; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,2,7,0]
2685; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2686; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2687; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2688; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2689; CHECK-NEXT:    vzeroupper
2690; CHECK-NEXT:    retq
2691  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2692  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2693  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2694  ret <4 x float> %res
2695}
2696define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2697; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2698; CHECK:       # %bb.0:
2699; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,3,5,2]
2700; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2701; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2702; CHECK-NEXT:    vzeroupper
2703; CHECK-NEXT:    retq
2704  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2705  ret <4 x float> %res
2706}
2707define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2708; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2709; CHECK:       # %bb.0:
2710; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,3,5,2]
2711; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
2712; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2713; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
2714; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2715; CHECK-NEXT:    vzeroupper
2716; CHECK-NEXT:    retq
2717  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2718  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2719  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2720  ret <4 x float> %res
2721}
2722
2723define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2724; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2725; CHECK:       # %bb.0:
2726; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,5,2]
2727; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2728; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
2729; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2730; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2731; CHECK-NEXT:    vzeroupper
2732; CHECK-NEXT:    retq
2733  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2734  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2735  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2736  ret <4 x float> %res
2737}
2738define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
2739; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2740; CHECK:       # %bb.0:
2741; CHECK-NEXT:    vmovaps 16(%rdi), %xmm1
2742; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [2,6,0,1]
2743; CHECK-NEXT:    vpermi2ps (%rdi), %xmm1, %xmm0
2744; CHECK-NEXT:    retq
2745  %vec = load <8 x float>, <8 x float>* %vp
2746  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2747  ret <4 x float> %res
2748}
2749define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2750; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2751; CHECK:       # %bb.0:
2752; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2753; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [2,6,0,1]
2754; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
2755; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2756; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2757; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
2758; CHECK-NEXT:    retq
2759  %vec = load <8 x float>, <8 x float>* %vp
2760  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2761  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2762  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2763  ret <4 x float> %res
2764}
2765
2766define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
2767; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2768; CHECK:       # %bb.0:
2769; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2770; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [2,6,0,1]
2771; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2772; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
2773; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2774; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2775; CHECK-NEXT:    retq
2776  %vec = load <8 x float>, <8 x float>* %vp
2777  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2778  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2779  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2780  ret <4 x float> %res
2781}
2782
2783define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2784; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2785; CHECK:       # %bb.0:
2786; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2787; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [2,7,7,2]
2788; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm3
2789; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2790; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2791; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
2792; CHECK-NEXT:    retq
2793  %vec = load <8 x float>, <8 x float>* %vp
2794  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2795  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2796  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2797  ret <4 x float> %res
2798}
2799
2800define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
2801; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
2802; CHECK:       # %bb.0:
2803; CHECK-NEXT:    vmovaps 16(%rdi), %xmm2
2804; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [2,7,7,2]
2805; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2806; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
2807; CHECK-NEXT:    vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2808; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2809; CHECK-NEXT:    retq
2810  %vec = load <8 x float>, <8 x float>* %vp
2811  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2812  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2813  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2814  ret <4 x float> %res
2815}
2816
2817define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2818; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
2819; CHECK:       # %bb.0:
2820; CHECK-NEXT:    vmovaps (%rdi), %xmm2
2821; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,1,3,7]
2822; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
2823; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2824; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2825; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
2826; CHECK-NEXT:    retq
2827  %vec = load <8 x float>, <8 x float>* %vp
2828  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2829  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2830  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2831  ret <4 x float> %res
2832}
2833
2834define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
2835; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
2836; CHECK:       # %bb.0:
2837; CHECK-NEXT:    vmovaps (%rdi), %xmm2
2838; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,1,3,7]
2839; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2840; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
2841; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
2842; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2843; CHECK-NEXT:    retq
2844  %vec = load <8 x float>, <8 x float>* %vp
2845  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2846  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2847  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2848  ret <4 x float> %res
2849}
2850
2851define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
2852; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
2853; CHECK:       # %bb.0:
2854; CHECK-NEXT:    vmovaps (%rdi), %xmm1
2855; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [1,3,5,3]
2856; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
2857; CHECK-NEXT:    retq
2858  %vec = load <8 x float>, <8 x float>* %vp
2859  %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2860  ret <4 x float> %res
2861}
2862define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2863; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
2864; CHECK:       # %bb.0:
2865; CHECK-NEXT:    vmovaps (%rdi), %xmm2
2866; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [1,3,5,3]
2867; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
2868; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
2869; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
2870; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
2871; CHECK-NEXT:    retq
2872  %vec = load <8 x float>, <8 x float>* %vp
2873  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2874  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2875  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2876  ret <4 x float> %res
2877}
2878
2879define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
2880; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
2881; CHECK:       # %bb.0:
2882; CHECK-NEXT:    vmovaps (%rdi), %xmm2
2883; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [1,3,5,3]
2884; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2885; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
2886; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
2887; CHECK-NEXT:    vmovaps %xmm1, %xmm0
2888; CHECK-NEXT:    retq
2889  %vec = load <8 x float>, <8 x float>* %vp
2890  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2891  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2892  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2893  ret <4 x float> %res
2894}
2895
2896define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
2897; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
2898; CHECK:       # %bb.0:
2899; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
2900; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
2901; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
2902; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2903; CHECK-NEXT:    retq
2904  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2905  ret <8 x float> %res
2906}
2907define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2908; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
2909; CHECK:       # %bb.0:
2910; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
2911; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2912; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
2913; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2914; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
2915; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2916; CHECK-NEXT:    retq
2917  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2918  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2919  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2920  ret <8 x float> %res
2921}
2922
2923define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
2924; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
2925; CHECK:       # %bb.0:
2926; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
2927; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2928; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
2929; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2930; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2931; CHECK-NEXT:    retq
2932  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2933  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2934  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2935  ret <8 x float> %res
2936}
2937define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2938; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
2939; CHECK:       # %bb.0:
2940; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14]
2941; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
2942; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
2943; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2944; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
2945; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2946; CHECK-NEXT:    retq
2947  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2948  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2949  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2950  ret <8 x float> %res
2951}
2952
2953define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
2954; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
2955; CHECK:       # %bb.0:
2956; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
2957; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2958; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
2959; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2960; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2961; CHECK-NEXT:    retq
2962  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2963  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2964  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2965  ret <8 x float> %res
2966}
2967define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2968; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
2969; CHECK:       # %bb.0:
2970; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
2971; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
2972; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2973; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
2974; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2975; CHECK-NEXT:    retq
2976  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
2977  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2978  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2979  ret <8 x float> %res
2980}
2981
2982define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
2983; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
2984; CHECK:       # %bb.0:
2985; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
2986; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
2987; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
2988; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2989; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2990; CHECK-NEXT:    retq
2991  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
2992  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2993  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2994  ret <8 x float> %res
2995}
2996define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
2997; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
2998; CHECK:       # %bb.0:
2999; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
3000; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
3001; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
3002; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3003; CHECK-NEXT:    retq
3004  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3005  ret <8 x float> %res
3006}
3007define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3008; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3009; CHECK:       # %bb.0:
3010; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
3011; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3012; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
3013; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3014; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
3015; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3016; CHECK-NEXT:    retq
3017  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3018  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3019  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3020  ret <8 x float> %res
3021}
3022
3023define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3024; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3025; CHECK:       # %bb.0:
3026; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3027; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3028; CHECK-NEXT:    vcmpeqps %ymm3, %ymm1, %k1
3029; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3030; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3031; CHECK-NEXT:    retq
3032  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3033  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3034  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3035  ret <8 x float> %res
3036}
3037define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3038; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3039; CHECK:       # %bb.0:
3040; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4,8,9,10]
3041; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3042; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3043; CHECK-NEXT:    vzeroupper
3044; CHECK-NEXT:    retq
3045  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3046  ret <4 x float> %res
3047}
3048define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3049; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3050; CHECK:       # %bb.0:
3051; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [4,8,9,10]
3052; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
3053; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3054; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
3055; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3056; CHECK-NEXT:    vzeroupper
3057; CHECK-NEXT:    retq
3058  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3059  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3060  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3061  ret <4 x float> %res
3062}
3063
3064define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3065; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3066; CHECK:       # %bb.0:
3067; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [4,8,9,10]
3068; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3069; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3070; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3071; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3072; CHECK-NEXT:    vzeroupper
3073; CHECK-NEXT:    retq
3074  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3075  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3076  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3077  ret <4 x float> %res
3078}
3079define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3080; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3081; CHECK:       # %bb.0:
3082; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [8,6,10,6]
3083; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
3084; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3085; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
3086; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3087; CHECK-NEXT:    vzeroupper
3088; CHECK-NEXT:    retq
3089  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3090  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3091  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3092  ret <4 x float> %res
3093}
3094
3095define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3096; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3097; CHECK:       # %bb.0:
3098; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [8,6,10,6]
3099; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3100; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3101; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3102; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3103; CHECK-NEXT:    vzeroupper
3104; CHECK-NEXT:    retq
3105  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3106  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3107  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3108  ret <4 x float> %res
3109}
3110define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3111; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3112; CHECK:       # %bb.0:
3113; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
3114; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3115; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3116; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
3117; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3118; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3119; CHECK-NEXT:    vzeroupper
3120; CHECK-NEXT:    retq
3121  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3122  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3123  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3124  ret <4 x float> %res
3125}
3126
3127define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3128; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3129; CHECK:       # %bb.0:
3130; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
3131; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3132; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3133; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3134; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3135; CHECK-NEXT:    vzeroupper
3136; CHECK-NEXT:    retq
3137  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3138  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3139  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3140  ret <4 x float> %res
3141}
3142define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3143; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3144; CHECK:       # %bb.0:
3145; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [10,2,11,6]
3146; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
3147; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3148; CHECK-NEXT:    vzeroupper
3149; CHECK-NEXT:    retq
3150  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3151  ret <4 x float> %res
3152}
3153define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3154; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3155; CHECK:       # %bb.0:
3156; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [10,2,11,6]
3157; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
3158; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3159; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
3160; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3161; CHECK-NEXT:    vzeroupper
3162; CHECK-NEXT:    retq
3163  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3164  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3165  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3166  ret <4 x float> %res
3167}
3168
3169define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3170; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3171; CHECK:       # %bb.0:
3172; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [10,2,11,6]
3173; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3174; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1
3175; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3176; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3177; CHECK-NEXT:    vzeroupper
3178; CHECK-NEXT:    retq
3179  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3180  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3181  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3182  ret <4 x float> %res
3183}
3184define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
3185; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3186; CHECK:       # %bb.0:
3187; CHECK-NEXT:    vmovaps (%rdi), %ymm1
3188; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3189; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm1, %ymm0
3190; CHECK-NEXT:    retq
3191  %vec = load <16 x float>, <16 x float>* %vp
3192  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3193  ret <8 x float> %res
3194}
3195define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3196; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3197; CHECK:       # %bb.0:
3198; CHECK-NEXT:    vmovaps (%rdi), %ymm2
3199; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3200; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
3201; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3202; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3203; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3204; CHECK-NEXT:    retq
3205  %vec = load <16 x float>, <16 x float>* %vp
3206  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3207  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3208  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3209  ret <8 x float> %res
3210}
3211
3212define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
3213; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3214; CHECK:       # %bb.0:
3215; CHECK-NEXT:    vmovaps (%rdi), %ymm2
3216; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3217; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3218; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3219; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3220; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3221; CHECK-NEXT:    retq
3222  %vec = load <16 x float>, <16 x float>* %vp
3223  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3224  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3225  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3226  ret <8 x float> %res
3227}
3228
3229define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3230; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3231; CHECK:       # %bb.0:
3232; CHECK-NEXT:    vmovaps (%rdi), %ymm2
3233; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3234; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm3
3235; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3236; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3237; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3238; CHECK-NEXT:    retq
3239  %vec = load <16 x float>, <16 x float>* %vp
3240  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3241  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3242  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3243  ret <8 x float> %res
3244}
3245
3246define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
3247; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3248; CHECK:       # %bb.0:
3249; CHECK-NEXT:    vmovaps (%rdi), %ymm2
3250; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3251; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3252; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3253; CHECK-NEXT:    vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3254; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3255; CHECK-NEXT:    retq
3256  %vec = load <16 x float>, <16 x float>* %vp
3257  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3258  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3259  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3260  ret <8 x float> %res
3261}
3262
3263define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3264; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3265; CHECK:       # %bb.0:
3266; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3267; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3268; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3269; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3270; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3271; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3272; CHECK-NEXT:    retq
3273  %vec = load <16 x float>, <16 x float>* %vp
3274  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3275  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3276  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3277  ret <8 x float> %res
3278}
3279
3280define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
3281; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3282; CHECK:       # %bb.0:
3283; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3284; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3285; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3286; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3287; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3288; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3289; CHECK-NEXT:    retq
3290  %vec = load <16 x float>, <16 x float>* %vp
3291  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3292  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3293  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3294  ret <8 x float> %res
3295}
3296
3297define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
3298; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3299; CHECK:       # %bb.0:
3300; CHECK-NEXT:    vmovaps 32(%rdi), %ymm1
3301; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3302; CHECK-NEXT:    vpermi2ps (%rdi), %ymm1, %ymm0
3303; CHECK-NEXT:    retq
3304  %vec = load <16 x float>, <16 x float>* %vp
3305  %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3306  ret <8 x float> %res
3307}
3308define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3309; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3310; CHECK:       # %bb.0:
3311; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3312; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3313; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3314; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3315; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
3316; CHECK-NEXT:    vmovaps %ymm3, %ymm0 {%k1}
3317; CHECK-NEXT:    retq
3318  %vec = load <16 x float>, <16 x float>* %vp
3319  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3320  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3321  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3322  ret <8 x float> %res
3323}
3324
3325define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
3326; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3327; CHECK:       # %bb.0:
3328; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3329; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3330; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3331; CHECK-NEXT:    vcmpeqps %ymm3, %ymm0, %k1
3332; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3333; CHECK-NEXT:    vmovaps %ymm1, %ymm0
3334; CHECK-NEXT:    retq
3335  %vec = load <16 x float>, <16 x float>* %vp
3336  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3337  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3338  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3339  ret <8 x float> %res
3340}
3341
3342define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
3343; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3344; CHECK:       # %bb.0:
3345; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
3346; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,7,3]
3347; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm1, %xmm0
3348; CHECK-NEXT:    vzeroupper
3349; CHECK-NEXT:    retq
3350  %vec = load <16 x float>, <16 x float>* %vp
3351  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3352  ret <4 x float> %res
3353}
3354define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3355; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3356; CHECK:       # %bb.0:
3357; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3358; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,7,3]
3359; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm3
3360; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3361; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3362; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3363; CHECK-NEXT:    vzeroupper
3364; CHECK-NEXT:    retq
3365  %vec = load <16 x float>, <16 x float>* %vp
3366  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3367  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3368  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3369  ret <4 x float> %res
3370}
3371
3372define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3373; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3374; CHECK:       # %bb.0:
3375; CHECK-NEXT:    vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3376; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,7,3]
3377; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3378; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3379; CHECK-NEXT:    vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3380; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3381; CHECK-NEXT:    vzeroupper
3382; CHECK-NEXT:    retq
3383  %vec = load <16 x float>, <16 x float>* %vp
3384  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3385  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3386  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3387  ret <4 x float> %res
3388}
3389
3390define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3391; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3392; CHECK:       # %bb.0:
3393; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3394; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u>
3395; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3396; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3397; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3398; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3399; CHECK-NEXT:    vzeroupper
3400; CHECK-NEXT:    retq
3401  %vec = load <16 x float>, <16 x float>* %vp
3402  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3403  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3404  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3405  ret <4 x float> %res
3406}
3407
3408define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
3409; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3410; CHECK:       # %bb.0:
3411; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3412; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u>
3413; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3414; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3415; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3416; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3417; CHECK-NEXT:    vzeroupper
3418; CHECK-NEXT:    retq
3419  %vec = load <16 x float>, <16 x float>* %vp
3420  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3421  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3422  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3423  ret <4 x float> %res
3424}
3425
3426define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3427; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3428; CHECK:       # %bb.0:
3429; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3430; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [60129542148,60129542148,60129542148,60129542148]
3431; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm3
3432; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3433; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3434; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3435; CHECK-NEXT:    vzeroupper
3436; CHECK-NEXT:    retq
3437  %vec = load <16 x float>, <16 x float>* %vp
3438  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3439  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3440  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3441  ret <4 x float> %res
3442}
3443
3444define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
3445; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3446; CHECK:       # %bb.0:
3447; CHECK-NEXT:    vmovaps 32(%rdi), %ymm2
3448; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [60129542148,60129542148,60129542148,60129542148]
3449; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3450; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3451; CHECK-NEXT:    vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3452; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3453; CHECK-NEXT:    vzeroupper
3454; CHECK-NEXT:    retq
3455  %vec = load <16 x float>, <16 x float>* %vp
3456  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3457  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3458  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3459  ret <4 x float> %res
3460}
3461
3462define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
3463; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3464; CHECK:       # %bb.0:
3465; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3,3,15,9]
3466; CHECK-NEXT:    vmovaps (%rdi), %ymm0
3467; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm1, %ymm0
3468; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3469; CHECK-NEXT:    vzeroupper
3470; CHECK-NEXT:    retq
3471  %vec = load <16 x float>, <16 x float>* %vp
3472  %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3473  ret <4 x float> %res
3474}
3475define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3476; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3477; CHECK:       # %bb.0:
3478; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3479; CHECK-NEXT:    vmovaps (%rdi), %ymm3
3480; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm3
3481; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
3482; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
3483; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1}
3484; CHECK-NEXT:    vzeroupper
3485; CHECK-NEXT:    retq
3486  %vec = load <16 x float>, <16 x float>* %vp
3487  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3488  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3489  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3490  ret <4 x float> %res
3491}
3492
3493define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
3494; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3495; CHECK:       # %bb.0:
3496; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3497; CHECK-NEXT:    vmovaps (%rdi), %ymm1
3498; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
3499; CHECK-NEXT:    vcmpeqps %xmm3, %xmm0, %k1
3500; CHECK-NEXT:    vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3501; CHECK-NEXT:    vmovaps %xmm1, %xmm0
3502; CHECK-NEXT:    vzeroupper
3503; CHECK-NEXT:    retq
3504  %vec = load <16 x float>, <16 x float>* %vp
3505  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3506  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3507  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3508  ret <4 x float> %res
3509}
3510
3511define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3512; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3513; CHECK:       # %bb.0:
3514; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3515; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3516; CHECK-NEXT:    vzeroupper
3517; CHECK-NEXT:    retq
3518  %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3519  ret <2 x double> %res
3520}
3521define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3522; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3523; CHECK:       # %bb.0:
3524; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3525; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3526; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
3527; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3528; CHECK-NEXT:    vzeroupper
3529; CHECK-NEXT:    retq
3530  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3531  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3532  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3533  ret <2 x double> %res
3534}
3535
3536define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3537; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3538; CHECK:       # %bb.0:
3539; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3540; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
3541; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3542; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3543; CHECK-NEXT:    vzeroupper
3544; CHECK-NEXT:    retq
3545  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3546  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3547  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3548  ret <2 x double> %res
3549}
3550define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3551; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3552; CHECK:       # %bb.0:
3553; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3554; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3555; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
3556; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3557; CHECK-NEXT:    vzeroupper
3558; CHECK-NEXT:    retq
3559  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3560  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3561  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3562  ret <2 x double> %res
3563}
3564
3565define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3566; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3567; CHECK:       # %bb.0:
3568; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3569; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
3570; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3571; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
3572; CHECK-NEXT:    vzeroupper
3573; CHECK-NEXT:    retq
3574  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3575  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3576  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3577  ret <2 x double> %res
3578}
3579define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
3580; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3581; CHECK:       # %bb.0:
3582; CHECK-NEXT:    vmovaps (%rdi), %xmm0
3583; CHECK-NEXT:    vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
3584; CHECK-NEXT:    retq
3585  %vec = load <4 x double>, <4 x double>* %vp
3586  %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3587  ret <2 x double> %res
3588}
3589define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3590; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3591; CHECK:       # %bb.0:
3592; CHECK-NEXT:    vmovapd (%rdi), %xmm2
3593; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
3594; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3595; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3596; CHECK-NEXT:    vmovapd %xmm2, %xmm0 {%k1}
3597; CHECK-NEXT:    retq
3598  %vec = load <4 x double>, <4 x double>* %vp
3599  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3600  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3601  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3602  ret <2 x double> %res
3603}
3604
3605define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
3606; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3607; CHECK:       # %bb.0:
3608; CHECK-NEXT:    vmovapd (%rdi), %xmm1
3609; CHECK-NEXT:    vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
3610; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3611; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
3612; CHECK-NEXT:    vmovapd %xmm1, %xmm0 {%k1} {z}
3613; CHECK-NEXT:    retq
3614  %vec = load <4 x double>, <4 x double>* %vp
3615  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3616  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3617  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3618  ret <2 x double> %res
3619}
3620
3621define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3622; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3623; CHECK:       # %bb.0:
3624; CHECK-NEXT:    vmovapd 16(%rdi), %xmm2
3625; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3626; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3627; CHECK-NEXT:    vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3628; CHECK-NEXT:    retq
3629  %vec = load <4 x double>, <4 x double>* %vp
3630  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3631  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3632  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3633  ret <2 x double> %res
3634}
3635
3636define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
3637; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3638; CHECK:       # %bb.0:
3639; CHECK-NEXT:    vmovapd 16(%rdi), %xmm1
3640; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3641; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
3642; CHECK-NEXT:    vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3643; CHECK-NEXT:    retq
3644  %vec = load <4 x double>, <4 x double>* %vp
3645  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3646  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3647  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3648  ret <2 x double> %res
3649}
3650
3651define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3652; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3653; CHECK:       # %bb.0:
3654; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3]
3655; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3656; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
3657; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3658; CHECK-NEXT:    retq
3659  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3660  ret <4 x double> %res
3661}
3662define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3663; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3664; CHECK:       # %bb.0:
3665; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3]
3666; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3667; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
3668; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3669; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3670; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3671; CHECK-NEXT:    retq
3672  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3673  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3674  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3675  ret <4 x double> %res
3676}
3677
3678define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3679; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3680; CHECK:       # %bb.0:
3681; CHECK-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3]
3682; CHECK-NEXT:    # ymm2 = mem[0,1,0,1]
3683; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3684; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3685; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3686; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3687; CHECK-NEXT:    retq
3688  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3689  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3690  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3691  ret <4 x double> %res
3692}
3693define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3694; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3695; CHECK:       # %bb.0:
3696; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6]
3697; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3698; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
3699; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3700; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3701; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3702; CHECK-NEXT:    retq
3703  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3704  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3705  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3706  ret <4 x double> %res
3707}
3708
3709define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3710; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3711; CHECK:       # %bb.0:
3712; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3713; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3714; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3715; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3716; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3717; CHECK-NEXT:    retq
3718  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3719  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3720  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3721  ret <4 x double> %res
3722}
3723define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3724; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3725; CHECK:       # %bb.0:
3726; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3727; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3728; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3729; CHECK-NEXT:    vmovapd %ymm1, %ymm0
3730; CHECK-NEXT:    retq
3731  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3732  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3733  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3734  ret <4 x double> %res
3735}
3736
3737define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3738; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3739; CHECK:       # %bb.0:
3740; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
3741; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
3742; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3743; CHECK-NEXT:    retq
3744  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3745  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3746  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3747  ret <4 x double> %res
3748}
3749define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3750; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3751; CHECK:       # %bb.0:
3752; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,1,4,0,2,1,4]
3753; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
3754; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
3755; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3756; CHECK-NEXT:    retq
3757  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3758  ret <4 x double> %res
3759}
3760define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3761; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
3762; CHECK:       # %bb.0:
3763; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,0,2,1,4]
3764; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3765; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
3766; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3767; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3768; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3769; CHECK-NEXT:    retq
3770  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3771  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3772  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3773  ret <4 x double> %res
3774}
3775
3776define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
3777; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
3778; CHECK:       # %bb.0:
3779; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,1,4]
3780; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3781; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3782; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3783; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3784; CHECK-NEXT:    retq
3785  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3786  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3787  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3788  ret <4 x double> %res
3789}
3790define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3791; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
3792; CHECK:       # %bb.0:
3793; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
3794; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,5,5]
3795; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
3796; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
3797; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
3798; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3799; CHECK-NEXT:    retq
3800  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3801  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3802  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3803  ret <4 x double> %res
3804}
3805
3806define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
3807; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
3808; CHECK:       # %bb.0:
3809; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
3810; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [1,1,5,5]
3811; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
3812; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
3813; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3814; CHECK-NEXT:    vmovapd %ymm2, %ymm0
3815; CHECK-NEXT:    retq
3816  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3817  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3818  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3819  ret <4 x double> %res
3820}
3821define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3822; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
3823; CHECK:       # %bb.0:
3824; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2]
3825; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3826; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
3827; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3828; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3829; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3830; CHECK-NEXT:    retq
3831  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3832  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3833  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3834  ret <4 x double> %res
3835}
3836
3837define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
3838; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
3839; CHECK:       # %bb.0:
3840; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [2,6,2,2]
3841; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3842; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3843; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3844; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3845; CHECK-NEXT:    retq
3846  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3847  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3848  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3849  ret <4 x double> %res
3850}
3851define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
3852; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
3853; CHECK:       # %bb.0:
3854; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [5,8,7,8,5,8,7,8]
3855; CHECK-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
3856; CHECK-NEXT:    vpermt2pd %zmm0, %zmm1, %zmm0
3857; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3858; CHECK-NEXT:    retq
3859  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3860  ret <4 x double> %res
3861}
3862define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3863; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
3864; CHECK:       # %bb.0:
3865; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [5,8,7,8,5,8,7,8]
3866; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3867; CHECK-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm3
3868; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
3869; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
3870; CHECK-NEXT:    vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
3871; CHECK-NEXT:    retq
3872  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3873  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3874  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3875  ret <4 x double> %res
3876}
3877
3878define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
3879; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
3880; CHECK:       # %bb.0:
3881; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [5,8,7,8]
3882; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3883; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3884; CHECK-NEXT:    vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z}
3885; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3886; CHECK-NEXT:    retq
3887  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3888  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3889  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3890  ret <4 x double> %res
3891}
3892define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3893; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
3894; CHECK:       # %bb.0:
3895; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6]
3896; CHECK-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
3897; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
3898; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3
3899; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
3900; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3901; CHECK-NEXT:    retq
3902  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3903  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3904  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3905  ret <4 x double> %res
3906}
3907
3908define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
3909; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
3910; CHECK:       # %bb.0:
3911; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [3,5,0,6]
3912; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3913; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
3914; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3915; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
3916; CHECK-NEXT:    retq
3917  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3918  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3919  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3920  ret <4 x double> %res
3921}
3922define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
3923; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
3924; CHECK:       # %bb.0:
3925; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
3926; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
3927; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3928; CHECK-NEXT:    vzeroupper
3929; CHECK-NEXT:    retq
3930  %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3931  ret <2 x double> %res
3932}
3933define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3934; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
3935; CHECK:       # %bb.0:
3936; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
3937; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm3
3938; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
3939; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
3940; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
3941; CHECK-NEXT:    vmovapd %xmm1, %xmm0
3942; CHECK-NEXT:    vzeroupper
3943; CHECK-NEXT:    retq
3944  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3945  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3946  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3947  ret <2 x double> %res
3948}
3949
3950define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
3951; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
3952; CHECK:       # %bb.0:
3953; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
3954; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
3955; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3956; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3957; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
3958; CHECK-NEXT:    vzeroupper
3959; CHECK-NEXT:    retq
3960  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3961  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3962  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3963  ret <2 x double> %res
3964}
3965define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3966; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
3967; CHECK:       # %bb.0:
3968; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [3,7]
3969; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
3970; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3971; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
3972; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3973; CHECK-NEXT:    vzeroupper
3974; CHECK-NEXT:    retq
3975  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
3976  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3977  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3978  ret <2 x double> %res
3979}
3980
3981define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
3982; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
3983; CHECK:       # %bb.0:
3984; CHECK-NEXT:    vmovapd {{.*#+}} xmm2 = [3,7]
3985; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
3986; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
3987; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3988; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3989; CHECK-NEXT:    vzeroupper
3990; CHECK-NEXT:    retq
3991  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
3992  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3993  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3994  ret <2 x double> %res
3995}
3996define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
3997; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
3998; CHECK:       # %bb.0:
3999; CHECK-NEXT:    vmovapd (%rdi), %ymm1
4000; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4001; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
4002; CHECK-NEXT:    retq
4003  %vec = load <8 x double>, <8 x double>* %vp
4004  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4005  ret <4 x double> %res
4006}
4007define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4008; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4009; CHECK:       # %bb.0:
4010; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4011; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4012; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
4013; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4014; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4015; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4016; CHECK-NEXT:    retq
4017  %vec = load <8 x double>, <8 x double>* %vp
4018  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4019  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4020  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4021  ret <4 x double> %res
4022}
4023
4024define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
4025; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4026; CHECK:       # %bb.0:
4027; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4028; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4029; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4030; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4031; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4032; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4033; CHECK-NEXT:    retq
4034  %vec = load <8 x double>, <8 x double>* %vp
4035  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4036  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4037  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4038  ret <4 x double> %res
4039}
4040
4041define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4042; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4043; CHECK:       # %bb.0:
4044; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4045; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4046; CHECK-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
4047; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4048; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4049; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4050; CHECK-NEXT:    retq
4051  %vec = load <8 x double>, <8 x double>* %vp
4052  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4053  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4054  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4055  ret <4 x double> %res
4056}
4057
4058define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
4059; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4060; CHECK:       # %bb.0:
4061; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4062; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4063; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4064; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4065; CHECK-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
4066; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4067; CHECK-NEXT:    retq
4068  %vec = load <8 x double>, <8 x double>* %vp
4069  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4070  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4071  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4072  ret <4 x double> %res
4073}
4074
4075define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4076; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4077; CHECK:       # %bb.0:
4078; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4079; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4080; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
4081; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4082; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4083; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4084; CHECK-NEXT:    retq
4085  %vec = load <8 x double>, <8 x double>* %vp
4086  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4087  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4088  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4089  ret <4 x double> %res
4090}
4091
4092define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
4093; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4094; CHECK:       # %bb.0:
4095; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4096; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4097; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4098; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4099; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4100; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4101; CHECK-NEXT:    retq
4102  %vec = load <8 x double>, <8 x double>* %vp
4103  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4104  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4105  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4106  ret <4 x double> %res
4107}
4108
4109define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
4110; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4111; CHECK:       # %bb.0:
4112; CHECK-NEXT:    vmovapd (%rdi), %ymm1
4113; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4114; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm1, %ymm0
4115; CHECK-NEXT:    retq
4116  %vec = load <8 x double>, <8 x double>* %vp
4117  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4118  ret <4 x double> %res
4119}
4120define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4121; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4122; CHECK:       # %bb.0:
4123; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4124; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4125; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm3
4126; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4127; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4128; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4129; CHECK-NEXT:    retq
4130  %vec = load <8 x double>, <8 x double>* %vp
4131  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4132  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4133  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4134  ret <4 x double> %res
4135}
4136
4137define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
4138; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4139; CHECK:       # %bb.0:
4140; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4141; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4142; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4143; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4144; CHECK-NEXT:    vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4145; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4146; CHECK-NEXT:    retq
4147  %vec = load <8 x double>, <8 x double>* %vp
4148  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4149  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4150  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4151  ret <4 x double> %res
4152}
4153
4154define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4155; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4156; CHECK:       # %bb.0:
4157; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4158; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4159; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
4160; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4161; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4162; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4163; CHECK-NEXT:    retq
4164  %vec = load <8 x double>, <8 x double>* %vp
4165  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4166  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4167  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4168  ret <4 x double> %res
4169}
4170
4171define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
4172; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4173; CHECK:       # %bb.0:
4174; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4175; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4176; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4177; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4178; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4179; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4180; CHECK-NEXT:    retq
4181  %vec = load <8 x double>, <8 x double>* %vp
4182  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4183  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4184  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4185  ret <4 x double> %res
4186}
4187
4188define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4189; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4190; CHECK:       # %bb.0:
4191; CHECK-NEXT:    vmovapd (%rdi), %ymm2
4192; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
4193; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4194; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4195; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4196; CHECK-NEXT:    retq
4197  %vec = load <8 x double>, <8 x double>* %vp
4198  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4199  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4200  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4201  ret <4 x double> %res
4202}
4203
4204define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
4205; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4206; CHECK:       # %bb.0:
4207; CHECK-NEXT:    vmovapd (%rdi), %ymm1
4208; CHECK-NEXT:    vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
4209; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4210; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4211; CHECK-NEXT:    vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4212; CHECK-NEXT:    retq
4213  %vec = load <8 x double>, <8 x double>* %vp
4214  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4215  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4216  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4217  ret <4 x double> %res
4218}
4219
4220define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
4221; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4222; CHECK:       # %bb.0:
4223; CHECK-NEXT:    vmovapd 32(%rdi), %ymm1
4224; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [0,2,4,1]
4225; CHECK-NEXT:    vpermi2pd (%rdi), %ymm1, %ymm0
4226; CHECK-NEXT:    retq
4227  %vec = load <8 x double>, <8 x double>* %vp
4228  %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4229  ret <4 x double> %res
4230}
4231define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4232; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4233; CHECK:       # %bb.0:
4234; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4235; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,2,4,1]
4236; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm3
4237; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4238; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
4239; CHECK-NEXT:    vmovapd %ymm3, %ymm0 {%k1}
4240; CHECK-NEXT:    retq
4241  %vec = load <8 x double>, <8 x double>* %vp
4242  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4243  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4244  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4245  ret <4 x double> %res
4246}
4247
4248define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
4249; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4250; CHECK:       # %bb.0:
4251; CHECK-NEXT:    vmovapd 32(%rdi), %ymm2
4252; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [0,2,4,1]
4253; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4254; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
4255; CHECK-NEXT:    vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4256; CHECK-NEXT:    vmovapd %ymm1, %ymm0
4257; CHECK-NEXT:    retq
4258  %vec = load <8 x double>, <8 x double>* %vp
4259  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4260  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4261  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4262  ret <4 x double> %res
4263}
4264
4265define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4266; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4267; CHECK:       # %bb.0:
4268; CHECK-NEXT:    vbroadcastsd 40(%rdi), %ymm2
4269; CHECK-NEXT:    vblendpd $5, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0],ymm2[1],mem[2],ymm2[3]
4270; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4271; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm1, %k1
4272; CHECK-NEXT:    vmovapd %ymm2, %ymm0 {%k1}
4273; CHECK-NEXT:    retq
4274  %vec = load <8 x double>, <8 x double>* %vp
4275  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4276  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4277  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4278  ret <4 x double> %res
4279}
4280
4281define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
4282; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4283; CHECK:       # %bb.0:
4284; CHECK-NEXT:    vbroadcastsd 40(%rdi), %ymm1
4285; CHECK-NEXT:    vblendpd $5, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0],ymm1[1],mem[2],ymm1[3]
4286; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4287; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm0, %k1
4288; CHECK-NEXT:    vmovapd %ymm1, %ymm0 {%k1} {z}
4289; CHECK-NEXT:    retq
4290  %vec = load <8 x double>, <8 x double>* %vp
4291  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4292  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4293  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4294  ret <4 x double> %res
4295}
4296
4297define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
4298; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4299; CHECK:       # %bb.0:
4300; CHECK-NEXT:    vmovapd (%rdi), %xmm0
4301; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
4302; CHECK-NEXT:    retq
4303  %vec = load <8 x double>, <8 x double>* %vp
4304  %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4305  ret <2 x double> %res
4306}
4307define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4308; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4309; CHECK:       # %bb.0:
4310; CHECK-NEXT:    vmovapd (%rdi), %xmm2
4311; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4312; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4313; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
4314; CHECK-NEXT:    retq
4315  %vec = load <8 x double>, <8 x double>* %vp
4316  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4317  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4318  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4319  ret <2 x double> %res
4320}
4321
4322define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
4323; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4324; CHECK:       # %bb.0:
4325; CHECK-NEXT:    vmovapd (%rdi), %xmm1
4326; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4327; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
4328; CHECK-NEXT:    vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
4329; CHECK-NEXT:    retq
4330  %vec = load <8 x double>, <8 x double>* %vp
4331  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4332  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4333  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4334  ret <2 x double> %res
4335}
4336
4337define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4338; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4339; CHECK:       # %bb.0:
4340; CHECK-NEXT:    vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4341; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
4342; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm1, %k1
4343; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
4344; CHECK-NEXT:    retq
4345  %vec = load <8 x double>, <8 x double>* %vp
4346  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4347  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4348  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4349  ret <2 x double> %res
4350}
4351
4352define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
4353; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4354; CHECK:       # %bb.0:
4355; CHECK-NEXT:    vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
4356; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
4357; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm0, %k1
4358; CHECK-NEXT:    vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
4359; CHECK-NEXT:    retq
4360  %vec = load <8 x double>, <8 x double>* %vp
4361  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4362  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4363  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4364  ret <2 x double> %res
4365}
4366
4367; PR35977
4368define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
4369; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4370; CHECK:       # %bb.0:
4371; CHECK-NEXT:    vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
4372; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
4373; CHECK-NEXT:    vmovdqa %xmm0, (%rsi)
4374; CHECK-NEXT:    retq
4375  %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
4376  %tmp2 = load <8 x i8>, <8 x i8>* %tmp
4377  %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4378  %tmp4 = zext i8 %tmp3 to i16
4379  %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4380  %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4381  %tmp7 = zext i8 %tmp6 to i16
4382  %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4383  %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4384  %tmp10 = zext i8 %tmp9 to i16
4385  %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4386  %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4387  %tmp13 = zext i8 %tmp12 to i16
4388  %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4389  %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4390  %tmp16 = zext i8 %tmp15 to i16
4391  %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4392  %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4393  %tmp19 = zext i8 %tmp18 to i16
4394  %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4395  %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4396  %tmp22 = zext i8 %tmp21 to i16
4397  %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4398  %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4399  %tmp25 = zext i8 %tmp24 to i16
4400  %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4401  %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4402  %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
4403  store <8 x i16> %tmp27, <8 x i16>* %tmp28
4404  ret void
4405}
4406